diff -urN oldtree/Documentation/kernel-parameters.txt newtree/Documentation/kernel-parameters.txt
--- oldtree/Documentation/kernel-parameters.txt	2006-03-08 18:47:59.047819500 +0000
+++ newtree/Documentation/kernel-parameters.txt	2006-03-08 15:22:33.001489750 +0000
@@ -72,6 +72,7 @@
 	SERIAL	Serial support is enabled.
 	SMP	The kernel is an SMP kernel.
 	SPARC	Sparc architecture is enabled.
+	SUSPEND2 Suspend2 is enabled.
 	SWSUSP	Software suspend is enabled.
 	TS	Appropriate touchscreen support is enabled.
 	USB	USB support is enabled.
@@ -1051,6 +1052,8 @@
 	noresume	[SWSUSP] Disables resume and restores original swap
 			space.
 
+	noresume2	[SUSPEND2] Disables resuming and restores original swap signature.
+ 
 	no-scroll	[VGA] Disables scrollback.
 			This is required for the Braillex ib80-piezo Braille
 			reader made by F.H. Papenmeier (Germany).
@@ -1315,6 +1318,11 @@
 	resume=		[SWSUSP]
 			Specify the partition device for software suspend
 
+ 	resume2=	[SUSPEND2] Specify the storage device for Suspend2.
+			Format: <writer>:<writer-parameters>.
+			See Documentation/power/suspend2.txt for details of the
+			formats	for available image writers.
+
 	rhash_entries=	[KNL,NET]
 			Set number of hash buckets for route cache
 
diff -urN oldtree/Documentation/kernel-parameters.txt.orig newtree/Documentation/kernel-parameters.txt.orig
--- oldtree/Documentation/kernel-parameters.txt.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/Documentation/kernel-parameters.txt.orig	2006-03-08 15:21:14.488583000 +0000
@@ -0,0 +1,1687 @@
+February 2003             Kernel Parameters                     v2.5.59
+                          ~~~~~~~~~~~~~~~~~
+
+The following is a consolidated list of the kernel parameters as implemented
+(mostly) by the __setup() macro and sorted into English Dictionary order
+(defined as ignoring all punctuation and sorting digits before letters in a
+case insensitive manner), and with descriptions where known.
+
+Module parameters for loadable modules are specified only as the
+parameter name with optional '=' and value as appropriate, such as:
+
+	modprobe usbcore blinkenlights=1
+
+Module parameters for modules that are built into the kernel image
+are specified on the kernel command line with the module name plus
+'.' plus parameter name, with '=' and value if appropriate, such as:
+
+	usbcore.blinkenlights=1
+
+The text in square brackets at the beginning of the description states the
+restrictions on the kernel for the said kernel parameter to be valid. The
+restrictions referred to are that the relevant option is valid if:
+
+	ACPI	ACPI support is enabled.
+	ALSA	ALSA sound support is enabled.
+	APIC	APIC support is enabled.
+	APM	Advanced Power Management support is enabled.
+	AX25	Appropriate AX.25 support is enabled.
+	CD	Appropriate CD support is enabled.
+	DEVFS	devfs support is enabled.
+	DRM	Direct Rendering Management support is enabled.
+	EDD	BIOS Enhanced Disk Drive Services (EDD) is enabled
+	EFI	EFI Partitioning (GPT) is enabled
+	EIDE	EIDE/ATAPI support is enabled.
+	FB	The frame buffer device is enabled.
+	HW	Appropriate hardware is enabled.
+	IA-32	IA-32 aka i386 architecture is enabled.
+	IA-64	IA-64 architecture is enabled.
+	IOSCHED	More than one I/O scheduler is enabled.
+	IP_PNP	IP DHCP, BOOTP, or RARP is enabled.
+	ISAPNP	ISA PnP code is enabled.
+	ISDN	Appropriate ISDN support is enabled.
+	JOY	Appropriate joystick support is enabled.
+	LP	Printer support is enabled.
+	LOOP	Loopback device support is enabled.
+	M68k	M68k architecture is enabled.
+			These options have more detailed description inside of
+			Documentation/m68k/kernel-options.txt.
+	MCA	MCA bus support is enabled.
+	MDA	MDA console support is enabled.
+	MOUSE	Appropriate mouse support is enabled.
+	MTD	MTD support is enabled.
+	NET	Appropriate network support is enabled.
+	NUMA	NUMA support is enabled.
+	GENERIC_TIME The generic timeofday code is enabled.
+	NFS	Appropriate NFS support is enabled.
+	OSS	OSS sound support is enabled.
+	PARIDE	The ParIDE subsystem is enabled.
+	PARISC	The PA-RISC architecture is enabled.
+	PCI	PCI bus support is enabled.
+	PCMCIA	The PCMCIA subsystem is enabled.
+	PNP	Plug & Play support is enabled.
+	PPC	PowerPC architecture is enabled.
+	PPT	Parallel port support is enabled.
+	PS2	Appropriate PS/2 support is enabled.
+	RAM	RAM disk support is enabled.
+	S390	S390 architecture is enabled.
+	SCSI	Appropriate SCSI support is enabled.
+			A lot of drivers has their options described inside of
+			Documentation/scsi/.
+	SELINUX SELinux support is enabled.
+	SERIAL	Serial support is enabled.
+	SMP	The kernel is an SMP kernel.
+	SPARC	Sparc architecture is enabled.
+	SWSUSP	Software suspend is enabled.
+	TS	Appropriate touchscreen support is enabled.
+	USB	USB support is enabled.
+	USBHID	USB Human Interface Device support is enabled.
+	V4L	Video For Linux support is enabled.
+	VGA	The VGA console has been enabled.
+	VT	Virtual terminal support is enabled.
+	WDT	Watchdog support is enabled.
+	XT	IBM PC/XT MFM hard disk support is enabled.
+	X86-64	X86-64 architecture is enabled.
+			More X86-64 boot options can be found in
+			Documentation/x86_64/boot-options.txt .
+
+In addition, the following text indicates that the option:
+
+	BUGS=	Relates to possible processor bugs on the said processor.
+	KNL	Is a kernel start-up parameter.
+	BOOT	Is a boot loader parameter.
+
+Parameters denoted with BOOT are actually interpreted by the boot
+loader, and have no meaning to the kernel directly.
+Do not modify the syntax of boot loader parameters without extreme
+need or coordination with <Documentation/i386/boot.txt>.
+
+Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
+a trailing = on the name of any parameter states that that parameter will
+be entered as an environment variable, whereas its absence indicates that
+it will appear as a kernel argument readable via /proc/cmdline by programs
+running once the system is up.
+
+	53c7xx=		[HW,SCSI] Amiga SCSI controllers
+			See header of drivers/scsi/53c7xx.c.
+			See also Documentation/scsi/ncr53c7xx.txt.
+
+	acpi=		[HW,ACPI] Advanced Configuration and Power Interface
+			Format: { force | off | ht | strict | noirq }
+			force -- enable ACPI if default was off
+			off -- disable ACPI if default was on
+			noirq -- do not use ACPI for IRQ routing
+			ht -- run only enough ACPI to enable Hyper Threading
+			strict -- Be less tolerant of platforms that are not
+				strictly ACPI specification compliant.
+
+			See also Documentation/pm.txt, pci=noacpi
+
+	acpi_sleep=	[HW,ACPI] Sleep options
+			Format: { s3_bios, s3_mode }
+			See Documentation/power/video.txt
+
+	acpi_sci=	[HW,ACPI] ACPI System Control Interrupt trigger mode
+			Format: { level | edge | high | low }
+
+	acpi_irq_balance [HW,ACPI]
+			ACPI will balance active IRQs
+			default in APIC mode
+
+	acpi_irq_nobalance [HW,ACPI]
+			ACPI will not move active IRQs (default)
+			default in PIC mode
+
+	acpi_irq_pci=	[HW,ACPI] If irq_balance, clear listed IRQs for
+			use by PCI
+			Format: <irq>,<irq>...
+
+	acpi_irq_isa=	[HW,ACPI] If irq_balance, mark listed IRQs used by ISA
+			Format: <irq>,<irq>...
+
+	acpi_osi=	[HW,ACPI] empty param disables _OSI
+
+	acpi_serialize	[HW,ACPI] force serialization of AML methods
+
+	acpi_skip_timer_override [HW,ACPI]
+			Recognize and ignore IRQ0/pin2 Interrupt Override.
+			For broken nForce2 BIOS resulting in XT-PIC timer.
+
+	acpi_dbg_layer=	[HW,ACPI]
+			Format: <int>
+			Each bit of the <int> indicates an ACPI debug layer,
+			1: enable, 0: disable. It is useful for boot time
+			debugging. After system has booted up, it can be set
+			via /proc/acpi/debug_layer.
+
+	acpi_dbg_level=	[HW,ACPI]
+			Format: <int>
+			Each bit of the <int> indicates an ACPI debug level,
+			1: enable, 0: disable. It is useful for boot time
+			debugging. After system has booted up, it can be set
+			via /proc/acpi/debug_level.
+
+	acpi_fake_ecdt	[HW,ACPI] Workaround failure due to BIOS lacking ECDT
+
+	acpi_generic_hotkey [HW,ACPI]
+			Allow consolidated generic hotkey driver to
+			override platform specific driver.
+			See also Documentation/acpi-hotkey.txt.
+
+	enable_timer_pin_1 [i386,x86-64]
+			Enable PIN 1 of APIC timer
+			Can be useful to work around chipset bugs
+			(in particular on some ATI chipsets).
+			The kernel tries to set a reasonable default.
+
+	disable_timer_pin_1 [i386,x86-64]
+			Disable PIN 1 of APIC timer
+			Can be useful to work around chipset bugs.
+
+	ad1816=		[HW,OSS]
+			Format: <io>,<irq>,<dma>,<dma2>
+			See also Documentation/sound/oss/AD1816.
+
+	ad1848=		[HW,OSS]
+			Format: <io>,<irq>,<dma>,<dma2>,<type>
+
+	adlib=		[HW,OSS]
+			Format: <io>
+
+	advansys=	[HW,SCSI]
+			See header of drivers/scsi/advansys.c.
+
+	advwdt=		[HW,WDT] Advantech WDT
+			Format: <iostart>,<iostop>
+
+	aedsp16=	[HW,OSS] Audio Excel DSP 16
+			Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq>
+			See also header of sound/oss/aedsp16.c.
+
+	aha152x=	[HW,SCSI]
+			See Documentation/scsi/aha152x.txt.
+
+	aha1542=	[HW,SCSI]
+			Format: <portbase>[,<buson>,<busoff>[,<dmaspeed>]]
+
+	aic7xxx=	[HW,SCSI]
+			See Documentation/scsi/aic7xxx.txt.
+
+	aic79xx=	[HW,SCSI]
+			See Documentation/scsi/aic79xx.txt.
+
+	amijoy.map=	[HW,JOY] Amiga joystick support
+			Map of devices attached to JOY0DAT and JOY1DAT
+			Format: <a>,<b>
+			See also Documentation/kernel/input/joystick.txt
+
+	analog.map=	[HW,JOY] Analog joystick and gamepad support
+			Specifies type or capabilities of an analog joystick
+			connected to one of 16 gameports
+			Format: <type1>,<type2>,..<type16>
+
+	apc=		[HW,SPARC]
+			Power management functions (SPARCstation-4/5 + deriv.)
+			Format: noidle
+			Disable APC CPU standby support. SPARCstation-Fox does
+			not play well with APC CPU idle - disable it if you have
+			APC and your system crashes randomly.
+
+	apic=		[APIC,i386] Change the output verbosity whilst booting
+			Format: { quiet (default) | verbose | debug }
+			Change the amount of debugging information output
+			when initialising the APIC and IO-APIC components.
+
+	apm=		[APM] Advanced Power Management
+			See header of arch/i386/kernel/apm.c.
+
+	applicom=	[HW]
+			Format: <mem>,<irq>
+
+	arcrimi=	[HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards
+			Format: <io>,<irq>,<nodeID>
+
+	ataflop=	[HW,M68k]
+
+	atarimouse=	[HW,MOUSE] Atari Mouse
+
+	atascsi=	[HW,SCSI] Atari SCSI
+
+	atkbd.extra=	[HW] Enable extra LEDs and keys on IBM RapidAccess,
+			EzKey and similar keyboards
+
+	atkbd.reset=	[HW] Reset keyboard during initialization
+
+	atkbd.set=	[HW] Select keyboard code set
+			Format: <int> (2 = AT (default), 3 = PS/2)
+
+	atkbd.scroll=	[HW] Enable scroll wheel on MS Office and similar
+			keyboards
+
+	atkbd.softraw=	[HW] Choose between synthetic and real raw mode
+			Format: <bool> (0 = real, 1 = synthetic (default))
+
+	atkbd.softrepeat= [HW]
+			Use software keyboard repeat
+
+	autotest	[IA64]
+
+	awe=		[HW,OSS] AWE32/SB32/AWE64 wave table synth
+			Format: <io>,<memsize>,<isapnp>
+
+	aztcd=		[HW,CD] Aztech CD268 CDROM driver
+			Format: <io>,0x79 (?)
+
+	baycom_epp=	[HW,AX25]
+			Format: <io>,<mode>
+
+	baycom_par=	[HW,AX25] BayCom Parallel Port AX.25 Modem
+			Format: <io>,<mode>
+			See header of drivers/net/hamradio/baycom_par.c.
+
+	baycom_ser_fdx=	[HW,AX25]
+			BayCom Serial Port AX.25 Modem (Full Duplex Mode)
+			Format: <io>,<irq>,<mode>[,<baud>]
+			See header of drivers/net/hamradio/baycom_ser_fdx.c.
+
+	baycom_ser_hdx=	[HW,AX25]
+			BayCom Serial Port AX.25 Modem (Half Duplex Mode)
+			Format: <io>,<irq>,<mode>
+			See header of drivers/net/hamradio/baycom_ser_hdx.c.
+
+	blkmtd_device=	[HW,MTD]
+	blkmtd_erasesz=
+	blkmtd_ro=
+	blkmtd_bs=
+	blkmtd_count=
+
+	bttv.card=	[HW,V4L] bttv (bt848 + bt878 based grabber cards)
+	bttv.radio=	Most important insmod options are available as
+			kernel args too.
+	bttv.pll=	See Documentation/video4linux/bttv/Insmod-options
+	bttv.tuner=	and Documentation/video4linux/bttv/CARDLIST
+
+	BusLogic=	[HW,SCSI]
+			See drivers/scsi/BusLogic.c, comment before function
+			BusLogic_ParseDriverOptions().
+
+	c101=		[NET] Moxa C101 synchronous serial card
+
+	cachesize=	[BUGS=IA-32] Override level 2 CPU cache size detection.
+			Sometimes CPU hardware bugs make them report the cache
+			size incorrectly. The kernel will attempt work arounds
+			to fix known problems, but for some CPUs it is not
+			possible to determine what the correct size should be.
+			This option provides an override for these situations.
+
+	cdu31a=		[HW,CD]
+			Format: <io>,<irq>[,PAS]
+			See header of drivers/cdrom/cdu31a.c.
+
+	chandev=	[HW,NET] Generic channel device initialisation
+
+	checkreqprot	[SELINUX] Set initial checkreqprot flag value.
+			Format: { "0" | "1" }
+			See security/selinux/Kconfig help text.
+			0 -- check protection applied by kernel (includes
+				any implied execute protection).
+			1 -- check protection requested by application.
+			Default value is set via a kernel config option.
+			Value can be changed at runtime via
+				/selinux/checkreqprot.
+
+	clock=		[BUGS=IA-32, HW] gettimeofday clocksource override.
+			[Deprecated]
+			Forces specified clocksource (if avaliable) to be used
+			when calculating gettimeofday(). If specified
+			clocksource is not avalible, it defaults to PIT.
+			Format: { pit | tsc | cyclone | pmtmr }
+
+	disable_8254_timer
+	enable_8254_timer
+			[IA32/X86_64] Disable/Enable interrupt 0 timer routing
+			over the 8254 in addition to over the IO-APIC. The
+			kernel tries to set a sensible default.
+
+	hpet=		[IA-32,HPET] option to disable HPET and use PIT.
+			Format: disable
+
+	cm206=		[HW,CD]
+			Format: { auto | [<io>,][<irq>] }
+
+	com20020=	[HW,NET] ARCnet - COM20020 chipset
+			Format:
+			<io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]]
+
+	com90io=	[HW,NET] ARCnet - COM90xx chipset (IO-mapped buffers)
+			Format: <io>[,<irq>]
+
+	com90xx=	[HW,NET]
+			ARCnet - COM90xx chipset (memory-mapped buffers)
+			Format: <io>[,<irq>[,<memstart>]]
+
+	condev=		[HW,S390] console device
+	conmode=
+
+	console=	[KNL] Output console device and options.
+
+		tty<n>	Use the virtual console device <n>.
+
+		ttyS<n>[,options]
+			Use the specified serial port.  The options are of
+			the form "bbbbpn", where "bbbb" is the baud rate,
+			"p" is parity ("n", "o", or "e"), and "n" is bits.
+			Default is "9600n8".
+
+			See also Documentation/serial-console.txt.
+
+		uart,io,<addr>[,options]
+		uart,mmio,<addr>[,options]
+			Start an early, polled-mode console on the 8250/16550
+			UART at the specified I/O port or MMIO address,
+			switching to the matching ttyS device later.  The
+			options are the same as for ttyS, above.
+
+	cpcihp_generic=	[HW,PCI] Generic port I/O CompactPCI driver
+			Format:
+			<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
+
+	cpia_pp=	[HW,PPT]
+			Format: { parport<nr> | auto | none }
+
+	crashkernel=nn[KMG]@ss[KMG]
+			[KNL] Reserve a chunk of physical memory to
+			hold a kernel to switch to with kexec on panic.
+
+	cs4232=		[HW,OSS]
+			Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq>
+
+	cs89x0_dma=	[HW,NET]
+			Format: <dma>
+
+	cs89x0_media=	[HW,NET]
+			Format: { rj45 | aui | bnc }
+
+	cyclades=	[HW,SERIAL] Cyclades multi-serial port adapter.
+
+	dasd=		[HW,NET]
+			See header of drivers/s390/block/dasd_devmap.c.
+
+	db9.dev[2|3]=	[HW,JOY] Multisystem joystick support via parallel port
+			(one device per port)
+			Format: <port#>,<type>
+			See also Documentation/input/joystick-parport.txt
+
+	debug		[KNL] Enable kernel debugging (events log level).
+
+	decnet=		[HW,NET]
+			Format: <area>[,<node>]
+			See also Documentation/networking/decnet.txt.
+
+	devfs=		[DEVFS]
+			See Documentation/filesystems/devfs/boot-options.
+
+	dhash_entries=	[KNL]
+			Set number of hash buckets for dentry cache.
+
+	digi=		[HW,SERIAL]
+			IO parameters + enable/disable command.
+
+	digiepca=	[HW,SERIAL]
+			See drivers/char/README.epca and
+			Documentation/digiepca.txt.
+
+	dmascc=		[HW,AX25,SERIAL] AX.25 Z80SCC driver with DMA
+			support available.
+			Format: <io_dev0>[,<io_dev1>[,..<io_dev32>]]
+
+	dmasound=	[HW,OSS] Sound subsystem buffers
+
+	dscc4.setup=	[NET]
+
+	dtc3181e=	[HW,SCSI]
+
+	earlyprintk=	[IA-32,X86-64]
+			earlyprintk=vga
+			earlyprintk=serial[,ttySn[,baudrate]]
+
+			Append ",keep" to not disable it when the real console
+			takes over.
+
+			Only vga or serial at a time, not both.
+
+			Currently only ttyS0 and ttyS1 are supported.
+
+			Interaction with the standard serial driver is not
+			very good.
+
+			The VGA output is eventually overwritten by the real
+			console.
+
+	eata=		[HW,SCSI]
+
+	ec_intr=	[HW,ACPI] ACPI Embedded Controller interrupt mode
+			Format: <int>
+			0: polling mode
+			non-0: interrupt mode (default)
+
+	eda=		[HW,PS2]
+
+	edb=		[HW,PS2]
+
+	edd=		[EDD]
+			Format: {"of[f]" | "sk[ipmbr]"}
+			See comment in arch/i386/boot/edd.S
+
+	eicon=		[HW,ISDN]
+			Format: <id>,<membase>,<irq>
+
+	eisa_irq_edge=	[PARISC,HW]
+			See header of drivers/parisc/eisa.c.
+
+	elanfreq=	[IA-32]
+			See comment before function elanfreq_setup() in
+			arch/i386/kernel/cpu/cpufreq/elanfreq.c.
+
+	elevator=	[IOSCHED]
+			Format: {"anticipatory" | "cfq" | "deadline" | "noop"}
+			See Documentation/block/as-iosched.txt and
+			Documentation/block/deadline-iosched.txt for details.
+
+	elfcorehdr=	[IA-32, X86_64]
+			Specifies physical address of start of kernel core
+			image elf header. Generally kexec loader will
+			pass this option to capture kernel.
+			See Documentation/kdump/kdump.txt for details.
+
+	enforcing	[SELINUX] Set initial enforcing status.
+			Format: {"0" | "1"}
+			See security/selinux/Kconfig help text.
+			0 -- permissive (log only, no denials).
+			1 -- enforcing (deny and log).
+			Default value is 0.
+			Value can be changed at runtime via /selinux/enforce.
+
+	es1370=		[HW,OSS]
+			Format: <lineout>[,<micbias>]
+			See also header of sound/oss/es1370.c.
+
+	es1371=		[HW,OSS]
+			Format: <spdif>,[<nomix>,[<amplifier>]]
+			See also header of sound/oss/es1371.c.
+
+	ether=		[HW,NET] Ethernet cards parameters
+			This option is obsoleted by the "netdev=" option, which
+			has equivalent usage. See its documentation for details.
+
+	eurwdt=		[HW,WDT] Eurotech CPU-1220/1410 onboard watchdog.
+			Format: <io>[,<irq>]
+
+	fd_mcs=		[HW,SCSI]
+			See header of drivers/scsi/fd_mcs.c.
+
+	fdomain=	[HW,SCSI]
+			See header of drivers/scsi/fdomain.c.
+
+	floppy=		[HW]
+			See Documentation/floppy.txt.
+
+	ftape=		[HW] Floppy Tape subsystem debugging options.
+			See Documentation/ftape.txt.
+
+	gamecon.map[2|3]=
+			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
+			support via parallel port (up to 5 devices per port)
+			Format: <port#>,<pad1>,<pad2>,<pad3>,<pad4>,<pad5>
+			See also Documentation/input/joystick-parport.txt
+
+	gamma=		[HW,DRM]
+
+	gdth=		[HW,SCSI]
+			See header of drivers/scsi/gdth.c.
+
+	gpt		[EFI] Forces disk with valid GPT signature but
+			invalid Protective MBR to be treated as GPT.
+
+	gscd=		[HW,CD]
+			Format: <io>
+
+	gt96100eth=	[NET] MIPS GT96100 Advanced Communication Controller
+
+	gus=		[HW,OSS]
+			Format: <io>,<irq>,<dma>,<dma16>
+
+	gvp11=		[HW,SCSI]
+
+	hashdist=	[KNL,NUMA] Large hashes allocated during boot
+			are distributed across NUMA nodes.  Defaults on
+			for IA-64, off otherwise.
+			Format: 0 | 1 (for off | on)
+
+	hcl=		[IA-64] SGI's Hardware Graph compatibility layer
+
+	hd=		[EIDE] (E)IDE hard drive subsystem geometry
+			Format: <cyl>,<head>,<sect>
+
+	hd?=		[HW] (E)IDE subsystem
+	hd?lun=		See Documentation/ide.txt.
+
+	highmem=nn[KMG]	[KNL,BOOT] forces the highmem zone to have an exact
+			size of <nn>. This works even on boxes that have no
+			highmem otherwise. This also works to reduce highmem
+			size on bigger boxes.
+
+	hisax=		[HW,ISDN]
+			See Documentation/isdn/README.HiSax.
+
+	hugepages=	[HW,IA-32,IA-64] Maximal number of HugeTLB pages.
+
+	noirqbalance	[IA-32,SMP,KNL] Disable kernel irq balancing
+
+	i8042.direct	[HW] Put keyboard port into non-translated mode
+	i8042.dumbkbd	[HW] Pretend that controlled can only read data from
+			     keyboard and can not control its state
+			     (Don't attempt to blink the leds)
+	i8042.noaux	[HW] Don't check for auxiliary (== mouse) port
+	i8042.nokbd	[HW] Don't check/create keyboard port
+	i8042.nomux	[HW] Don't check presence of an active multiplexing
+			     controller
+	i8042.nopnp	[HW] Don't use ACPIPnP / PnPBIOS to discover KBD/AUX
+			     controllers
+	i8042.panicblink=
+			[HW] Frequency with which keyboard LEDs should blink
+			     when kernel panics (default is 0.5 sec)
+	i8042.reset	[HW] Reset the controller during init and cleanup
+	i8042.unlock	[HW] Unlock (ignore) the keylock
+
+	i810=		[HW,DRM]
+
+	i8k.ignore_dmi	[HW] Continue probing hardware even if DMI data
+			indicates that the driver is running on unsupported
+			hardware.
+	i8k.force	[HW] Activate i8k driver even if SMM BIOS signature
+			does not match list of supported models.
+	i8k.power_status
+			[HW] Report power status in /proc/i8k
+			(disabled by default)
+	i8k.restricted	[HW] Allow controlling fans only if SYS_ADMIN
+			capability is set.
+
+	ibmmcascsi=	[HW,MCA,SCSI] IBM MicroChannel SCSI adapter
+			See Documentation/mca.txt.
+
+	icn=		[HW,ISDN]
+			Format: <io>[,<membase>[,<icn_id>[,<icn_id2>]]]
+
+	ide=		[HW] (E)IDE subsystem
+			Format: ide=nodma or ide=doubler or ide=reverse
+			See Documentation/ide.txt.
+
+	ide?=		[HW] (E)IDE subsystem
+			Format: ide?=noprobe or chipset specific parameters.
+			See Documentation/ide.txt.
+
+	idebus=		[HW] (E)IDE subsystem - VLB/PCI bus speed
+			See Documentation/ide.txt.
+
+	idle=		[HW]
+			Format: idle=poll or idle=halt
+
+	ihash_entries=	[KNL]
+			Set number of hash buckets for inode cache.
+
+	in2000=		[HW,SCSI]
+			See header of drivers/scsi/in2000.c.
+
+	init=		[KNL]
+			Format: <full_path>
+			Run specified binary instead of /sbin/init as init
+			process.
+
+	initcall_debug	[KNL] Trace initcalls as they are executed.  Useful
+			for working out where the kernel is dying during
+			startup.
+
+	initrd=		[BOOT] Specify the location of the initial ramdisk
+
+	inport.irq=	[HW] Inport (ATI XL and Microsoft) busmouse driver
+			Format: <irq>
+
+	combined_mode=	[HW] control which driver uses IDE ports in combined
+			mode: legacy IDE driver, libata, or both
+			(in the libata case, libata.atapi_enabled=1 may be
+			useful as well).  Note that using the ide or libata
+			options may affect your device naming (e.g. by
+			changing hdc to sdb).
+			Format: combined (default), ide, or libata
+
+	inttest=	[IA64]
+
+	io7=		[HW] IO7 for Marvel based alpha systems
+			See comment before marvel_specify_io7 in
+			arch/alpha/kernel/core_marvel.c.
+
+	ip=		[IP_PNP]
+			See Documentation/nfsroot.txt.
+
+	ip2=		[HW] Set IO/IRQ pairs for up to 4 IntelliPort boards
+			See comment before ip2_setup() in drivers/char/ip2.c.
+
+	ips=		[HW,SCSI] Adaptec / IBM ServeRAID controller
+			See header of drivers/scsi/ips.c.
+
+	irqfixup	[HW]
+			When an interrupt is not handled search all handlers
+			for it. Intended to get systems with badly broken
+			firmware running.
+
+	irqpoll		[HW]
+			When an interrupt is not handled search all handlers
+			for it. Also check all handlers each timer
+			interrupt. Intended to get systems with badly broken
+			firmware running.
+
+	isapnp=		[ISAPNP]
+			Format: <RDP>,<reset>,<pci_scan>,<verbosity>
+
+	isolcpus=	[KNL,SMP] Isolate CPUs from the general scheduler.
+			Format: <cpu number>,...,<cpu number>
+			This option can be used to specify one or more CPUs
+			to isolate from the general SMP balancing and scheduling
+			algorithms. The only way to move a process onto or off
+			an "isolated" CPU is via the CPU affinity syscalls.
+			<cpu number> begins at 0 and the maximum value is
+			"number of CPUs in system - 1".
+
+			This option is the preferred way to isolate CPUs. The
+			alternative -- manually setting the CPU mask of all
+			tasks in the system -- can cause problems and
+			suboptimal load balancer performance.
+
+	isp16=		[HW,CD]
+			Format: <io>,<irq>,<dma>,<setup>
+
+	iucv=		[HW,NET]
+
+	js=		[HW,JOY] Analog joystick
+			See Documentation/input/joystick.txt.
+
+	keepinitrd	[HW,ARM]
+
+	kstack=N	[IA-32,X86-64] Print N words from the kernel stack
+			in oops dumps.
+
+	l2cr=		[PPC]
+
+	lapic		[IA-32,APIC] Enable the local APIC even if BIOS
+			disabled it.
+
+	lasi=		[HW,SCSI] PARISC LASI driver for the 53c700 chip
+			Format: addr:<io>,irq:<irq>
+
+	llsc*=		[IA64] See function print_params() in
+			arch/ia64/sn/kernel/llsc4.c.
+
+	load_ramdisk=	[RAM] List of ramdisks to load from floppy
+			See Documentation/ramdisk.txt.
+
+	lockd.nlm_grace_period=P  [NFS] Assign grace period.
+			Format: <integer>
+
+	lockd.nlm_tcpport=N	[NFS] Assign TCP port.
+			Format: <integer>
+
+	lockd.nlm_timeout=T	[NFS] Assign timeout value.
+			Format: <integer>
+
+	lockd.nlm_udpport=M	[NFS] Assign UDP port.
+			Format: <integer>
+
+	logibm.irq=	[HW,MOUSE] Logitech Bus Mouse Driver
+			Format: <irq>
+
+	loglevel=	All Kernel Messages with a loglevel smaller than the
+			console loglevel will be printed to the console. It can
+			also be changed with klogd or other programs. The
+			loglevels are defined as follows:
+
+			0 (KERN_EMERG)		system is unusable
+			1 (KERN_ALERT)		action must be taken immediately
+			2 (KERN_CRIT)		critical conditions
+			3 (KERN_ERR)		error conditions
+			4 (KERN_WARNING)	warning conditions
+			5 (KERN_NOTICE)		normal but significant condition
+			6 (KERN_INFO)		informational
+			7 (KERN_DEBUG)		debug-level messages
+
+	log_buf_len=n	Sets the size of the printk ring buffer, in bytes.
+			Format: { n | nk | nM }
+			n must be a power of two.  The default size
+			is set in the kernel config file.
+
+	lp=0		[LP]	Specify parallel ports to use, e.g,
+	lp=port[,port...]	lp=none,parport0 (lp0 not configured, lp1 uses
+	lp=reset		first parallel port). 'lp=0' disables the
+	lp=auto			printer driver. 'lp=reset' (which can be
+				specified in addition to the ports) causes
+				attached printers to be reset. Using
+				lp=port1,port2,... specifies the parallel ports
+				to associate lp devices with, starting with
+				lp0. A port specification may be 'none' to skip
+				that lp device, or a parport name such as
+				'parport0'. Specifying 'lp=auto' instead of a
+				port specification list means that device IDs
+				from each port should be examined, to see if
+				an IEEE 1284-compliant printer is attached; if
+				so, the driver will manage that printer.
+				See also header of drivers/char/lp.c.
+
+	lpj=n		[KNL]
+			Sets loops_per_jiffy to given constant, thus avoiding
+			time-consuming boot-time autodetection (up to 250 ms per
+			CPU). 0 enables autodetection (default). To determine
+			the correct value for your kernel, boot with normal
+			autodetection and see what value is printed. Note that
+			on SMP systems the preset will be applied to all CPUs,
+			which is likely to cause problems if your CPUs need
+			significantly divergent settings. An incorrect value
+			will cause delays in the kernel to be wrong, leading to
+			unpredictable I/O errors and other breakage. Although
+			unlikely, in the extreme case this might damage your
+			hardware.
+
+	ltpc=		[NET]
+			Format: <io>,<irq>,<dma>
+
+	mac5380=	[HW,SCSI] Format:
+			<can_queue>,<cmd_per_lun>,<sg_tablesize>,<hostid>,<use_tags>
+
+	mac53c9x=	[HW,SCSI] Format:
+			<num_esps>,<disconnect>,<nosync>,<can_queue>,<cmd_per_lun>,<sg_tablesize>,<hostid>,<use_tags>
+
+	machvec=	[IA64] Force the use of a particular machine-vector
+			(machvec) in a generic kernel.
+			Example: machvec=hpzx1_swiotlb
+
+	mad16=		[HW,OSS] Format:
+			<io>,<irq>,<dma>,<dma16>,<mpu_io>,<mpu_irq>,<joystick>
+
+	maui=		[HW,OSS]
+			Format: <io>,<irq>
+
+	max_loop=	[LOOP] Maximum number of loopback devices that can
+			be mounted
+			Format: <1-256>
+
+	maxcpus=	[SMP] Maximum number of processors that	an SMP kernel
+			should make use of
+
+	max_addr=[KMG]	[KNL,BOOT,ia64] All physical memory greater than or
+			equal to this physical address is ignored.
+
+	max_luns=	[SCSI] Maximum number of LUNs to probe.
+			Should be between 1 and 2^32-1.
+
+	max_report_luns=
+			[SCSI] Maximum number of LUNs received.
+			Should be between 1 and 16384.
+
+	mca-pentium	[BUGS=IA-32]
+
+	mcatest=	[IA-64]
+
+	mcd=		[HW,CD]
+			Format: <port>,<irq>,<mitsumi_bug_93_wait>
+
+	mcdx=		[HW,CD]
+
+	mce		[IA-32] Machine Check Exception
+
+	md=		[HW] RAID subsystems devices and level
+			See Documentation/md.txt.
+
+	mdacon=		[MDA]
+			Format: <first>,<last>
+			Specifies range of consoles to be captured by the MDA.
+
+	mem=nn[KMG]	[KNL,BOOT] Force usage of a specific amount of memory
+			Amount of memory to be used when the kernel is not able
+			to see the whole system memory or for test.
+			[IA-32] Use together with memmap= to avoid physical
+			address space collisions. Without memmap= PCI devices
+			could be placed at addresses belonging to unused RAM.
+
+	mem=nopentium	[BUGS=IA-32] Disable usage of 4MB pages for kernel
+			memory.
+
+	memmap=exactmap	[KNL,IA-32,X86_64] Enable setting of an exact
+			E820 memory map, as specified by the user.
+			Such memmap=exactmap lines can be constructed based on
+			BIOS output or other requirements. See the memmap=nn@ss
+			option description.
+
+	memmap=nn[KMG]@ss[KMG]
+			[KNL] Force usage of a specific region of memory
+			Region of memory to be used, from ss to ss+nn.
+
+	memmap=nn[KMG]#ss[KMG]
+			[KNL,ACPI] Mark specific memory as ACPI data.
+			Region of memory to be used, from ss to ss+nn.
+
+	memmap=nn[KMG]$ss[KMG]
+			[KNL,ACPI] Mark specific memory as reserved.
+			Region of memory to be used, from ss to ss+nn.
+
+	meye.*=		[HW] Set MotionEye Camera parameters
+			See Documentation/video4linux/meye.txt.
+
+	mga=		[HW,DRM]
+
+	migration_cost=
+			[KNL,SMP] debug: override scheduler migration costs
+			Format: <level-1-usecs>,<level-2-usecs>,...
+			This debugging option can be used to override the
+			default scheduler migration cost matrix. The numbers
+			are indexed by 'CPU domain distance'.
+			E.g. migration_cost=1000,2000,3000 on an SMT NUMA
+			box will set up an intra-core migration cost of
+			1 msec, an inter-core migration cost of 2 msecs,
+			and an inter-node migration cost of 3 msecs.
+
+			WARNING: using the wrong values here can break
+			scheduler performance, so it's only for scheduler
+			development purposes, not production environments.
+
+	migration_debug=
+			[KNL,SMP] migration cost auto-detect verbosity
+			Format=<0|1|2>
+			If a system's migration matrix reported at bootup
+			seems erroneous then this option can be used to
+			increase verbosity of the detection process.
+			We default to 0 (no extra messages), 1 will print
+			some more information, and 2 will be really
+			verbose (probably only useful if you also have a
+			serial console attached to the system).
+
+	migration_factor=
+			[KNL,SMP] multiply/divide migration costs by a factor
+			Format=<percent>
+			This debug option can be used to proportionally
+			increase or decrease the auto-detected migration
+			costs for all entries of the migration matrix.
+			E.g. migration_factor=150 will increase migration
+			costs by 50%. (and thus the scheduler will be less
+			eager migrating cache-hot tasks)
+			migration_factor=80 will decrease migration costs
+			by 20%. (thus the scheduler will be more eager to
+			migrate tasks)
+
+			WARNING: using the wrong values here can break
+			scheduler performance, so it's only for scheduler
+			development purposes, not production environments.
+
+	mousedev.tap_time=
+			[MOUSE] Maximum time between finger touching and
+			leaving touchpad surface for touch to be considered
+			a tap and be reported as a left button click (for
+			touchpads working in absolute mode only).
+			Format: <msecs>
+	mousedev.xres=	[MOUSE] Horizontal screen resolution, used for devices
+			reporting absolute coordinates, such as tablets
+	mousedev.yres=	[MOUSE] Vertical screen resolution, used for devices
+			reporting absolute coordinates, such as tablets
+
+	mpu401=		[HW,OSS]
+			Format: <io>,<irq>
+
+	MTD_Partition=	[MTD]
+			Format: <name>,<region-number>,<size>,<offset>
+
+	MTD_Region=	[MTD] Format:
+			<name>,<region-number>[,<base>,<size>,<buswidth>,<altbuswidth>]
+
+	mtdparts=	[MTD]
+			See drivers/mtd/cmdline.c.
+
+	mtouchusb.raw_coordinates=
+			[HW] Make the MicroTouch USB driver use raw coordinates
+			('y', default) or cooked coordinates ('n')
+
+	n2=		[NET] SDL Inc. RISCom/N2 synchronous serial card
+
+	NCR_D700=	[HW,SCSI]
+			See header of drivers/scsi/NCR_D700.c.
+
+	ncr5380=	[HW,SCSI]
+
+	ncr53c400=	[HW,SCSI]
+
+	ncr53c400a=	[HW,SCSI]
+
+	ncr53c406a=	[HW,SCSI]
+
+	ncr53c8xx=	[HW,SCSI]
+
+	netdev=		[NET] Network devices parameters
+			Format: <irq>,<io>,<mem_start>,<mem_end>,<name>
+			Note that mem_start is often overloaded to mean
+			something different and driver-specific.
+			This usage is only documented in each driver source
+			file if at all.
+
+	nfsaddrs=	[NFS]
+			See Documentation/nfsroot.txt.
+
+	nfsroot=	[NFS] nfs root filesystem for disk-less boxes.
+			See Documentation/nfsroot.txt.
+
+	nfs.callback_tcpport=
+			[NFS] set the TCP port on which the NFSv4 callback
+			channel should listen.
+
+	nfs.idmap_cache_timeout=
+			[NFS] set the maximum lifetime for idmapper cache
+			entries.
+
+	nmi_watchdog=	[KNL,BUGS=IA-32] Debugging features for SMP kernels
+
+	no387		[BUGS=IA-32] Tells the kernel to use the 387 maths
+			emulation library even if a 387 maths coprocessor
+			is present.
+
+	noalign		[KNL,ARM]
+
+	noapic		[SMP,APIC] Tells the kernel to not make use of any
+			IOAPICs that may be present in the system.
+
+	noasync		[HW,M68K] Disables async and sync negotiation for
+			all devices.
+
+	nobats		[PPC] Do not use BATs for mapping kernel lowmem
+			on "Classic" PPC cores.
+
+	nocache		[ARM]
+
+	nodisconnect	[HW,SCSI,M68K] Disables SCSI disconnects.
+
+	noexec		[IA-64]
+
+	noexec		[IA-32,X86-64]
+			noexec=on: enable non-executable mappings (default)
+			noexec=off: disable nn-executable mappings
+
+	nofxsr		[BUGS=IA-32] Disables x86 floating point extended
+			register save and restore. The kernel will only save
+			legacy floating-point registers on task switch.
+
+	nohlt		[BUGS=ARM]
+
+	no-hlt		[BUGS=IA-32] Tells the kernel that the hlt
+			instruction doesn't work correctly and not to
+			use it.
+
+	nohalt		[IA-64] Tells the kernel not to use the power saving
+			function PAL_HALT_LIGHT when idle. This increases
+			power-consumption. On the positive side, it reduces
+			interrupt wake-up latency, which may improve performance
+			in certain environments such as networked servers or
+			real-time systems.
+
+	noirqdebug	[IA-32] Disables the code which attempts to detect and
+			disable unhandled interrupt sources.
+
+	noisapnp	[ISAPNP] Disables ISA PnP code.
+
+	noinitrd	[RAM] Tells the kernel not to load any configured
+			initial RAM disk.
+
+	nointroute	[IA-64]
+
+	nolapic		[IA-32,APIC] Do not enable or use the local APIC.
+
+	noltlbs		[PPC] Do not use large page/tlb entries for kernel
+			lowmem mapping on PPC40x.
+
+	nomce		[IA-32] Machine Check Exception
+
+	nomca		[IA-64] Disable machine check abort handling
+
+	noresidual	[PPC] Don't use residual data on PReP machines.
+
+	noresume	[SWSUSP] Disables resume and restores original swap
+			space.
+
+	no-scroll	[VGA] Disables scrollback.
+			This is required for the Braillex ib80-piezo Braille
+			reader made by F.H. Papenmeier (Germany).
+
+	nosbagart	[IA-64]
+
+	nosep		[BUGS=IA-32] Disables x86 SYSENTER/SYSEXIT support.
+
+	nosmp		[SMP] Tells an SMP kernel to act as a UP kernel.
+
+	nosync		[HW,M68K] Disables sync negotiation for all devices.
+
+	notsc		[BUGS=IA-32] Disable Time Stamp Counter
+
+	nousb		[USB] Disable the USB subsystem
+
+	nowb		[ARM]
+
+	nr_uarts=	[SERIAL] maximum number of UARTs to be registered.
+
+	opl3=		[HW,OSS]
+			Format: <io>
+
+	opl3sa=		[HW,OSS]
+			Format: <io>,<irq>,<dma>,<dma2>,<mpu_io>,<mpu_irq>
+
+	opl3sa2=	[HW,OSS] Format:
+			<io>,<irq>,<dma>,<dma2>,<mss_io>,<mpu_io>,<ymode>,<loopback>[,<isapnp>,<multiple]
+
+	oprofile.timer=	[HW]
+			Use timer interrupt instead of performance counters
+
+	optcd=		[HW,CD]
+			Format: <io>
+
+	osst=		[HW,SCSI] SCSI Tape Driver
+			Format: <buffer_size>,<write_threshold>
+			See also Documentation/scsi/st.txt.
+
+	panic=		[KNL] Kernel behaviour on panic
+			Format: <timeout>
+
+	parkbd.port=	[HW] Parallel port number the keyboard adapter is
+			connected to, default is 0.
+			Format: <parport#>
+	parkbd.mode=	[HW] Parallel port keyboard adapter mode of operation,
+			0 for XT, 1 for AT (default is AT).
+			Format: <mode>
+
+	parport=	[HW,PPT] Specify parallel ports. 0 disables.
+			Format: { 0 | auto | 0xBBB[,IRQ[,DMA]] }
+			Use 'auto' to force the driver to use any
+			IRQ/DMA settings detected (the default is to
+			ignore detected IRQ/DMA settings because of
+			possible conflicts). You can specify the base
+			address, IRQ, and DMA settings; IRQ and DMA
+			should be numbers, or 'auto' (for using detected
+			settings on that particular port), or 'nofifo'
+			(to avoid using a FIFO even if it is detected).
+			Parallel ports are assigned in the order they
+			are specified on the command line, starting
+			with parport0.
+
+	parport_init_mode=	[HW,PPT]
+			Configure VIA parallel port to operate in
+			a specific mode. This is necessary on Pegasos
+			computer where firmware has no options for setting
+			up parallel port mode and sets it to spp.
+			Currently this function knows 686a and 8231 chips.
+			Format: [spp|ps2|epp|ecp|ecpepp]
+
+	pas2=		[HW,OSS] Format:
+			<io>,<irq>,<dma>,<dma16>,<sb_io>,<sb_irq>,<sb_dma>,<sb_dma16>
+
+	pas16=		[HW,SCSI]
+			See header of drivers/scsi/pas16.c.
+
+	pause_on_oops=
+			Halt all CPUs after the first oops has been printed for
+			the specified number of seconds.  This is to be used if
+			your oopses keep scrolling off the screen.
+
+	pcbit=		[HW,ISDN]
+
+	pcd.		[PARIDE]
+			See header of drivers/block/paride/pcd.c.
+			See also Documentation/paride.txt.
+
+	pci=option[,option...]	[PCI] various PCI subsystem options:
+		off		[IA-32] don't probe for the PCI bus
+		bios		[IA-32] force use of PCI BIOS, don't access
+				the hardware directly. Use this if your machine
+				has a non-standard PCI host bridge.
+		nobios		[IA-32] disallow use of PCI BIOS, only direct
+				hardware access methods are allowed. Use this
+				if you experience crashes upon bootup and you
+				suspect they are caused by the BIOS.
+		conf1		[IA-32] Force use of PCI Configuration
+				Mechanism 1.
+		conf2		[IA-32] Force use of PCI Configuration
+				Mechanism 2.
+		nommconf	[IA-32,X86_64] Disable use of MMCONFIG for PCI
+				Configuration
+		nosort		[IA-32] Don't sort PCI devices according to
+				order given by the PCI BIOS. This sorting is
+				done to get a device order compatible with
+				older kernels.
+		biosirq		[IA-32] Use PCI BIOS calls to get the interrupt
+				routing table. These calls are known to be buggy
+				on several machines and they hang the machine
+				when used, but on other computers it's the only
+				way to get the interrupt routing table. Try
+				this option if the kernel is unable to allocate
+				IRQs or discover secondary PCI buses on your
+				motherboard.
+		rom		[IA-32] Assign address space to expansion ROMs.
+				Use with caution as certain devices share
+				address decoders between ROMs and other
+				resources.
+		irqmask=0xMMMM	[IA-32] Set a bit mask of IRQs allowed to be
+				assigned automatically to PCI devices. You can
+				make the kernel exclude IRQs of your ISA cards
+				this way.
+		pirqaddr=0xAAAAA	[IA-32] Specify the physical address
+				of the PIRQ table (normally generated
+				by the BIOS) if it is outside the
+				F0000h-100000h range.
+		lastbus=N	[IA-32] Scan all buses thru bus #N. Can be
+				useful if the kernel is unable to find your
+				secondary buses and you want to tell it
+				explicitly which ones they are.
+		assign-busses	[IA-32] Always assign all PCI bus
+				numbers ourselves, overriding
+				whatever the firmware may have done.
+		usepirqmask	[IA-32] Honor the possible IRQ mask stored
+				in the BIOS $PIR table. This is needed on
+				some systems with broken BIOSes, notably
+				some HP Pavilion N5400 and Omnibook XE3
+				notebooks. This will have no effect if ACPI
+				IRQ routing is enabled.
+		noacpi		[IA-32] Do not use ACPI for IRQ routing
+				or for PCI scanning.
+		routeirq	Do IRQ routing for all PCI devices.
+				This is normally done in pci_enable_device(),
+				so this option is a temporary workaround
+				for broken drivers that don't call it.
+		firmware	[ARM] Do not re-enumerate the bus but instead
+				just use the configuration from the
+				bootloader. This is currently used on
+				IXP2000 systems where the bus has to be
+				configured a certain way for adjunct CPUs.
+
+	pcmv=		[HW,PCMCIA] BadgePAD 4
+
+	pd.		[PARIDE]
+			See Documentation/paride.txt.
+
+	pdcchassis=	[PARISC,HW] Disable/Enable PDC Chassis Status codes at
+			boot time.
+			Format: { 0 | 1 }
+			See arch/parisc/kernel/pdc_chassis.c
+
+	pf.		[PARIDE]
+			See Documentation/paride.txt.
+
+	pg.		[PARIDE]
+			See Documentation/paride.txt.
+
+	pirq=		[SMP,APIC] Manual mp-table setup
+			See Documentation/i386/IO-APIC.txt.
+
+	plip=		[PPT,NET] Parallel port network link
+			Format: { parport<nr> | timid | 0 }
+			See also Documentation/parport.txt.
+
+	pnpacpi=	[ACPI]
+			{ off }
+
+	pnpbios=	[ISAPNP]
+			{ on | off | curr | res | no-curr | no-res }
+
+	pnp_reserve_irq=
+			[ISAPNP] Exclude IRQs for the autoconfiguration
+
+	pnp_reserve_dma=
+			[ISAPNP] Exclude DMAs for the autoconfiguration
+
+	pnp_reserve_io=	[ISAPNP] Exclude I/O ports for the autoconfiguration
+			Ranges are in pairs (I/O port base and size).
+
+	pnp_reserve_mem=
+			[ISAPNP] Exclude memory regions for the
+			autoconfiguration.
+			Ranges are in pairs (memory base and size).
+
+	profile=	[KNL] Enable kernel profiling via /proc/profile
+			Format: [schedule,]<number>
+			Param: "schedule" - profile schedule points.
+			Param: <number> - step/bucket size as a power of 2 for
+				statistical time based profiling.
+
+	processor.max_cstate=	[HW,ACPI]
+			Limit processor to maximum C-state
+			max_cstate=9 overrides any DMI blacklist limit.
+
+	processor.nocst	[HW,ACPI]
+			Ignore the _CST method to determine C-states,
+			instead using the legacy FADT method
+
+	prompt_ramdisk=	[RAM] List of RAM disks to prompt for floppy disk
+			before loading.
+			See Documentation/ramdisk.txt.
+
+	psmouse.proto=	[HW,MOUSE] Highest PS2 mouse protocol extension to
+			probe for; one of (bare|imps|exps|lifebook|any).
+	psmouse.rate=	[HW,MOUSE] Set desired mouse report rate, in reports
+			per second.
+	psmouse.resetafter=	[HW,MOUSE]
+			Try to reset the device after so many bad packets
+			(0 = never).
+	psmouse.resolution=
+			[HW,MOUSE] Set desired mouse resolution, in dpi.
+	psmouse.smartscroll=
+			[HW,MOUSE] Controls Logitech smartscroll autorepeat.
+			0 = disabled, 1 = enabled (default).
+
+	pss=		[HW,OSS] Personal Sound System (ECHO ESC614)
+			Format:
+			<io>,<mss_io>,<mss_irq>,<mss_dma>,<mpu_io>,<mpu_irq>
+
+	pt.		[PARIDE]
+			See Documentation/paride.txt.
+
+	quiet=		[KNL] Disable log messages
+
+	r128=		[HW,DRM]
+
+	raid=		[HW,RAID]
+			See Documentation/md.txt.
+
+	ramdisk=	[RAM] Sizes of RAM disks in kilobytes [deprecated]
+			See Documentation/ramdisk.txt.
+
+	ramdisk_blocksize=	[RAM]
+			See Documentation/ramdisk.txt.
+
+	ramdisk_size=	[RAM] Sizes of RAM disks in kilobytes
+			New name for the ramdisk parameter.
+			See Documentation/ramdisk.txt.
+
+	rdinit=		[KNL]
+			Format: <full_path>
+			Run specified binary instead of /init from the ramdisk,
+			used for early userspace startup. See initrd.
+
+	reboot=		[BUGS=IA-32,BUGS=ARM,BUGS=IA-64] Rebooting mode
+			Format: <reboot_mode>[,<reboot_mode2>[,...]]
+			See arch/*/kernel/reboot.c.
+
+	reserve=	[KNL,BUGS] Force the kernel to ignore some iomem area
+
+	resume=		[SWSUSP]
+			Specify the partition device for software suspend
+
+	rhash_entries=	[KNL,NET]
+			Set number of hash buckets for route cache
+
+	riscom8=	[HW,SERIAL]
+			Format: <io_board1>[,<io_board2>[,...<io_boardN>]]
+
+	ro		[KNL] Mount root device read-only on boot
+
+	root=		[KNL] Root filesystem
+
+	rootdelay=	[KNL] Delay (in seconds) to pause before attempting to
+			mount the root filesystem
+
+	rootflags=	[KNL] Set root filesystem mount option string
+
+	rootfstype=	[KNL] Set root filesystem type
+
+	rw		[KNL] Mount root device read-write on boot
+
+	S		[KNL] Run init in single mode
+
+	sa1100ir	[NET]
+			See drivers/net/irda/sa1100_ir.c.
+
+	sb=		[HW,OSS]
+			Format: <io>,<irq>,<dma>,<dma2>
+
+	sbni=		[NET] Granch SBNI12 leased line adapter
+
+	sbpcd=		[HW,CD] Soundblaster CD adapter
+			Format: <io>,<type>
+			See a comment before function sbpcd_setup() in
+			drivers/cdrom/sbpcd.c.
+
+	sc1200wdt=	[HW,WDT] SC1200 WDT (watchdog) driver
+			Format: <io>[,<timeout>[,<isapnp>]]
+
+	scsi_debug_*=	[SCSI]
+			See drivers/scsi/scsi_debug.c.
+
+	scsi_default_dev_flags=
+			[SCSI] SCSI default device flags
+			Format: <integer>
+
+	scsi_dev_flags=	[SCSI] Black/white list entry for vendor and model
+			Format: <vendor>:<model>:<flags>
+			(flags are integer value)
+
+	scsi_logging=	[SCSI]
+
+	selinux		[SELINUX] Disable or enable SELinux at boot time.
+			Format: { "0" | "1" }
+			See security/selinux/Kconfig help text.
+			0 -- disable.
+			1 -- enable.
+			Default value is set via kernel config option.
+			If enabled at boot time, /selinux/disable can be used
+			later to disable prior to initial policy load.
+
+	serialnumber	[BUGS=IA-32]
+
+	sg_def_reserved_size=	[SCSI]
+
+	sgalaxy=	[HW,OSS]
+			Format: <io>,<irq>,<dma>,<dma2>,<sgbase>
+
+	shapers=	[NET]
+			Maximal number of shapers.
+
+	sim710=		[SCSI,HW]
+			See header of drivers/scsi/sim710.c.
+
+	simeth=		[IA-64]
+	simscsi=
+
+	sjcd=		[HW,CD]
+			Format: <io>,<irq>,<dma>
+			See header of drivers/cdrom/sjcd.c.
+
+	slram=		[HW,MTD]
+
+	smart2=		[HW]
+			Format: <io1>[,<io2>[,...,<io8>]]
+
+	snd-ad1816a=	[HW,ALSA]
+
+	snd-ad1848=	[HW,ALSA]
+
+	snd-ali5451=	[HW,ALSA]
+
+	snd-als100=	[HW,ALSA]
+
+	snd-als4000=	[HW,ALSA]
+
+	snd-azt2320=	[HW,ALSA]
+
+	snd-cmi8330=	[HW,ALSA]
+
+	snd-cmipci=	[HW,ALSA]
+
+	snd-cs4231=	[HW,ALSA]
+
+	snd-cs4232=	[HW,ALSA]
+
+	snd-cs4236=	[HW,ALSA]
+
+	snd-cs4281=	[HW,ALSA]
+
+	snd-cs46xx=	[HW,ALSA]
+
+	snd-dt019x=	[HW,ALSA]
+
+	snd-dummy=	[HW,ALSA]
+
+	snd-emu10k1=	[HW,ALSA]
+
+	snd-ens1370=	[HW,ALSA]
+
+	snd-ens1371=	[HW,ALSA]
+
+	snd-es968=	[HW,ALSA]
+
+	snd-es1688=	[HW,ALSA]
+
+	snd-es18xx=	[HW,ALSA]
+
+	snd-es1938=	[HW,ALSA]
+
+	snd-es1968=	[HW,ALSA]
+
+	snd-fm801=	[HW,ALSA]
+
+	snd-gusclassic=	[HW,ALSA]
+
+	snd-gusextreme=	[HW,ALSA]
+
+	snd-gusmax=	[HW,ALSA]
+
+	snd-hdsp=	[HW,ALSA]
+
+	snd-ice1712=	[HW,ALSA]
+
+	snd-intel8x0=	[HW,ALSA]
+
+	snd-interwave=	[HW,ALSA]
+
+	snd-interwave-stb=
+			[HW,ALSA]
+
+	snd-korg1212=	[HW,ALSA]
+
+	snd-maestro3=	[HW,ALSA]
+
+	snd-mpu401=	[HW,ALSA]
+
+	snd-mtpav=	[HW,ALSA]
+
+	snd-nm256=	[HW,ALSA]
+
+	snd-opl3sa2=	[HW,ALSA]
+
+	snd-opti92x-ad1848=
+			[HW,ALSA]
+
+	snd-opti92x-cs4231=
+			[HW,ALSA]
+
+	snd-opti93x=	[HW,ALSA]
+
+	snd-pmac=	[HW,ALSA]
+
+	snd-rme32=	[HW,ALSA]
+
+	snd-rme96=	[HW,ALSA]
+
+	snd-rme9652=	[HW,ALSA]
+
+	snd-sb8=	[HW,ALSA]
+
+	snd-sb16=	[HW,ALSA]
+
+	snd-sbawe=	[HW,ALSA]
+
+	snd-serial=	[HW,ALSA]
+
+	snd-sgalaxy=	[HW,ALSA]
+
+	snd-sonicvibes=	[HW,ALSA]
+
+	snd-sun-amd7930=
+			[HW,ALSA]
+
+	snd-sun-cs4231=	[HW,ALSA]
+
+	snd-trident=	[HW,ALSA]
+
+	snd-usb-audio=	[HW,ALSA,USB]
+
+	snd-via82xx=	[HW,ALSA]
+
+	snd-virmidi=	[HW,ALSA]
+
+	snd-wavefront=	[HW,ALSA]
+
+	snd-ymfpci=	[HW,ALSA]
+
+	sonicvibes=	[HW,OSS]
+			Format: <reverb>
+
+	sonycd535=	[HW,CD]
+			Format: <io>[,<irq>]
+
+	sonypi.*=	[HW] Sony Programmable I/O Control Device driver
+			See Documentation/sonypi.txt
+
+	specialix=	[HW,SERIAL] Specialix multi-serial port adapter
+			See Documentation/specialix.txt.
+
+	spia_io_base=	[HW,MTD]
+	spia_fio_base=
+	spia_pedr=
+	spia_peddr=
+
+	sscape=		[HW,OSS]
+			Format: <io>,<irq>,<dma>,<mpu_io>,<mpu_irq>
+
+	st=		[HW,SCSI] SCSI tape parameters (buffers, etc.)
+			See Documentation/scsi/st.txt.
+
+	st0x=		[HW,SCSI]
+			See header of drivers/scsi/seagate.c.
+
+	sti=		[PARISC,HW]
+			Format: <num>
+			Set the STI (builtin display/keyboard on the HP-PARISC
+			machines) console (graphic card) which should be used
+			as the initial boot-console.
+			See also comment in drivers/video/console/sticore.c.
+
+	sti_font=	[HW]
+			See comment in drivers/video/console/sticore.c.
+
+	stifb=		[HW]
+			Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
+
+	swiotlb=	[IA-64] Number of I/O TLB slabs
+
+	switches=	[HW,M68k]
+
+	sym53c416=	[HW,SCSI]
+			See header of drivers/scsi/sym53c416.c.
+
+	t128=		[HW,SCSI]
+			See header of drivers/scsi/t128.c.
+
+	tdfx=		[HW,DRM]
+
+	thash_entries=	[KNL,NET]
+			Set number of hash buckets for TCP connection
+
+	time		Show timing data prefixed to each printk message line
+
+	clocksource=	[GENERIC_TIME] Override the default clocksource
+			Override the default clocksource and use the clocksource
+			with the name specified.
+
+	tipar.timeout=	[HW,PPT]
+			Set communications timeout in tenths of a second
+			(default 15).
+
+	tipar.delay=	[HW,PPT]
+			Set inter-bit delay in microseconds (default 10).
+
+	tmc8xx=		[HW,SCSI]
+			See header of drivers/scsi/seagate.c.
+
+	tmscsim=	[HW,SCSI]
+			See comment before function dc390_setup() in
+			drivers/scsi/tmscsim.c.
+
+	tp720=		[HW,PS2]
+
+	trix=		[HW,OSS] MediaTrix AudioTrix Pro
+			Format:
+			<io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
+
+	tsdev.xres=	[TS] Horizontal screen resolution.
+	tsdev.yres=	[TS] Vertical screen resolution.
+
+	turbografx.map[2|3]=	[HW,JOY]
+			TurboGraFX parallel port interface
+			Format:
+			<port#>,<js1>,<js2>,<js3>,<js4>,<js5>,<js6>,<js7>
+			See also Documentation/input/joystick-parport.txt
+
+	u14-34f=	[HW,SCSI] UltraStor 14F/34F SCSI host adapter
+			See header of drivers/scsi/u14-34f.c.
+
+	uart401=	[HW,OSS]
+			Format: <io>,<irq>
+
+	uart6850=	[HW,OSS]
+			Format: <io>,<irq>
+
+	usbhid.mousepoll=
+			[USBHID] The interval which mice are to be polled at.
+
+	video=		[FB] Frame buffer configuration
+			See Documentation/fb/modedb.txt.
+
+	vga=		[BOOT,IA-32] Select a particular video mode
+			See Documentation/i386/boot.txt and
+			Documentation/svga.txt.
+			Use vga=ask for menu.
+			This is actually a boot loader parameter; the value is
+			passed to the kernel using a special protocol.
+
+	vmalloc=nn[KMG]	[KNL,BOOT] Forces the vmalloc area to have an exact
+			size of <nn>. This can be used to increase the
+			minimum size (128MB on x86). It can also be used to
+			decrease the size and leave more room for directly
+			mapped kernel RAM.
+
+	vmhalt=		[KNL,S390]
+
+	vmpoff=		[KNL,S390]
+
+	waveartist=	[HW,OSS]
+			Format: <io>,<irq>,<dma>,<dma2>
+
+	wd33c93=	[HW,SCSI]
+			See header of drivers/scsi/wd33c93.c.
+
+	wd7000=		[HW,SCSI]
+			See header of drivers/scsi/wd7000.c.
+
+	wdt=		[WDT] Watchdog
+			See Documentation/watchdog/watchdog.txt.
+
+	xd=		[HW,XT] Original XT pre-IDE (RLL encoded) disks.
+	xd_geo=		See header of drivers/block/xd.c.
+
+	xirc2ps_cs=	[NET,PCMCIA]
+			Format:
+			<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
+
+	norandmaps	Don't use address space randomization
+			Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space
+
+
+______________________________________________________________________
+Changelog:
+
+2000-06-??	Mr. Unknown
+	The last known update (for 2.4.0) - the changelog was not kept before.
+
+2002-11-24	Petr Baudis <pasky@ucw.cz>
+		Randy Dunlap <randy.dunlap@verizon.net>
+	Update for 2.5.49, description for most of the options introduced,
+	references to other documentation (C files, READMEs, ..), added S390,
+	PPC, SPARC, MTD, ALSA and OSS category. Minor corrections and
+	reformatting.
+
+2005-10-19	Randy Dunlap <rdunlap@xenotime.net>
+	Lots of typos, whitespace, some reformatting.
+
+TODO:
+
+	Add documentation for ALSA options.
+	Add more DRM drivers.
diff -urN oldtree/Documentation/power/internals.txt newtree/Documentation/power/internals.txt
--- oldtree/Documentation/power/internals.txt	1970-01-01 00:00:00.000000000 +0000
+++ newtree/Documentation/power/internals.txt	2006-03-08 15:22:33.005490000 +0000
@@ -0,0 +1,360 @@
+		Software Suspend 2.2 Internal Documentation.
+				Version 1
+
+1.  Introduction.
+
+    Software Suspend 2.2 is an addition to the Linux Kernel, designed to
+    allow the user to quickly shutdown and quickly boot a computer, without
+    needing to close documents or programs. It is equivalent to the
+    hibernate facility in some laptops. This implementation, however,
+    requires no special BIOS or hardware support.
+
+    The code in these files is based upon the original implementation
+    prepared by Gabor Kuti and additional work by Pavel Machek and a
+    host of others. This code has been substantially reworked by Nigel
+    Cunningham, again with the help and testing of many others, not the
+    least of whom is Michael Frank, At its heart, however, the operation is
+    essentially the same as Gabor's version.
+
+2.  Overview of operation.
+
+    The basic sequence of operations is as follows:
+
+	a. Quiesce all other activity.
+	b. Ensure enough memory and storage space are available, and attempt
+	   to free memory/storage if necessary.
+	c. Allocate the required memory and storage space.
+	d. Write the image.
+	e. Power down.
+
+    There are a number of complicating factors which mean that things are
+    not as simple as the above would imply, however...
+
+    o The activity of each process must be stopped at a point where it will
+    not be holding locks necessary for saving the image, or unexpectedly
+    restart operations due to something like a timeout and thereby make
+    our image inconsistent.
+
+    o It is desirous that we sync outstanding I/O to disk before calculating
+    image statistics. This reduces corruption if one should suspend but
+    then not resume, and also makes later parts of the operation safer (see
+    below).
+
+    o We need to get as close as we can to an atomic copy of the data.
+    Inconsistencies in the image will result inconsistent memory contents at
+    resume time, and thus in instability of the system and/or file system
+    corruption. This would appear to imply a maximum image size of one half of
+    the amount of RAM, but we have a solution... (again, below).
+
+    o In 2.6, we must play nicely with the other suspend-to-disk
+    implementations.
+
+3.  Detailed description of internals.
+
+    a. Quiescing activity.
+
+    Safely quiescing the system is achieved using two methods.
+
+    First, we note that the vast majority of processes don't need to run during
+    suspend. They can be 'frozen'. We therefore implement a refrigerator
+    routine, which processes enter and in which they remain until the cycle is
+    complete. In the vanilla kernel, processes enter the refrigerator via
+    try_to_freeze() invocations at appropriate places.  A process cannot be
+    frozen in any old place. It must not be holding locks that will be needed
+    for writing the image or freezing other processes. For this reason,
+    userspace processes generally enter the refrigerator via the signal handling
+    code, and kernel threads at the place in their event loops where they drop
+    locks and yield to other processes or sleep.
+
+    In this revision of Suspend2, Christoph Lameter's todo list concept is
+    utilised to do the freezing. This means that we replace direct invocation of
+    the refrigerator function with a notifier list implementation, allowing
+    other applications of the hooks.
+
+    The second part of our method for quisescing the system involves freezing
+    the filesystems. We use the standard freeze_bdev and thaw_bdev functions to
+    ensure that all of the user's data is synced to disk before we begin to
+    write the image.
+
+    Quiescing the system works most quickly and reliably when we add one more
+    element to the algorithm: separating the freezing of userspace processes
+    from the freezing of kernel space processes, and doing the filesystem freeze
+    in between. The filesystem freeze needs to be done while kernel threads such
+    as kjournald can still run.At the same time, though, everything will be less
+    racy and run more quickly if we stop userspace submitting more I/O work
+    while we're trying to quiesce.
+
+    Quiescing the system is therefore done in three steps:
+	- Freeze userspace
+	- Freeze filesystems
+	- Freeze kernel threads
+
+    If we need to free memory, we thaw kernel threads and filesystems, but not
+    userspace. We can then free caches without worrying about deadlocks due to
+    swap files being on frozen filesystems or such like.
+
+    b. Ensure enough memory & storage are available.
+
+    We have a number of constraints to meet to be able to successfully suspend
+    and resume.
+
+    First, the image will be written in two parts, described below. One of these
+    parts needs to have an atomic copy made, which of course implies a maximum
+    size of one half of the amount of system memory. The other part ('pageset')
+    is not atomically copied, and can therefore be as large or small as desired.
+
+    Second, we have constraints on the amount of storage available. In these
+    calculations, we may also consider any compression that will be done. The
+    cryptoapi plugin allows the user to configure an expected compression ratio.
+   
+    Third, the user can specify an arbitrary limit on the image size, in
+    megabytes. This limit is treated as a soft limit, so that we don't fail the
+    attempt to suspend if we cannot meet this constraint.
+
+    c. Allocate the required memory and storage space.
+
+    Having done the initial freeze, we determine whether the above constraints
+    are met, and seek to allocate the metadata for the image. If the constraints
+    are not met, or we fail to allocate the required space for the metadata, we
+    seek to free the amount of memory that we calculate is needed and try again.
+    We allow up to four iterations of this loop before aborting the cycle. If we
+    do fail, it should only be because of a bug in Suspend's calculations.
+    
+    These steps are merged together in the prepare_image function, found in
+    prepare_image.c. The functions are merged because of the cyclical nature
+    of the problem of calculating how much memory and storage is needed. Since
+    the data structures containing the information about the image must
+    themselves take memory and use storage, the amount of memory and storage
+    required changes as we prepare the image. Since the changes are not large,
+    only one or two iterations will be required to achieve a solution.
+
+    d. Write the image.
+
+    We previously mentioned the need to create an atomic copy of the data, and
+    the half-of-memory limitation that is implied in this. This limitation is
+    circumvented by dividing the memory to be saved into two parts, called
+    pagesets.
+
+    Pageset2 contains the page cache - the pages on the active and inactive
+    lists. These pages are saved first and reloaded last. While saving these
+    pages, the swapwriter plugin carefully ensures that the work of writing
+    the pages doesn't make the image inconsistent. Pages added to the LRU
+    lists are immediately shot down, and careful accounting for available
+    memory aids debugging. No atomic copy of these pages needs to be made.
+
+    Writing the image requires memory, of course, and at this point we have
+    also not yet suspended the drivers. To avoid the possibility of remaining
+    activity corrupting the image, we allocate a special memory pool. Calls
+    to __alloc_pages and __free_pages_ok are then diverted to use our memory
+    pool. Pages in the memory pool are saved as part of pageset1 regardless of
+    whether or not they are used.
+
+    Once pageset2 has been saved, we suspend the drivers and save the CPU
+    context before making an atomic copy of pageset1, resuming the drivers
+    and saving the atomic copy. After saving the two pagesets, we just need to
+    save our metadata before powering down.
+
+    Having saved pageset2 pages, we can safely overwrite their contents with
+    the atomic copy of pageset1. This is how we manage to overcome the half of
+    memory limitation. Pageset2 is normally far larger than pageset1, and
+    pageset1 is normally much smaller than half of the memory, with the result
+    that pageset2 pages can be safely overwritten with the atomic copy of
+    pageset1. This is where we need to be careful about syncing, however.
+    Pageset2 will probably contain filesystem meta data. If this is overwritten
+    with pageset1 and then a sync occurs, the filesystem will be corrupted -
+    at least until resume time and another sync of the restored data. Since
+    there is a possibility that the user might not resume or (may it never be!)
+    that suspend might oops, we do our utmost to avoid syncing filesystems after
+    copying pageset1.
+
+    e. Power down.
+
+    Powering down uses standard kernel routines. Prior to this, however, we
+    suspend drivers again, ensuring that write caches are flushed.
+
+4.  The method of writing the image.
+
+    Suspend2 contains an internal API which is designed to simplify the
+    implementation of new methods of transforming the image to be written and
+    writing the image itself. In early versions of Suspend2, compression support
+    was inlined in the image writing code, and the data structures and code for
+    managing swap were intertwined with the rest of the code. A number of people
+    had expressed interest in implementing image encryption, and alternative
+    methods of storing the image. This internal API makes that possible by
+    implementing 'plugins'.
+
+    A plugin is a single file which encapsulates the functionality needed
+    to transform a pageset of data (encryption or compression, for example),
+    or to write the pageset to a device. The former type of plugin is called
+    a 'page-transformer', the later a 'writer'.
+
+    Plugins are linked together in pipeline fashion. There may be zero or more
+    page transformers in a pipeline, and there is always exactly one writer.
+    The pipeline follows this pattern:
+
+		---------------------------------
+		|          Suspend2 Core        
+		---------------------------------
+				|
+				|
+		---------------------------------
+		|	Page transformer 1	|
+		---------------------------------
+				|
+				|
+		---------------------------------
+		|	Page transformer 2	|
+		---------------------------------
+				|
+				|
+		---------------------------------
+		|            Writer		|
+		---------------------------------
+
+    During the writing of an image, the core code feeds pages one at a time
+    to the first plugin. This plugin performs whatever transformations it
+    implements on the incoming data, completely consuming the incoming data and
+    feeding output in a similar manner to the next plugin. A plugin may buffer
+    its output.
+
+    During reading, the pipeline works in the reverse direction. The core code
+    calls the first plugin with the address of a buffer which should be filled.
+    (Note that the buffer size is always PAGE_SIZE at this time). This plugin
+    will in turn request data from the next plugin and so on down until the
+    writer is made to read from the stored image.
+
+    Part of definition of the structure of a plugin thus looks like this:
+
+	/* Writing the image proper */
+	int (*write_init) (int stream_number);
+	int (*write_chunk) (char *buffer_start);
+	int (*write_cleanup) (void);
+
+	/* Reading the image proper */
+	int (*read_init) (int stream_number);
+	int (*read_chunk) (char *buffer_start, int sync);
+	int (*read_cleanup) (void);
+
+    It should be noted that the _cleanup routines may be called before the
+    full stream of data has been read or written. While writing the image,
+    the user may (depending upon settings) choose to abort suspending, and
+    if we are in the midst of writing the last portion of the image, a portion
+    of the second pageset may be reread.
+
+    In addition to the above routines for writing the data, all plugins have a
+    number of other routines:
+
+    TYPE indicates whether the plugin is a page transformer or a writer.
+    #define TRANSFORMER_PLUGIN 1
+    #define WRITER_PLUGIN 2
+
+    NAME is the name of the plugin, used in generic messages.
+
+    PLUGIN_LIST is used to link the plugin into the list of all plugins.
+
+    MEMORY_NEEDED returns the number of pages of memory required by the plugin
+    to do its work.
+
+    STORAGE_NEEDED returns the number of pages in the suspend header required
+    to store the plugin's configuration data.
+
+    PRINT_DEBUG_INFO fills a buffer with information to be displayed about the
+    operation or settings of the plugin.
+
+    SAVE_CONFIG_INFO returns a buffer of PAGE_SIZE or smaller (the size is the
+    return code), containing the plugin's configuration info. This information
+    will be written in the image header and restored at resume time. Since this
+    buffer is allocated after the atomic copy of the kernel is made, you don't
+    need to worry about the buffer being freed.
+
+    LOAD_CONFIG_INFO gives the plugin a pointer to the the configuration info
+    which was saved during suspending. Once again, the plugin doesn't need to
+    worry about freeing the buffer. The kernel will be overwritten with the
+    original kernel, so no memory leak will occur.
+
+    OPS contains the operations specific to transformers and writers. These are
+    described below.
+
+    The complete definition of struct suspend_plugin_ops is:
+
+	struct suspend_plugin_ops {
+		/* Functions common to transformers and writers */
+		int type;
+		char *name;
+		struct list_head plugin_list;
+		unsigned long (*memory_needed) (void);
+		unsigned long (*storage_needed) (void);
+		int (*print_debug_info) (char *buffer, int size);
+		int (*save_config_info) (char *buffer);
+		void (*load_config_info) (char *buffer, int len);
+	
+		/* Writing the image proper */
+		int (*write_init) (int stream_number);
+		int (*write_chunk) (char *buffer_start);
+		int (*write_cleanup) (void);
+
+		/* Reading the image proper */
+		int (*read_init) (int stream_number);
+		int (*read_chunk) (char *buffer_start, int sync);
+		int (*read_cleanup) (void);
+
+		union {
+			struct suspend_transformer_ops transformer;
+			struct suspend_writer_ops writer;
+		} ops;
+	};
+
+
+	The operations specific to transformers are few in number:
+
+	struct suspend_transformer_ops {
+		int (*expected_compression) (void);
+		struct list_head transformer_list;
+	};
+
+	Expected compression returns the expected ratio between the amount of
+	data sent to this plugin and the amount of data it passes to the next
+	plugin. The value is used by the core code to calculate the amount of
+	space required to write the image. If the ratio is not achieved, the
+	writer will complain when it runs out of space with data still to
+	write, and the core code will abort the suspend.
+
+	transformer_list links together page transformers, in the order in
+	which they register, which is in turn determined by order in the
+	Makefile.
+	
+	There are many more operations specific to a writer:
+
+	struct suspend_writer_ops {
+
+		long (*storage_available) (void);
+	
+		unsigned long (*storage_allocated) (void);
+		
+		int (*release_storage) (void);
+
+		long (*allocate_header_space) (unsigned long space_requested);
+		int (*allocate_storage) (unsigned long space_requested);
+
+		int (*write_header_init) (void);
+		int (*write_header_chunk) (char *buffer_start, int buffer_size);
+		int (*write_header_cleanup) (void);
+
+		int (*read_header_init) (void);
+		int (*read_header_chunk) (char *buffer_start, int buffer_size);
+		int (*read_header_cleanup) (void);
+
+		int (*prepare_save) (void);
+		int (*post_load) (void);
+
+		int (*parse_image_location) (char *buffer);
+
+		int (*image_exists) (void);
+
+		int (*invalidate_image) (void);
+
+		int (*wait_on_io) (int flush_all);
+
+		struct list_head writer_list;
+	};
+
diff -urN oldtree/Documentation/power/kernel_threads.txt newtree/Documentation/power/kernel_threads.txt
--- oldtree/Documentation/power/kernel_threads.txt	2006-01-03 03:21:10.000000000 +0000
+++ newtree/Documentation/power/kernel_threads.txt	2006-03-08 15:22:33.005490000 +0000
@@ -4,15 +4,15 @@
 Freezer
 
 Upon entering a suspended state the system will freeze all
-tasks. This is done by delivering pseudosignals. This affects
-kernel threads, too. To successfully freeze a kernel thread
-the thread has to check for the pseudosignal and enter the
-refrigerator. Code to do this looks like this:
+tasks. This is done by making all processes execute a notifier.
+This affects kernel threads, too. To successfully freeze a kernel thread
+the thread has to check for the notifications and call the notifier
+chain for the process. Code to do this looks like this:
 
 	do {
 		hub_events();
 		wait_event_interruptible(khubd_wait, !list_empty(&hub_event_list));
-		try_to_freeze();
+		try_todo_list();
 	} while (!signal_pending(current));
 
 from drivers/usb/core/hub.c::hub_thread()
diff -urN oldtree/Documentation/power/suspend2.txt newtree/Documentation/power/suspend2.txt
--- oldtree/Documentation/power/suspend2.txt	1970-01-01 00:00:00.000000000 +0000
+++ newtree/Documentation/power/suspend2.txt	2006-03-08 15:22:33.009490250 +0000
@@ -0,0 +1,631 @@
+	--- Suspend2, version 2.1.9 ---
+
+1.  What is it?
+2.  Why would you want it?
+3.  What do you need to use it?
+4.  How do you use it?
+5.  What do all those entries in /proc/suspend2 do?
+6.  How do you get support?
+7.  I think I've found a bug. What should I do?
+8.  When will XXX be supported?
+9.  How does it work?
+10. Who wrote Suspend2?
+
+1. What is it?
+
+   Imagine you're sitting at your computer, working away. For some reason, you
+   need to turn off your computer for a while - perhaps it's time to go home
+   for the day. When you come back to your computer next, you're going to want
+   to carry on where you left off. Now imagine that you could push a button and
+   have your computer store the contents of its memory to disk and power down.
+   Then, when you next start up your computer, it loads that image back into
+   memory and you can carry on from where you were, just as if you'd never
+   turned the computer off. Far less time to start up, no reopening
+   applications and finding what directory you put that file in yesterday.
+   That's what Suspend2 does.
+
+2. Why would you want it?
+
+   Why wouldn't you want it?
+   
+   Being able to save the state of your system and quickly restore it improves
+   your productivity - you get a useful system in far less time than through
+   the normal boot process.
+   
+3. What do you need to use it?
+
+   a. Kernel Support.
+
+   i) The Suspend2 patch.
+   
+   Suspend2 is part of the Linux Kernel. This version is not part of Linus's
+   2.6 tree at the moment, so you will need to download the kernel source and
+   apply the latest patch. Having done that, enable the appropriate options in
+   make [menu|x]config (under General Setup), compile and install your kernel.
+   Suspend2 works with SMP, Highmem, preemption, x86-32, PPC and mac.
+   x86-64 support is coming.
+
+   Suspend2 patches are available from http://suspend2.net.
+
+   ii) Compression and encryption support.
+
+   As of 2.1.9.2, compression and encryption support are implemented via the
+   cryptoapi. You will therefore want to select any Cryptoapi transforms that
+   you want to use on your image from the Cryptoapi menu while configuring
+   your kernel.
+
+   You can also tell Suspend to write it's image to an encrypted and/or
+   compressed filesystem/swap partition. In that case, you don't need to do
+   anything special for Suspend2 when it comes to kernel configuration.
+
+   iii) Configuring other options.
+
+   While you're configuring your kernel, try to configure as much as possible
+   to build as modules. We recommend this because there are a number of drivers
+   that are still in the process of implementing proper power management
+   support. In those cases, the best way to work around their current lack is
+   to build them as modules and remove the modules while suspending. You might
+   also bug the driver authors to get their support up to speed, or even help!
+
+   b. Storage.
+
+   i) Swap.
+
+   Suspend2 can store the suspend image in your swap partition, a swap file or
+   a combination thereof. Whichever combination you choose, you will probably
+   want to create enough swap space to store the largest image you could have,
+   plus the space you'd normally use for swap. A good rule of thumb would be
+   to calculate the amount of swap you'd want without using Suspend2, and then
+   add the amount of memory you have. This swapspace can be arranged in any way
+   you'd like. It can be in one partition or file, or spread over a number. The
+   only requirement is that they be active when you start a suspend cycle.
+   
+   There is one exception to this requirement. Suspend2 has the ability to turn
+   on one swap file or partition at the start of suspending and turn it back off
+   at the end. If you want to ensure you have enough memory to store a image
+   when your memory is fully used, you might want to make one swap partition or
+   file for 'normal' use, and another for Suspend2 to activate & deactivate
+   automatically. (Further details below).
+
+   ii) Normal files.
+
+   As of 2.1.8.5, Suspend2 includes a 'filewriter'. The filewriter can store
+   your image in a simple file. Since Linux has the idea of everything being
+   a file, this is more powerful than it initially sounds. If, for example,
+   you were to set up a network block device file, you could suspend to a
+   network server. This has been tested and works to a point, but nbd itself
+   isn't stateless enough for our purposes.
+
+   Take extra care when setting up the filewriter. If you just type commands
+   without thinking and then try to suspend, you could cause irreversible
+   corruption on your filesystems! Make sure you have backups. Also, because
+   the filewriter is comparatively new, it's not as well tested as the
+   swapwriter. Be aware that there may be bugs that could cause damage to your
+   data even if you are careful! You have been warned!
+
+   Most people will only want to suspend to a local file. To achieve that, do
+   something along the lines of:
+
+   echo Suspend2 > /suspend-file
+   dd if=/dev/zero bs=1M count=512 >> suspend-file
+
+   This will create a 512MB file called /suspend-file. To get Suspend2 to use
+   it:
+
+   echo /suspend-file > /proc/suspend2/filewriter_target
+
+   Then
+
+   cat /proc/suspend2/resume2
+
+   Put the results of this into your bootloader's configuration (see also step
+   C, below:
+
+   ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
+   # cat /proc/suspend2/resume2
+   file:/dev/hda2:0x1e001
+   
+   In this example, we would edit the append= line of our lilo.conf|menu.lst
+   so that it included:
+
+   resume2=file:/dev/hda2:0x1e001
+   ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
+ 
+   For those who are thinking 'Could I make the file sparse?', the answer is
+   'No!'. At the moment, there is no way for Suspend2 to fill in the holes in
+   a sparse file while suspending. In the longer term (post merge!), I'd like
+   to change things so that the file could be dynamically resized as needed.
+   Right now, however, that's not possible.
+
+   c. Bootloader configuration.
+   
+   Using Suspend2 also requires that you add an extra parameter to 
+   your lilo.conf or equivalent. Here's an example for a swap partition:
+
+   append="resume2=swap:/dev/hda1"
+
+   This would tell Suspend2 that /dev/hda1 is a swap partition you 
+   have. Suspend2 will use the swap signature of this partition as a
+   pointer to your data when you suspend. This means that (in this example)
+   /dev/hda1 doesn't need to be _the_ swap partition where all of your data
+   is actually stored. It just needs to be a swap partition that has a
+   valid signature.
+
+   You don't need to have a swap partition for this purpose. Suspend2
+   can also use a swap file, but usage is a little more complex. Having made
+   your swap file, turn it on and do 
+
+   cat /proc/suspend2/headerlocations
+
+   (this assumes you've already compiled your kernel with Suspend2
+   support and booted it). The results of the cat command will tell you
+   what you need to put in lilo.conf:
+
+   For swap partitions like /dev/hda1, simply use resume2=/dev/hda1.
+   For swapfile `swapfile`, use resume2=swap:/dev/hda2:0x242d@4096.
+
+   If the swapfile changes for any reason (it is moved to a different
+   location, it is deleted and recreated, or the filesystem is
+   defragmented) then you will have to check
+   /proc/suspend2/headerlocations for a new resume_block value.
+
+   Once you've compiled and installed the kernel, adjusted your lilo.conf
+   and rerun lilo, you should only need to reboot for the most basic part
+   of Suspend2 to be ready.
+
+   If you only compile in the swapwriter, or only compile in the filewriter,
+   you don't need to add the "swap:" part of the resume2= parameters above.
+   resume2=/dev/hda2:0x242d@4096 will work just as well.
+
+   d. The hibernate script.
+
+   Since the driver model in 2.6 kernels is still being developed, you may need
+   to do more, however. Users of Suspend2 usually start the process via a script
+   which prepares for the suspend, tells the kernel to do its stuff and then
+   restore things afterwards. This script might involve:
+
+   - Switching to a text console and back if X doesn't like the video card
+     status on resume.
+   - Un/reloading PCMCIA support since it doesn't play well with suspend.
+  
+   Note that you might not be able to unload some drivers if there are 
+   processes using them. You might have to kill off processes that hold
+   devices open. Hint: if your X server accesses an USB mouse, doing a
+   'chvt' to a text console releases the device and you can unload the
+   module.
+
+   Check out the latest script (available on suspend2.net).
+   
+4. How do you use it?
+
+   Once your script is properly set up, you should just be able to start it
+   and everything should go like clockwork. Of course things aren't always
+   that easy out of the box.
+
+   Check out (in the kernel source tree) include/linux/suspend2.h for
+   settings you can use to get detailed information about what suspend is doing.
+   The kernel parameters suspend_act, suspend_dbg and suspend_lvl allow you to
+   set the action and debugging parameters prior to starting a suspend and/or
+   at the lilo prompt before resuming. There is also a nice little program that
+   should be available from suspend2.net which makes it easier to turn these
+   debugging settings on and off. Note that to get any debugging output, you
+   need to enable CONFIG_PM_DEBUG when compiling the kernel.
+
+   A neat feature of Suspend2 is that you can press Escape at any time
+   during suspending, and the process will be aborted.
+   
+   Due to the way suspend works, this means you'll have your system back and
+   perfectly usable almost instantly. The only exception is when it's at
+   the very end of writing the image. Then it will need to reload a small
+   (usually 4-50MBs, depending upon the image characteristics) portion first.
+
+   If you run into problems with resuming, adding the "noresume2" option to
+   the kernel command line will let you skip the resume step and recover your
+   system.
+
+5. What do all those entries in /proc/suspend2 do?
+
+   /proc/suspend2 is the directory which contains files you can use to
+   tune and configure Suspend2 to your liking. The exact contents of
+   the directory will depend upon the version of Suspend2 you're
+   running and the options you selected at compile time. In the following
+   descriptions, names in brackets refer to compile time options.
+   (Note that they're all dependant upon you having selected CONFIG_SUSPEND2
+   in the first place!)
+
+   Since the values of these settings can open potential security risks, they
+   are usually accessible only to the root user. You can, however, enable a
+   compile time option which makes all of these files world-accessible. This
+   should only be done if you trust everyone with shell access to this
+   computer!
+  
+   - all_settings:
+
+   This file provides a convenient way to save and restore all of the other
+   settings in one hit. The contents include binary data, so you'll want to
+   redirect the output to a file:
+
+   cat /proc/suspend2/all_settings > /etc/hibernate/all_settings.conf
+
+   cat /etc/hibernate/all_settings.conf > /proc/suspend2/all_settings
+
+   - debug_info:
+  
+   This file returns information about your configuration that may be helpful
+   in diagnosing problems with suspending.
+
+   - debug_sections (CONFIG_PM_DEBUG):
+
+   This value, together with the console log level, controls what debugging
+   information is displayed. The console log level determines the level of
+   detail, and this value determines what detail is displayed. This value is
+   a bit vector, and the meaning of the bits can be found in the kernel tree
+   in include/linux/suspend2.h. It can be overridden using the kernel's
+   command line option suspend_dbg.
+
+   - default_console_level (CONFIG_PM_DEBUG):
+
+   This determines the value of the console log level at the start of a
+   suspend cycle. If debugging is compiled in, the console log level can be
+   changed during a cycle by pressing the digit keys. Meanings are:
+
+   0: Nice display.
+   1: Nice display plus numerical progress.
+   2: Errors only.
+   3: Low level debugging info.
+   4: Medium level debugging info.
+   5: High level debugging info.
+   6: Verbose debugging info.
+
+   This value can be overridden using the kernel command line option 
+   suspend_lvl.
+
+   - disable_*
+
+   This option can be used to temporarily disable various parts of suspend.
+   Note that these flags can be set by restoring all_settings: If the saved
+   settings don't include any information about how a part of suspend should
+   be configured, that section will be disabled.
+
+   - do_resume:
+
+   When anything is written to this file suspend will attempt to read and
+   restore an image. If there is no image, it will return almost immediately.
+   If an image exists, the echo > will never return. Instead, the original
+   kernel context will be restored and the original echo > do_suspend will
+   return.
+
+   - do_suspend:
+
+   When anything is written to this file, the kernel side of Suspend2 will
+   begin to attempt to write an image to disk and power down. You'll normally
+   want to run the hibernate script instead, to get modules unloaded first.
+
+   - enable_escape:
+
+   Setting this to "1" will enable you abort a suspend by
+   pressing escape, "0" (default) disables this feature. Note that enabling
+   this option means that you cannot initiate a suspend and then walk away
+   from your computer, expecting it to be secure. With feature disabled,
+   you can validly have this expectation once Suspend begins to write the
+   image to disk. (Prior to this point, it is possible that Suspend might
+   about because of failure to freeze all processes or because constraints
+   on its ability to save the image are not met).
+
+   - expected_compression:
+
+   These values allow you to set an expected compression ratio, which Software
+   Suspend will use in calculating whether it meets constraints on the image
+   size. If this expected compression ratio is not attained, the suspend will
+   abort, so it is wise to allow some spare. You can see what compression
+   ratio is achieved in the logs after suspending.
+
+   - filewriter_target:
+
+   Read this value to get the current setting. Write to it to point Suspend
+   at a new storage location for the filewriter. See above for details of how
+   to set up the filewriter.
+
+   - headerlocations:
+
+   This option tells you the resume2= options to use for swap devices you
+   currently have activated. It is particularly useful when you only want to
+   use a swap file to store your image. See above for further details.
+
+   - image_exists:
+
+   Can be used in a script to determine whether a valid image exists at the
+   location currently pointed to by resume2=.  Echoing anything to this entry
+   removes any current image.
+
+   - image_size_limit:
+
+   The maximum size of suspend image written to disk, measured in megabytes
+   (1024*1024).
+
+   - interface_version:
+
+   The value returned by this file can be used by scripts and configuration
+   tools to determine what entries should be looked for. The value is
+   incremented whenever an entry in /proc/suspend2 is obsoleted or 
+   added.
+
+   - last_result:
+
+   The result of the last suspend, as defined in
+   include/linux/suspend-debug.h with the values SUSPEND_ABORTED to
+   SUSPEND_KEPT_IMAGE. This is a bitmask.
+
+   - log_everything (CONFIG_PM_DEBUG):
+
+   Setting this option results in all messages printed being logged. Normally,
+   only a subset are logged, so as to not slow the process and not clutter the
+   logs. Useful for debugging. It can be toggled during a cycle by pressing
+   'L'.
+
+   - pause_between_steps (CONFIG_PM_DEBUG):
+
+   This option is used during debugging, to make Suspend2 pause between
+   each step of the process. It is ignored when the nice display is on.
+
+   - powerdown_method:
+
+   Used to select a method by which Suspend2 should powerdown after writing the
+   image. Currently:
+
+   3: Attempt to enter Suspend-to-ram.
+   4: Attempt to enter ACPI S4 mode.
+   5: Normal power down.
+
+   Note that these options are highly dependant upon your hardware & software.
+
+   - progressbar_granularity_limit:
+
+   This option can be used to limit the granularity of the progress bar
+   displayed with a bootsplash screen. The value is the maximum number of
+   steps. That is, 10 will make the progress bar jump in 10% increments.
+
+   - reboot:
+
+   This option causes Suspend2 to reboot rather than powering down
+   at the end of saving an image. It can be toggled during a cycle by pressing
+   'R'.
+
+   - resume_commandline:
+
+   This entry can be read after resuming to see the commandline that was used
+   when resuming began. You might use this to set up two bootloader entries
+   that are the same apart from the fact that one includes a extra append=
+   argument "at_work=1". You could then grep resume_commandline in your
+   post-resume scripts and configure networking (for example) differently
+   depending upon whether you're at home or work. resume_commandline can be
+   set to arbitrary text if you wish to remove sensitive contents.
+
+   - swapfile:
+
+   This entry is used to specify the swapfile or partition that
+   Suspend2 will attempt to swapon/swapoff automatically. Thus, if
+   I normally use /dev/hda1 for swap, and want to use /dev/hda2 for specifically
+   for my suspend image, I would
+  
+   echo /dev/hda2 > /proc/suspend2/swapfile
+
+   /dev/hda2 would then be automatically swapon'd and swapoff'd. Note that the
+   swapon and swapoff occur while other processes are frozen (including kswapd)
+   so this swap file will not be used up when attempting to free memory. The
+   parition/file is also given the highest priority, so other swapfiles/partitions
+   will only be used to save the image when this one is filled.
+
+   The value of this file is used by headerlocations along with any currently
+   activated swapfiles/partitions.
+
+   - toggle_process_nofreeze
+
+   This entry can be used to toggle the NOFREEZE flag on a process, to allow it
+   to run during Suspending. It should be used with extreme caution. There are
+   strict limitations on what a process running during suspend can do. This is
+   really only intended for use by Suspend's helpers (userui in particular).
+
+   - userui_program
+
+   This entry is used to tell Suspend what userspace program to use for
+   providing a user interface while suspending. The program uses a netlink
+   socket to pass messages back and forward to the kernel, allowing all of the
+   functions formerly implemented in the kernel user interface components.
+
+   - version:
+  
+   The version of suspend you have compiled into the currently running kernel.
+
+6. How do you get support?
+
+   Glad you asked. Suspend2 is being actively maintained and supported
+   by Nigel (the guy doing most of the kernel coding at the moment), Bernard
+   (who maintains the hibernate script and userspace user interface components)
+   and its users.
+
+   Resources availble include HowTos, FAQs and a Wiki, all available via
+   suspend2.net.  You can find the mailing lists there.
+
+7. I think I've found a bug. What should I do?
+
+   By far and a way, the most common problems people have with suspend2
+   related to drivers not having adequate power management support. In this
+   case, it is not a bug with suspend2, but we can still help you. As we
+   mentioned above, such issues can usually be worked around by building the
+   functionality as modules and unloading them while suspending. Please visit
+   the Wiki for up-to-date lists of known issues and work arounds.
+
+   If this information doesn't help, try running:
+
+   hibernate --bug-report
+
+   ..and sending the output to the users mailing list.
+
+   Good information on how to provide us with useful information from an
+   oops is found in the file REPORTING-BUGS, in the top level directory
+   of the kernel tree. If you get an oops, please especially note the
+   information about running what is printed on the screen through ksymoops.
+   The raw information is useless.
+
+8. When will XXX be supported?
+
+   Suspend2 currently lacks support for x86-64. It is work in progress, but
+   hasn't been made a great priority because debugging is difficult (Nigel
+   doesn't have access to the hardware). 64GB Highmem and discontig-mem are
+   also not supported at the moment.
+
+   Patches for the other items (and anything that's been missed) are welcome. 
+   Please send to the list.
+
+9. How does it work?
+
+   Suspend2 does its work in a number of steps.
+
+   a. Freezing system activity.
+
+   The first main stage in suspending is to stop all other activity. This is
+   achieved in stages. Processes are considered in fours groups, which we will
+   describe in reverse order for clarity's sake: Threads with the PF_NOFREEZE
+   flag, kernel threads without this flag, userspace processes with the
+   PF_SYNCTHREAD flag and all other processes. The first set (PF_NOFREEZE) are
+   untouched by the refrigerator code. They are allowed to run during suspending
+   and resuming, and are used to support user interaction, storage access or the
+   like. Other kernel threads (those unneeded while suspending) are frozen last.
+   This leaves us with userspace processes that need to be frozen. When a
+   process enters one of the *_sync system calls, we set a PF_SYNCTHREAD flag on
+   that process for the duration of that call. Processes that have this flag are
+   frozen after processes without it, so that we can seek to ensure that dirty
+   data is synced to disk as quickly as possible in a situation where other
+   processes may be submitting writes at the same time. Freezing the processes
+   that are submitting data stops new I/O from being submitted. Syncthreads can
+   then cleanly finish their work. So the order is:
+
+   - Userspace processes without PF_SYNCTHREAD or PF_NOFREEZE;
+   - Userspace processes with PF_SYNCTHREAD (they won't have NOFREEZE);
+   - Kernel processes without PF_NOFREEZE.
+
+   b. Eating memory.
+
+   For a successful suspend, you need to have enough disk space to store the
+   image and enough memory for the various limitations of Suspend2's
+   algorithm. You can also specify a maximum image size. In order to attain
+   to those constraints, Suspend2 may 'eat' memory. If, after freezing
+   processes, the constraints aren't met, Suspend2 will thaw all the
+   other processes and begin to eat memory until its calculations indicate
+   the constraints are met. It will then freeze processes again and recheck
+   its calculations.
+
+   c. Allocation of storage.
+
+   Next, Suspend2 allocates the storage that will be used to save
+   the image.
+
+   The core of Suspend2 knows nothing about how or where pages are stored. We
+   therefore request the active writer (remember you might have compiled in
+   more than one!) to allocate enough storage for our expect image size. If
+   this request cannot be fulfilled, we eat more memory and try again. If it
+   is fulfiled, we seek to allocate additional storage, just in case our
+   expected compression ratio (if any) isn't achieved. This time, however, we
+   just continue if we can't allocate enough storage.
+
+   If these calls to our writer change the characteristics of the image such
+   that we haven't allocated enough memory, we also loop. (The writer may well
+   need to allocate space for its storage information).
+
+   d. Write the first part of the image.
+
+   Suspend2 stores the image in two sets of pages called 'pagesets'.
+   Pageset 2 contains pages on the active and inactive lists; essentially
+   the page cache. Pageset 1 contains all other pages, including the kernel.
+   We use two pagesets for one important reason: We need to make an atomic copy
+   of the kernel to ensure consistency of the image. Without a second pageset,
+   that would limit us to an image that was at most half the amount of memory
+   available. Using two pagesets allows us to store a full image. Since pageset
+   2 pages won't be needed in saving pageset 1, we first save pageset 2 pages.
+   We can then make our atomic copy of the remaining pages using both pageset 2
+   pages and any other pages that are free. While saving both pagesets, we are
+   careful not to corrupt the image. Among other things, we use lowlevel block
+   I/O routines that don't change the pagecache contents.
+
+   The next step, then, is writing pageset 2.
+
+   e. Suspending drivers and storing processor context.
+
+   Having written pageset2, Suspend2 calls the power management functions to
+   notify drivers of the suspend, and saves the processor state in preparation
+   for the atomic copy of memory we are about to make.
+
+   f. Atomic copy.
+
+   At this stage, everything else but the Suspend2 code is halted. Processes
+   are frozen or idling, drivers are quiesced and have stored (ideally and where
+   necessary) their configuration in memory we are about to atomically copy.
+   In our lowlevel architecture specific code, we have saved the CPU state.
+   We can therefore now do our atomic copy before resuming drivers etc.
+
+   g. Save the atomic copy (pageset 1).
+
+   Suspend can then write the atomic copy of the remaining pages. Since we
+   have copied the pages into other locations, we can continue to use the
+   normal block I/O routines without fear of corruption our image.
+
+   f. Save the suspend header.
+
+   Nearly there! We save our settings and other parameters needed for
+   reloading pageset 1 in a 'suspend header'. We also tell our writer to
+   serialise its data at this stage, so that it can reread the image at resume
+   time. Note that the writer can write this data in any format - in the case
+   of the swapwriter, for example, it splits header pages in 4092 byte blocks,
+   using the last four bytes to link pages of data together. This is completely
+   transparent to the core.
+
+   g. Set the image header.
+
+   Finally, we edit the header at our resume2= location. The signature is
+   changed by the writer to reflect the fact that an image exists, and to point
+   to the start of that data if necessary (swapwriter).
+
+   h. Power down.
+
+   Or reboot if we're debugging and the appropriate option is selected.
+
+   Whew!
+
+   Reloading the image.
+   --------------------
+
+   Reloading the image is essentially the reverse of all the above. We load
+   our copy of pageset 1, being careful to choose locations that aren't going
+   to be overwritten as we copy it back (We start very early in the boot
+   process, so there are no other processes to quiesce here). We then copy
+   pageset 1 back to its original location in memory and restore the process
+   context. We are now running with the original kernel. Next, we reload the
+   pageset 2 pages, free the memory and swap used by Suspend2, restore
+   the pageset header and restart processes. Sounds easy in comparison to
+   suspending, doesn't it!
+
+   There is of course more to Suspend2 than this, but this explanation
+   should be a good start. If there's interest, I'll write further
+   documentation on range pages and the low level I/O.
+
+10. Who wrote Suspend2?
+
+   (Answer based on the writings of Florent Chabaud, credits in files and
+   Nigel's limited knowledge; apologies to anyone missed out!)
+
+   The main developers of Suspend2 have been...
+
+   Gabor Kuti
+   Pavel Machek
+   Florent Chabaud
+   Bernard Blackham
+   Nigel Cunningham
+
+   They have been aided in their efforts by a host of hundreds, if not thousands
+   of testers and people who have submitted bug fixes & suggestions. Of special
+   note are the efforts of Michael Frank, who had his computers repetitively
+   suspend and resume for literally tens of thousands of cycles and developed
+   scripts to stress the system and test Suspend2 far beyond the point
+   most of us (Nigel included!) would consider testing. His efforts have
+   contributed as much to Suspend2 as any of the names above.
diff -urN oldtree/Documentation/power/swsusp.txt newtree/Documentation/power/swsusp.txt
--- oldtree/Documentation/power/swsusp.txt	2006-03-08 18:47:59.055820000 +0000
+++ newtree/Documentation/power/swsusp.txt	2006-03-08 15:22:33.013490500 +0000
@@ -139,7 +139,8 @@
 website, and not to the Linux Kernel Mailing List. We are working
 toward merging suspend2 into the mainline kernel.
 
-Q: A kernel thread must voluntarily freeze itself (call 'refrigerator').
+Q: A kernel thread must work on the todo list (call 'run_todo_list')
+to enter the refrigerator.
 I found some kernel threads that don't do it, and they don't freeze
 so the system can't sleep. Is this a known behavior?
 
@@ -148,7 +149,7 @@
 should be held at that point and it must be safe to sleep there), and
 add:
 
-       try_to_freeze();
+       try_todo_list();
 
 If the thread is needed for writing the image to storage, you should
 instead set the PF_NOFREEZE process flag when creating the thread (and
diff -urN oldtree/Documentation/power/swsusp.txt.orig newtree/Documentation/power/swsusp.txt.orig
--- oldtree/Documentation/power/swsusp.txt.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/Documentation/power/swsusp.txt.orig	2006-03-08 15:21:14.496583500 +0000
@@ -0,0 +1,373 @@
+Some warnings, first.
+
+ * BIG FAT WARNING *********************************************************
+ *
+ * If you touch anything on disk between suspend and resume...
+ *				...kiss your data goodbye.
+ *
+ * If you do resume from initrd after your filesystems are mounted...
+ *				...bye bye root partition.
+ *			[this is actually same case as above]
+ *
+ * If you have unsupported (*) devices using DMA, you may have some
+ * problems. If your disk driver does not support suspend... (IDE does),
+ * it may cause some problems, too. If you change kernel command line
+ * between suspend and resume, it may do something wrong. If you change
+ * your hardware while system is suspended... well, it was not good idea;
+ * but it will probably only crash.
+ *
+ * (*) suspend/resume support is needed to make it safe.
+ *
+ * If you have any filesystems on USB devices mounted before suspend,
+ * they won't be accessible after resume and you may lose data, as though
+ * you have unplugged the USB devices with mounted filesystems on them
+ * (see the FAQ below for details).
+
+You need to append resume=/dev/your_swap_partition to kernel command
+line. Then you suspend by
+
+echo shutdown > /sys/power/disk; echo disk > /sys/power/state
+
+. If you feel ACPI works pretty well on your system, you might try
+
+echo platform > /sys/power/disk; echo disk > /sys/power/state
+
+. If you have SATA disks, you'll need recent kernels with SATA suspend
+support. For suspend and resume to work, make sure your disk drivers
+are built into kernel -- not modules. [There's way to make
+suspend/resume with modular disk drivers, see FAQ, but you probably
+should not do that.]
+
+If you want to limit the suspend image size to N bytes, do
+
+echo N > /sys/power/image_size
+
+before suspend (it is limited to 500 MB by default).
+
+
+Article about goals and implementation of Software Suspend for Linux
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Author: G‚ábor Kuti
+Last revised: 2003-10-20 by Pavel Machek
+
+Idea and goals to achieve
+
+Nowadays it is common in several laptops that they have a suspend button. It
+saves the state of the machine to a filesystem or to a partition and switches
+to standby mode. Later resuming the machine the saved state is loaded back to
+ram and the machine can continue its work. It has two real benefits. First we
+save ourselves the time machine goes down and later boots up, energy costs
+are real high when running from batteries. The other gain is that we don't have to
+interrupt our programs so processes that are calculating something for a long
+time shouldn't need to be written interruptible.
+
+swsusp saves the state of the machine into active swaps and then reboots or
+powerdowns.  You must explicitly specify the swap partition to resume from with
+``resume='' kernel option. If signature is found it loads and restores saved
+state. If the option ``noresume'' is specified as a boot parameter, it skips
+the resuming.
+
+In the meantime while the system is suspended you should not add/remove any
+of the hardware, write to the filesystems, etc.
+
+Sleep states summary
+====================
+
+There are three different interfaces you can use, /proc/acpi should
+work like this:
+
+In a really perfect world:
+echo 1 > /proc/acpi/sleep       # for standby
+echo 2 > /proc/acpi/sleep       # for suspend to ram
+echo 3 > /proc/acpi/sleep       # for suspend to ram, but with more power conservative
+echo 4 > /proc/acpi/sleep       # for suspend to disk
+echo 5 > /proc/acpi/sleep       # for shutdown unfriendly the system
+
+and perhaps
+echo 4b > /proc/acpi/sleep      # for suspend to disk via s4bios
+
+Frequently Asked Questions
+==========================
+
+Q: well, suspending a server is IMHO a really stupid thing,
+but... (Diego Zuccato):
+
+A: You bought new UPS for your server. How do you install it without
+bringing machine down? Suspend to disk, rearrange power cables,
+resume.
+
+You have your server on UPS. Power died, and UPS is indicating 30
+seconds to failure. What do you do? Suspend to disk.
+
+
+Q: Maybe I'm missing something, but why don't the regular I/O paths work?
+
+A: We do use the regular I/O paths. However we cannot restore the data
+to its original location as we load it. That would create an
+inconsistent kernel state which would certainly result in an oops.
+Instead, we load the image into unused memory and then atomically copy
+it back to it original location. This implies, of course, a maximum
+image size of half the amount of memory.
+
+There are two solutions to this:
+
+* require half of memory to be free during suspend. That way you can
+read "new" data onto free spots, then cli and copy
+
+* assume we had special "polling" ide driver that only uses memory
+between 0-640KB. That way, I'd have to make sure that 0-640KB is free
+during suspending, but otherwise it would work...
+
+suspend2 shares this fundamental limitation, but does not include user
+data and disk caches into "used memory" by saving them in
+advance. That means that the limitation goes away in practice.
+
+Q: Does linux support ACPI S4?
+
+A: Yes. That's what echo platform > /sys/power/disk does.
+
+Q: What is 'suspend2'?
+
+A: suspend2 is 'Software Suspend 2', a forked implementation of
+suspend-to-disk which is available as separate patches for 2.4 and 2.6
+kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
+highmem and preemption. It also has a extensible architecture that
+allows for arbitrary transformations on the image (compression,
+encryption) and arbitrary backends for writing the image (eg to swap
+or an NFS share[Work In Progress]). Questions regarding suspend2
+should be sent to the mailing list available through the suspend2
+website, and not to the Linux Kernel Mailing List. We are working
+toward merging suspend2 into the mainline kernel.
+
+Q: A kernel thread must voluntarily freeze itself (call 'refrigerator').
+I found some kernel threads that don't do it, and they don't freeze
+so the system can't sleep. Is this a known behavior?
+
+A: All such kernel threads need to be fixed, one by one. Select the
+place where the thread is safe to be frozen (no kernel semaphores
+should be held at that point and it must be safe to sleep there), and
+add:
+
+       try_to_freeze();
+
+If the thread is needed for writing the image to storage, you should
+instead set the PF_NOFREEZE process flag when creating the thread (and
+be very carefull).
+
+
+Q: What is the difference between between "platform", "shutdown" and
+"firmware" in /sys/power/disk?
+
+A:
+
+shutdown: save state in linux, then tell bios to powerdown
+
+platform: save state in linux, then tell bios to powerdown and blink
+          "suspended led"
+
+firmware: tell bios to save state itself [needs BIOS-specific suspend
+	  partition, and has very little to do with swsusp]
+
+"platform" is actually right thing to do, but "shutdown" is most
+reliable.
+
+Q: I do not understand why you have such strong objections to idea of
+selective suspend.
+
+A: Do selective suspend during runtime power managment, that's okay. But
+its useless for suspend-to-disk. (And I do not see how you could use
+it for suspend-to-ram, I hope you do not want that).
+
+Lets see, so you suggest to
+
+* SUSPEND all but swap device and parents
+* Snapshot
+* Write image to disk
+* SUSPEND swap device and parents
+* Powerdown
+
+Oh no, that does not work, if swap device or its parents uses DMA,
+you've corrupted data. You'd have to do
+
+* SUSPEND all but swap device and parents
+* FREEZE swap device and parents
+* Snapshot
+* UNFREEZE swap device and parents
+* Write
+* SUSPEND swap device and parents
+
+Which means that you still need that FREEZE state, and you get more
+complicated code. (And I have not yet introduce details like system
+devices).
+
+Q: There don't seem to be any generally useful behavioral
+distinctions between SUSPEND and FREEZE.
+
+A: Doing SUSPEND when you are asked to do FREEZE is always correct,
+but it may be unneccessarily slow. If you want USB to stay simple,
+slowness may not matter to you. It can always be fixed later.
+
+For devices like disk it does matter, you do not want to spindown for
+FREEZE.
+
+Q: After resuming, system is paging heavilly, leading to very bad interactivity.
+
+A: Try running
+
+cat `cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u` > /dev/null
+
+after resume. swapoff -a; swapon -a may also be useful.
+
+Q: What happens to devices during swsusp? They seem to be resumed
+during system suspend?
+
+A: That's correct. We need to resume them if we want to write image to
+disk. Whole sequence goes like
+
+      Suspend part
+      ~~~~~~~~~~~~
+      running system, user asks for suspend-to-disk
+
+      user processes are stopped
+
+      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+      		      with state snapshot
+
+      state snapshot: copy of whole used memory is taken with interrupts disabled
+
+      resume(): devices are woken up so that we can write image to swap
+
+      write image to swap
+
+      suspend(PMSG_SUSPEND): suspend devices so that we can power off
+
+      turn the power off
+
+      Resume part
+      ~~~~~~~~~~~
+      (is actually pretty similar)
+
+      running system, user asks for suspend-to-disk
+
+      user processes are stopped (in common case there are none, but with resume-from-initrd, noone knows)
+
+      read image from disk
+
+      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+      		      with image restoration
+
+      image restoration: rewrite memory with image
+
+      resume(): devices are woken up so that system can continue
+
+      thaw all user processes
+
+Q: What is this 'Encrypt suspend image' for?
+
+A: First of all: it is not a replacement for dm-crypt encrypted swap.
+It cannot protect your computer while it is suspended. Instead it does
+protect from leaking sensitive data after resume from suspend.
+
+Think of the following: you suspend while an application is running
+that keeps sensitive data in memory. The application itself prevents
+the data from being swapped out. Suspend, however, must write these
+data to swap to be able to resume later on. Without suspend encryption
+your sensitive data are then stored in plaintext on disk.  This means
+that after resume your sensitive data are accessible to all
+applications having direct access to the swap device which was used
+for suspend. If you don't need swap after resume these data can remain
+on disk virtually forever. Thus it can happen that your system gets
+broken in weeks later and sensitive data which you thought were
+encrypted and protected are retrieved and stolen from the swap device.
+To prevent this situation you should use 'Encrypt suspend image'.
+
+During suspend a temporary key is created and this key is used to
+encrypt the data written to disk. When, during resume, the data was
+read back into memory the temporary key is destroyed which simply
+means that all data written to disk during suspend are then
+inaccessible so they can't be stolen later on.  The only thing that
+you must then take care of is that you call 'mkswap' for the swap
+partition used for suspend as early as possible during regular
+boot. This asserts that any temporary key from an oopsed suspend or
+from a failed or aborted resume is erased from the swap device.
+
+As a rule of thumb use encrypted swap to protect your data while your
+system is shut down or suspended. Additionally use the encrypted
+suspend image to prevent sensitive data from being stolen after
+resume.
+
+Q: Why can't we suspend to a swap file?
+
+A: Because accessing swap file needs the filesystem mounted, and
+filesystem might do something wrong (like replaying the journal)
+during mount.
+
+There are few ways to get that fixed:
+
+1) Probably could be solved by modifying every filesystem to support
+some kind of "really read-only!" option. Patches welcome.
+
+2) suspend2 gets around that by storing absolute positions in on-disk
+image (and blocksize), with resume parameter pointing directly to
+suspend header.
+
+Q: Is there a maximum system RAM size that is supported by swsusp?
+
+A: It should work okay with highmem.
+
+Q: Does swsusp (to disk) use only one swap partition or can it use
+multiple swap partitions (aggregate them into one logical space)?
+
+A: Only one swap partition, sorry.
+
+Q: If my application(s) causes lots of memory & swap space to be used
+(over half of the total system RAM), is it correct that it is likely
+to be useless to try to suspend to disk while that app is running?
+
+A: No, it should work okay, as long as your app does not mlock()
+it. Just prepare big enough swap partition.
+
+Q: What information is useful for debugging suspend-to-disk problems?
+
+A: Well, last messages on the screen are always useful. If something
+is broken, it is usually some kernel driver, therefore trying with as
+little as possible modules loaded helps a lot. I also prefer people to
+suspend from console, preferably without X running. Booting with
+init=/bin/bash, then swapon and starting suspend sequence manually
+usually does the trick. Then it is good idea to try with latest
+vanilla kernel.
+
+Q: How can distributions ship a swsusp-supporting kernel with modular
+disk drivers (especially SATA)?
+
+A: Well, it can be done, load the drivers, then do echo into
+/sys/power/disk/resume file from initrd. Be sure not to mount
+anything, not even read-only mount, or you are going to lose your
+data.
+
+Q: How do I make suspend more verbose?
+
+A: If you want to see any non-error kernel messages on the virtual
+terminal the kernel switches to during suspend, you have to set the
+kernel console loglevel to at least 5, for example by doing
+
+	echo 5 > /proc/sys/kernel/printk
+
+Q: Is this true that if I have a mounted filesystem on a USB device and
+I suspend to disk, I can lose data unless the filesystem has been mounted
+with "sync"?
+
+A: That's right.  It depends on your hardware, and it could be true even for
+suspend-to-RAM.  In fact, even with "-o sync" you can lose data if your
+programs have information in buffers they haven't written out to disk.
+
+If you're lucky, your hardware will support low-power modes for USB
+controllers while the system is asleep.  Lots of hardware doesn't,
+however.  Shutting off the power to a USB controller is equivalent to
+unplugging all the attached devices.
+
+Remember that it's always a bad idea to unplug a disk drive containing a
+mounted filesystem.  With USB that's true even when your system is asleep!
+The safest thing is to unmount all USB-based filesystems before suspending
+and remount them after resuming.
+
diff -urN oldtree/arch/arm/mm/init.c newtree/arch/arm/mm/init.c
--- oldtree/arch/arm/mm/init.c	2006-03-08 18:47:58.591791000 +0000
+++ newtree/arch/arm/mm/init.c	2006-03-08 15:22:33.013490500 +0000
@@ -17,6 +17,7 @@
 #include <linux/mman.h>
 #include <linux/nodemask.h>
 #include <linux/initrd.h>
+#include <linux/suspend.h>
 
 #include <asm/mach-types.h>
 #include <asm/hardware.h>
@@ -86,6 +87,11 @@
 	printk("%d pages swap cached\n", cached);
 }
 
+int page_is_ram(int pfn)
+{
+	return pfn_valid(pfn);
+}
+
 static inline pmd_t *pmd_off(pgd_t *pgd, unsigned long virt)
 {
 	return pmd_offset(pgd, virt);
@@ -660,6 +666,15 @@
 		 */
 		sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
 	}
+#ifdef CONFIG_SUSPEND2
+	{
+		unsigned long addr;
+		for (addr = &__nosave_begin; addr < &__nosave_end; 
+		     addr += PAGE_SIZE) {
+			SetPageNosave(virt_to_page(addr));
+		}
+	}
+#endif
 }
 
 void free_initmem(void)
diff -urN oldtree/arch/i386/kernel/time.c newtree/arch/i386/kernel/time.c
--- oldtree/arch/i386/kernel/time.c	2006-03-08 18:47:58.643794250 +0000
+++ newtree/arch/i386/kernel/time.c	2006-03-08 15:22:33.021491000 +0000
@@ -251,7 +251,8 @@
 
 
 
-static long clock_cmos_diff, sleep_start;
+static long clock_cmos_diff;
+static unsigned long sleep_start;
 
 static int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
@@ -269,14 +270,16 @@
 	unsigned long flags;
 	unsigned long sec;
 	unsigned long sleep_length;
+	unsigned long cmos_time;
 
 #ifdef CONFIG_HPET_TIMER
 	if (is_hpet_enabled())
 		hpet_reenable();
 #endif
+	cmos_time = get_cmos_time();
+	sec = cmos_time + clock_cmos_diff;
+	sleep_length = (cmos_time - sleep_start) * HZ;
 	setup_pit_timer();
-	sec = get_cmos_time() + clock_cmos_diff;
-	sleep_length = (get_cmos_time() - sleep_start) * HZ;
 	write_seqlock_irqsave(&xtime_lock, flags);
 	jiffies_64 += sleep_length;
 	wall_jiffies += sleep_length;
diff -urN oldtree/arch/i386/kernel/time.c.orig newtree/arch/i386/kernel/time.c.orig
--- oldtree/arch/i386/kernel/time.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/arch/i386/kernel/time.c.orig	2006-03-08 15:21:14.028554250 +0000
@@ -0,0 +1,338 @@
+/*
+ *  linux/arch/i386/kernel/time.c
+ *
+ *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
+ *
+ * This file contains the PC-specific time handling details:
+ * reading the RTC at bootup, etc..
+ * 1994-07-02    Alan Modra
+ *	fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
+ * 1995-03-26    Markus Kuhn
+ *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
+ *      precision CMOS clock update
+ * 1996-05-03    Ingo Molnar
+ *      fixed time warps in do_[slow|fast]_gettimeoffset()
+ * 1997-09-10	Updated NTP code according to technical memorandum Jan '96
+ *		"A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-09-05    (Various)
+ *	More robust do_fast_gettimeoffset() algorithm implemented
+ *	(works with APM, Cyrix 6x86MX and Centaur C6),
+ *	monotonic gettimeofday() with fast_get_timeoffset(),
+ *	drift-proof precision TSC calibration on boot
+ *	(C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
+ *	Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
+ *	ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
+ * 1998-12-16    Andrea Arcangeli
+ *	Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
+ *	because was not accounting lost_ticks.
+ * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
+ *	Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ *	serialize accesses to xtime/lost_ticks).
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/efi.h>
+#include <linux/mca.h>
+
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/irq.h>
+#include <asm/msr.h>
+#include <asm/delay.h>
+#include <asm/mpspec.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+#include <asm/timeofday.h>
+
+#include "mach_time.h"
+
+#include <linux/timex.h>
+#include <linux/config.h>
+
+#include <asm/hpet.h>
+
+#include <asm/arch_hooks.h>
+
+#include "io_ports.h"
+
+#include <asm/i8259.h>
+
+int pit_latch_buggy;              /* extern */
+
+#include "do_timer.h"
+
+unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
+extern unsigned long wall_jiffies;
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with.  It is required for NMI access to the
+ * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock = 0;
+EXPORT_SYMBOL(cmos_lock);
+
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+	unsigned char val;
+	lock_cmos_prefix(addr);
+	outb_p(addr, RTC_PORT(0));
+	val = inb_p(RTC_PORT(1));
+	lock_cmos_suffix(addr);
+	return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+	lock_cmos_prefix(addr);
+	outb_p(addr, RTC_PORT(0));
+	outb_p(val, RTC_PORT(1));
+	lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+
+static int set_rtc_mmss(unsigned long nowtime)
+{
+	int retval;
+	unsigned long flags;
+
+	/* gets recalled with irq locally disabled */
+	/* XXX - does irqsave resolve this? -johnstul */
+	spin_lock_irqsave(&rtc_lock, flags);
+	if (efi_enabled)
+		retval = efi_set_rtc_mmss(nowtime);
+	else
+		retval = mach_set_rtc_mmss(nowtime);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	return retval;
+}
+
+
+int timer_ack;
+
+#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+	if (in_lock_functions(pc))
+		return *(unsigned long *)(regs->ebp + 4);
+
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+#endif
+
+/*
+ * This is the same as the above, except we _also_ save the current
+ * Time Stamp Counter value at the time of the timer interrupt, so that
+ * we later on can estimate the time of day more exactly.
+ */
+irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	/*
+	 * Here we are in the timer irq handler. We just have irqs locally
+	 * disabled but we don't know if the timer_bh is running on the other
+	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
+	 * the irq version of write_lock because as just said we have irq
+	 * locally disabled. -arca
+	 */
+	write_seqlock(&xtime_lock);
+
+#ifdef CONFIG_X86_IO_APIC
+	if (timer_ack) {
+		/*
+		 * Subtle, when I/O APICs are used we have to ack timer IRQ
+		 * manually to reset the IRR bit for do_slow_gettimeoffset().
+		 * This will also deassert NMI lines for the watchdog if run
+		 * on an 82489DX-based system.
+		 */
+		spin_lock(&i8259A_lock);
+		outb(0x0c, PIC_MASTER_OCW3);
+		/* Ack the IRQ; AEOI will end it automatically. */
+		inb(PIC_MASTER_POLL);
+		spin_unlock(&i8259A_lock);
+	}
+#endif
+
+	do_timer_interrupt_hook(regs);
+
+
+	if (MCA_bus) {
+		/* The PS/2 uses level-triggered interrupts.  You can't
+		turn them off, nor would you want to (any attempt to
+		enable edge-triggered interrupts usually gets intercepted by a
+		special hardware circuit).  Hence we have to acknowledge
+		the timer interrupt.  Through some incredibly stupid
+		design idea, the reset for IRQ 0 is done by setting the
+		high bit of the PPI port B (0x61).  Note that some PS/2s,
+		notably the 55SX, work fine if this is removed.  */
+
+		irq = inb_p( 0x61 );	/* read the current state */
+		outb_p( irq|0x80, 0x61 );	/* reset the IRQ */
+	}
+
+	write_sequnlock(&xtime_lock);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (using_apic_timer)
+		smp_send_timer_broadcast_ipi(regs);
+#endif
+
+	return IRQ_HANDLED;
+}
+
+/* not static: needed by APM */
+unsigned long get_cmos_time(void)
+{
+	unsigned long retval;
+
+	spin_lock(&rtc_lock);
+
+	if (efi_enabled)
+		retval = efi_get_time();
+	else
+		retval = mach_get_cmos_time();
+
+	spin_unlock(&rtc_lock);
+
+	return retval;
+}
+EXPORT_SYMBOL(get_cmos_time);
+
+/* arch specific timeofday hooks */
+s64 read_persistent_clock(void)
+{
+	return (s64)get_cmos_time() * NSEC_PER_SEC;
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+	static unsigned long last_rtc_update;
+	/*
+	 * If we have an externally synchronized Linux clock, then update
+	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+	 * called as close as possible to 500 ms before the new second starts.
+	 */
+	if (ts.tv_sec <= last_rtc_update + 660)
+		return;
+
+	if((ts.tv_nsec / 1000) >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
+		(ts.tv_nsec / 1000) <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) {
+		/* horrible...FIXME */
+		if (set_rtc_mmss(ts.tv_sec) == 0)
+			last_rtc_update = ts.tv_sec;
+		else
+			last_rtc_update = ts.tv_sec - 600; /* do it again in 60 s */
+	}
+}
+
+
+
+static long clock_cmos_diff, sleep_start;
+
+static int timer_suspend(struct sys_device *dev, pm_message_t state)
+{
+	/*
+	 * Estimate time zone so that set_time can update the clock
+	 */
+	clock_cmos_diff = -get_cmos_time();
+	clock_cmos_diff += get_seconds();
+	sleep_start = get_cmos_time();
+	return 0;
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+	unsigned long flags;
+	unsigned long sec;
+	unsigned long sleep_length;
+
+#ifdef CONFIG_HPET_TIMER
+	if (is_hpet_enabled())
+		hpet_reenable();
+#endif
+	setup_pit_timer();
+	sec = get_cmos_time() + clock_cmos_diff;
+	sleep_length = (get_cmos_time() - sleep_start) * HZ;
+	write_seqlock_irqsave(&xtime_lock, flags);
+	jiffies_64 += sleep_length;
+	wall_jiffies += sleep_length;
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+	touch_softlockup_watchdog();
+	return 0;
+}
+
+static struct sysdev_class timer_sysclass = {
+	.resume = timer_resume,
+	.suspend = timer_suspend,
+	set_kset_name("timer"),
+};
+
+
+/* XXX this driverfs stuff should probably go elsewhere later -john */
+static struct sys_device device_timer = {
+	.id	= 0,
+	.cls	= &timer_sysclass,
+};
+
+static int time_init_device(void)
+{
+	int error = sysdev_class_register(&timer_sysclass);
+	if (!error)
+		error = sysdev_register(&device_timer);
+	return error;
+}
+
+device_initcall(time_init_device);
+
+#ifdef CONFIG_HPET_TIMER
+extern void (*late_time_init)(void);
+/* Duplicate of time_init() below, with hpet_enable part added */
+static void __init hpet_time_init(void)
+{
+	if ((hpet_enable() >= 0) && hpet_use_timer) {
+		printk("Using HPET for base-timer\n");
+	}
+
+
+	time_init_hook();
+}
+#endif
+
+void __init time_init(void)
+{
+#ifdef CONFIG_HPET_TIMER
+	if (is_hpet_capable()) {
+		/*
+		 * HPET initialization needs to do memory-mapped io. So, let
+		 * us do a late initialization after mem_init().
+		 */
+		late_time_init = hpet_time_init;
+		return;
+	}
+#endif
+	time_init_hook();
+}
diff -urN oldtree/arch/i386/mm/init.c newtree/arch/i386/mm/init.c
--- oldtree/arch/i386/mm/init.c	2006-03-08 18:47:58.659795250 +0000
+++ newtree/arch/i386/mm/init.c	2006-03-08 15:25:27.324384250 +0000
@@ -29,6 +29,7 @@
 #include <linux/efi.h>
 #include <linux/memory_hotplug.h>
 #include <linux/initrd.h>
+#include <linux/suspend.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -48,6 +49,7 @@
 unsigned long highstart_pfn, highend_pfn;
 
 static int noinline do_test_wp_bit(void);
+int bad_ppro;
 
 /*
  * Creates a middle page table and puts a pointer to it in the
@@ -279,9 +281,12 @@
 {
 	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
 		ClearPageReserved(page);
+		ClearPageNosave(page);
 		free_new_highpage(page);
-	} else
+	} else {
 		SetPageReserved(page);
+		SetPageNosave(page);
+	}
 }
 
 static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
@@ -384,7 +389,7 @@
 #endif
 }
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_PM
 /*
  * Swap suspend & friends need this for resume because things like the intel-agp
  * driver might have split up a kernel 4MB mapping.
@@ -570,7 +575,7 @@
 	extern int ppro_with_ram_bug(void);
 	int codesize, reservedpages, datasize, initsize;
 	int tmp;
-	int bad_ppro;
+	struct page *tmp_page;
 
 #ifdef CONFIG_FLATMEM
 	if (!mem_map)
@@ -601,12 +606,23 @@
 	totalram_pages += free_all_bootmem();
 
 	reservedpages = 0;
-	for (tmp = 0; tmp < max_low_pfn; tmp++)
-		/*
-		 * Only count reserved RAM pages
-		 */
-		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
-			reservedpages++;
+	for (tmp = 0; tmp < max_low_pfn; tmp++) {
+		if (page_is_ram(tmp)) {
+			/*
+			 * Only count reserved RAM pages
+			 */
+			if (PageReserved(pfn_to_page(tmp)))
+				reservedpages++;
+		} else
+			/*
+			 * Non-RAM pages are always nosave
+			 */
+			SetPageNosave(pfn_to_page(tmp));
+	}
+
+	for (tmp_page = virt_to_page(&__nosave_begin);
+			tmp_page < virt_to_page(&__nosave_end); tmp_page++)
+		SetPageNosave(tmp_page);
 
 	set_highmem_pages_init(bad_ppro);
 
@@ -749,7 +765,8 @@
 
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
+		ClearPageNosave(virt_to_page(addr));
+                init_page_count(virt_to_page(addr));
 #ifdef CONFIG_DEBUG_INITDATA
 		/*
 		 * Unmap the page, and leak it.  So any further accesses will
diff -urN oldtree/arch/i386/mm/init.c.orig newtree/arch/i386/mm/init.c.orig
--- oldtree/arch/i386/mm/init.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/arch/i386/mm/init.c.orig	2006-03-08 15:21:14.044555250 +0000
@@ -0,0 +1,784 @@
+/*
+ *  linux/arch/i386/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/efi.h>
+#include <linux/memory_hotplug.h>
+#include <linux/initrd.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+unsigned long highstart_pfn, highend_pfn;
+
+static int noinline do_test_wp_bit(void);
+
+/*
+ * Creates a middle page table and puts a pointer to it in the
+ * given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t * __init one_md_table_init(pgd_t *pgd)
+{
+	pud_t *pud;
+	pmd_t *pmd_table;
+		
+#ifdef CONFIG_X86_PAE
+	pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+	set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+	pud = pud_offset(pgd, 0);
+	if (pmd_table != pmd_offset(pud, 0)) 
+		BUG();
+#else
+	pud = pud_offset(pgd, 0);
+	pmd_table = pmd_offset(pud, 0);
+#endif
+
+	return pmd_table;
+}
+
+/*
+ * Create a page table and place a pointer to it in a middle page
+ * directory entry.
+ */
+static pte_t * __init one_page_table_init(pmd_t *pmd)
+{
+	if (pmd_none(*pmd)) {
+		pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+		if (page_table != pte_offset_kernel(pmd, 0))
+			BUG();	
+
+		return page_table;
+	}
+	
+	return pte_offset_kernel(pmd, 0);
+}
+
+/*
+ * This function initializes a certain range of kernel virtual memory 
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+ */
+
+/*
+ * NOTE: The pagetables are allocated contiguous on the physical space 
+ * so we can cache the place of the first one and move around without 
+ * checking the pgd every time.
+ */
+static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	int pgd_idx, pmd_idx;
+	unsigned long vaddr;
+
+	vaddr = start;
+	pgd_idx = pgd_index(vaddr);
+	pmd_idx = pmd_index(vaddr);
+	pgd = pgd_base + pgd_idx;
+
+	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+		if (pgd_none(*pgd)) 
+			one_md_table_init(pgd);
+		pud = pud_offset(pgd, vaddr);
+		pmd = pmd_offset(pud, vaddr);
+		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
+			if (pmd_none(*pmd)) 
+				one_page_table_init(pmd);
+
+			vaddr += PMD_SIZE;
+		}
+		pmd_idx = 0;
+	}
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+	if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+		return 1;
+	return 0;
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total 
+ * of max_low_pfn pages, by creating page tables starting from address 
+ * PAGE_OFFSET.
+ */
+static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+{
+	unsigned long pfn;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	int pgd_idx, pmd_idx, pte_ofs;
+
+	pgd_idx = pgd_index(PAGE_OFFSET);
+	pgd = pgd_base + pgd_idx;
+	pfn = 0;
+
+	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+		pmd = one_md_table_init(pgd);
+		if (pfn >= max_low_pfn)
+			continue;
+		for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
+			unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+			/* Map with big pages if possible, otherwise create normal page tables. */
+			if (cpu_has_pse) {
+				unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
+
+				if (is_kernel_text(address) || is_kernel_text(address2))
+					set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
+				else
+					set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+				pfn += PTRS_PER_PTE;
+			} else {
+				pte = one_page_table_init(pmd);
+
+				for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
+						if (is_kernel_text(address))
+							set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
+						else
+							set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
+				}
+			}
+		}
+	}
+}
+
+static inline int page_kills_ppro(unsigned long pagenr)
+{
+	if (pagenr >= 0x70000 && pagenr <= 0x7003F)
+		return 1;
+	return 0;
+}
+
+extern int is_available_memory(efi_memory_desc_t *);
+
+int page_is_ram(unsigned long pagenr)
+{
+	int i;
+	unsigned long addr, end;
+
+	if (efi_enabled) {
+		efi_memory_desc_t *md;
+		void *p;
+
+		for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+			md = p;
+			if (!is_available_memory(md))
+				continue;
+			addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+			end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
+
+			if ((pagenr >= addr) && (pagenr < end))
+				return 1;
+		}
+		return 0;
+	}
+
+	for (i = 0; i < e820.nr_map; i++) {
+
+		if (e820.map[i].type != E820_RAM)	/* not usable memory */
+			continue;
+		/*
+		 *	!!!FIXME!!! Some BIOSen report areas as RAM that
+		 *	are not. Notably the 640->1Mb area. We need a sanity
+		 *	check here.
+		 */
+		addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+		end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
+		if  ((pagenr >= addr) && (pagenr < end))
+			return 1;
+	}
+	return 0;
+}
+
+#ifdef CONFIG_HIGHMEM
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+#define kmap_get_fixmap_pte(vaddr)					\
+	pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
+
+static void __init kmap_init(void)
+{
+	unsigned long kmap_vstart;
+
+	/* cache the first kmap pte */
+	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+	kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+	kmap_prot = PAGE_KERNEL;
+}
+
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long vaddr;
+
+	vaddr = PKMAP_BASE;
+	page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	pud = pud_offset(pgd, vaddr);
+	pmd = pmd_offset(pud, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
+	pkmap_page_table = pte;	
+}
+
+static void __meminit free_new_highpage(struct page *page)
+{
+	init_page_count(page);
+	__free_page(page);
+	totalhigh_pages++;
+}
+
+void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+{
+	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
+		ClearPageReserved(page);
+		free_new_highpage(page);
+	} else
+		SetPageReserved(page);
+}
+
+static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+{
+	free_new_highpage(page);
+	totalram_pages++;
+#ifdef CONFIG_FLATMEM
+	max_mapnr = max(pfn, max_mapnr);
+#endif
+	num_physpages++;
+	return 0;
+}
+
+/*
+ * Not currently handling the NUMA case.
+ * Assuming single node and all memory that
+ * has been added dynamically that would be
+ * onlined here is in HIGHMEM
+ */
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	add_one_highpage_hotplug(page, page_to_pfn(page));
+}
+
+
+#ifdef CONFIG_NUMA
+extern void set_highmem_pages_init(int);
+#else
+static void __init set_highmem_pages_init(int bad_ppro)
+{
+	int pfn;
+	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
+		add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+	totalram_pages += totalhigh_pages;
+}
+#endif /* CONFIG_FLATMEM */
+
+#else
+#define kmap_init() do { } while (0)
+#define permanent_kmaps_init(pgd_base) do { } while (0)
+#define set_highmem_pages_init(bad_ppro) do { } while (0)
+#endif /* CONFIG_HIGHMEM */
+
+unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
+EXPORT_SYMBOL(__PAGE_KERNEL);
+unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
+
+#ifdef CONFIG_NUMA
+extern void __init remap_numa_kva(void);
+#else
+#define remap_numa_kva() do {} while (0)
+#endif
+
+static void __init pagetable_init (void)
+{
+	unsigned long vaddr;
+	pgd_t *pgd_base = swapper_pg_dir;
+
+#ifdef CONFIG_X86_PAE
+	int i;
+	/* Init entries of the first-level page table to the zero page */
+	for (i = 0; i < PTRS_PER_PGD; i++)
+		set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+#endif
+
+	/* Enable PSE if available */
+	if (cpu_has_pse) {
+		set_in_cr4(X86_CR4_PSE);
+	}
+
+	/* Enable PGE if available */
+	if (cpu_has_pge) {
+		set_in_cr4(X86_CR4_PGE);
+		__PAGE_KERNEL |= _PAGE_GLOBAL;
+		__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
+	}
+
+	kernel_physical_mapping_init(pgd_base);
+	remap_numa_kva();
+
+	/*
+	 * Fixed mappings, only the page table structure has to be
+	 * created - mappings will be set by set_fixmap():
+	 */
+	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+	page_table_range_init(vaddr, 0, pgd_base);
+
+	permanent_kmaps_init(pgd_base);
+
+#ifdef CONFIG_X86_PAE
+	/*
+	 * Add low memory identity-mappings - SMP needs it when
+	 * starting up on an AP from real-mode. In the non-PAE
+	 * case we already have these mappings through head.S.
+	 * All user-space mappings are explicitly cleared after
+	 * SMP startup.
+	 */
+	set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]);
+#endif
+}
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Swap suspend & friends need this for resume because things like the intel-agp
+ * driver might have split up a kernel 4MB mapping.
+ */
+char __nosavedata swsusp_pg_dir[PAGE_SIZE]
+	__attribute__ ((aligned (PAGE_SIZE)));
+
+static inline void save_pg_dir(void)
+{
+	memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+}
+#else
+static inline void save_pg_dir(void)
+{
+}
+#endif
+
+void zap_low_mappings (void)
+{
+	int i;
+
+	save_pg_dir();
+
+	/*
+	 * Zap initial low-memory mappings.
+	 *
+	 * Note that "pgd_clear()" doesn't do it for
+	 * us, because pgd_clear() is a no-op on i386.
+	 */
+	for (i = 0; i < USER_PTRS_PER_PGD; i++)
+#ifdef CONFIG_X86_PAE
+		set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
+#else
+		set_pgd(swapper_pg_dir+i, __pgd(0));
+#endif
+	flush_tlb_all();
+}
+
+static int disable_nx __initdata = 0;
+u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+
+/*
+ * noexec = on|off
+ *
+ * Control non executable mappings.
+ *
+ * on      Enable
+ * off     Disable
+ */
+void __init noexec_setup(const char *str)
+{
+	if (!strncmp(str, "on",2) && cpu_has_nx) {
+		__supported_pte_mask |= _PAGE_NX;
+		disable_nx = 0;
+	} else if (!strncmp(str,"off",3)) {
+		disable_nx = 1;
+		__supported_pte_mask &= ~_PAGE_NX;
+	}
+}
+
+int nx_enabled = 0;
+#ifdef CONFIG_X86_PAE
+
+static void __init set_nx(void)
+{
+	unsigned int v[4], l, h;
+
+	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+		if ((v[3] & (1 << 20)) && !disable_nx) {
+			rdmsr(MSR_EFER, l, h);
+			l |= EFER_NX;
+			wrmsr(MSR_EFER, l, h);
+			nx_enabled = 1;
+			__supported_pte_mask |= _PAGE_NX;
+		}
+	}
+}
+
+/*
+ * Enables/disables executability of a given kernel page and
+ * returns the previous setting.
+ */
+int __init set_kernel_exec(unsigned long vaddr, int enable)
+{
+	pte_t *pte;
+	int ret = 1;
+
+	if (!nx_enabled)
+		goto out;
+
+	pte = lookup_address(vaddr);
+	BUG_ON(!pte);
+
+	if (!pte_exec_kernel(*pte))
+		ret = 0;
+
+	if (enable)
+		pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
+	else
+		pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
+	__flush_tlb_all();
+out:
+	return ret;
+}
+
+#endif
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+#ifdef CONFIG_X86_PAE
+	set_nx();
+	if (nx_enabled)
+		printk("NX (Execute Disable) protection: active\n");
+#endif
+
+	pagetable_init();
+
+	load_cr3(swapper_pg_dir);
+
+#ifdef CONFIG_X86_PAE
+	/*
+	 * We will bail out later - printk doesn't work right now so
+	 * the user would just see a hanging kernel.
+	 */
+	if (cpu_has_pae)
+		set_in_cr4(X86_CR4_PAE);
+#endif
+	__flush_tlb_all();
+
+	kmap_init();
+}
+
+/*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+ * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
+ * used to involve black magic jumps to work around some nasty CPU bugs,
+ * but fortunately the switch to using exceptions got rid of all that.
+ */
+
+static void __init test_wp_bit(void)
+{
+	printk("Checking if this processor honours the WP bit even in supervisor mode... ");
+
+	/* Any page-aligned address will do, the test is non-destructive */
+	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
+	boot_cpu_data.wp_works_ok = do_test_wp_bit();
+	clear_fixmap(FIX_WP_TEST);
+
+	if (!boot_cpu_data.wp_works_ok) {
+		printk("No.\n");
+#ifdef CONFIG_X86_WP_WORKS_OK
+		panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+#endif
+	} else {
+		printk("Ok.\n");
+	}
+}
+
+static void __init set_max_mapnr_init(void)
+{
+#ifdef CONFIG_HIGHMEM
+	num_physpages = highend_pfn;
+#else
+	num_physpages = max_low_pfn;
+#endif
+#ifdef CONFIG_FLATMEM
+	max_mapnr = num_physpages;
+#endif
+}
+
+static struct kcore_list kcore_mem, kcore_vmalloc; 
+
+void __init mem_init(void)
+{
+	extern int ppro_with_ram_bug(void);
+	int codesize, reservedpages, datasize, initsize;
+	int tmp;
+	int bad_ppro;
+
+#ifdef CONFIG_FLATMEM
+	if (!mem_map)
+		BUG();
+#endif
+	
+	bad_ppro = ppro_with_ram_bug();
+
+#ifdef CONFIG_HIGHMEM
+	/* check that fixmap and pkmap do not overlap */
+	if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
+		printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
+		printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
+				PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
+		BUG();
+	}
+#endif
+ 
+	set_max_mapnr_init();
+
+#ifdef CONFIG_HIGHMEM
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+	/* this will put all low memory onto the freelists */
+	totalram_pages += free_all_bootmem();
+
+	reservedpages = 0;
+	for (tmp = 0; tmp < max_low_pfn; tmp++)
+		/*
+		 * Only count reserved RAM pages
+		 */
+		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
+			reservedpages++;
+
+	set_highmem_pages_init(bad_ppro);
+
+	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+		   VMALLOC_END-VMALLOC_START);
+
+	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
+		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		num_physpages << (PAGE_SHIFT-10),
+		codesize >> 10,
+		reservedpages << (PAGE_SHIFT-10),
+		datasize >> 10,
+		initsize >> 10,
+		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
+	       );
+
+#ifdef CONFIG_X86_PAE
+	if (!cpu_has_pae)
+		panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
+#endif
+	if (boot_cpu_data.wp_works_ok < 0)
+		test_wp_bit();
+
+	/*
+	 * Subtle. SMP is doing it's boot stuff late (because it has to
+	 * fork idle threads) - but it also needs low mappings for the
+	 * protected-mode entry to work. We zap these entries only after
+	 * the WP-bit has been tested.
+	 */
+#ifndef CONFIG_SMP
+	zap_low_mappings();
+#endif
+}
+
+/*
+ * this is for the non-NUMA, single node SMP system case.
+ * Specifically, in the case of x86, we will always add
+ * memory to the highmem for now.
+ */
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+int add_memory(u64 start, u64 size)
+{
+	struct pglist_data *pgdata = &contig_page_data;
+	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	return __add_pages(zone, start_pfn, nr_pages);
+}
+
+int remove_memory(u64 start, u64 size)
+{
+	return -EINVAL;
+}
+#endif
+
+kmem_cache_t *pgd_cache;
+kmem_cache_t *pmd_cache;
+
+void __init pgtable_cache_init(void)
+{
+	if (PTRS_PER_PMD > 1) {
+		pmd_cache = kmem_cache_create("pmd",
+					PTRS_PER_PMD*sizeof(pmd_t),
+					PTRS_PER_PMD*sizeof(pmd_t),
+					0,
+					pmd_ctor,
+					NULL);
+		if (!pmd_cache)
+			panic("pgtable_cache_init(): cannot create pmd cache");
+	}
+	pgd_cache = kmem_cache_create("pgd",
+				PTRS_PER_PGD*sizeof(pgd_t),
+				PTRS_PER_PGD*sizeof(pgd_t),
+				0,
+				pgd_ctor,
+				PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
+	if (!pgd_cache)
+		panic("pgtable_cache_init(): Cannot create pgd cache");
+}
+
+/*
+ * This function cannot be __init, since exceptions don't work in that
+ * section.  Put this after the callers, so that it cannot be inlined.
+ */
+static int noinline do_test_wp_bit(void)
+{
+	char tmp_reg;
+	int flag;
+
+	__asm__ __volatile__(
+		"	movb %0,%1	\n"
+		"1:	movb %1,%0	\n"
+		"	xorl %2,%2	\n"
+		"2:			\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4	\n"
+		"	.long 1b,2b	\n"
+		".previous		\n"
+		:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
+		 "=q" (tmp_reg),
+		 "=r" (flag)
+		:"2" (1)
+		:"memory");
+	
+	return flag;
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+
+extern char __start_rodata, __end_rodata;
+void mark_rodata_ro(void)
+{
+	unsigned long addr = (unsigned long)&__start_rodata;
+
+	for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
+		change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
+
+	printk ("Write protecting the kernel read-only data: %luk\n",
+			(unsigned long)(&__end_rodata - &__start_rodata) >> 10);
+
+	/*
+	 * change_page_attr() requires a global_flush_tlb() call after it.
+	 * We do this after the printk so that if something went wrong in the
+	 * change, the printk gets out at least to give a better debug hint
+	 * of who is the culprit.
+	 */
+	global_flush_tlb();
+}
+#endif
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+	unsigned long addr;
+
+	for (addr = begin; addr < end; addr += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(addr));
+		init_page_count(virt_to_page(addr));
+#ifdef CONFIG_DEBUG_INITDATA
+		/*
+		 * Unmap the page, and leak it.  So any further accesses will
+		 * oops.
+		 */
+		change_page_attr(virt_to_page(addr), 1, __pgprot(0));
+#else
+		memset((void *)addr, 0xcc, PAGE_SIZE);
+		free_page(addr);
+		totalram_pages++;
+#endif
+	}
+	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
+#ifdef CONFIG_DEBUG_INITDATA
+	global_flush_tlb();
+#endif
+}
+
+void free_initmem(void)
+{
+	free_init_pages("unused kernel memory",
+			(unsigned long)(&__init_begin),
+			(unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	free_init_pages("initrd memory", start, end);
+}
+#endif
+
diff -urN oldtree/arch/ppc/mm/init.c newtree/arch/ppc/mm/init.c
--- oldtree/arch/ppc/mm/init.c	2006-03-08 18:47:58.831806000 +0000
+++ newtree/arch/ppc/mm/init.c	2006-03-08 15:22:33.037492000 +0000
@@ -32,6 +32,7 @@
 #include <linux/highmem.h>
 #include <linux/initrd.h>
 #include <linux/pagemap.h>
+#include <linux/suspend.h>
 
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
@@ -407,8 +408,10 @@
 	/* if we are booted from BootX with an initial ramdisk,
 	   make sure the ramdisk pages aren't reserved. */
 	if (initrd_start) {
-		for (addr = initrd_start; addr < initrd_end; addr += PAGE_SIZE)
+		for (addr = initrd_start; addr < initrd_end; addr += PAGE_SIZE) {
 			ClearPageReserved(virt_to_page(addr));
+			ClearPageNosave(virt_to_page(addr));
+		}
 	}
 #endif /* CONFIG_BLK_DEV_INITRD */
 
@@ -417,13 +420,21 @@
 	if ( rtas_data )
 		for (addr = (ulong)__va(rtas_data);
 		     addr < PAGE_ALIGN((ulong)__va(rtas_data)+rtas_size) ;
-		     addr += PAGE_SIZE)
+		     addr += PAGE_SIZE) {
 			SetPageReserved(virt_to_page(addr));
+			SetPageNosave(virt_to_page(addr));
+		}
 #endif
 	for (addr = PAGE_OFFSET; addr < (unsigned long)high_memory;
 	     addr += PAGE_SIZE) {
 		if (!PageReserved(virt_to_page(addr)))
 			continue;
+		/*
+		 * Mark nosave pages
+		 */
+		if (addr >= (void *)&__nosave_begin && addr < (void *)&__nosave_end)
+			SetPageNosave(virt_to_page(addr));
+
 		if (addr < (ulong) etext)
 			codepages++;
 		else if (addr >= (unsigned long)&__init_begin
diff -urN oldtree/arch/ppc/mm/init.c.orig newtree/arch/ppc/mm/init.c.orig
--- oldtree/arch/ppc/mm/init.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/arch/ppc/mm/init.c.orig	2006-03-08 15:21:14.240567500 +0000
@@ -0,0 +1,624 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *  Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *  PPC44x/36-bit changes by Matt Porter (mporter@mvista.com)
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/initrd.h>
+#include <linux/pagemap.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/btext.h>
+#include <asm/tlb.h>
+#include <asm/bootinfo.h>
+
+#include "mem_pieces.h"
+#include "mmu_decl.h"
+
+#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
+/* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */
+#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - KERNELBASE))
+#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
+#endif
+#endif
+#define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+unsigned long total_memory;
+unsigned long total_lowmem;
+
+unsigned long ppc_memstart;
+unsigned long ppc_memoffset = PAGE_OFFSET;
+
+int mem_init_done;
+int init_bootmem_done;
+int boot_mapsize;
+
+extern char _end[];
+extern char etext[], _stext[];
+extern char __init_begin, __init_end;
+
+#ifdef CONFIG_HIGHMEM
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+EXPORT_SYMBOL(kmap_prot);
+EXPORT_SYMBOL(kmap_pte);
+#endif
+
+void MMU_init(void);
+void set_phys_avail(unsigned long total_ram);
+
+/* XXX should be in current.h  -- paulus */
+extern struct task_struct *current_set[NR_CPUS];
+
+char *klimit = _end;
+struct mem_pieces phys_avail;
+
+/*
+ * this tells the system to map all of ram with the segregs
+ * (i.e. page tables) instead of the bats.
+ * -- Cort
+ */
+int __map_without_bats;
+int __map_without_ltlbs;
+
+/* max amount of RAM to use */
+unsigned long __max_memory;
+/* max amount of low RAM to map in */
+unsigned long __max_low_memory = MAX_LOW_MEM;
+
+void show_mem(void)
+{
+	int i,free = 0,total = 0,reserved = 0;
+	int shared = 0, cached = 0;
+	int highmem = 0;
+
+	printk("Mem-info:\n");
+	show_free_areas();
+	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+	i = max_mapnr;
+	while (i-- > 0) {
+		total++;
+		if (PageHighMem(mem_map+i))
+			highmem++;
+		if (PageReserved(mem_map+i))
+			reserved++;
+		else if (PageSwapCache(mem_map+i))
+			cached++;
+		else if (!page_count(mem_map+i))
+			free++;
+		else
+			shared += page_count(mem_map+i) - 1;
+	}
+	printk("%d pages of RAM\n",total);
+	printk("%d pages of HIGHMEM\n", highmem);
+	printk("%d free pages\n",free);
+	printk("%d reserved pages\n",reserved);
+	printk("%d pages shared\n",shared);
+	printk("%d pages swap cached\n",cached);
+}
+
+/* Free up now-unused memory */
+static void free_sec(unsigned long start, unsigned long end, const char *name)
+{
+	unsigned long cnt = 0;
+
+	while (start < end) {
+		ClearPageReserved(virt_to_page(start));
+		init_page_count(virt_to_page(start));
+		free_page(start);
+		cnt++;
+		start += PAGE_SIZE;
+ 	}
+	if (cnt) {
+		printk(" %ldk %s", cnt << (PAGE_SHIFT - 10), name);
+		totalram_pages += cnt;
+	}
+}
+
+void free_initmem(void)
+{
+#define FREESEC(TYPE) \
+	free_sec((unsigned long)(&__ ## TYPE ## _begin), \
+		 (unsigned long)(&__ ## TYPE ## _end), \
+		 #TYPE);
+
+	printk ("Freeing unused kernel memory:");
+	FREESEC(init);
+ 	printk("\n");
+	ppc_md.progress = NULL;
+#undef FREESEC
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+
+	for (; start < end; start += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(start));
+		init_page_count(virt_to_page(start));
+		free_page(start);
+		totalram_pages++;
+	}
+}
+#endif
+
+/*
+ * Check for command-line options that affect what MMU_init will do.
+ */
+void MMU_setup(void)
+{
+	/* Check for nobats option (used in mapin_ram). */
+	if (strstr(cmd_line, "nobats")) {
+		__map_without_bats = 1;
+	}
+
+	if (strstr(cmd_line, "noltlbs")) {
+		__map_without_ltlbs = 1;
+	}
+
+	/* Look for mem= option on command line */
+	if (strstr(cmd_line, "mem=")) {
+		char *p, *q;
+		unsigned long maxmem = 0;
+
+		for (q = cmd_line; (p = strstr(q, "mem=")) != 0; ) {
+			q = p + 4;
+			if (p > cmd_line && p[-1] != ' ')
+				continue;
+			maxmem = simple_strtoul(q, &q, 0);
+			if (*q == 'k' || *q == 'K') {
+				maxmem <<= 10;
+				++q;
+			} else if (*q == 'm' || *q == 'M') {
+				maxmem <<= 20;
+				++q;
+			}
+		}
+		__max_memory = maxmem;
+	}
+}
+
+/*
+ * MMU_init sets up the basic memory mappings for the kernel,
+ * including both RAM and possibly some I/O regions,
+ * and sets up the page tables and the MMU hardware ready to go.
+ */
+void __init MMU_init(void)
+{
+	if (ppc_md.progress)
+		ppc_md.progress("MMU:enter", 0x111);
+
+	/* parse args from command line */
+	MMU_setup();
+
+	/*
+	 * Figure out how much memory we have, how much
+	 * is lowmem, and how much is highmem.  If we were
+	 * passed the total memory size from the bootloader,
+	 * just use it.
+	 */
+	if (boot_mem_size)
+		total_memory = boot_mem_size;
+	else
+		total_memory = ppc_md.find_end_of_memory();
+
+	if (__max_memory && total_memory > __max_memory)
+		total_memory = __max_memory;
+	total_lowmem = total_memory;
+#ifdef CONFIG_FSL_BOOKE
+	/* Freescale Book-E parts expect lowmem to be mapped by fixed TLB
+	 * entries, so we need to adjust lowmem to match the amount we can map
+	 * in the fixed entries */
+	adjust_total_lowmem();
+#endif /* CONFIG_FSL_BOOKE */
+	if (total_lowmem > __max_low_memory) {
+		total_lowmem = __max_low_memory;
+#ifndef CONFIG_HIGHMEM
+		total_memory = total_lowmem;
+#endif /* CONFIG_HIGHMEM */
+	}
+	set_phys_avail(total_lowmem);
+
+	/* Initialize the MMU hardware */
+	if (ppc_md.progress)
+		ppc_md.progress("MMU:hw init", 0x300);
+	MMU_init_hw();
+
+	/* Map in all of RAM starting at KERNELBASE */
+	if (ppc_md.progress)
+		ppc_md.progress("MMU:mapin", 0x301);
+	mapin_ram();
+
+#ifdef CONFIG_HIGHMEM
+	ioremap_base = PKMAP_BASE;
+#else
+	ioremap_base = 0xfe000000UL;	/* for now, could be 0xfffff000 */
+#endif /* CONFIG_HIGHMEM */
+	ioremap_bot = ioremap_base;
+
+	/* Map in I/O resources */
+	if (ppc_md.progress)
+		ppc_md.progress("MMU:setio", 0x302);
+	if (ppc_md.setup_io_mappings)
+		ppc_md.setup_io_mappings();
+
+	/* Initialize the context management stuff */
+	mmu_context_init();
+
+	if (ppc_md.progress)
+		ppc_md.progress("MMU:exit", 0x211);
+
+#ifdef CONFIG_BOOTX_TEXT
+	/* By default, we are no longer mapped */
+       	boot_text_mapped = 0;
+	/* Must be done last, or ppc_md.progress will die. */
+	map_boot_text();
+#endif
+}
+
+/* This is only called until mem_init is done. */
+void __init *early_get_page(void)
+{
+	void *p;
+
+	if (init_bootmem_done) {
+		p = alloc_bootmem_pages(PAGE_SIZE);
+	} else {
+		p = mem_pieces_find(PAGE_SIZE, PAGE_SIZE);
+	}
+	return p;
+}
+
+/*
+ * Initialize the bootmem system and give it all the memory we
+ * have available.
+ */
+void __init do_init_bootmem(void)
+{
+	unsigned long start, size;
+	int i;
+
+	/*
+	 * Find an area to use for the bootmem bitmap.
+	 * We look for the first area which is at least
+	 * 128kB in length (128kB is enough for a bitmap
+	 * for 4GB of memory, using 4kB pages), plus 1 page
+	 * (in case the address isn't page-aligned).
+	 */
+	start = 0;
+	size = 0;
+	for (i = 0; i < phys_avail.n_regions; ++i) {
+		unsigned long a = phys_avail.regions[i].address;
+		unsigned long s = phys_avail.regions[i].size;
+		if (s <= size)
+			continue;
+		start = a;
+		size = s;
+		if (s >= 33 * PAGE_SIZE)
+			break;
+	}
+	start = PAGE_ALIGN(start);
+
+	min_low_pfn = start >> PAGE_SHIFT;
+	max_low_pfn = (PPC_MEMSTART + total_lowmem) >> PAGE_SHIFT;
+	max_pfn = (PPC_MEMSTART + total_memory) >> PAGE_SHIFT;
+	boot_mapsize = init_bootmem_node(&contig_page_data, min_low_pfn,
+					 PPC_MEMSTART >> PAGE_SHIFT,
+					 max_low_pfn);
+
+	/* remove the bootmem bitmap from the available memory */
+	mem_pieces_remove(&phys_avail, start, boot_mapsize, 1);
+
+	/* add everything in phys_avail into the bootmem map */
+	for (i = 0; i < phys_avail.n_regions; ++i)
+		free_bootmem(phys_avail.regions[i].address,
+			     phys_avail.regions[i].size);
+
+	init_bootmem_done = 1;
+}
+
+/*
+ * paging_init() sets up the page tables - in fact we've already done this.
+ */
+void __init paging_init(void)
+{
+	unsigned long zones_size[MAX_NR_ZONES], i;
+
+#ifdef CONFIG_HIGHMEM
+	map_page(PKMAP_BASE, 0, 0);	/* XXX gross */
+	pkmap_page_table = pte_offset_kernel(pmd_offset(pgd_offset_k
+			(PKMAP_BASE), PKMAP_BASE), PKMAP_BASE);
+	map_page(KMAP_FIX_BEGIN, 0, 0);	/* XXX gross */
+	kmap_pte = pte_offset_kernel(pmd_offset(pgd_offset_k
+			(KMAP_FIX_BEGIN), KMAP_FIX_BEGIN), KMAP_FIX_BEGIN);
+	kmap_prot = PAGE_KERNEL;
+#endif /* CONFIG_HIGHMEM */
+
+	/*
+	 * All pages are DMA-able so we put them all in the DMA zone.
+	 */
+	zones_size[ZONE_DMA] = total_lowmem >> PAGE_SHIFT;
+	for (i = 1; i < MAX_NR_ZONES; i++)
+		zones_size[i] = 0;
+
+#ifdef CONFIG_HIGHMEM
+	zones_size[ZONE_HIGHMEM] = (total_memory - total_lowmem) >> PAGE_SHIFT;
+#endif /* CONFIG_HIGHMEM */
+
+	free_area_init(zones_size);
+}
+
+void __init mem_init(void)
+{
+	unsigned long addr;
+	int codepages = 0;
+	int datapages = 0;
+	int initpages = 0;
+#ifdef CONFIG_HIGHMEM
+	unsigned long highmem_mapnr;
+
+	highmem_mapnr = total_lowmem >> PAGE_SHIFT;
+#endif /* CONFIG_HIGHMEM */
+	max_mapnr = total_memory >> PAGE_SHIFT;
+
+	high_memory = (void *) __va(PPC_MEMSTART + total_lowmem);
+	num_physpages = max_mapnr;	/* RAM is assumed contiguous */
+
+	totalram_pages += free_all_bootmem();
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	/* if we are booted from BootX with an initial ramdisk,
+	   make sure the ramdisk pages aren't reserved. */
+	if (initrd_start) {
+		for (addr = initrd_start; addr < initrd_end; addr += PAGE_SIZE)
+			ClearPageReserved(virt_to_page(addr));
+	}
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+#ifdef CONFIG_PPC_OF
+	/* mark the RTAS pages as reserved */
+	if ( rtas_data )
+		for (addr = (ulong)__va(rtas_data);
+		     addr < PAGE_ALIGN((ulong)__va(rtas_data)+rtas_size) ;
+		     addr += PAGE_SIZE)
+			SetPageReserved(virt_to_page(addr));
+#endif
+	for (addr = PAGE_OFFSET; addr < (unsigned long)high_memory;
+	     addr += PAGE_SIZE) {
+		if (!PageReserved(virt_to_page(addr)))
+			continue;
+		if (addr < (ulong) etext)
+			codepages++;
+		else if (addr >= (unsigned long)&__init_begin
+			 && addr < (unsigned long)&__init_end)
+			initpages++;
+		else if (addr < (ulong) klimit)
+			datapages++;
+	}
+
+#ifdef CONFIG_HIGHMEM
+	{
+		unsigned long pfn;
+
+		for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
+			struct page *page = mem_map + pfn;
+
+			ClearPageReserved(page);
+			init_page_count(page);
+			__free_page(page);
+			totalhigh_pages++;
+		}
+		totalram_pages += totalhigh_pages;
+	}
+#endif /* CONFIG_HIGHMEM */
+
+        printk("Memory: %luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n",
+	       (unsigned long)nr_free_pages()<< (PAGE_SHIFT-10),
+	       codepages<< (PAGE_SHIFT-10), datapages<< (PAGE_SHIFT-10),
+	       initpages<< (PAGE_SHIFT-10),
+	       (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)));
+
+	mem_init_done = 1;
+}
+
+/*
+ * Set phys_avail to the amount of physical memory,
+ * less the kernel text/data/bss.
+ */
+void __init
+set_phys_avail(unsigned long total_memory)
+{
+	unsigned long kstart, ksize;
+
+	/*
+	 * Initially, available physical memory is equivalent to all
+	 * physical memory.
+	 */
+
+	phys_avail.regions[0].address = PPC_MEMSTART;
+	phys_avail.regions[0].size = total_memory;
+	phys_avail.n_regions = 1;
+
+	/*
+	 * Map out the kernel text/data/bss from the available physical
+	 * memory.
+	 */
+
+	kstart = __pa(_stext);	/* should be 0 */
+	ksize = PAGE_ALIGN(klimit - _stext);
+
+	mem_pieces_remove(&phys_avail, kstart, ksize, 0);
+	mem_pieces_remove(&phys_avail, 0, 0x4000, 0);
+
+#if defined(CONFIG_BLK_DEV_INITRD)
+	/* Remove the init RAM disk from the available memory. */
+	if (initrd_start) {
+		mem_pieces_remove(&phys_avail, __pa(initrd_start),
+				  initrd_end - initrd_start, 1);
+	}
+#endif /* CONFIG_BLK_DEV_INITRD */
+#ifdef CONFIG_PPC_OF
+	/* remove the RTAS pages from the available memory */
+	if (rtas_data)
+		mem_pieces_remove(&phys_avail, rtas_data, rtas_size, 1);
+#endif
+}
+
+/* Mark some memory as reserved by removing it from phys_avail. */
+void __init reserve_phys_mem(unsigned long start, unsigned long size)
+{
+	mem_pieces_remove(&phys_avail, start, size, 1);
+}
+
+/*
+ * This is called when a page has been modified by the kernel.
+ * It just marks the page as not i-cache clean.  We do the i-cache
+ * flush later when the page is given to a user process, if necessary.
+ */
+void flush_dcache_page(struct page *page)
+{
+	clear_bit(PG_arch_1, &page->flags);
+}
+
+void flush_dcache_icache_page(struct page *page)
+{
+#ifdef CONFIG_BOOKE
+	void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
+	__flush_dcache_icache(start);
+	kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+#elif defined(CONFIG_8xx)
+	/* On 8xx there is no need to kmap since highmem is not supported */
+	__flush_dcache_icache(page_address(page)); 
+#else
+	__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
+#endif
+
+}
+void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
+{
+	clear_page(page);
+	clear_bit(PG_arch_1, &pg->flags);
+}
+
+void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
+		    struct page *pg)
+{
+	copy_page(vto, vfrom);
+	clear_bit(PG_arch_1, &pg->flags);
+}
+
+void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
+			     unsigned long addr, int len)
+{
+	unsigned long maddr;
+
+	maddr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK);
+	flush_icache_range(maddr, maddr + len);
+	kunmap(page);
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+		      pte_t pte)
+{
+	/* handle i-cache coherency */
+	unsigned long pfn = pte_pfn(pte);
+
+	if (pfn_valid(pfn)) {
+		struct page *page = pfn_to_page(pfn);
+#ifdef CONFIG_8xx
+		/* On 8xx, the TLB handlers work in 2 stages:
+	 	 * First, a zeroed entry is loaded by TLBMiss handler,
+		 * which causes the TLBError handler to be triggered.
+		 * That means the zeroed TLB has to be invalidated
+		 * whenever a page miss occurs.
+		 */
+		_tlbie(address);
+#endif
+		if (!PageReserved(page)
+		    && !test_bit(PG_arch_1, &page->flags)) {
+			if (vma->vm_mm == current->active_mm)
+				__flush_dcache_icache((void *) address);
+			else
+				flush_dcache_icache_page(page);
+			set_bit(PG_arch_1, &page->flags);
+		}
+	}
+
+#ifdef CONFIG_PPC_STD_MMU
+	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+	if (Hash != 0 && pte_young(pte)) {
+		struct mm_struct *mm;
+		pmd_t *pmd;
+
+		mm = (address < TASK_SIZE)? vma->vm_mm: &init_mm;
+		pmd = pmd_offset(pgd_offset(mm, address), address);
+		if (!pmd_none(*pmd))
+			add_hash_page(mm->context, address, pmd_val(*pmd));
+	}
+#endif
+}
+
+/*
+ * This is called by /dev/mem to know if a given address has to
+ * be mapped non-cacheable or not
+ */
+int page_is_ram(unsigned long pfn)
+{
+	return pfn < max_pfn;
+}
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+			      unsigned long size, pgprot_t vma_prot)
+{
+	if (ppc_md.phys_mem_access_prot)
+		return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot);
+
+	if (!page_is_ram(pfn))
+		vma_prot = __pgprot(pgprot_val(vma_prot)
+				    | _PAGE_GUARDED | _PAGE_NO_CACHE);
+	return vma_prot;
+}
+EXPORT_SYMBOL(phys_mem_access_prot);
diff -urN oldtree/arch/x86_64/kernel/e820.c newtree/arch/x86_64/kernel/e820.c
--- oldtree/arch/x86_64/kernel/e820.c	2006-03-08 18:47:11.432843750 +0000
+++ newtree/arch/x86_64/kernel/e820.c	2006-03-08 15:22:33.041492250 +0000
@@ -186,6 +186,23 @@
 	return end_pfn;	
 }
 
+int page_is_ram(unsigned long pagenr)
+{
+	unsigned long start = pagenr << PAGE_SHIFT;
+	int i;
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (ei->addr+ei->size <= start || 
+		    ei->addr >= (start + PAGE_SIZE))
+			continue;
+
+		return (ei->type == E820_RAM);
+	}
+
+	return 0;
+}
+
 /* 
  * Compute how much memory is missing in a range.
  * Unlike the other functions in this file the arguments are in page numbers.
diff -urN oldtree/arch/x86_64/kernel/suspend.c newtree/arch/x86_64/kernel/suspend.c
--- oldtree/arch/x86_64/kernel/suspend.c	2006-03-08 18:47:11.448844750 +0000
+++ newtree/arch/x86_64/kernel/suspend.c	2006-03-08 15:22:33.041492250 +0000
@@ -13,6 +13,7 @@
 #include <asm/proto.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/i387.h>
 
 struct saved_context saved_context;
 
@@ -22,6 +23,8 @@
 unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
 unsigned long saved_context_eflags;
 
+void fix_processor_context(void);
+
 void __save_processor_state(struct saved_context *ctxt)
 {
 	kernel_fpu_begin();
@@ -141,7 +144,7 @@
 
 }
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#if defined(CONFIG_SOFTWARE_SUSPEND)
 /* Defined in arch/x86_64/kernel/suspend_asm.S */
 extern int restore_image(void);
 
@@ -220,4 +223,5 @@
 	restore_image();
 	return 0;
 }
+
 #endif /* CONFIG_SOFTWARE_SUSPEND */
diff -urN oldtree/arch/x86_64/kernel/time.c newtree/arch/x86_64/kernel/time.c
--- oldtree/arch/x86_64/kernel/time.c	2006-03-08 18:47:58.999816500 +0000
+++ newtree/arch/x86_64/kernel/time.c	2006-03-08 15:22:33.057493250 +0000
@@ -517,8 +517,6 @@
  * standard 8.3 MHz ISA bus.
  */
 
-	spin_lock_irqsave(&rtc_lock, flags);
-
 	while (timeout && (!uip || this)) {
 		uip |= this;
 		this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
@@ -541,8 +539,6 @@
 		extyear = CMOS_READ(acpi_fadt.century);
 #endif
 
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
 	/*
 	 * We know that x86-64 always uses BCD format, no need to check the
 	 * config register.
@@ -570,6 +566,20 @@
 	return mktime(year, mon, day, hour, min, sec);
 }
 
+static unsigned long get_cmos_time(void)
+{
+	unsigned long flags;
+	unsigned long result;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+
+	result = __get_cmos_time();
+
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	return result;
+}
+
 #ifdef CONFIG_CPU_FREQ
 
 /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -1030,7 +1040,7 @@
 	/*
 	 * Estimate time zone so that set_time can update the clock
 	 */
-	long cmos_time =  get_cmos_time();
+	long cmos_time =  __get_cmos_time();
 
 	clock_cmos_diff = -cmos_time;
 	clock_cmos_diff += get_seconds();
diff -urN oldtree/arch/x86_64/kernel/time.c.orig newtree/arch/x86_64/kernel/time.c.orig
--- oldtree/arch/x86_64/kernel/time.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/arch/x86_64/kernel/time.c.orig	2006-03-08 15:21:14.436579750 +0000
@@ -0,0 +1,1373 @@
+/*
+ *  linux/arch/x86-64/kernel/time.c
+ *
+ *  "High Precision Event Timer" based timekeeping.
+ *
+ *  Copyright (c) 1991,1992,1995  Linus Torvalds
+ *  Copyright (c) 1994  Alan Modra
+ *  Copyright (c) 1995  Markus Kuhn
+ *  Copyright (c) 1996  Ingo Molnar
+ *  Copyright (c) 1998  Andrea Arcangeli
+ *  Copyright (c) 2002  Vojtech Pavlik
+ *  Copyright (c) 2003  Andi Kleen
+ *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/time.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/kallsyms.h>
+#include <linux/acpi.h>
+#ifdef CONFIG_ACPI
+#include <acpi/achware.h>	/* for PM timer frequency */
+#endif
+#include <asm/8253pit.h>
+#include <asm/pgtable.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/proto.h>
+#include <asm/hpet.h>
+#include <asm/sections.h>
+#include <linux/cpufreq.h>
+#include <linux/hpet.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/apic.h>
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+static void cpufreq_delayed_get(void);
+#endif
+extern void i8254_timer_resume(void);
+extern int using_apic_timer;
+
+static char *time_init_gtod(void);
+
+DEFINE_SPINLOCK(rtc_lock);
+DEFINE_SPINLOCK(i8253_lock);
+
+int nohpet __initdata = 0;
+static int notsc __initdata = 0;
+
+#undef HPET_HACK_ENABLE_DANGEROUS
+
+unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
+static unsigned long hpet_period;			/* fsecs / HPET clock */
+unsigned long hpet_tick;				/* HPET clocks / interrupt */
+int hpet_use_timer;				/* Use counter of hpet for time keeping, otherwise PIT */
+unsigned long vxtime_hz = PIT_TICK_RATE;
+int report_lost_ticks;				/* command line option */
+unsigned long long monotonic_base;
+
+struct vxtime_data __vxtime __section_vxtime;	/* for vsyscalls */
+
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
+struct timespec __xtime __section_xtime;
+struct timezone __sys_tz __section_sys_tz;
+
+/*
+ * do_gettimeoffset() returns microseconds since last timer interrupt was
+ * triggered by hardware. A memory read of HPET is slower than a register read
+ * of TSC, but much more reliable. It's also synchronized to the timer
+ * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
+ * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
+ * This is not a problem, because jiffies hasn't updated either. They are bound
+ * together by xtime_lock.
+ */
+
+static inline unsigned int do_gettimeoffset_tsc(void)
+{
+	unsigned long t;
+	unsigned long x;
+	t = get_cycles_sync();
+	if (t < vxtime.last_tsc) 
+		t = vxtime.last_tsc; /* hack */
+	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
+	return x;
+}
+
+static inline unsigned int do_gettimeoffset_hpet(void)
+{
+	/* cap counter read to one tick to avoid inconsistencies */
+	unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
+	return (min(counter,hpet_tick) * vxtime.quot) >> 32;
+}
+
+unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
+
+/*
+ * This version of gettimeofday() has microsecond resolution and better than
+ * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
+ * MHz) HPET timer.
+ */
+
+void do_gettimeofday(struct timeval *tv)
+{
+	unsigned long seq, t;
+ 	unsigned int sec, usec;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		sec = xtime.tv_sec;
+		usec = xtime.tv_nsec / 1000;
+
+		/* i386 does some correction here to keep the clock 
+		   monotonous even when ntpd is fixing drift.
+		   But they didn't work for me, there is a non monotonic
+		   clock anyways with ntp.
+		   I dropped all corrections now until a real solution can
+		   be found. Note when you fix it here you need to do the same
+		   in arch/x86_64/kernel/vsyscall.c and export all needed
+		   variables in vmlinux.lds. -AK */ 
+
+		t = (jiffies - wall_jiffies) * (1000000L / HZ) +
+			do_gettimeoffset();
+		usec += t;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	tv->tv_sec = sec + usec / 1000000;
+	tv->tv_usec = usec % 1000000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+
+/*
+ * settimeofday() first undoes the correction that gettimeofday would do
+ * on the time, and then saves it. This is ugly, but has been like this for
+ * ages already.
+ */
+
+int do_settimeofday(struct timespec *tv)
+{
+	time_t wtm_sec, sec = tv->tv_sec;
+	long wtm_nsec, nsec = tv->tv_nsec;
+
+	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+		return -EINVAL;
+
+	write_seqlock_irq(&xtime_lock);
+
+	nsec -= do_gettimeoffset() * 1000 +
+		(jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
+
+	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+	set_normalized_timespec(&xtime, sec, nsec);
+	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+	ntp_clear();
+
+	write_sequnlock_irq(&xtime_lock);
+	clock_was_set();
+	return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+	/* Assume the lock function has either no stack frame or only a single 
+	   word.  This checks if the address on the stack looks like a kernel 
+	   text address.
+	   There is a small window for false hits, but in that case the tick
+	   is just accounted to the spinlock function.
+	   Better would be to write these functions in assembler again
+	   and check exactly. */
+	if (in_lock_functions(pc)) {
+		char *v = *(char **)regs->rsp;
+		if ((v >= _stext && v <= _etext) ||
+			(v >= _sinittext && v <= _einittext) ||
+			(v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
+			return (unsigned long)v;
+		return ((unsigned long *)regs->rsp)[1];
+	}
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
+ * ms after the second nowtime has started, because when nowtime is written
+ * into the registers of the CMOS clock, it will jump to the next second
+ * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
+ * sheet for details.
+ */
+
+static void set_rtc_mmss(unsigned long nowtime)
+{
+	int real_seconds, real_minutes, cmos_minutes;
+	unsigned char control, freq_select;
+
+/*
+ * IRQs are disabled when we're called from the timer interrupt,
+ * no need for spin_lock_irqsave()
+ */
+
+	spin_lock(&rtc_lock);
+
+/*
+ * Tell the clock it's being set and stop it.
+ */
+
+	control = CMOS_READ(RTC_CONTROL);
+	CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
+
+	freq_select = CMOS_READ(RTC_FREQ_SELECT);
+	CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
+
+	cmos_minutes = CMOS_READ(RTC_MINUTES);
+		BCD_TO_BIN(cmos_minutes);
+
+/*
+ * since we're only adjusting minutes and seconds, don't interfere with hour
+ * overflow. This avoids messing with unknown time zones but requires your RTC
+ * not to be off by more than 15 minutes. Since we're calling it only when
+ * our clock is externally synchronized using NTP, this shouldn't be a problem.
+ */
+
+	real_seconds = nowtime % 60;
+	real_minutes = nowtime / 60;
+	if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
+		real_minutes += 30;		/* correct for half hour time zone */
+	real_minutes %= 60;
+
+	if (abs(real_minutes - cmos_minutes) >= 30) {
+		printk(KERN_WARNING "time.c: can't update CMOS clock "
+		       "from %d to %d\n", cmos_minutes, real_minutes);
+	} else {
+		BIN_TO_BCD(real_seconds);
+		BIN_TO_BCD(real_minutes);
+		CMOS_WRITE(real_seconds, RTC_SECONDS);
+		CMOS_WRITE(real_minutes, RTC_MINUTES);
+	}
+
+/*
+ * The following flags have to be released exactly in this order, otherwise the
+ * DS12887 (popular MC146818A clone with integrated battery and quartz) will
+ * not reset the oscillator and will not update precisely 500 ms later. You
+ * won't find this mentioned in the Dallas Semiconductor data sheets, but who
+ * believes data sheets anyway ... -- Markus Kuhn
+ */
+
+	CMOS_WRITE(control, RTC_CONTROL);
+	CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
+
+	spin_unlock(&rtc_lock);
+}
+
+
+/* monotonic_clock(): returns # of nanoseconds passed since time_init()
+ *		Note: This function is required to return accurate
+ *		time even in the absence of multiple timer ticks.
+ */
+unsigned long long monotonic_clock(void)
+{
+	unsigned long seq;
+ 	u32 last_offset, this_offset, offset;
+	unsigned long long base;
+
+	if (vxtime.mode == VXTIME_HPET) {
+		do {
+			seq = read_seqbegin(&xtime_lock);
+
+			last_offset = vxtime.last;
+			base = monotonic_base;
+			this_offset = hpet_readl(HPET_COUNTER);
+		} while (read_seqretry(&xtime_lock, seq));
+		offset = (this_offset - last_offset);
+		offset *= (NSEC_PER_SEC/HZ) / hpet_tick;
+	} else {
+		do {
+			seq = read_seqbegin(&xtime_lock);
+
+			last_offset = vxtime.last_tsc;
+			base = monotonic_base;
+		} while (read_seqretry(&xtime_lock, seq));
+		this_offset = get_cycles_sync();
+		offset = (this_offset - last_offset)*1000 / cpu_khz; 
+	}
+	return base + offset;
+}
+EXPORT_SYMBOL(monotonic_clock);
+
+static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
+{
+	static long lost_count;
+	static int warned;
+	if (report_lost_ticks) {
+		printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost);
+		print_symbol("rip %s)\n", regs->rip);
+	}
+
+	if (lost_count == 1000 && !warned) {
+		printk(KERN_WARNING "warning: many lost ticks.\n"
+		       KERN_WARNING "Your time source seems to be instable or "
+		   		"some driver is hogging interupts\n");
+		print_symbol("rip %s\n", regs->rip);
+		if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
+			printk(KERN_WARNING "Falling back to HPET\n");
+			if (hpet_use_timer)
+				vxtime.last = hpet_readl(HPET_T0_CMP) - 
+							hpet_tick;
+			else
+				vxtime.last = hpet_readl(HPET_COUNTER);
+			vxtime.mode = VXTIME_HPET;
+			do_gettimeoffset = do_gettimeoffset_hpet;
+		}
+		/* else should fall back to PIT, but code missing. */
+		warned = 1;
+	} else
+		lost_count++;
+
+#ifdef CONFIG_CPU_FREQ
+	/* In some cases the CPU can change frequency without us noticing
+	   Give cpufreq a change to catch up. */
+	if ((lost_count+1) % 25 == 0)
+		cpufreq_delayed_get();
+#endif
+}
+
+void main_timer_handler(struct pt_regs *regs)
+{
+	static unsigned long rtc_update = 0;
+	unsigned long tsc;
+	int delay = 0, offset = 0, lost = 0;
+
+/*
+ * Here we are in the timer irq handler. We have irqs locally disabled (so we
+ * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
+ * on the other CPU, so we need a lock. We also need to lock the vsyscall
+ * variables, because both do_timer() and us change them -arca+vojtech
+ */
+
+	write_seqlock(&xtime_lock);
+
+	if (vxtime.hpet_address)
+		offset = hpet_readl(HPET_COUNTER);
+
+	if (hpet_use_timer) {
+		/* if we're using the hpet timer functionality,
+		 * we can more accurately know the counter value
+		 * when the timer interrupt occured.
+		 */
+		offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
+		delay = hpet_readl(HPET_COUNTER) - offset;
+	} else if (!pmtmr_ioport) {
+		spin_lock(&i8253_lock);
+		outb_p(0x00, 0x43);
+		delay = inb_p(0x40);
+		delay |= inb(0x40) << 8;
+		spin_unlock(&i8253_lock);
+		delay = LATCH - 1 - delay;
+	}
+
+	tsc = get_cycles_sync();
+
+	if (vxtime.mode == VXTIME_HPET) {
+		if (offset - vxtime.last > hpet_tick) {
+			lost = (offset - vxtime.last) / hpet_tick - 1;
+		}
+
+		monotonic_base += 
+			(offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
+
+		vxtime.last = offset;
+#ifdef CONFIG_X86_PM_TIMER
+	} else if (vxtime.mode == VXTIME_PMTMR) {
+		lost = pmtimer_mark_offset();
+#endif
+	} else {
+		offset = (((tsc - vxtime.last_tsc) *
+			   vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
+
+		if (offset < 0)
+			offset = 0;
+
+		if (offset > (USEC_PER_SEC / HZ)) {
+			lost = offset / (USEC_PER_SEC / HZ);
+			offset %= (USEC_PER_SEC / HZ);
+		}
+
+		monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
+
+		vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
+
+		if ((((tsc - vxtime.last_tsc) *
+		      vxtime.tsc_quot) >> 32) < offset)
+			vxtime.last_tsc = tsc -
+				(((long) offset << 32) / vxtime.tsc_quot) - 1;
+	}
+
+	if (lost > 0) {
+		handle_lost_ticks(lost, regs);
+		jiffies += lost;
+	}
+
+/*
+ * Do the timer stuff.
+ */
+
+	do_timer(regs);
+#ifndef CONFIG_SMP
+	update_process_times(user_mode(regs));
+#endif
+
+/*
+ * In the SMP case we use the local APIC timer interrupt to do the profiling,
+ * except when we simulate SMP mode on a uniprocessor system, in that case we
+ * have to call the local interrupt handler.
+ */
+
+#ifndef CONFIG_X86_LOCAL_APIC
+	profile_tick(CPU_PROFILING, regs);
+#else
+	if (!using_apic_timer)
+		smp_local_timer_interrupt(regs);
+#endif
+
+/*
+ * If we have an externally synchronized Linux clock, then update CMOS clock
+ * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
+ * closest to exactly 500 ms before the next second. If the update fails, we
+ * don't care, as it'll be updated on the next turn, and the problem (time way
+ * off) isn't likely to go away much sooner anyway.
+ */
+
+	if (ntp_synced() && xtime.tv_sec > rtc_update &&
+		abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
+		set_rtc_mmss(xtime.tv_sec);
+		rtc_update = xtime.tv_sec + 660;
+	}
+ 
+	write_sequnlock(&xtime_lock);
+}
+
+static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	if (apic_runs_main_timer > 1)
+		return IRQ_HANDLED;
+	main_timer_handler(regs);
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (using_apic_timer)
+		smp_send_timer_broadcast_ipi();
+#endif
+	return IRQ_HANDLED;
+}
+
+static unsigned int cyc2ns_scale;
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
+{
+	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+}
+
+unsigned long long sched_clock(void)
+{
+	unsigned long a = 0;
+
+#if 0
+	/* Don't do a HPET read here. Using TSC always is much faster
+	   and HPET may not be mapped yet when the scheduler first runs.
+           Disadvantage is a small drift between CPUs in some configurations,
+	   but that should be tolerable. */
+	if (__vxtime.mode == VXTIME_HPET)
+		return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32;
+#endif
+
+	/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
+	   which means it is not completely exact and may not be monotonous between
+	   CPUs. But the errors should be too small to matter for scheduling
+	   purposes. */
+
+	rdtscll(a);
+	return cycles_2_ns(a);
+}
+
+static unsigned long get_cmos_time(void)
+{
+	unsigned int timeout = 1000000, year, mon, day, hour, min, sec;
+	unsigned char uip = 0, this = 0;
+	unsigned long flags;
+	unsigned extyear = 0;
+
+/*
+ * The Linux interpretation of the CMOS clock register contents: When the
+ * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the
+ * second which has precisely just started. Waiting for this can take up to 1
+ * second, we timeout approximately after 2.4 seconds on a machine with
+ * standard 8.3 MHz ISA bus.
+ */
+
+	spin_lock_irqsave(&rtc_lock, flags);
+
+	while (timeout && (!uip || this)) {
+		uip |= this;
+		this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
+		timeout--;
+	}
+
+	/*
+	 * Here we are safe to assume the registers won't change for a whole
+	 * second, so we just go ahead and read them.
+ 	 */
+	sec = CMOS_READ(RTC_SECONDS);
+	min = CMOS_READ(RTC_MINUTES);
+	hour = CMOS_READ(RTC_HOURS);
+	day = CMOS_READ(RTC_DAY_OF_MONTH);
+	mon = CMOS_READ(RTC_MONTH);
+	year = CMOS_READ(RTC_YEAR);
+
+#ifdef CONFIG_ACPI
+	if (acpi_fadt.revision >= FADT2_REVISION_ID && acpi_fadt.century)
+		extyear = CMOS_READ(acpi_fadt.century);
+#endif
+
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	/*
+	 * We know that x86-64 always uses BCD format, no need to check the
+	 * config register.
+ 	 */
+
+	BCD_TO_BIN(sec);
+	BCD_TO_BIN(min);
+	BCD_TO_BIN(hour);
+	BCD_TO_BIN(day);
+	BCD_TO_BIN(mon);
+	BCD_TO_BIN(year);
+
+	if (extyear) {
+		BCD_TO_BIN(extyear);
+		year += extyear;
+		printk(KERN_INFO "Extended CMOS year: %d\n", extyear);
+	} else { 
+		/*
+		 * x86-64 systems only exists since 2002.
+		 * This will work up to Dec 31, 2100
+	 	 */
+		year += 2000;
+	}
+
+	return mktime(year, mon, day, hour, min, sec);
+}
+
+#ifdef CONFIG_CPU_FREQ
+
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+   changes.
+   
+   RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
+   not that important because current Opteron setups do not support
+   scaling on SMP anyroads.
+
+   Should fix up last_tsc too. Currently gettimeofday in the
+   first tick after the change will be slightly wrong. */
+
+#include <linux/workqueue.h>
+
+static unsigned int cpufreq_delayed_issched = 0;
+static unsigned int cpufreq_init = 0;
+static struct work_struct cpufreq_delayed_get_work;
+
+static void handle_cpufreq_delayed_get(void *v)
+{
+	unsigned int cpu;
+	for_each_online_cpu(cpu) {
+		cpufreq_get(cpu);
+	}
+	cpufreq_delayed_issched = 0;
+}
+
+/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
+ * to verify the CPU frequency the timing core thinks the CPU is running
+ * at is still correct.
+ */
+static void cpufreq_delayed_get(void)
+{
+	static int warned;
+	if (cpufreq_init && !cpufreq_delayed_issched) {
+		cpufreq_delayed_issched = 1;
+		if (!warned) {
+			warned = 1;
+			printk(KERN_DEBUG 
+	"Losing some ticks... checking if CPU frequency changed.\n");
+		}
+		schedule_work(&cpufreq_delayed_get_work);
+	}
+}
+
+static unsigned int  ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+
+static unsigned long cpu_khz_ref = 0;
+
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				 void *data)
+{
+        struct cpufreq_freqs *freq = data;
+	unsigned long *lpj, dummy;
+
+	if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
+		return 0;
+
+	lpj = &dummy;
+	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+#ifdef CONFIG_SMP
+		lpj = &cpu_data[freq->cpu].loops_per_jiffy;
+#else
+		lpj = &boot_cpu_data.loops_per_jiffy;
+#endif
+
+	if (!ref_freq) {
+		ref_freq = freq->old;
+		loops_per_jiffy_ref = *lpj;
+		cpu_khz_ref = cpu_khz;
+	}
+        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+            (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE)) {
+                *lpj =
+		cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+			vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+	}
+	
+	set_cyc2ns_scale(cpu_khz_ref);
+
+	return 0;
+}
+ 
+static struct notifier_block time_cpufreq_notifier_block = {
+         .notifier_call  = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
+	if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
+				       CPUFREQ_TRANSITION_NOTIFIER))
+		cpufreq_init = 1;
+	return 0;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif
+
+/*
+ * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
+ * it to the HPET timer of known frequency.
+ */
+
+#define TICK_COUNT 100000000
+
+static unsigned int __init hpet_calibrate_tsc(void)
+{
+	int tsc_start, hpet_start;
+	int tsc_now, hpet_now;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	local_irq_disable();
+
+	hpet_start = hpet_readl(HPET_COUNTER);
+	rdtscl(tsc_start);
+
+	do {
+		local_irq_disable();
+		hpet_now = hpet_readl(HPET_COUNTER);
+		tsc_now = get_cycles_sync();
+		local_irq_restore(flags);
+	} while ((tsc_now - tsc_start) < TICK_COUNT &&
+		 (hpet_now - hpet_start) < TICK_COUNT);
+
+	return (tsc_now - tsc_start) * 1000000000L
+		/ ((hpet_now - hpet_start) * hpet_period / 1000);
+}
+
+
+/*
+ * pit_calibrate_tsc() uses the speaker output (channel 2) of
+ * the PIT. This is better than using the timer interrupt output,
+ * because we can read the value of the speaker with just one inb(),
+ * where we need three i/o operations for the interrupt channel.
+ * We count how many ticks the TSC does in 50 ms.
+ */
+
+static unsigned int __init pit_calibrate_tsc(void)
+{
+	unsigned long start, end;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+
+	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+	outb(0xb0, 0x43);
+	outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
+	outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+	start = get_cycles_sync();
+	while ((inb(0x61) & 0x20) == 0);
+	end = get_cycles_sync();
+
+	spin_unlock_irqrestore(&i8253_lock, flags);
+	
+	return (end - start) / 50;
+}
+
+#ifdef	CONFIG_HPET
+static __init int late_hpet_init(void)
+{
+	struct hpet_data	hd;
+	unsigned int 		ntimer;
+
+	if (!vxtime.hpet_address)
+        	return -1;
+
+	memset(&hd, 0, sizeof (hd));
+
+	ntimer = hpet_readl(HPET_ID);
+	ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+	ntimer++;
+
+	/*
+	 * Register with driver.
+	 * Timer0 and Timer1 is used by platform.
+	 */
+	hd.hd_phys_address = vxtime.hpet_address;
+	hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
+	hd.hd_nirqs = ntimer;
+	hd.hd_flags = HPET_DATA_PLATFORM;
+	hpet_reserve_timer(&hd, 0);
+#ifdef	CONFIG_HPET_EMULATE_RTC
+	hpet_reserve_timer(&hd, 1);
+#endif
+	hd.hd_irq[0] = HPET_LEGACY_8254;
+	hd.hd_irq[1] = HPET_LEGACY_RTC;
+	if (ntimer > 2) {
+		struct hpet		*hpet;
+		struct hpet_timer	*timer;
+		int			i;
+
+		hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
+		timer = &hpet->hpet_timers[2];
+		for (i = 2; i < ntimer; timer++, i++)
+			hd.hd_irq[i] = (timer->hpet_config &
+					Tn_INT_ROUTE_CNF_MASK) >>
+				Tn_INT_ROUTE_CNF_SHIFT;
+
+	}
+
+	hpet_alloc(&hd);
+	return 0;
+}
+fs_initcall(late_hpet_init);
+#endif
+
+static int hpet_timer_stop_set_go(unsigned long tick)
+{
+	unsigned int cfg;
+
+/*
+ * Stop the timers and reset the main counter.
+ */
+
+	cfg = hpet_readl(HPET_CFG);
+	cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+	hpet_writel(cfg, HPET_CFG);
+	hpet_writel(0, HPET_COUNTER);
+	hpet_writel(0, HPET_COUNTER + 4);
+
+/*
+ * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
+ * and period also hpet_tick.
+ */
+	if (hpet_use_timer) {
+		hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+		    HPET_TN_32BIT, HPET_T0_CFG);
+		hpet_writel(hpet_tick, HPET_T0_CMP);
+		hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
+		cfg |= HPET_CFG_LEGACY;
+	}
+/*
+ * Go!
+ */
+
+	cfg |= HPET_CFG_ENABLE;
+	hpet_writel(cfg, HPET_CFG);
+
+	return 0;
+}
+
+static int hpet_init(void)
+{
+	unsigned int id;
+
+	if (!vxtime.hpet_address)
+		return -1;
+	set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address);
+	__set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+
+/*
+ * Read the period, compute tick and quotient.
+ */
+
+	id = hpet_readl(HPET_ID);
+
+	if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
+		return -1;
+
+	hpet_period = hpet_readl(HPET_PERIOD);
+	if (hpet_period < 100000 || hpet_period > 100000000)
+		return -1;
+
+	hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
+		hpet_period;
+
+	hpet_use_timer = (id & HPET_ID_LEGSUP);
+
+	return hpet_timer_stop_set_go(hpet_tick);
+}
+
+static int hpet_reenable(void)
+{
+	return hpet_timer_stop_set_go(hpet_tick);
+}
+
+#define PIT_MODE 0x43
+#define PIT_CH0  0x40
+
+static void __init __pit_init(int val, u8 mode)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+	outb_p(mode, PIT_MODE);
+	outb_p(val & 0xff, PIT_CH0);	/* LSB */
+	outb_p(val >> 8, PIT_CH0);	/* MSB */
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+void __init pit_init(void)
+{
+	__pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */
+}
+
+void __init pit_stop_interrupt(void)
+{
+	__pit_init(0, 0x30); /* mode 0 */
+}
+
+void __init stop_timer_interrupt(void)
+{
+	char *name;
+	if (vxtime.hpet_address) {
+		name = "HPET";
+		hpet_timer_stop_set_go(0);
+	} else {
+		name = "PIT";
+		pit_stop_interrupt();
+	}
+	printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
+}
+
+int __init time_setup(char *str)
+{
+	report_lost_ticks = 1;
+	return 1;
+}
+
+static struct irqaction irq0 = {
+	timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
+};
+
+void __init time_init(void)
+{
+	char *timename;
+	char *gtod;
+
+#ifdef HPET_HACK_ENABLE_DANGEROUS
+        if (!vxtime.hpet_address) {
+		printk(KERN_WARNING "time.c: WARNING: Enabling HPET base "
+		       "manually!\n");
+                outl(0x800038a0, 0xcf8);
+                outl(0xff000001, 0xcfc);
+                outl(0x800038a0, 0xcf8);
+                vxtime.hpet_address = inl(0xcfc) & 0xfffffffe;
+		printk(KERN_WARNING "time.c: WARNING: Enabled HPET "
+		       "at %#lx.\n", vxtime.hpet_address);
+        }
+#endif
+	if (nohpet)
+		vxtime.hpet_address = 0;
+
+	xtime.tv_sec = get_cmos_time();
+	xtime.tv_nsec = 0;
+
+	set_normalized_timespec(&wall_to_monotonic,
+	                        -xtime.tv_sec, -xtime.tv_nsec);
+
+	if (!hpet_init())
+                vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period;
+	else
+		vxtime.hpet_address = 0;
+
+	if (hpet_use_timer) {
+		cpu_khz = hpet_calibrate_tsc();
+		timename = "HPET";
+#ifdef CONFIG_X86_PM_TIMER
+	} else if (pmtmr_ioport && !vxtime.hpet_address) {
+		vxtime_hz = PM_TIMER_FREQUENCY;
+		timename = "PM";
+		pit_init();
+		cpu_khz = pit_calibrate_tsc();
+#endif
+	} else {
+		pit_init();
+		cpu_khz = pit_calibrate_tsc();
+		timename = "PIT";
+	}
+
+	vxtime.mode = VXTIME_TSC;
+	gtod = time_init_gtod();
+
+	printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
+	       vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod);
+	printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
+		cpu_khz / 1000, cpu_khz % 1000);
+	vxtime.quot = (1000000L << 32) / vxtime_hz;
+	vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+	vxtime.last_tsc = get_cycles_sync();
+	setup_irq(0, &irq0);
+
+	set_cyc2ns_scale(cpu_khz);
+}
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+#ifdef CONFIG_SMP
+	if (oem_force_hpet_timer())
+		return 1;
+ 	/* Intel systems are normally all synchronized. Exceptions
+ 	   are handled in the OEM check above. */
+ 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ 		return 0;
+#endif
+ 	/* Assume multi socket systems are not synchronized */
+ 	return num_present_cpus() > 1;
+}
+
+/*
+ * Decide what mode gettimeofday should use.
+ */
+__init static char *time_init_gtod(void)
+{
+	char *timetype;
+
+	if (unsynchronized_tsc())
+		notsc = 1;
+	if (vxtime.hpet_address && notsc) {
+		timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
+		if (hpet_use_timer)
+			vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+		else
+			vxtime.last = hpet_readl(HPET_COUNTER);
+		vxtime.mode = VXTIME_HPET;
+		do_gettimeoffset = do_gettimeoffset_hpet;
+#ifdef CONFIG_X86_PM_TIMER
+	/* Using PM for gettimeofday is quite slow, but we have no other
+	   choice because the TSC is too unreliable on some systems. */
+	} else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
+		timetype = "PM";
+		do_gettimeoffset = do_gettimeoffset_pm;
+		vxtime.mode = VXTIME_PMTMR;
+		sysctl_vsyscall = 0;
+		printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
+#endif
+	} else {
+		timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
+		vxtime.mode = VXTIME_TSC;
+	}
+	return timetype;
+}
+
+__setup("report_lost_ticks", time_setup);
+
+static long clock_cmos_diff;
+static unsigned long sleep_start;
+
+/*
+ * sysfs support for the timer.
+ */
+
+static int timer_suspend(struct sys_device *dev, pm_message_t state)
+{
+	/*
+	 * Estimate time zone so that set_time can update the clock
+	 */
+	long cmos_time =  get_cmos_time();
+
+	clock_cmos_diff = -cmos_time;
+	clock_cmos_diff += get_seconds();
+	sleep_start = cmos_time;
+	return 0;
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+	unsigned long flags;
+	unsigned long sec;
+	unsigned long ctime = get_cmos_time();
+	unsigned long sleep_length = (ctime - sleep_start) * HZ;
+
+	if (vxtime.hpet_address)
+		hpet_reenable();
+	else
+		i8254_timer_resume();
+
+	sec = ctime + clock_cmos_diff;
+	write_seqlock_irqsave(&xtime_lock,flags);
+	xtime.tv_sec = sec;
+	xtime.tv_nsec = 0;
+	if (vxtime.mode == VXTIME_HPET) {
+		if (hpet_use_timer)
+			vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+		else
+			vxtime.last = hpet_readl(HPET_COUNTER);
+#ifdef CONFIG_X86_PM_TIMER
+	} else if (vxtime.mode == VXTIME_PMTMR) {
+		pmtimer_resume();
+#endif
+	} else
+		vxtime.last_tsc = get_cycles_sync();
+	write_sequnlock_irqrestore(&xtime_lock,flags);
+	jiffies += sleep_length;
+	wall_jiffies += sleep_length;
+	monotonic_base += sleep_length * (NSEC_PER_SEC/HZ);
+	touch_softlockup_watchdog();
+	return 0;
+}
+
+static struct sysdev_class timer_sysclass = {
+	.resume = timer_resume,
+	.suspend = timer_suspend,
+	set_kset_name("timer"),
+};
+
+/* XXX this driverfs stuff should probably go elsewhere later -john */
+static struct sys_device device_timer = {
+	.id	= 0,
+	.cls	= &timer_sysclass,
+};
+
+static int time_init_device(void)
+{
+	int error = sysdev_class_register(&timer_sysclass);
+	if (!error)
+		error = sysdev_register(&device_timer);
+	return error;
+}
+
+device_initcall(time_init_device);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ *    is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/rtc.h>
+
+#define DEFAULT_RTC_INT_FREQ 	64
+#define RTC_NUM_INTS 		1
+
+static unsigned long UIE_on;
+static unsigned long prev_update_sec;
+
+static unsigned long AIE_on;
+static struct rtc_time alarm_time;
+
+static unsigned long PIE_on;
+static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
+static unsigned long PIE_count;
+
+static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+static unsigned int hpet_t1_cmp; /* cached comparator register */
+
+int is_hpet_enabled(void)
+{
+	return vxtime.hpet_address != 0;
+}
+
+/*
+ * Timer 1 for RTC, we do not use periodic interrupt feature,
+ * even if HPET supports periodic interrupts on Timer 1.
+ * The reason being, to set up a periodic interrupt in HPET, we need to
+ * stop the main counter. And if we do that everytime someone diables/enables
+ * RTC, we will have adverse effect on main kernel timer running on Timer 0.
+ * So, for the time being, simulate the periodic interrupt in software.
+ *
+ * hpet_rtc_timer_init() is called for the first time and during subsequent
+ * interuppts reinit happens through hpet_rtc_timer_reinit().
+ */
+int hpet_rtc_timer_init(void)
+{
+	unsigned int cfg, cnt;
+	unsigned long flags;
+
+	if (!is_hpet_enabled())
+		return 0;
+	/*
+	 * Set the counter 1 and enable the interrupts.
+	 */
+	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+		hpet_rtc_int_freq = PIE_freq;
+	else
+		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+
+	local_irq_save(flags);
+	cnt = hpet_readl(HPET_COUNTER);
+	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
+	hpet_writel(cnt, HPET_T1_CMP);
+	hpet_t1_cmp = cnt;
+	local_irq_restore(flags);
+
+	cfg = hpet_readl(HPET_T1_CFG);
+	cfg &= ~HPET_TN_PERIODIC;
+	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_T1_CFG);
+
+	return 1;
+}
+
+static void hpet_rtc_timer_reinit(void)
+{
+	unsigned int cfg, cnt;
+
+	if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
+		cfg = hpet_readl(HPET_T1_CFG);
+		cfg &= ~HPET_TN_ENABLE;
+		hpet_writel(cfg, HPET_T1_CFG);
+		return;
+	}
+
+	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+		hpet_rtc_int_freq = PIE_freq;
+	else
+		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+
+	/* It is more accurate to use the comparator value than current count.*/
+	cnt = hpet_t1_cmp;
+	cnt += hpet_tick*HZ/hpet_rtc_int_freq;
+	hpet_writel(cnt, HPET_T1_CMP);
+	hpet_t1_cmp = cnt;
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (bit_mask & RTC_UIE)
+		UIE_on = 0;
+	if (bit_mask & RTC_PIE)
+		PIE_on = 0;
+	if (bit_mask & RTC_AIE)
+		AIE_on = 0;
+
+	return 1;
+}
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+	int timer_init_reqd = 0;
+
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (!(PIE_on | AIE_on | UIE_on))
+		timer_init_reqd = 1;
+
+	if (bit_mask & RTC_UIE) {
+		UIE_on = 1;
+	}
+	if (bit_mask & RTC_PIE) {
+		PIE_on = 1;
+		PIE_count = 0;
+	}
+	if (bit_mask & RTC_AIE) {
+		AIE_on = 1;
+	}
+
+	if (timer_init_reqd)
+		hpet_rtc_timer_init();
+
+	return 1;
+}
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	alarm_time.tm_hour = hrs;
+	alarm_time.tm_min = min;
+	alarm_time.tm_sec = sec;
+
+	return 1;
+}
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	PIE_freq = freq;
+	PIE_count = 0;
+
+	return 1;
+}
+
+int hpet_rtc_dropped_irq(void)
+{
+	unsigned int cnt, ticks_per_int, lost_ints;
+
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (UIE_on | PIE_on | AIE_on) {
+		/*
+		 * The interrupt handler schedules the next interrupt at a
+		 * constant offset from the time the current interrupt was
+		 * scheduled, without regard to the actual time. When the
+		 * handler is delayed too long, it tries to schedule the next
+		 * interrupt in the past and the hardware would not interrupt
+		 * until the counter had wrapped around. We catch it here.
+		 */
+		cnt = hpet_readl(HPET_COUNTER);
+		/* was the comparator set to a time in the past? */
+		if ((int)(cnt - hpet_t1_cmp) > 0) {
+			/* determine how many interrupts were actually lost */
+			ticks_per_int = (hpet_tick * HZ) / hpet_rtc_int_freq;
+			lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+			/*
+			 * Make sure that, even with the time needed to execute
+			 * this code, the next scheduled interrupt has been
+			 * moved back to the future.
+			 */
+			lost_ints++;
+
+			cnt = hpet_t1_cmp + lost_ints * ticks_per_int;
+			hpet_writel(cnt, HPET_T1_CMP);
+			hpet_t1_cmp = cnt;
+
+			if (PIE_on)
+				PIE_count += lost_ints;
+
+			printk(KERN_WARNING "rtc: lost some interrupts"
+			       " at %ldHz.\n", hpet_rtc_int_freq);
+		}
+	}
+
+	return 1;
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct rtc_time curr_time;
+	unsigned long rtc_int_flag = 0;
+	int call_rtc_interrupt = 0;
+
+	hpet_rtc_timer_reinit();
+
+	if (UIE_on | AIE_on) {
+		rtc_get_rtc_time(&curr_time);
+	}
+	if (UIE_on) {
+		if (curr_time.tm_sec != prev_update_sec) {
+			/* Set update int info, call real rtc int routine */
+			call_rtc_interrupt = 1;
+			rtc_int_flag = RTC_UF;
+			prev_update_sec = curr_time.tm_sec;
+		}
+	}
+	if (PIE_on) {
+		PIE_count++;
+		if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
+			/* Set periodic int info, call real rtc int routine */
+			call_rtc_interrupt = 1;
+			rtc_int_flag |= RTC_PF;
+			PIE_count = 0;
+		}
+	}
+	if (AIE_on) {
+		if ((curr_time.tm_sec == alarm_time.tm_sec) &&
+		    (curr_time.tm_min == alarm_time.tm_min) &&
+		    (curr_time.tm_hour == alarm_time.tm_hour)) {
+			/* Set alarm int info, call real rtc int routine */
+			call_rtc_interrupt = 1;
+			rtc_int_flag |= RTC_AF;
+		}
+	}
+	if (call_rtc_interrupt) {
+		rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+		rtc_interrupt(rtc_int_flag, dev_id, regs);
+	}
+	return IRQ_HANDLED;
+}
+#endif
+
+static int __init nohpet_setup(char *s) 
+{ 
+	nohpet = 1;
+	return 0;
+} 
+
+__setup("nohpet", nohpet_setup);
+
+int __init notsc_setup(char *s)
+{
+	notsc = 1;
+	return 0;
+}
+
+__setup("notsc", notsc_setup);
diff -urN oldtree/arch/x86_64/mm/init.c newtree/arch/x86_64/mm/init.c
--- oldtree/arch/x86_64/mm/init.c	2006-03-08 18:47:59.003816750 +0000
+++ newtree/arch/x86_64/mm/init.c	2006-03-08 15:26:45.337259750 +0000
@@ -593,7 +593,8 @@
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
+		ClearPageNosave(virt_to_page(addr));
+                init_page_count(virt_to_page(addr));
 		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
 		free_page(addr);
 		totalram_pages++;
@@ -744,3 +745,22 @@
 {
 	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
 }
+
+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_SUSPEND2)
+/*
+ * Software suspend & friends need this for resume because things like the intel-agp
+ * driver might have split up a kernel 4MB mapping.
+ */
+char __nosavedata swsusp_pg_dir[PAGE_SIZE]
+	__attribute__ ((aligned (PAGE_SIZE)));
+
+static inline void save_pg_dir(void)
+{
+	memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+}
+#else
+static inline void save_pg_dir(void)
+{
+}
+#endif
+
diff -urN oldtree/arch/x86_64/mm/init.c.orig newtree/arch/x86_64/mm/init.c.orig
--- oldtree/arch/x86_64/mm/init.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/arch/x86_64/mm/init.c.orig	2006-03-08 15:21:14.440580000 +0000
@@ -0,0 +1,746 @@
+/*
+ *  linux/arch/x86_64/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/memory_hotplug.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/sections.h>
+#include <asm/dma-mapping.h>
+#include <asm/swiotlb.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+struct dma_mapping_ops* dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
+static unsigned long dma_reserve __initdata;
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+void show_mem(void)
+{
+	long i, total = 0, reserved = 0;
+	long shared = 0, cached = 0;
+	pg_data_t *pgdat;
+	struct page *page;
+
+	printk(KERN_INFO "Mem-info:\n");
+	show_free_areas();
+	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+
+	for_each_online_pgdat(pgdat) {
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+			page = pfn_to_page(pgdat->node_start_pfn + i);
+			total++;
+			if (PageReserved(page))
+				reserved++;
+			else if (PageSwapCache(page))
+				cached++;
+			else if (page_count(page))
+				shared += page_count(page) - 1;
+               }
+	}
+	printk(KERN_INFO "%lu pages of RAM\n", total);
+	printk(KERN_INFO "%lu reserved pages\n",reserved);
+	printk(KERN_INFO "%lu pages shared\n",shared);
+	printk(KERN_INFO "%lu pages swap cached\n",cached);
+}
+
+/* References to section boundaries */
+
+int after_bootmem;
+
+static __init void *spp_getpage(void)
+{ 
+	void *ptr;
+	if (after_bootmem)
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
+	else
+		ptr = alloc_bootmem_pages(PAGE_SIZE);
+	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
+		panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
+
+	Dprintk("spp_getpage %p\n", ptr);
+	return ptr;
+} 
+
+static __init void set_pte_phys(unsigned long vaddr,
+			 unsigned long phys, pgprot_t prot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte, new_pte;
+
+	Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+
+	pgd = pgd_offset_k(vaddr);
+	if (pgd_none(*pgd)) {
+		printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
+		return;
+	}
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		pmd = (pmd_t *) spp_getpage(); 
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+		if (pmd != pmd_offset(pud, 0)) {
+			printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
+			return;
+		}
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		pte = (pte_t *) spp_getpage();
+		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+		if (pte != pte_offset_kernel(pmd, 0)) {
+			printk("PAGETABLE BUG #02!\n");
+			return;
+		}
+	}
+	new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
+
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (!pte_none(*pte) &&
+	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
+		pte_ERROR(*pte);
+	set_pte(pte, new_pte);
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+/* NOTE: this is meant to be run only at boot */
+void __init 
+__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+{
+	unsigned long address = __fix_to_virt(idx);
+
+	if (idx >= __end_of_fixed_addresses) {
+		printk("Invalid __set_fixmap\n");
+		return;
+	}
+	set_pte_phys(address, phys, prot);
+}
+
+unsigned long __initdata table_start, table_end; 
+
+extern pmd_t temp_boot_pmds[]; 
+
+static  struct temp_map { 
+	pmd_t *pmd;
+	void  *address; 
+	int    allocated; 
+} temp_mappings[] __initdata = { 
+	{ &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
+	{ &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
+	{}
+}; 
+
+static __meminit void *alloc_low_page(int *index, unsigned long *phys)
+{ 
+	struct temp_map *ti;
+	int i; 
+	unsigned long pfn = table_end++, paddr; 
+	void *adr;
+
+	if (after_bootmem) {
+		adr = (void *)get_zeroed_page(GFP_ATOMIC);
+		*phys = __pa(adr);
+		return adr;
+	}
+
+	if (pfn >= end_pfn) 
+		panic("alloc_low_page: ran out of memory"); 
+	for (i = 0; temp_mappings[i].allocated; i++) {
+		if (!temp_mappings[i].pmd) 
+			panic("alloc_low_page: ran out of temp mappings"); 
+	} 
+	ti = &temp_mappings[i];
+	paddr = (pfn << PAGE_SHIFT) & PMD_MASK; 
+	set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); 
+	ti->allocated = 1; 
+	__flush_tlb(); 	       
+	adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 
+	memset(adr, 0, PAGE_SIZE);
+	*index = i; 
+	*phys  = pfn * PAGE_SIZE;  
+	return adr; 
+} 
+
+static __meminit void unmap_low_page(int i)
+{ 
+	struct temp_map *ti;
+
+	if (after_bootmem)
+		return;
+
+	ti = &temp_mappings[i];
+	set_pmd(ti->pmd, __pmd(0));
+	ti->allocated = 0; 
+} 
+
+static void __meminit
+phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
+{
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {
+		unsigned long entry;
+
+		if (address > end) {
+			for (; i < PTRS_PER_PMD; i++, pmd++)
+				set_pmd(pmd, __pmd(0));
+			break;
+		}
+		entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+		entry &= __supported_pte_mask;
+		set_pmd(pmd, __pmd(entry));
+	}
+}
+
+static void __meminit
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+{
+	pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
+
+	if (pmd_none(*pmd)) {
+		spin_lock(&init_mm.page_table_lock);
+		phys_pmd_init(pmd, address, end);
+		spin_unlock(&init_mm.page_table_lock);
+		__flush_tlb_all();
+	}
+}
+
+static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{ 
+	long i = pud_index(address);
+
+	pud = pud + i;
+
+	if (after_bootmem && pud_val(*pud)) {
+		phys_pmd_update(pud, address, end);
+		return;
+	}
+
+	for (; i < PTRS_PER_PUD; pud++, i++) {
+		int map; 
+		unsigned long paddr, pmd_phys;
+		pmd_t *pmd;
+
+		paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
+		if (paddr >= end)
+			break;
+
+		if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
+			set_pud(pud, __pud(0)); 
+			continue;
+		} 
+
+		pmd = alloc_low_page(&map, &pmd_phys);
+		spin_lock(&init_mm.page_table_lock);
+		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
+		phys_pmd_init(pmd, paddr, end);
+		spin_unlock(&init_mm.page_table_lock);
+		unmap_low_page(map);
+	}
+	__flush_tlb();
+} 
+
+static void __init find_early_table_space(unsigned long end)
+{
+	unsigned long puds, pmds, tables, start;
+
+	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
+		 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+ 	/* RED-PEN putting page tables only on node 0 could
+ 	   cause a hotspot and fill up ZONE_DMA. The page tables
+ 	   need roughly 0.5KB per GB. */
+ 	start = 0x8000;
+ 	table_start = find_e820_area(start, end, tables);
+	if (table_start == -1UL)
+		panic("Cannot find space for the kernel page tables");
+
+	table_start >>= PAGE_SHIFT;
+	table_end = table_start;
+
+	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
+		end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
+}
+
+/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
+   This runs before bootmem is initialized and gets pages directly from the 
+   physical memory. To access them they are temporarily mapped. */
+void __meminit init_memory_mapping(unsigned long start, unsigned long end)
+{ 
+	unsigned long next; 
+
+	Dprintk("init_memory_mapping\n");
+
+	/* 
+	 * Find space for the kernel direct mapping tables.
+	 * Later we should allocate these tables in the local node of the memory
+	 * mapped.  Unfortunately this is done currently before the nodes are 
+	 * discovered.
+	 */
+	if (!after_bootmem)
+		find_early_table_space(end);
+
+	start = (unsigned long)__va(start);
+	end = (unsigned long)__va(end);
+
+	for (; start < end; start = next) {
+		int map;
+		unsigned long pud_phys; 
+		pgd_t *pgd = pgd_offset_k(start);
+		pud_t *pud;
+
+		if (after_bootmem)
+			pud = pud_offset_k(pgd, start & PGDIR_MASK);
+		else
+			pud = alloc_low_page(&map, &pud_phys);
+
+		next = start + PGDIR_SIZE;
+		if (next > end) 
+			next = end; 
+		phys_pud_init(pud, __pa(start), __pa(next));
+		if (!after_bootmem)
+			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+		unmap_low_page(map);   
+	} 
+
+	if (!after_bootmem)
+		asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
+	__flush_tlb_all();
+}
+
+void __cpuinit zap_low_mappings(int cpu)
+{
+	if (cpu == 0) {
+		pgd_t *pgd = pgd_offset_k(0UL);
+		pgd_clear(pgd);
+	} else {
+		/*
+		 * For AP's, zap the low identity mappings by changing the cr3
+		 * to init_level4_pgt and doing local flush tlb all
+		 */
+		asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
+	}
+	__flush_tlb_all();
+}
+
+/* Compute zone sizes for the DMA and DMA32 zones in a node. */
+__init void
+size_zones(unsigned long *z, unsigned long *h,
+	   unsigned long start_pfn, unsigned long end_pfn)
+{
+ 	int i;
+ 	unsigned long w;
+
+ 	for (i = 0; i < MAX_NR_ZONES; i++)
+ 		z[i] = 0;
+
+ 	if (start_pfn < MAX_DMA_PFN)
+ 		z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
+ 	if (start_pfn < MAX_DMA32_PFN) {
+ 		unsigned long dma32_pfn = MAX_DMA32_PFN;
+ 		if (dma32_pfn > end_pfn)
+ 			dma32_pfn = end_pfn;
+ 		z[ZONE_DMA32] = dma32_pfn - start_pfn;
+ 	}
+ 	z[ZONE_NORMAL] = end_pfn - start_pfn;
+
+ 	/* Remove lower zones from higher ones. */
+ 	w = 0;
+ 	for (i = 0; i < MAX_NR_ZONES; i++) {
+ 		if (z[i])
+ 			z[i] -= w;
+ 	        w += z[i];
+	}
+
+	/* Compute holes */
+	w = start_pfn;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		unsigned long s = w;
+		w += z[i];
+		h[i] = e820_hole_size(s, w);
+	}
+
+	/* Add the space pace needed for mem_map to the holes too. */
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
+
+	/* The 16MB DMA zone has the kernel and other misc mappings.
+ 	   Account them too */
+	if (h[ZONE_DMA]) {
+		h[ZONE_DMA] += dma_reserve;
+		if (h[ZONE_DMA] >= z[ZONE_DMA]) {
+			printk(KERN_WARNING
+				"Kernel too large and filling up ZONE_DMA?\n");
+			h[ZONE_DMA] = z[ZONE_DMA];
+		}
+	}
+}
+
+#ifndef CONFIG_NUMA
+void __init paging_init(void)
+{
+	unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
+
+	memory_present(0, 0, end_pfn);
+	sparse_init();
+	size_zones(zones, holes, 0, end_pfn);
+	free_area_init_node(0, NODE_DATA(0), zones,
+			    __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
+}
+#endif
+
+/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
+   from the CPU leading to inconsistent cache lines. address and size
+   must be aligned to 2MB boundaries. 
+   Does nothing when the mapping doesn't exist. */
+void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
+{
+	unsigned long end = address + size;
+
+	BUG_ON(address & ~LARGE_PAGE_MASK);
+	BUG_ON(size & ~LARGE_PAGE_MASK); 
+	
+	for (; address < end; address += LARGE_PAGE_SIZE) { 
+		pgd_t *pgd = pgd_offset_k(address);
+		pud_t *pud;
+		pmd_t *pmd;
+		if (pgd_none(*pgd))
+			continue;
+		pud = pud_offset(pgd, address);
+		if (pud_none(*pud))
+			continue; 
+		pmd = pmd_offset(pud, address);
+		if (!pmd || pmd_none(*pmd))
+			continue; 
+		if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
+			/* Could handle this, but it should not happen currently. */
+			printk(KERN_ERR 
+	       "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
+			pmd_ERROR(*pmd); 
+		}
+		set_pmd(pmd, __pmd(0)); 		
+	}
+	__flush_tlb_all();
+} 
+
+/*
+ * Memory hotplug specific functions
+ * These are only for non-NUMA machines right now.
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+	totalram_pages++;
+	num_physpages++;
+}
+
+int add_memory(u64 start, u64 size)
+{
+	struct pglist_data *pgdat = NODE_DATA(0);
+	struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	int ret;
+
+	ret = __add_pages(zone, start_pfn, nr_pages);
+	if (ret)
+		goto error;
+
+	init_memory_mapping(start, (start + size -1));
+
+	return ret;
+error:
+	printk("%s: Problem encountered in __add_pages!\n", __func__);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory);
+
+int remove_memory(u64 start, u64 size)
+{
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+
+#endif
+
+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
+			 kcore_vsyscall;
+
+void __init mem_init(void)
+{
+	long codesize, reservedpages, datasize, initsize;
+
+#ifdef CONFIG_SWIOTLB
+	pci_swiotlb_init();
+#endif
+	no_iommu_init();
+
+	/* How many end-of-memory variables you have, grandma! */
+	max_low_pfn = end_pfn;
+	max_pfn = end_pfn;
+	num_physpages = end_pfn;
+	high_memory = (void *) __va(end_pfn * PAGE_SIZE);
+
+	/* clear the zero-page */
+	memset(empty_zero_page, 0, PAGE_SIZE);
+
+	reservedpages = 0;
+
+	/* this will put all low memory onto the freelists */
+#ifdef CONFIG_NUMA
+	totalram_pages = numa_free_all_bootmem();
+#else
+	totalram_pages = free_all_bootmem();
+#endif
+	reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
+
+	after_bootmem = 1;
+
+	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+	/* Register memory areas for /proc/kcore */
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+		   VMALLOC_END-VMALLOC_START);
+	kclist_add(&kcore_kernel, &_stext, _end - _stext);
+	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
+	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
+				 VSYSCALL_END - VSYSCALL_START);
+
+	printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
+		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		end_pfn << (PAGE_SHIFT-10),
+		codesize >> 10,
+		reservedpages << (PAGE_SHIFT-10),
+		datasize >> 10,
+		initsize >> 10);
+
+#ifdef CONFIG_SMP
+	/*
+	 * Sync boot_level4_pgt mappings with the init_level4_pgt
+	 * except for the low identity mappings which are already zapped
+	 * in init_level4_pgt. This sync-up is essential for AP's bringup
+	 */
+	memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
+#endif
+}
+
+void free_initmem(void)
+{
+	unsigned long addr;
+
+	addr = (unsigned long)(&__init_begin);
+	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(addr));
+		init_page_count(virt_to_page(addr));
+		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
+		free_page(addr);
+		totalram_pages++;
+	}
+	memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
+	printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+
+extern char __start_rodata, __end_rodata;
+void mark_rodata_ro(void)
+{
+	unsigned long addr = (unsigned long)&__start_rodata;
+
+	for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
+		change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
+
+	printk ("Write protecting the kernel read-only data: %luk\n",
+			(&__end_rodata - &__start_rodata) >> 10);
+
+	/*
+	 * change_page_attr_addr() requires a global_flush_tlb() call after it.
+	 * We do this after the printk so that if something went wrong in the
+	 * change, the printk gets out at least to give a better debug hint
+	 * of who is the culprit.
+	 */
+	global_flush_tlb();
+}
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	if (start >= end)
+		return;
+	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+	for (; start < end; start += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(start));
+		init_page_count(virt_to_page(start));
+		free_page(start);
+		totalram_pages++;
+	}
+}
+#endif
+
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
+{ 
+	/* Should check here against the e820 map to avoid double free */ 
+#ifdef CONFIG_NUMA
+	int nid = phys_to_nid(phys);
+  	reserve_bootmem_node(NODE_DATA(nid), phys, len);
+#else       		
+	reserve_bootmem(phys, len);    
+#endif
+	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
+		dma_reserve += len / PAGE_SIZE;
+}
+
+int kern_addr_valid(unsigned long addr) 
+{ 
+	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+	if (above != 0 && above != -1UL)
+		return 0; 
+	
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud))
+		return 0; 
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return 0;
+	if (pmd_large(*pmd))
+		return pfn_valid(pmd_pfn(*pmd));
+
+	pte = pte_offset_kernel(pmd, addr);
+	if (pte_none(*pte))
+		return 0;
+	return pfn_valid(pte_pfn(*pte));
+}
+
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+
+extern int exception_trace, page_fault_trace;
+
+static ctl_table debug_table2[] = {
+	{ 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
+	  proc_dointvec },
+	{ 0, }
+}; 
+
+static ctl_table debug_root_table2[] = { 
+	{ .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
+	   .child = debug_table2 }, 
+	{ 0 }, 
+}; 
+
+static __init int x8664_sysctl_init(void)
+{ 
+	register_sysctl_table(debug_root_table2, 1);
+	return 0;
+}
+__initcall(x8664_sysctl_init);
+#endif
+
+/* A pseudo VMAs to allow ptrace access for the vsyscall page.   This only
+   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+   not need special handling anymore. */
+
+static struct vm_area_struct gate_vma = {
+	.vm_start = VSYSCALL_START,
+	.vm_end = VSYSCALL_END,
+	.vm_page_prot = PAGE_READONLY
+};
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (test_tsk_thread_flag(tsk, TIF_IA32))
+		return NULL;
+#endif
+	return &gate_vma;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+	struct vm_area_struct *vma = get_gate_vma(task);
+	if (!vma)
+		return 0;
+	return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+
+/* Use this when you have no reliable task/vma, typically from interrupt
+ * context.  It is less reliable than using the task's vma and may give
+ * false positives.
+ */
+int in_gate_area_no_task(unsigned long addr)
+{
+	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
+}
diff -urN oldtree/block/ll_rw_blk.c newtree/block/ll_rw_blk.c
--- oldtree/block/ll_rw_blk.c	2006-03-08 18:47:59.019817750 +0000
+++ newtree/block/ll_rw_blk.c	2006-03-08 15:29:40.012176250 +0000
@@ -29,6 +29,10 @@
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
 #include <linux/blktrace_api.h>
+#include <linux/freezer.h>
+#include <linux/mount.h>
+#include <linux/suspend2.h>
+
 
 /*
  * for max sense size
@@ -3136,12 +3140,26 @@
 	else
 		mod_page_state(pgpgin, count);
 
+	if (unlikely((	bio->bi_flags & (1 << BIO_SUSPEND2)) &&
+			test_action_state(SUSPEND_TEST_BIO) &&
+			(rw & WRITE))) {
+		char b[BDEVNAME_SIZE];
+		printk("FAKEDWRITE: %s(%d): %s block %Lu on %s\n",
+			current->comm, current->pid,
+			(rw & WRITE) ? "WRITE" : "READ",
+			(unsigned long long)bio->bi_sector,
+			bdevname(bio->bi_bdev,b));
+		bio_endio(bio, PAGE_SIZE, 0);
+		return;
+	}
+
 	if (unlikely(block_dump)) {
 		char b[BDEVNAME_SIZE];
-		printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+		printk(KERN_DEBUG "%s(%d): %s block %Lu size %d on %s\n",
 			current->comm, current->pid,
 			(rw & WRITE) ? "WRITE" : "READ",
 			(unsigned long long)bio->bi_sector,
+			bio->bi_size,
 			bdevname(bio->bi_bdev,b));
 	}
 
@@ -3535,7 +3553,7 @@
 {
 	int i;
 
-	kblockd_workqueue = create_workqueue("kblockd");
+	kblockd_workqueue = create_nofreeze_workqueue("kblockd");
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 
diff -urN oldtree/block/ll_rw_blk.c.orig newtree/block/ll_rw_blk.c.orig
--- oldtree/block/ll_rw_blk.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/block/ll_rw_blk.c.orig	2006-03-08 15:21:14.456581000 +0000
@@ -0,0 +1,3939 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
+ * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
+ * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
+ * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
+ * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
+ */
+
+/*
+ * This handles all read/write requests to block devices
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/blktrace_api.h>
+
+/*
+ * for max sense size
+ */
+#include <scsi/scsi_cmnd.h>
+
+static void blk_unplug_work(void *data);
+static void blk_unplug_timeout(unsigned long data);
+static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
+static void init_request_from_bio(struct request *req, struct bio *bio);
+static int __make_request(request_queue_t *q, struct bio *bio);
+
+/*
+ * For the allocated request tables
+ */
+static kmem_cache_t *request_cachep;
+
+/*
+ * For queue allocation
+ */
+static kmem_cache_t *requestq_cachep;
+
+/*
+ * For io context allocations
+ */
+static kmem_cache_t *iocontext_cachep;
+
+static wait_queue_head_t congestion_wqh[2] = {
+		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
+		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
+	};
+
+/*
+ * Controlling structure to kblockd
+ */
+static struct workqueue_struct *kblockd_workqueue;
+
+unsigned long blk_max_low_pfn, blk_max_pfn;
+
+EXPORT_SYMBOL(blk_max_low_pfn);
+EXPORT_SYMBOL(blk_max_pfn);
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
+/* Amount of time in which a process may batch requests */
+#define BLK_BATCH_TIME	(HZ/50UL)
+
+/* Number of requests a "batching" process may submit */
+#define BLK_BATCH_REQ	32
+
+/*
+ * Return the threshold (number of used requests) at which the queue is
+ * considered to be congested.  It include a little hysteresis to keep the
+ * context switch rate down.
+ */
+static inline int queue_congestion_on_threshold(struct request_queue *q)
+{
+	return q->nr_congestion_on;
+}
+
+/*
+ * The threshold at which a queue is considered to be uncongested
+ */
+static inline int queue_congestion_off_threshold(struct request_queue *q)
+{
+	return q->nr_congestion_off;
+}
+
+static void blk_queue_congestion_threshold(struct request_queue *q)
+{
+	int nr;
+
+	nr = q->nr_requests - (q->nr_requests / 8) + 1;
+	if (nr > q->nr_requests)
+		nr = q->nr_requests;
+	q->nr_congestion_on = nr;
+
+	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
+	if (nr < 1)
+		nr = 1;
+	q->nr_congestion_off = nr;
+}
+
+/*
+ * A queue has just exitted congestion.  Note this in the global counter of
+ * congested queues, and wake up anyone who was waiting for requests to be
+ * put back.
+ */
+static void clear_queue_congested(request_queue_t *q, int rw)
+{
+	enum bdi_state bit;
+	wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	clear_bit(bit, &q->backing_dev_info.state);
+	smp_mb__after_clear_bit();
+	if (waitqueue_active(wqh))
+		wake_up(wqh);
+}
+
+/*
+ * A queue has just entered congestion.  Flag that in the queue's VM-visible
+ * state flags and increment the global gounter of congested queues.
+ */
+static void set_queue_congested(request_queue_t *q, int rw)
+{
+	enum bdi_state bit;
+
+	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	set_bit(bit, &q->backing_dev_info.state);
+}
+
+/**
+ * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
+ * @bdev:	device
+ *
+ * Locates the passed device's request queue and returns the address of its
+ * backing_dev_info
+ *
+ * Will return NULL if the request queue cannot be located.
+ */
+struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
+{
+	struct backing_dev_info *ret = NULL;
+	request_queue_t *q = bdev_get_queue(bdev);
+
+	if (q)
+		ret = &q->backing_dev_info;
+	return ret;
+}
+
+EXPORT_SYMBOL(blk_get_backing_dev_info);
+
+void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
+{
+	q->activity_fn = fn;
+	q->activity_data = data;
+}
+
+EXPORT_SYMBOL(blk_queue_activity_fn);
+
+/**
+ * blk_queue_prep_rq - set a prepare_request function for queue
+ * @q:		queue
+ * @pfn:	prepare_request function
+ *
+ * It's possible for a queue to register a prepare_request callback which
+ * is invoked before the request is handed to the request_fn. The goal of
+ * the function is to prepare a request for I/O, it can be used to build a
+ * cdb from the request data for instance.
+ *
+ */
+void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
+{
+	q->prep_rq_fn = pfn;
+}
+
+EXPORT_SYMBOL(blk_queue_prep_rq);
+
+/**
+ * blk_queue_merge_bvec - set a merge_bvec function for queue
+ * @q:		queue
+ * @mbfn:	merge_bvec_fn
+ *
+ * Usually queues have static limitations on the max sectors or segments that
+ * we can put in a request. Stacking drivers may have some settings that
+ * are dynamic, and thus we have to query the queue whether it is ok to
+ * add a new bio_vec to a bio at a given offset or not. If the block device
+ * has such limitations, it needs to register a merge_bvec_fn to control
+ * the size of bio's sent to it. Note that a block device *must* allow a
+ * single page to be added to an empty bio. The block device driver may want
+ * to use the bio_split() function to deal with these bio's. By default
+ * no merge_bvec_fn is defined for a queue, and only the fixed limits are
+ * honored.
+ */
+void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn)
+{
+	q->merge_bvec_fn = mbfn;
+}
+
+EXPORT_SYMBOL(blk_queue_merge_bvec);
+
+void blk_queue_softirq_done(request_queue_t *q, softirq_done_fn *fn)
+{
+	q->softirq_done_fn = fn;
+}
+
+EXPORT_SYMBOL(blk_queue_softirq_done);
+
+/**
+ * blk_queue_make_request - define an alternate make_request function for a device
+ * @q:  the request queue for the device to be affected
+ * @mfn: the alternate make_request function
+ *
+ * Description:
+ *    The normal way for &struct bios to be passed to a device
+ *    driver is for them to be collected into requests on a request
+ *    queue, and then to allow the device driver to select requests
+ *    off that queue when it is ready.  This works well for many block
+ *    devices. However some block devices (typically virtual devices
+ *    such as md or lvm) do not benefit from the processing on the
+ *    request queue, and are served best by having the requests passed
+ *    directly to them.  This can be achieved by providing a function
+ *    to blk_queue_make_request().
+ *
+ * Caveat:
+ *    The driver that does this *must* be able to deal appropriately
+ *    with buffers in "highmemory". This can be accomplished by either calling
+ *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
+ *    blk_queue_bounce() to create a buffer in normal memory.
+ **/
+void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
+{
+	/*
+	 * set defaults
+	 */
+	q->nr_requests = BLKDEV_MAX_RQ;
+	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
+	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
+	q->make_request_fn = mfn;
+	q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+	q->backing_dev_info.state = 0;
+	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
+	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
+	blk_queue_hardsect_size(q, 512);
+	blk_queue_dma_alignment(q, 511);
+	blk_queue_congestion_threshold(q);
+	q->nr_batching = BLK_BATCH_REQ;
+
+	q->unplug_thresh = 4;		/* hmm */
+	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
+	if (q->unplug_delay == 0)
+		q->unplug_delay = 1;
+
+	INIT_WORK(&q->unplug_work, blk_unplug_work, q);
+
+	q->unplug_timer.function = blk_unplug_timeout;
+	q->unplug_timer.data = (unsigned long)q;
+
+	/*
+	 * by default assume old behaviour and bounce for any highmem page
+	 */
+	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
+
+	blk_queue_activity_fn(q, NULL, NULL);
+}
+
+EXPORT_SYMBOL(blk_queue_make_request);
+
+static inline void rq_init(request_queue_t *q, struct request *rq)
+{
+	INIT_LIST_HEAD(&rq->queuelist);
+	INIT_LIST_HEAD(&rq->donelist);
+
+	rq->errors = 0;
+	rq->rq_status = RQ_ACTIVE;
+	rq->bio = rq->biotail = NULL;
+	rq->ioprio = 0;
+	rq->buffer = NULL;
+	rq->ref_count = 1;
+	rq->q = q;
+	rq->waiting = NULL;
+	rq->special = NULL;
+	rq->data_len = 0;
+	rq->data = NULL;
+	rq->nr_phys_segments = 0;
+	rq->sense = NULL;
+	rq->end_io = NULL;
+	rq->end_io_data = NULL;
+	rq->completion_data = NULL;
+}
+
+/**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q:        the request queue
+ * @ordered:  one of QUEUE_ORDERED_*
+ * @prepare_flush_fn: rq setup helper for cache flush ordered writes
+ *
+ * Description:
+ *   For journalled file systems, doing ordered writes on a commit
+ *   block instead of explicitly doing wait_on_buffer (which is bad
+ *   for performance) can be a big win. Block drivers supporting this
+ *   feature should call this function and indicate so.
+ *
+ **/
+int blk_queue_ordered(request_queue_t *q, unsigned ordered,
+		      prepare_flush_fn *prepare_flush_fn)
+{
+	if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
+	    prepare_flush_fn == NULL) {
+		printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
+		return -EINVAL;
+	}
+
+	if (ordered != QUEUE_ORDERED_NONE &&
+	    ordered != QUEUE_ORDERED_DRAIN &&
+	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
+	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
+	    ordered != QUEUE_ORDERED_TAG &&
+	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
+	    ordered != QUEUE_ORDERED_TAG_FUA) {
+		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
+		return -EINVAL;
+	}
+
+	q->ordered = ordered;
+	q->next_ordered = ordered;
+	q->prepare_flush_fn = prepare_flush_fn;
+
+	return 0;
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/**
+ * blk_queue_issue_flush_fn - set function for issuing a flush
+ * @q:     the request queue
+ * @iff:   the function to be called issuing the flush
+ *
+ * Description:
+ *   If a driver supports issuing a flush command, the support is notified
+ *   to the block layer by defining it through this call.
+ *
+ **/
+void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
+{
+	q->issue_flush_fn = iff;
+}
+
+EXPORT_SYMBOL(blk_queue_issue_flush_fn);
+
+/*
+ * Cache flushing for ordered writes handling
+ */
+inline unsigned blk_ordered_cur_seq(request_queue_t *q)
+{
+	if (!q->ordseq)
+		return 0;
+	return 1 << ffz(q->ordseq);
+}
+
+unsigned blk_ordered_req_seq(struct request *rq)
+{
+	request_queue_t *q = rq->q;
+
+	BUG_ON(q->ordseq == 0);
+
+	if (rq == &q->pre_flush_rq)
+		return QUEUE_ORDSEQ_PREFLUSH;
+	if (rq == &q->bar_rq)
+		return QUEUE_ORDSEQ_BAR;
+	if (rq == &q->post_flush_rq)
+		return QUEUE_ORDSEQ_POSTFLUSH;
+
+	if ((rq->flags & REQ_ORDERED_COLOR) ==
+	    (q->orig_bar_rq->flags & REQ_ORDERED_COLOR))
+		return QUEUE_ORDSEQ_DRAIN;
+	else
+		return QUEUE_ORDSEQ_DONE;
+}
+
+void blk_ordered_complete_seq(request_queue_t *q, unsigned seq, int error)
+{
+	struct request *rq;
+	int uptodate;
+
+	if (error && !q->orderr)
+		q->orderr = error;
+
+	BUG_ON(q->ordseq & seq);
+	q->ordseq |= seq;
+
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
+		return;
+
+	/*
+	 * Okay, sequence complete.
+	 */
+	rq = q->orig_bar_rq;
+	uptodate = q->orderr ? q->orderr : 1;
+
+	q->ordseq = 0;
+
+	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
+	end_that_request_last(rq, uptodate);
+}
+
+static void pre_flush_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+}
+
+static void bar_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+}
+
+static void post_flush_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+}
+
+static void queue_flush(request_queue_t *q, unsigned which)
+{
+	struct request *rq;
+	rq_end_io_fn *end_io;
+
+	if (which == QUEUE_ORDERED_PREFLUSH) {
+		rq = &q->pre_flush_rq;
+		end_io = pre_flush_end_io;
+	} else {
+		rq = &q->post_flush_rq;
+		end_io = post_flush_end_io;
+	}
+
+	rq_init(q, rq);
+	rq->flags = REQ_HARDBARRIER;
+	rq->elevator_private = NULL;
+	rq->rq_disk = q->bar_rq.rq_disk;
+	rq->rl = NULL;
+	rq->end_io = end_io;
+	q->prepare_flush_fn(q, rq);
+
+	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+}
+
+static inline struct request *start_ordered(request_queue_t *q,
+					    struct request *rq)
+{
+	q->bi_size = 0;
+	q->orderr = 0;
+	q->ordered = q->next_ordered;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+
+	/*
+	 * Prep proxy barrier request.
+	 */
+	blkdev_dequeue_request(rq);
+	q->orig_bar_rq = rq;
+	rq = &q->bar_rq;
+	rq_init(q, rq);
+	rq->flags = bio_data_dir(q->orig_bar_rq->bio);
+	rq->flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
+	rq->elevator_private = NULL;
+	rq->rl = NULL;
+	init_request_from_bio(rq, q->orig_bar_rq->bio);
+	rq->end_io = bar_end_io;
+
+	/*
+	 * Queue ordered sequence.  As we stack them at the head, we
+	 * need to queue in reverse order.  Note that we rely on that
+	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
+	 * request gets inbetween ordered sequence.
+	 */
+	if (q->ordered & QUEUE_ORDERED_POSTFLUSH)
+		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
+	else
+		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
+
+	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+
+	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
+		queue_flush(q, QUEUE_ORDERED_PREFLUSH);
+		rq = &q->pre_flush_rq;
+	} else
+		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
+
+	if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
+		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
+	else
+		rq = NULL;
+
+	return rq;
+}
+
+int blk_do_ordered(request_queue_t *q, struct request **rqp)
+{
+	struct request *rq = *rqp;
+	int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
+
+	if (!q->ordseq) {
+		if (!is_barrier)
+			return 1;
+
+		if (q->next_ordered != QUEUE_ORDERED_NONE) {
+			*rqp = start_ordered(q, rq);
+			return 1;
+		} else {
+			/*
+			 * This can happen when the queue switches to
+			 * ORDERED_NONE while this request is on it.
+			 */
+			blkdev_dequeue_request(rq);
+			end_that_request_first(rq, -EOPNOTSUPP,
+					       rq->hard_nr_sectors);
+			end_that_request_last(rq, -EOPNOTSUPP);
+			*rqp = NULL;
+			return 0;
+		}
+	}
+
+	/*
+	 * Ordered sequence in progress
+	 */
+
+	/* Special requests are not subject to ordering rules. */
+	if (!blk_fs_request(rq) &&
+	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
+		return 1;
+
+	if (q->ordered & QUEUE_ORDERED_TAG) {
+		/* Ordered by tag.  Blocking the next barrier is enough. */
+		if (is_barrier && rq != &q->bar_rq)
+			*rqp = NULL;
+	} else {
+		/* Ordered by draining.  Wait for turn. */
+		WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
+		if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
+			*rqp = NULL;
+	}
+
+	return 1;
+}
+
+static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error)
+{
+	request_queue_t *q = bio->bi_private;
+	struct bio_vec *bvec;
+	int i;
+
+	/*
+	 * This is dry run, restore bio_sector and size.  We'll finish
+	 * this request again with the original bi_end_io after an
+	 * error occurs or post flush is complete.
+	 */
+	q->bi_size += bytes;
+
+	if (bio->bi_size)
+		return 1;
+
+	/* Rewind bvec's */
+	bio->bi_idx = 0;
+	bio_for_each_segment(bvec, bio, i) {
+		bvec->bv_len += bvec->bv_offset;
+		bvec->bv_offset = 0;
+	}
+
+	/* Reset bio */
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio->bi_size = q->bi_size;
+	bio->bi_sector -= (q->bi_size >> 9);
+	q->bi_size = 0;
+
+	return 0;
+}
+
+static inline int ordered_bio_endio(struct request *rq, struct bio *bio,
+				    unsigned int nbytes, int error)
+{
+	request_queue_t *q = rq->q;
+	bio_end_io_t *endio;
+	void *private;
+
+	if (&q->bar_rq != rq)
+		return 0;
+
+	/*
+	 * Okay, this is the barrier request in progress, dry finish it.
+	 */
+	if (error && !q->orderr)
+		q->orderr = error;
+
+	endio = bio->bi_end_io;
+	private = bio->bi_private;
+	bio->bi_end_io = flush_dry_bio_endio;
+	bio->bi_private = q;
+
+	bio_endio(bio, nbytes, error);
+
+	bio->bi_end_io = endio;
+	bio->bi_private = private;
+
+	return 1;
+}
+
+/**
+ * blk_queue_bounce_limit - set bounce buffer limit for queue
+ * @q:  the request queue for the device
+ * @dma_addr:   bus address limit
+ *
+ * Description:
+ *    Different hardware can have different requirements as to what pages
+ *    it can do I/O directly to. A low level driver can call
+ *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
+ *    buffers for doing I/O to pages residing above @page. 
+ **/
+void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
+{
+	unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
+	int dma = 0;
+
+	q->bounce_gfp = GFP_NOIO;
+#if BITS_PER_LONG == 64
+	/* Assume anything >= 4GB can be handled by IOMMU. 
+	   Actually some IOMMUs can handle everything, but I don't
+	   know of a way to test this here. */
+	if (bounce_pfn < (0xffffffff>>PAGE_SHIFT))
+		dma = 1;
+	q->bounce_pfn = max_low_pfn;
+#else
+	if (bounce_pfn < blk_max_low_pfn)
+		dma = 1;
+	q->bounce_pfn = bounce_pfn;
+#endif
+	if (dma) {	
+		init_emergency_isa_pool();
+		q->bounce_gfp = GFP_NOIO | GFP_DMA;
+		q->bounce_pfn = bounce_pfn;
+	}
+}
+
+EXPORT_SYMBOL(blk_queue_bounce_limit);
+
+/**
+ * blk_queue_max_sectors - set max sectors for a request for this queue
+ * @q:  the request queue for the device
+ * @max_sectors:  max sectors in the usual 512b unit
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the size of
+ *    received requests.
+ **/
+void blk_queue_max_sectors(request_queue_t *q, unsigned int max_sectors)
+{
+	if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
+		max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+		printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
+	}
+
+	if (BLK_DEF_MAX_SECTORS > max_sectors)
+		q->max_hw_sectors = q->max_sectors = max_sectors;
+ 	else {
+		q->max_sectors = BLK_DEF_MAX_SECTORS;
+		q->max_hw_sectors = max_sectors;
+	}
+}
+
+EXPORT_SYMBOL(blk_queue_max_sectors);
+
+/**
+ * blk_queue_max_phys_segments - set max phys segments for a request for this queue
+ * @q:  the request queue for the device
+ * @max_segments:  max number of segments
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the number of
+ *    physical data segments in a request.  This would be the largest sized
+ *    scatter list the driver could handle.
+ **/
+void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments)
+{
+	if (!max_segments) {
+		max_segments = 1;
+		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
+	}
+
+	q->max_phys_segments = max_segments;
+}
+
+EXPORT_SYMBOL(blk_queue_max_phys_segments);
+
+/**
+ * blk_queue_max_hw_segments - set max hw segments for a request for this queue
+ * @q:  the request queue for the device
+ * @max_segments:  max number of segments
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the number of
+ *    hw data segments in a request.  This would be the largest number of
+ *    address/length pairs the host adapter can actually give as once
+ *    to the device.
+ **/
+void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments)
+{
+	if (!max_segments) {
+		max_segments = 1;
+		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
+	}
+
+	q->max_hw_segments = max_segments;
+}
+
+EXPORT_SYMBOL(blk_queue_max_hw_segments);
+
+/**
+ * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
+ * @q:  the request queue for the device
+ * @max_size:  max size of segment in bytes
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the size of a
+ *    coalesced segment
+ **/
+void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size)
+{
+	if (max_size < PAGE_CACHE_SIZE) {
+		max_size = PAGE_CACHE_SIZE;
+		printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
+	}
+
+	q->max_segment_size = max_size;
+}
+
+EXPORT_SYMBOL(blk_queue_max_segment_size);
+
+/**
+ * blk_queue_hardsect_size - set hardware sector size for the queue
+ * @q:  the request queue for the device
+ * @size:  the hardware sector size, in bytes
+ *
+ * Description:
+ *   This should typically be set to the lowest possible sector size
+ *   that the hardware can operate on (possible without reverting to
+ *   even internal read-modify-write operations). Usually the default
+ *   of 512 covers most hardware.
+ **/
+void blk_queue_hardsect_size(request_queue_t *q, unsigned short size)
+{
+	q->hardsect_size = size;
+}
+
+EXPORT_SYMBOL(blk_queue_hardsect_size);
+
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+
+/**
+ * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
+ * @t:	the stacking driver (top)
+ * @b:  the underlying device (bottom)
+ **/
+void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
+{
+	/* zero is "infinity" */
+	t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
+	t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
+
+	t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
+	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
+	t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
+	t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
+	if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+		clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_stack_limits);
+
+/**
+ * blk_queue_segment_boundary - set boundary rules for segment merging
+ * @q:  the request queue for the device
+ * @mask:  the memory boundary mask
+ **/
+void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask)
+{
+	if (mask < PAGE_CACHE_SIZE - 1) {
+		mask = PAGE_CACHE_SIZE - 1;
+		printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
+	}
+
+	q->seg_boundary_mask = mask;
+}
+
+EXPORT_SYMBOL(blk_queue_segment_boundary);
+
+/**
+ * blk_queue_dma_alignment - set dma length and memory alignment
+ * @q:     the request queue for the device
+ * @mask:  alignment mask
+ *
+ * description:
+ *    set required memory and length aligment for direct dma transactions.
+ *    this is used when buiding direct io requests for the queue.
+ *
+ **/
+void blk_queue_dma_alignment(request_queue_t *q, int mask)
+{
+	q->dma_alignment = mask;
+}
+
+EXPORT_SYMBOL(blk_queue_dma_alignment);
+
+/**
+ * blk_queue_find_tag - find a request by its tag and queue
+ * @q:	 The request queue for the device
+ * @tag: The tag of the request
+ *
+ * Notes:
+ *    Should be used when a device returns a tag and you want to match
+ *    it with a request.
+ *
+ *    no locks need be held.
+ **/
+struct request *blk_queue_find_tag(request_queue_t *q, int tag)
+{
+	struct blk_queue_tag *bqt = q->queue_tags;
+
+	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
+		return NULL;
+
+	return bqt->tag_index[tag];
+}
+
+EXPORT_SYMBOL(blk_queue_find_tag);
+
+/**
+ * __blk_queue_free_tags - release tag maintenance info
+ * @q:  the request queue for the device
+ *
+ *  Notes:
+ *    blk_cleanup_queue() will take care of calling this function, if tagging
+ *    has been used. So there's no need to call this directly.
+ **/
+static void __blk_queue_free_tags(request_queue_t *q)
+{
+	struct blk_queue_tag *bqt = q->queue_tags;
+
+	if (!bqt)
+		return;
+
+	if (atomic_dec_and_test(&bqt->refcnt)) {
+		BUG_ON(bqt->busy);
+		BUG_ON(!list_empty(&bqt->busy_list));
+
+		kfree(bqt->tag_index);
+		bqt->tag_index = NULL;
+
+		kfree(bqt->tag_map);
+		bqt->tag_map = NULL;
+
+		kfree(bqt);
+	}
+
+	q->queue_tags = NULL;
+	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
+}
+
+/**
+ * blk_queue_free_tags - release tag maintenance info
+ * @q:  the request queue for the device
+ *
+ *  Notes:
+ *	This is used to disabled tagged queuing to a device, yet leave
+ *	queue in function.
+ **/
+void blk_queue_free_tags(request_queue_t *q)
+{
+	clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_free_tags);
+
+static int
+init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
+{
+	struct request **tag_index;
+	unsigned long *tag_map;
+	int nr_ulongs;
+
+	if (depth > q->nr_requests * 2) {
+		depth = q->nr_requests * 2;
+		printk(KERN_ERR "%s: adjusted depth to %d\n",
+				__FUNCTION__, depth);
+	}
+
+	tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+	if (!tag_index)
+		goto fail;
+
+	nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
+	tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
+	if (!tag_map)
+		goto fail;
+
+	memset(tag_index, 0, depth * sizeof(struct request *));
+	memset(tag_map, 0, nr_ulongs * sizeof(unsigned long));
+	tags->real_max_depth = depth;
+	tags->max_depth = depth;
+	tags->tag_index = tag_index;
+	tags->tag_map = tag_map;
+
+	return 0;
+fail:
+	kfree(tag_index);
+	return -ENOMEM;
+}
+
+/**
+ * blk_queue_init_tags - initialize the queue tag info
+ * @q:  the request queue for the device
+ * @depth:  the maximum queue depth supported
+ * @tags: the tag to use
+ **/
+int blk_queue_init_tags(request_queue_t *q, int depth,
+			struct blk_queue_tag *tags)
+{
+	int rc;
+
+	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
+
+	if (!tags && !q->queue_tags) {
+		tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
+		if (!tags)
+			goto fail;
+
+		if (init_tag_map(q, tags, depth))
+			goto fail;
+
+		INIT_LIST_HEAD(&tags->busy_list);
+		tags->busy = 0;
+		atomic_set(&tags->refcnt, 1);
+	} else if (q->queue_tags) {
+		if ((rc = blk_queue_resize_tags(q, depth)))
+			return rc;
+		set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+		return 0;
+	} else
+		atomic_inc(&tags->refcnt);
+
+	/*
+	 * assign it, all done
+	 */
+	q->queue_tags = tags;
+	q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
+	return 0;
+fail:
+	kfree(tags);
+	return -ENOMEM;
+}
+
+EXPORT_SYMBOL(blk_queue_init_tags);
+
+/**
+ * blk_queue_resize_tags - change the queueing depth
+ * @q:  the request queue for the device
+ * @new_depth: the new max command queueing depth
+ *
+ *  Notes:
+ *    Must be called with the queue lock held.
+ **/
+int blk_queue_resize_tags(request_queue_t *q, int new_depth)
+{
+	struct blk_queue_tag *bqt = q->queue_tags;
+	struct request **tag_index;
+	unsigned long *tag_map;
+	int max_depth, nr_ulongs;
+
+	if (!bqt)
+		return -ENXIO;
+
+	/*
+	 * if we already have large enough real_max_depth.  just
+	 * adjust max_depth.  *NOTE* as requests with tag value
+	 * between new_depth and real_max_depth can be in-flight, tag
+	 * map can not be shrunk blindly here.
+	 */
+	if (new_depth <= bqt->real_max_depth) {
+		bqt->max_depth = new_depth;
+		return 0;
+	}
+
+	/*
+	 * save the old state info, so we can copy it back
+	 */
+	tag_index = bqt->tag_index;
+	tag_map = bqt->tag_map;
+	max_depth = bqt->real_max_depth;
+
+	if (init_tag_map(q, bqt, new_depth))
+		return -ENOMEM;
+
+	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
+	nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
+	memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
+
+	kfree(tag_index);
+	kfree(tag_map);
+	return 0;
+}
+
+EXPORT_SYMBOL(blk_queue_resize_tags);
+
+/**
+ * blk_queue_end_tag - end tag operations for a request
+ * @q:  the request queue for the device
+ * @rq: the request that has completed
+ *
+ *  Description:
+ *    Typically called when end_that_request_first() returns 0, meaning
+ *    all transfers have been done for a request. It's important to call
+ *    this function before end_that_request_last(), as that will put the
+ *    request back on the free list thus corrupting the internal tag list.
+ *
+ *  Notes:
+ *   queue lock must be held.
+ **/
+void blk_queue_end_tag(request_queue_t *q, struct request *rq)
+{
+	struct blk_queue_tag *bqt = q->queue_tags;
+	int tag = rq->tag;
+
+	BUG_ON(tag == -1);
+
+	if (unlikely(tag >= bqt->real_max_depth))
+		/*
+		 * This can happen after tag depth has been reduced.
+		 * FIXME: how about a warning or info message here?
+		 */
+		return;
+
+	if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) {
+		printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
+		       __FUNCTION__, tag);
+		return;
+	}
+
+	list_del_init(&rq->queuelist);
+	rq->flags &= ~REQ_QUEUED;
+	rq->tag = -1;
+
+	if (unlikely(bqt->tag_index[tag] == NULL))
+		printk(KERN_ERR "%s: tag %d is missing\n",
+		       __FUNCTION__, tag);
+
+	bqt->tag_index[tag] = NULL;
+	bqt->busy--;
+}
+
+EXPORT_SYMBOL(blk_queue_end_tag);
+
+/**
+ * blk_queue_start_tag - find a free tag and assign it
+ * @q:  the request queue for the device
+ * @rq:  the block request that needs tagging
+ *
+ *  Description:
+ *    This can either be used as a stand-alone helper, or possibly be
+ *    assigned as the queue &prep_rq_fn (in which case &struct request
+ *    automagically gets a tag assigned). Note that this function
+ *    assumes that any type of request can be queued! if this is not
+ *    true for your device, you must check the request type before
+ *    calling this function.  The request will also be removed from
+ *    the request queue, so it's the drivers responsibility to readd
+ *    it if it should need to be restarted for some reason.
+ *
+ *  Notes:
+ *   queue lock must be held.
+ **/
+int blk_queue_start_tag(request_queue_t *q, struct request *rq)
+{
+	struct blk_queue_tag *bqt = q->queue_tags;
+	int tag;
+
+	if (unlikely((rq->flags & REQ_QUEUED))) {
+		printk(KERN_ERR 
+		       "%s: request %p for device [%s] already tagged %d",
+		       __FUNCTION__, rq,
+		       rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
+		BUG();
+	}
+
+	tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
+	if (tag >= bqt->max_depth)
+		return 1;
+
+	__set_bit(tag, bqt->tag_map);
+
+	rq->flags |= REQ_QUEUED;
+	rq->tag = tag;
+	bqt->tag_index[tag] = rq;
+	blkdev_dequeue_request(rq);
+	list_add(&rq->queuelist, &bqt->busy_list);
+	bqt->busy++;
+	return 0;
+}
+
+EXPORT_SYMBOL(blk_queue_start_tag);
+
+/**
+ * blk_queue_invalidate_tags - invalidate all pending tags
+ * @q:  the request queue for the device
+ *
+ *  Description:
+ *   Hardware conditions may dictate a need to stop all pending requests.
+ *   In this case, we will safely clear the block side of the tag queue and
+ *   readd all requests to the request queue in the right order.
+ *
+ *  Notes:
+ *   queue lock must be held.
+ **/
+void blk_queue_invalidate_tags(request_queue_t *q)
+{
+	struct blk_queue_tag *bqt = q->queue_tags;
+	struct list_head *tmp, *n;
+	struct request *rq;
+
+	list_for_each_safe(tmp, n, &bqt->busy_list) {
+		rq = list_entry_rq(tmp);
+
+		if (rq->tag == -1) {
+			printk(KERN_ERR
+			       "%s: bad tag found on list\n", __FUNCTION__);
+			list_del_init(&rq->queuelist);
+			rq->flags &= ~REQ_QUEUED;
+		} else
+			blk_queue_end_tag(q, rq);
+
+		rq->flags &= ~REQ_STARTED;
+		__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
+	}
+}
+
+EXPORT_SYMBOL(blk_queue_invalidate_tags);
+
+static const char * const rq_flags[] = {
+	"REQ_RW",
+	"REQ_FAILFAST",
+	"REQ_SORTED",
+	"REQ_SOFTBARRIER",
+	"REQ_HARDBARRIER",
+	"REQ_FUA",
+	"REQ_CMD",
+	"REQ_NOMERGE",
+	"REQ_STARTED",
+	"REQ_DONTPREP",
+	"REQ_QUEUED",
+	"REQ_ELVPRIV",
+	"REQ_PC",
+	"REQ_BLOCK_PC",
+	"REQ_SENSE",
+	"REQ_FAILED",
+	"REQ_QUIET",
+	"REQ_SPECIAL",
+	"REQ_DRIVE_CMD",
+	"REQ_DRIVE_TASK",
+	"REQ_DRIVE_TASKFILE",
+	"REQ_PREEMPT",
+	"REQ_PM_SUSPEND",
+	"REQ_PM_RESUME",
+	"REQ_PM_SHUTDOWN",
+	"REQ_ORDERED_COLOR",
+};
+
+void blk_dump_rq_flags(struct request *rq, char *msg)
+{
+	int bit;
+
+	printk("%s: dev %s: flags = ", msg,
+		rq->rq_disk ? rq->rq_disk->disk_name : "?");
+	bit = 0;
+	do {
+		if (rq->flags & (1 << bit))
+			printk("%s ", rq_flags[bit]);
+		bit++;
+	} while (bit < __REQ_NR_BITS);
+
+	printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
+						       rq->nr_sectors,
+						       rq->current_nr_sectors);
+	printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
+
+	if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) {
+		printk("cdb: ");
+		for (bit = 0; bit < sizeof(rq->cmd); bit++)
+			printk("%02x ", rq->cmd[bit]);
+		printk("\n");
+	}
+}
+
+EXPORT_SYMBOL(blk_dump_rq_flags);
+
+void blk_recount_segments(request_queue_t *q, struct bio *bio)
+{
+	struct bio_vec *bv, *bvprv = NULL;
+	int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster;
+	int high, highprv = 1;
+
+	if (unlikely(!bio->bi_io_vec))
+		return;
+
+	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
+	hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0;
+	bio_for_each_segment(bv, bio, i) {
+		/*
+		 * the trick here is making sure that a high page is never
+		 * considered part of another segment, since that might
+		 * change with the bounce page.
+		 */
+		high = page_to_pfn(bv->bv_page) >= q->bounce_pfn;
+		if (high || highprv)
+			goto new_hw_segment;
+		if (cluster) {
+			if (seg_size + bv->bv_len > q->max_segment_size)
+				goto new_segment;
+			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
+				goto new_segment;
+			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
+				goto new_segment;
+			if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
+				goto new_hw_segment;
+
+			seg_size += bv->bv_len;
+			hw_seg_size += bv->bv_len;
+			bvprv = bv;
+			continue;
+		}
+new_segment:
+		if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
+		    !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) {
+			hw_seg_size += bv->bv_len;
+		} else {
+new_hw_segment:
+			if (hw_seg_size > bio->bi_hw_front_size)
+				bio->bi_hw_front_size = hw_seg_size;
+			hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
+			nr_hw_segs++;
+		}
+
+		nr_phys_segs++;
+		bvprv = bv;
+		seg_size = bv->bv_len;
+		highprv = high;
+	}
+	if (hw_seg_size > bio->bi_hw_back_size)
+		bio->bi_hw_back_size = hw_seg_size;
+	if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size)
+		bio->bi_hw_front_size = hw_seg_size;
+	bio->bi_phys_segments = nr_phys_segs;
+	bio->bi_hw_segments = nr_hw_segs;
+	bio->bi_flags |= (1 << BIO_SEG_VALID);
+}
+
+
+static int blk_phys_contig_segment(request_queue_t *q, struct bio *bio,
+				   struct bio *nxt)
+{
+	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
+		return 0;
+
+	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
+		return 0;
+	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
+		return 0;
+
+	/*
+	 * bio and nxt are contigous in memory, check if the queue allows
+	 * these two to be merged into one
+	 */
+	if (BIO_SEG_BOUNDARY(q, bio, nxt))
+		return 1;
+
+	return 0;
+}
+
+static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
+				 struct bio *nxt)
+{
+	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+		blk_recount_segments(q, bio);
+	if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
+		blk_recount_segments(q, nxt);
+	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
+	    BIOVEC_VIRT_OVERSIZE(bio->bi_hw_front_size + bio->bi_hw_back_size))
+		return 0;
+	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * map a request to scatterlist, return number of sg entries setup. Caller
+ * must make sure sg can hold rq->nr_phys_segments entries
+ */
+int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg)
+{
+	struct bio_vec *bvec, *bvprv;
+	struct bio *bio;
+	int nsegs, i, cluster;
+
+	nsegs = 0;
+	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
+
+	/*
+	 * for each bio in rq
+	 */
+	bvprv = NULL;
+	rq_for_each_bio(bio, rq) {
+		/*
+		 * for each segment in bio
+		 */
+		bio_for_each_segment(bvec, bio, i) {
+			int nbytes = bvec->bv_len;
+
+			if (bvprv && cluster) {
+				if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
+					goto new_segment;
+
+				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
+					goto new_segment;
+				if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
+					goto new_segment;
+
+				sg[nsegs - 1].length += nbytes;
+			} else {
+new_segment:
+				memset(&sg[nsegs],0,sizeof(struct scatterlist));
+				sg[nsegs].page = bvec->bv_page;
+				sg[nsegs].length = nbytes;
+				sg[nsegs].offset = bvec->bv_offset;
+
+				nsegs++;
+			}
+			bvprv = bvec;
+		} /* segments in bio */
+	} /* bios in rq */
+
+	return nsegs;
+}
+
+EXPORT_SYMBOL(blk_rq_map_sg);
+
+/*
+ * the standard queue merge functions, can be overridden with device
+ * specific ones if so desired
+ */
+
+static inline int ll_new_mergeable(request_queue_t *q,
+				   struct request *req,
+				   struct bio *bio)
+{
+	int nr_phys_segs = bio_phys_segments(q, bio);
+
+	if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
+		req->flags |= REQ_NOMERGE;
+		if (req == q->last_merge)
+			q->last_merge = NULL;
+		return 0;
+	}
+
+	/*
+	 * A hw segment is just getting larger, bump just the phys
+	 * counter.
+	 */
+	req->nr_phys_segments += nr_phys_segs;
+	return 1;
+}
+
+static inline int ll_new_hw_segment(request_queue_t *q,
+				    struct request *req,
+				    struct bio *bio)
+{
+	int nr_hw_segs = bio_hw_segments(q, bio);
+	int nr_phys_segs = bio_phys_segments(q, bio);
+
+	if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
+	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
+		req->flags |= REQ_NOMERGE;
+		if (req == q->last_merge)
+			q->last_merge = NULL;
+		return 0;
+	}
+
+	/*
+	 * This will form the start of a new hw segment.  Bump both
+	 * counters.
+	 */
+	req->nr_hw_segments += nr_hw_segs;
+	req->nr_phys_segments += nr_phys_segs;
+	return 1;
+}
+
+static int ll_back_merge_fn(request_queue_t *q, struct request *req, 
+			    struct bio *bio)
+{
+	unsigned short max_sectors;
+	int len;
+
+	if (unlikely(blk_pc_request(req)))
+		max_sectors = q->max_hw_sectors;
+	else
+		max_sectors = q->max_sectors;
+
+	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
+		req->flags |= REQ_NOMERGE;
+		if (req == q->last_merge)
+			q->last_merge = NULL;
+		return 0;
+	}
+	if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
+		blk_recount_segments(q, req->biotail);
+	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+		blk_recount_segments(q, bio);
+	len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
+	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
+	    !BIOVEC_VIRT_OVERSIZE(len)) {
+		int mergeable =  ll_new_mergeable(q, req, bio);
+
+		if (mergeable) {
+			if (req->nr_hw_segments == 1)
+				req->bio->bi_hw_front_size = len;
+			if (bio->bi_hw_segments == 1)
+				bio->bi_hw_back_size = len;
+		}
+		return mergeable;
+	}
+
+	return ll_new_hw_segment(q, req, bio);
+}
+
+static int ll_front_merge_fn(request_queue_t *q, struct request *req, 
+			     struct bio *bio)
+{
+	unsigned short max_sectors;
+	int len;
+
+	if (unlikely(blk_pc_request(req)))
+		max_sectors = q->max_hw_sectors;
+	else
+		max_sectors = q->max_sectors;
+
+
+	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
+		req->flags |= REQ_NOMERGE;
+		if (req == q->last_merge)
+			q->last_merge = NULL;
+		return 0;
+	}
+	len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
+	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+		blk_recount_segments(q, bio);
+	if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
+		blk_recount_segments(q, req->bio);
+	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
+	    !BIOVEC_VIRT_OVERSIZE(len)) {
+		int mergeable =  ll_new_mergeable(q, req, bio);
+
+		if (mergeable) {
+			if (bio->bi_hw_segments == 1)
+				bio->bi_hw_front_size = len;
+			if (req->nr_hw_segments == 1)
+				req->biotail->bi_hw_back_size = len;
+		}
+		return mergeable;
+	}
+
+	return ll_new_hw_segment(q, req, bio);
+}
+
+static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
+				struct request *next)
+{
+	int total_phys_segments;
+	int total_hw_segments;
+
+	/*
+	 * First check if the either of the requests are re-queued
+	 * requests.  Can't merge them if they are.
+	 */
+	if (req->special || next->special)
+		return 0;
+
+	/*
+	 * Will it become too large?
+	 */
+	if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
+		return 0;
+
+	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
+	if (blk_phys_contig_segment(q, req->biotail, next->bio))
+		total_phys_segments--;
+
+	if (total_phys_segments > q->max_phys_segments)
+		return 0;
+
+	total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
+	if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
+		int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
+		/*
+		 * propagate the combined length to the end of the requests
+		 */
+		if (req->nr_hw_segments == 1)
+			req->bio->bi_hw_front_size = len;
+		if (next->nr_hw_segments == 1)
+			next->biotail->bi_hw_back_size = len;
+		total_hw_segments--;
+	}
+
+	if (total_hw_segments > q->max_hw_segments)
+		return 0;
+
+	/* Merge is OK... */
+	req->nr_phys_segments = total_phys_segments;
+	req->nr_hw_segments = total_hw_segments;
+	return 1;
+}
+
+/*
+ * "plug" the device if there are no outstanding requests: this will
+ * force the transfer to start only after we have put all the requests
+ * on the list.
+ *
+ * This is called with interrupts off and no requests on the queue and
+ * with the queue lock held.
+ */
+void blk_plug_device(request_queue_t *q)
+{
+	WARN_ON(!irqs_disabled());
+
+	/*
+	 * don't plug a stopped queue, it must be paired with blk_start_queue()
+	 * which will restart the queueing
+	 */
+	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
+		return;
+
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
+		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
+		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+	}
+}
+
+EXPORT_SYMBOL(blk_plug_device);
+
+/*
+ * remove the queue from the plugged list, if present. called with
+ * queue lock held and interrupts disabled.
+ */
+int blk_remove_plug(request_queue_t *q)
+{
+	WARN_ON(!irqs_disabled());
+
+	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+		return 0;
+
+	del_timer(&q->unplug_timer);
+	return 1;
+}
+
+EXPORT_SYMBOL(blk_remove_plug);
+
+/*
+ * remove the plug and let it rip..
+ */
+void __generic_unplug_device(request_queue_t *q)
+{
+	if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)))
+		return;
+
+	if (!blk_remove_plug(q))
+		return;
+
+	q->request_fn(q);
+}
+EXPORT_SYMBOL(__generic_unplug_device);
+
+/**
+ * generic_unplug_device - fire a request queue
+ * @q:    The &request_queue_t in question
+ *
+ * Description:
+ *   Linux uses plugging to build bigger requests queues before letting
+ *   the device have at them. If a queue is plugged, the I/O scheduler
+ *   is still adding and merging requests on the queue. Once the queue
+ *   gets unplugged, the request_fn defined for the queue is invoked and
+ *   transfers started.
+ **/
+void generic_unplug_device(request_queue_t *q)
+{
+	might_sleep();
+	spin_lock_irq(q->queue_lock);
+	__generic_unplug_device(q);
+	spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(generic_unplug_device);
+
+static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
+				   struct page *page)
+{
+	request_queue_t *q = bdi->unplug_io_data;
+
+	/*
+	 * devices don't necessarily have an ->unplug_fn defined
+	 */
+	if (q->unplug_fn) {
+		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+					q->rq.count[READ] + q->rq.count[WRITE]);
+
+		q->unplug_fn(q);
+	}
+}
+
+static void blk_unplug_work(void *data)
+{
+	request_queue_t *q = data;
+
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
+	q->unplug_fn(q);
+}
+
+static void blk_unplug_timeout(unsigned long data)
+{
+	request_queue_t *q = (request_queue_t *)data;
+
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
+	kblockd_schedule_work(&q->unplug_work);
+}
+
+/**
+ * blk_start_queue - restart a previously stopped queue
+ * @q:    The &request_queue_t in question
+ *
+ * Description:
+ *   blk_start_queue() will clear the stop flag on the queue, and call
+ *   the request_fn for the queue if it was in a stopped state when
+ *   entered. Also see blk_stop_queue(). Queue lock must be held.
+ **/
+void blk_start_queue(request_queue_t *q)
+{
+	clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
+
+	/*
+	 * one level of recursion is ok and is much faster than kicking
+	 * the unplug handling
+	 */
+	if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
+		q->request_fn(q);
+		clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
+	} else {
+		blk_plug_device(q);
+		kblockd_schedule_work(&q->unplug_work);
+	}
+}
+
+EXPORT_SYMBOL(blk_start_queue);
+
+/**
+ * blk_stop_queue - stop a queue
+ * @q:    The &request_queue_t in question
+ *
+ * Description:
+ *   The Linux block layer assumes that a block driver will consume all
+ *   entries on the request queue when the request_fn strategy is called.
+ *   Often this will not happen, because of hardware limitations (queue
+ *   depth settings). If a device driver gets a 'queue full' response,
+ *   or if it simply chooses not to queue more I/O at one point, it can
+ *   call this function to prevent the request_fn from being called until
+ *   the driver has signalled it's ready to go again. This happens by calling
+ *   blk_start_queue() to restart queue operations. Queue lock must be held.
+ **/
+void blk_stop_queue(request_queue_t *q)
+{
+	blk_remove_plug(q);
+	set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
+}
+EXPORT_SYMBOL(blk_stop_queue);
+
+/**
+ * blk_sync_queue - cancel any pending callbacks on a queue
+ * @q: the queue
+ *
+ * Description:
+ *     The block layer may perform asynchronous callback activity
+ *     on a queue, such as calling the unplug function after a timeout.
+ *     A block device may call blk_sync_queue to ensure that any
+ *     such activity is cancelled, thus allowing it to release resources
+ *     the the callbacks might use. The caller must already have made sure
+ *     that its ->make_request_fn will not re-add plugging prior to calling
+ *     this function.
+ *
+ */
+void blk_sync_queue(struct request_queue *q)
+{
+	del_timer_sync(&q->unplug_timer);
+	kblockd_flush();
+}
+EXPORT_SYMBOL(blk_sync_queue);
+
+/**
+ * blk_run_queue - run a single device queue
+ * @q:	The queue to run
+ */
+void blk_run_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	blk_remove_plug(q);
+	if (!elv_queue_empty(q))
+		q->request_fn(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL(blk_run_queue);
+
+/**
+ * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
+ * @q:    the request queue to be released
+ *
+ * Description:
+ *     blk_cleanup_queue is the pair to blk_init_queue() or
+ *     blk_queue_make_request().  It should be called when a request queue is
+ *     being released; typically when a block device is being de-registered.
+ *     Currently, its primary task it to free all the &struct request
+ *     structures that were allocated to the queue and the queue itself.
+ *
+ * Caveat:
+ *     Hopefully the low level driver will have finished any
+ *     outstanding requests first...
+ **/
+void blk_cleanup_queue(request_queue_t * q)
+{
+	struct request_list *rl = &q->rq;
+
+	if (!atomic_dec_and_test(&q->refcnt))
+		return;
+
+	if (q->elevator)
+		elevator_exit(q->elevator);
+
+	blk_sync_queue(q);
+
+	if (rl->rq_pool)
+		mempool_destroy(rl->rq_pool);
+
+	if (q->queue_tags)
+		__blk_queue_free_tags(q);
+
+	if (q->blk_trace)
+		blk_trace_shutdown(q);
+
+	kmem_cache_free(requestq_cachep, q);
+}
+
+EXPORT_SYMBOL(blk_cleanup_queue);
+
+static int blk_init_free_list(request_queue_t *q)
+{
+	struct request_list *rl = &q->rq;
+
+	rl->count[READ] = rl->count[WRITE] = 0;
+	rl->starved[READ] = rl->starved[WRITE] = 0;
+	rl->elvpriv = 0;
+	init_waitqueue_head(&rl->wait[READ]);
+	init_waitqueue_head(&rl->wait[WRITE]);
+
+	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
+				mempool_free_slab, request_cachep, q->node);
+
+	if (!rl->rq_pool)
+		return -ENOMEM;
+
+	return 0;
+}
+
+request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
+{
+	return blk_alloc_queue_node(gfp_mask, -1);
+}
+EXPORT_SYMBOL(blk_alloc_queue);
+
+request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+{
+	request_queue_t *q;
+
+	q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id);
+	if (!q)
+		return NULL;
+
+	memset(q, 0, sizeof(*q));
+	init_timer(&q->unplug_timer);
+	atomic_set(&q->refcnt, 1);
+
+	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
+	q->backing_dev_info.unplug_io_data = q;
+
+	return q;
+}
+EXPORT_SYMBOL(blk_alloc_queue_node);
+
+/**
+ * blk_init_queue  - prepare a request queue for use with a block device
+ * @rfn:  The function to be called to process requests that have been
+ *        placed on the queue.
+ * @lock: Request queue spin lock
+ *
+ * Description:
+ *    If a block device wishes to use the standard request handling procedures,
+ *    which sorts requests and coalesces adjacent requests, then it must
+ *    call blk_init_queue().  The function @rfn will be called when there
+ *    are requests on the queue that need to be processed.  If the device
+ *    supports plugging, then @rfn may not be called immediately when requests
+ *    are available on the queue, but may be called at some time later instead.
+ *    Plugged queues are generally unplugged when a buffer belonging to one
+ *    of the requests on the queue is needed, or due to memory pressure.
+ *
+ *    @rfn is not required, or even expected, to remove all requests off the
+ *    queue, but only as many as it can handle at a time.  If it does leave
+ *    requests on the queue, it is responsible for arranging that the requests
+ *    get dealt with eventually.
+ *
+ *    The queue spin lock must be held while manipulating the requests on the
+ *    request queue.
+ *
+ *    Function returns a pointer to the initialized request queue, or NULL if
+ *    it didn't succeed.
+ *
+ * Note:
+ *    blk_init_queue() must be paired with a blk_cleanup_queue() call
+ *    when the block device is deactivated (such as at module unload).
+ **/
+
+request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
+{
+	return blk_init_queue_node(rfn, lock, -1);
+}
+EXPORT_SYMBOL(blk_init_queue);
+
+request_queue_t *
+blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
+{
+	request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+
+	if (!q)
+		return NULL;
+
+	q->node = node_id;
+	if (blk_init_free_list(q))
+		goto out_init;
+
+	/*
+	 * if caller didn't supply a lock, they get per-queue locking with
+	 * our embedded lock
+	 */
+	if (!lock) {
+		spin_lock_init(&q->__queue_lock);
+		lock = &q->__queue_lock;
+	}
+
+	q->request_fn		= rfn;
+	q->back_merge_fn       	= ll_back_merge_fn;
+	q->front_merge_fn      	= ll_front_merge_fn;
+	q->merge_requests_fn	= ll_merge_requests_fn;
+	q->prep_rq_fn		= NULL;
+	q->unplug_fn		= generic_unplug_device;
+	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER);
+	q->queue_lock		= lock;
+
+	blk_queue_segment_boundary(q, 0xffffffff);
+
+	blk_queue_make_request(q, __make_request);
+	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
+
+	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
+	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
+
+	/*
+	 * all done
+	 */
+	if (!elevator_init(q, NULL)) {
+		blk_queue_congestion_threshold(q);
+		return q;
+	}
+
+	blk_cleanup_queue(q);
+out_init:
+	kmem_cache_free(requestq_cachep, q);
+	return NULL;
+}
+EXPORT_SYMBOL(blk_init_queue_node);
+
+int blk_get_queue(request_queue_t *q)
+{
+	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+		atomic_inc(&q->refcnt);
+		return 0;
+	}
+
+	return 1;
+}
+
+EXPORT_SYMBOL(blk_get_queue);
+
+static inline void blk_free_request(request_queue_t *q, struct request *rq)
+{
+	if (rq->flags & REQ_ELVPRIV)
+		elv_put_request(q, rq);
+	mempool_free(rq, q->rq.rq_pool);
+}
+
+static inline struct request *
+blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
+		  int priv, gfp_t gfp_mask)
+{
+	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
+
+	if (!rq)
+		return NULL;
+
+	/*
+	 * first three bits are identical in rq->flags and bio->bi_rw,
+	 * see bio.h and blkdev.h
+	 */
+	rq->flags = rw;
+
+	if (priv) {
+		if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
+			mempool_free(rq, q->rq.rq_pool);
+			return NULL;
+		}
+		rq->flags |= REQ_ELVPRIV;
+	}
+
+	return rq;
+}
+
+/*
+ * ioc_batching returns true if the ioc is a valid batching request and
+ * should be given priority access to a request.
+ */
+static inline int ioc_batching(request_queue_t *q, struct io_context *ioc)
+{
+	if (!ioc)
+		return 0;
+
+	/*
+	 * Make sure the process is able to allocate at least 1 request
+	 * even if the batch times out, otherwise we could theoretically
+	 * lose wakeups.
+	 */
+	return ioc->nr_batch_requests == q->nr_batching ||
+		(ioc->nr_batch_requests > 0
+		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
+}
+
+/*
+ * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
+ * will cause the process to be a "batcher" on all queues in the system. This
+ * is the behaviour we want though - once it gets a wakeup it should be given
+ * a nice run.
+ */
+static void ioc_set_batching(request_queue_t *q, struct io_context *ioc)
+{
+	if (!ioc || ioc_batching(q, ioc))
+		return;
+
+	ioc->nr_batch_requests = q->nr_batching;
+	ioc->last_waited = jiffies;
+}
+
+static void __freed_request(request_queue_t *q, int rw)
+{
+	struct request_list *rl = &q->rq;
+
+	if (rl->count[rw] < queue_congestion_off_threshold(q))
+		clear_queue_congested(q, rw);
+
+	if (rl->count[rw] + 1 <= q->nr_requests) {
+		if (waitqueue_active(&rl->wait[rw]))
+			wake_up(&rl->wait[rw]);
+
+		blk_clear_queue_full(q, rw);
+	}
+}
+
+/*
+ * A request has just been released.  Account for it, update the full and
+ * congestion status, wake up any waiters.   Called under q->queue_lock.
+ */
+static void freed_request(request_queue_t *q, int rw, int priv)
+{
+	struct request_list *rl = &q->rq;
+
+	rl->count[rw]--;
+	if (priv)
+		rl->elvpriv--;
+
+	__freed_request(q, rw);
+
+	if (unlikely(rl->starved[rw ^ 1]))
+		__freed_request(q, rw ^ 1);
+}
+
+#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
+/*
+ * Get a free request, queue_lock must be held.
+ * Returns NULL on failure, with queue_lock held.
+ * Returns !NULL on success, with queue_lock *not held*.
+ */
+static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
+				   gfp_t gfp_mask)
+{
+	struct request *rq = NULL;
+	struct request_list *rl = &q->rq;
+	struct io_context *ioc = NULL;
+	int may_queue, priv;
+
+	may_queue = elv_may_queue(q, rw, bio);
+	if (may_queue == ELV_MQUEUE_NO)
+		goto rq_starved;
+
+	if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
+		if (rl->count[rw]+1 >= q->nr_requests) {
+			ioc = current_io_context(GFP_ATOMIC);
+			/*
+			 * The queue will fill after this allocation, so set
+			 * it as full, and mark this process as "batching".
+			 * This process will be allowed to complete a batch of
+			 * requests, others will be blocked.
+			 */
+			if (!blk_queue_full(q, rw)) {
+				ioc_set_batching(q, ioc);
+				blk_set_queue_full(q, rw);
+			} else {
+				if (may_queue != ELV_MQUEUE_MUST
+						&& !ioc_batching(q, ioc)) {
+					/*
+					 * The queue is full and the allocating
+					 * process is not a "batcher", and not
+					 * exempted by the IO scheduler
+					 */
+					goto out;
+				}
+			}
+		}
+		set_queue_congested(q, rw);
+	}
+
+	/*
+	 * Only allow batching queuers to allocate up to 50% over the defined
+	 * limit of requests, otherwise we could have thousands of requests
+	 * allocated with any setting of ->nr_requests
+	 */
+	if (rl->count[rw] >= (3 * q->nr_requests / 2))
+		goto out;
+
+	rl->count[rw]++;
+	rl->starved[rw] = 0;
+
+	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+	if (priv)
+		rl->elvpriv++;
+
+	spin_unlock_irq(q->queue_lock);
+
+	rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
+	if (unlikely(!rq)) {
+		/*
+		 * Allocation failed presumably due to memory. Undo anything
+		 * we might have messed up.
+		 *
+		 * Allocating task should really be put onto the front of the
+		 * wait queue, but this is pretty rare.
+		 */
+		spin_lock_irq(q->queue_lock);
+		freed_request(q, rw, priv);
+
+		/*
+		 * in the very unlikely event that allocation failed and no
+		 * requests for this direction was pending, mark us starved
+		 * so that freeing of a request in the other direction will
+		 * notice us. another possible fix would be to split the
+		 * rq mempool into READ and WRITE
+		 */
+rq_starved:
+		if (unlikely(rl->count[rw] == 0))
+			rl->starved[rw] = 1;
+
+		goto out;
+	}
+
+	/*
+	 * ioc may be NULL here, and ioc_batching will be false. That's
+	 * OK, if the queue is under the request limit then requests need
+	 * not count toward the nr_batch_requests limit. There will always
+	 * be some limit enforced by BLK_BATCH_TIME.
+	 */
+	if (ioc_batching(q, ioc))
+		ioc->nr_batch_requests--;
+	
+	rq_init(q, rq);
+	rq->rl = rl;
+
+	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+out:
+	return rq;
+}
+
+/*
+ * No available requests for this queue, unplug the device and wait for some
+ * requests to become available.
+ *
+ * Called with q->queue_lock held, and returns with it unlocked.
+ */
+static struct request *get_request_wait(request_queue_t *q, int rw,
+					struct bio *bio)
+{
+	struct request *rq;
+
+	rq = get_request(q, rw, bio, GFP_NOIO);
+	while (!rq) {
+		DEFINE_WAIT(wait);
+		struct request_list *rl = &q->rq;
+
+		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		rq = get_request(q, rw, bio, GFP_NOIO);
+
+		if (!rq) {
+			struct io_context *ioc;
+
+			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+
+			__generic_unplug_device(q);
+			spin_unlock_irq(q->queue_lock);
+			io_schedule();
+
+			/*
+			 * After sleeping, we become a "batching" process and
+			 * will be able to allocate at least one request, and
+			 * up to a big batch of them for a small period time.
+			 * See ioc_batching, ioc_set_batching
+			 */
+			ioc = current_io_context(GFP_NOIO);
+			ioc_set_batching(q, ioc);
+
+			spin_lock_irq(q->queue_lock);
+		}
+		finish_wait(&rl->wait[rw], &wait);
+	}
+
+	return rq;
+}
+
+struct request *blk_get_request(request_queue_t *q, int rw, gfp_t gfp_mask)
+{
+	struct request *rq;
+
+	BUG_ON(rw != READ && rw != WRITE);
+
+	spin_lock_irq(q->queue_lock);
+	if (gfp_mask & __GFP_WAIT) {
+		rq = get_request_wait(q, rw, NULL);
+	} else {
+		rq = get_request(q, rw, NULL, gfp_mask);
+		if (!rq)
+			spin_unlock_irq(q->queue_lock);
+	}
+	/* q->queue_lock is unlocked at this point */
+
+	return rq;
+}
+EXPORT_SYMBOL(blk_get_request);
+
+/**
+ * blk_requeue_request - put a request back on queue
+ * @q:		request queue where request should be inserted
+ * @rq:		request to be inserted
+ *
+ * Description:
+ *    Drivers often keep queueing requests until the hardware cannot accept
+ *    more, when that condition happens we need to put the request back
+ *    on the queue. Must be called with queue lock held.
+ */
+void blk_requeue_request(request_queue_t *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+
+	if (blk_rq_tagged(rq))
+		blk_queue_end_tag(q, rq);
+
+	elv_requeue_request(q, rq);
+}
+
+EXPORT_SYMBOL(blk_requeue_request);
+
+/**
+ * blk_insert_request - insert a special request in to a request queue
+ * @q:		request queue where request should be inserted
+ * @rq:		request to be inserted
+ * @at_head:	insert request at head or tail of queue
+ * @data:	private data
+ *
+ * Description:
+ *    Many block devices need to execute commands asynchronously, so they don't
+ *    block the whole kernel from preemption during request execution.  This is
+ *    accomplished normally by inserting aritficial requests tagged as
+ *    REQ_SPECIAL in to the corresponding request queue, and letting them be
+ *    scheduled for actual execution by the request queue.
+ *
+ *    We have the option of inserting the head or the tail of the queue.
+ *    Typically we use the tail for new ioctls and so forth.  We use the head
+ *    of the queue for things like a QUEUE_FULL message from a device, or a
+ *    host that is unable to accept a particular command.
+ */
+void blk_insert_request(request_queue_t *q, struct request *rq,
+			int at_head, void *data)
+{
+	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
+	unsigned long flags;
+
+	/*
+	 * tell I/O scheduler that this isn't a regular read/write (ie it
+	 * must not attempt merges on this) and that it acts as a soft
+	 * barrier
+	 */
+	rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER;
+
+	rq->special = data;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	/*
+	 * If command is tagged, release the tag
+	 */
+	if (blk_rq_tagged(rq))
+		blk_queue_end_tag(q, rq);
+
+	drive_stat_acct(rq, rq->nr_sectors, 1);
+	__elv_add_request(q, rq, where, 0);
+
+	if (blk_queue_plugged(q))
+		__generic_unplug_device(q);
+	else
+		q->request_fn(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+EXPORT_SYMBOL(blk_insert_request);
+
+/**
+ * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
+ * @q:		request queue where request should be inserted
+ * @rq:		request structure to fill
+ * @ubuf:	the user buffer
+ * @len:	length of user data
+ *
+ * Description:
+ *    Data will be mapped directly for zero copy io, if possible. Otherwise
+ *    a kernel bounce buffer is used.
+ *
+ *    A matching blk_rq_unmap_user() must be issued at the end of io, while
+ *    still in process context.
+ *
+ *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
+ *    before being submitted to the device, as pages mapped may be out of
+ *    reach. It's the callers responsibility to make sure this happens. The
+ *    original bio must be passed back in to blk_rq_unmap_user() for proper
+ *    unmapping.
+ */
+int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf,
+		    unsigned int len)
+{
+	unsigned long uaddr;
+	struct bio *bio;
+	int reading;
+
+	if (len > (q->max_hw_sectors << 9))
+		return -EINVAL;
+	if (!len || !ubuf)
+		return -EINVAL;
+
+	reading = rq_data_dir(rq) == READ;
+
+	/*
+	 * if alignment requirement is satisfied, map in user pages for
+	 * direct dma. else, set up kernel bounce buffers
+	 */
+	uaddr = (unsigned long) ubuf;
+	if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
+		bio = bio_map_user(q, NULL, uaddr, len, reading);
+	else
+		bio = bio_copy_user(q, uaddr, len, reading);
+
+	if (!IS_ERR(bio)) {
+		rq->bio = rq->biotail = bio;
+		blk_rq_bio_prep(q, rq, bio);
+
+		rq->buffer = rq->data = NULL;
+		rq->data_len = len;
+		return 0;
+	}
+
+	/*
+	 * bio is the err-ptr
+	 */
+	return PTR_ERR(bio);
+}
+
+EXPORT_SYMBOL(blk_rq_map_user);
+
+/**
+ * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
+ * @q:		request queue where request should be inserted
+ * @rq:		request to map data to
+ * @iov:	pointer to the iovec
+ * @iov_count:	number of elements in the iovec
+ *
+ * Description:
+ *    Data will be mapped directly for zero copy io, if possible. Otherwise
+ *    a kernel bounce buffer is used.
+ *
+ *    A matching blk_rq_unmap_user() must be issued at the end of io, while
+ *    still in process context.
+ *
+ *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
+ *    before being submitted to the device, as pages mapped may be out of
+ *    reach. It's the callers responsibility to make sure this happens. The
+ *    original bio must be passed back in to blk_rq_unmap_user() for proper
+ *    unmapping.
+ */
+int blk_rq_map_user_iov(request_queue_t *q, struct request *rq,
+			struct sg_iovec *iov, int iov_count)
+{
+	struct bio *bio;
+
+	if (!iov || iov_count <= 0)
+		return -EINVAL;
+
+	/* we don't allow misaligned data like bio_map_user() does.  If the
+	 * user is using sg, they're expected to know the alignment constraints
+	 * and respect them accordingly */
+	bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	rq->bio = rq->biotail = bio;
+	blk_rq_bio_prep(q, rq, bio);
+	rq->buffer = rq->data = NULL;
+	rq->data_len = bio->bi_size;
+	return 0;
+}
+
+EXPORT_SYMBOL(blk_rq_map_user_iov);
+
+/**
+ * blk_rq_unmap_user - unmap a request with user data
+ * @bio:	bio to be unmapped
+ * @ulen:	length of user buffer
+ *
+ * Description:
+ *    Unmap a bio previously mapped by blk_rq_map_user().
+ */
+int blk_rq_unmap_user(struct bio *bio, unsigned int ulen)
+{
+	int ret = 0;
+
+	if (bio) {
+		if (bio_flagged(bio, BIO_USER_MAPPED))
+			bio_unmap_user(bio);
+		else
+			ret = bio_uncopy_user(bio);
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(blk_rq_unmap_user);
+
+/**
+ * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
+ * @q:		request queue where request should be inserted
+ * @rq:		request to fill
+ * @kbuf:	the kernel buffer
+ * @len:	length of user data
+ * @gfp_mask:	memory allocation flags
+ */
+int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf,
+		    unsigned int len, gfp_t gfp_mask)
+{
+	struct bio *bio;
+
+	if (len > (q->max_hw_sectors << 9))
+		return -EINVAL;
+	if (!len || !kbuf)
+		return -EINVAL;
+
+	bio = bio_map_kern(q, kbuf, len, gfp_mask);
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	if (rq_data_dir(rq) == WRITE)
+		bio->bi_rw |= (1 << BIO_RW);
+
+	rq->bio = rq->biotail = bio;
+	blk_rq_bio_prep(q, rq, bio);
+
+	rq->buffer = rq->data = NULL;
+	rq->data_len = len;
+	return 0;
+}
+
+EXPORT_SYMBOL(blk_rq_map_kern);
+
+/**
+ * blk_execute_rq_nowait - insert a request into queue for execution
+ * @q:		queue to insert the request in
+ * @bd_disk:	matching gendisk
+ * @rq:		request to insert
+ * @at_head:    insert request at head or tail of queue
+ * @done:	I/O completion handler
+ *
+ * Description:
+ *    Insert a fully prepared request at the back of the io scheduler queue
+ *    for execution.  Don't wait for completion.
+ */
+void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
+			   struct request *rq, int at_head,
+			   rq_end_io_fn *done)
+{
+	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
+
+	rq->rq_disk = bd_disk;
+	rq->flags |= REQ_NOMERGE;
+	rq->end_io = done;
+	WARN_ON(irqs_disabled());
+	spin_lock_irq(q->queue_lock);
+	__elv_add_request(q, rq, where, 1);
+	__generic_unplug_device(q);
+	spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
+
+/**
+ * blk_execute_rq - insert a request into queue for execution
+ * @q:		queue to insert the request in
+ * @bd_disk:	matching gendisk
+ * @rq:		request to insert
+ * @at_head:    insert request at head or tail of queue
+ *
+ * Description:
+ *    Insert a fully prepared request at the back of the io scheduler queue
+ *    for execution and wait for completion.
+ */
+int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
+		   struct request *rq, int at_head)
+{
+	DECLARE_COMPLETION(wait);
+	char sense[SCSI_SENSE_BUFFERSIZE];
+	int err = 0;
+
+	/*
+	 * we need an extra reference to the request, so we can look at
+	 * it after io completion
+	 */
+	rq->ref_count++;
+
+	if (!rq->sense) {
+		memset(sense, 0, sizeof(sense));
+		rq->sense = sense;
+		rq->sense_len = 0;
+	}
+
+	rq->waiting = &wait;
+	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
+	wait_for_completion(&wait);
+	rq->waiting = NULL;
+
+	if (rq->errors)
+		err = -EIO;
+
+	return err;
+}
+
+EXPORT_SYMBOL(blk_execute_rq);
+
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:	blockdev to issue flush for
+ * @error_sector:	error sector
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to.  Caller must run wait_for_completion() on its own.
+ */
+int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+{
+	request_queue_t *q;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+	if (!q->issue_flush_fn)
+		return -EOPNOTSUPP;
+
+	return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
+}
+
+EXPORT_SYMBOL(blkdev_issue_flush);
+
+static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
+{
+	int rw = rq_data_dir(rq);
+
+	if (!blk_fs_request(rq) || !rq->rq_disk)
+		return;
+
+	if (!new_io) {
+		__disk_stat_inc(rq->rq_disk, merges[rw]);
+	} else {
+		disk_round_stats(rq->rq_disk);
+		rq->rq_disk->in_flight++;
+	}
+}
+
+/*
+ * add-request adds a request to the linked list.
+ * queue lock is held and interrupts disabled, as we muck with the
+ * request queue list.
+ */
+static inline void add_request(request_queue_t * q, struct request * req)
+{
+	drive_stat_acct(req, req->nr_sectors, 1);
+
+	if (q->activity_fn)
+		q->activity_fn(q->activity_data, rq_data_dir(req));
+
+	/*
+	 * elevator indicated where it wants this request to be
+	 * inserted at elevator_merge time
+	 */
+	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
+}
+ 
+/*
+ * disk_round_stats()	- Round off the performance stats on a struct
+ * disk_stats.
+ *
+ * The average IO queue length and utilisation statistics are maintained
+ * by observing the current state of the queue length and the amount of
+ * time it has been in this state for.
+ *
+ * Normally, that accounting is done on IO completion, but that can result
+ * in more than a second's worth of IO being accounted for within any one
+ * second, leading to >100% utilisation.  To deal with that, we call this
+ * function to do a round-off before returning the results when reading
+ * /proc/diskstats.  This accounts immediately for all queue usage up to
+ * the current jiffies and restarts the counters again.
+ */
+void disk_round_stats(struct gendisk *disk)
+{
+	unsigned long now = jiffies;
+
+	if (now == disk->stamp)
+		return;
+
+	if (disk->in_flight) {
+		__disk_stat_add(disk, time_in_queue,
+				disk->in_flight * (now - disk->stamp));
+		__disk_stat_add(disk, io_ticks, (now - disk->stamp));
+	}
+	disk->stamp = now;
+}
+
+EXPORT_SYMBOL_GPL(disk_round_stats);
+
+/*
+ * queue lock must be held
+ */
+void __blk_put_request(request_queue_t *q, struct request *req)
+{
+	struct request_list *rl = req->rl;
+
+	if (unlikely(!q))
+		return;
+	if (unlikely(--req->ref_count))
+		return;
+
+	elv_completed_request(q, req);
+
+	req->rq_status = RQ_INACTIVE;
+	req->rl = NULL;
+
+	/*
+	 * Request may not have originated from ll_rw_blk. if not,
+	 * it didn't come out of our reserved rq pools
+	 */
+	if (rl) {
+		int rw = rq_data_dir(req);
+		int priv = req->flags & REQ_ELVPRIV;
+
+		BUG_ON(!list_empty(&req->queuelist));
+
+		blk_free_request(q, req);
+		freed_request(q, rw, priv);
+	}
+}
+
+EXPORT_SYMBOL_GPL(__blk_put_request);
+
+void blk_put_request(struct request *req)
+{
+	unsigned long flags;
+	request_queue_t *q = req->q;
+
+	/*
+	 * Gee, IDE calls in w/ NULL q.  Fix IDE and remove the
+	 * following if (q) test.
+	 */
+	if (q) {
+		spin_lock_irqsave(q->queue_lock, flags);
+		__blk_put_request(q, req);
+		spin_unlock_irqrestore(q->queue_lock, flags);
+	}
+}
+
+EXPORT_SYMBOL(blk_put_request);
+
+/**
+ * blk_end_sync_rq - executes a completion event on a request
+ * @rq: request to complete
+ * @error: end io status of the request
+ */
+void blk_end_sync_rq(struct request *rq, int error)
+{
+	struct completion *waiting = rq->waiting;
+
+	rq->waiting = NULL;
+	__blk_put_request(rq->q, rq);
+
+	/*
+	 * complete last, if this is a stack request the process (and thus
+	 * the rq pointer) could be invalid right after this complete()
+	 */
+	complete(waiting);
+}
+EXPORT_SYMBOL(blk_end_sync_rq);
+
+/**
+ * blk_congestion_wait - wait for a queue to become uncongested
+ * @rw: READ or WRITE
+ * @timeout: timeout in jiffies
+ *
+ * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
+ * If no queues are congested then just wait for the next request to be
+ * returned.
+ */
+long blk_congestion_wait(int rw, long timeout)
+{
+	long ret;
+	DEFINE_WAIT(wait);
+	wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+	ret = io_schedule_timeout(timeout);
+	finish_wait(wqh, &wait);
+	return ret;
+}
+
+EXPORT_SYMBOL(blk_congestion_wait);
+
+/*
+ * Has to be called with the request spinlock acquired
+ */
+static int attempt_merge(request_queue_t *q, struct request *req,
+			  struct request *next)
+{
+	if (!rq_mergeable(req) || !rq_mergeable(next))
+		return 0;
+
+	/*
+	 * not contigious
+	 */
+	if (req->sector + req->nr_sectors != next->sector)
+		return 0;
+
+	if (rq_data_dir(req) != rq_data_dir(next)
+	    || req->rq_disk != next->rq_disk
+	    || next->waiting || next->special)
+		return 0;
+
+	/*
+	 * If we are allowed to merge, then append bio list
+	 * from next to rq and release next. merge_requests_fn
+	 * will have updated segment counts, update sector
+	 * counts here.
+	 */
+	if (!q->merge_requests_fn(q, req, next))
+		return 0;
+
+	/*
+	 * At this point we have either done a back merge
+	 * or front merge. We need the smaller start_time of
+	 * the merged requests to be the current request
+	 * for accounting purposes.
+	 */
+	if (time_after(req->start_time, next->start_time))
+		req->start_time = next->start_time;
+
+	req->biotail->bi_next = next->bio;
+	req->biotail = next->biotail;
+
+	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
+
+	elv_merge_requests(q, req, next);
+
+	if (req->rq_disk) {
+		disk_round_stats(req->rq_disk);
+		req->rq_disk->in_flight--;
+	}
+
+	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
+
+	__blk_put_request(q, next);
+	return 1;
+}
+
+static inline int attempt_back_merge(request_queue_t *q, struct request *rq)
+{
+	struct request *next = elv_latter_request(q, rq);
+
+	if (next)
+		return attempt_merge(q, rq, next);
+
+	return 0;
+}
+
+static inline int attempt_front_merge(request_queue_t *q, struct request *rq)
+{
+	struct request *prev = elv_former_request(q, rq);
+
+	if (prev)
+		return attempt_merge(q, prev, rq);
+
+	return 0;
+}
+
+static void init_request_from_bio(struct request *req, struct bio *bio)
+{
+	req->flags |= REQ_CMD;
+
+	/*
+	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
+	 */
+	if (bio_rw_ahead(bio) || bio_failfast(bio))
+		req->flags |= REQ_FAILFAST;
+
+	/*
+	 * REQ_BARRIER implies no merging, but lets make it explicit
+	 */
+	if (unlikely(bio_barrier(bio)))
+		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+
+	req->errors = 0;
+	req->hard_sector = req->sector = bio->bi_sector;
+	req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
+	req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio);
+	req->nr_phys_segments = bio_phys_segments(req->q, bio);
+	req->nr_hw_segments = bio_hw_segments(req->q, bio);
+	req->buffer = bio_data(bio);	/* see ->buffer comment above */
+	req->waiting = NULL;
+	req->bio = req->biotail = bio;
+	req->ioprio = bio_prio(bio);
+	req->rq_disk = bio->bi_bdev->bd_disk;
+	req->start_time = jiffies;
+}
+
+static int __make_request(request_queue_t *q, struct bio *bio)
+{
+	struct request *req;
+	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
+	unsigned short prio;
+	sector_t sector;
+
+	sector = bio->bi_sector;
+	nr_sectors = bio_sectors(bio);
+	cur_nr_sectors = bio_cur_sectors(bio);
+	prio = bio_prio(bio);
+
+	rw = bio_data_dir(bio);
+	sync = bio_sync(bio);
+
+	/*
+	 * low level driver can indicate that it wants pages above a
+	 * certain limit bounced to low memory (ie for highmem, or even
+	 * ISA dma in theory)
+	 */
+	blk_queue_bounce(q, &bio);
+
+	spin_lock_prefetch(q->queue_lock);
+
+	barrier = bio_barrier(bio);
+	if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
+
+	spin_lock_irq(q->queue_lock);
+
+	if (unlikely(barrier) || elv_queue_empty(q))
+		goto get_rq;
+
+	el_ret = elv_merge(q, &req, bio);
+	switch (el_ret) {
+		case ELEVATOR_BACK_MERGE:
+			BUG_ON(!rq_mergeable(req));
+
+			if (!q->back_merge_fn(q, req, bio))
+				break;
+
+			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+
+			req->biotail->bi_next = bio;
+			req->biotail = bio;
+			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+			req->ioprio = ioprio_best(req->ioprio, prio);
+			drive_stat_acct(req, nr_sectors, 0);
+			if (!attempt_back_merge(q, req))
+				elv_merged_request(q, req);
+			goto out;
+
+		case ELEVATOR_FRONT_MERGE:
+			BUG_ON(!rq_mergeable(req));
+
+			if (!q->front_merge_fn(q, req, bio))
+				break;
+
+			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+
+			bio->bi_next = req->bio;
+			req->bio = bio;
+
+			/*
+			 * may not be valid. if the low level driver said
+			 * it didn't need a bounce buffer then it better
+			 * not touch req->buffer either...
+			 */
+			req->buffer = bio_data(bio);
+			req->current_nr_sectors = cur_nr_sectors;
+			req->hard_cur_sectors = cur_nr_sectors;
+			req->sector = req->hard_sector = sector;
+			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+			req->ioprio = ioprio_best(req->ioprio, prio);
+			drive_stat_acct(req, nr_sectors, 0);
+			if (!attempt_front_merge(q, req))
+				elv_merged_request(q, req);
+			goto out;
+
+		/* ELV_NO_MERGE: elevator says don't/can't merge. */
+		default:
+			;
+	}
+
+get_rq:
+	/*
+	 * Grab a free request. This is might sleep but can not fail.
+	 * Returns with the queue unlocked.
+	 */
+	req = get_request_wait(q, rw, bio);
+
+	/*
+	 * After dropping the lock and possibly sleeping here, our request
+	 * may now be mergeable after it had proven unmergeable (above).
+	 * We don't worry about that case for efficiency. It won't happen
+	 * often, and the elevators are able to handle it.
+	 */
+	init_request_from_bio(req, bio);
+
+	spin_lock_irq(q->queue_lock);
+	if (elv_queue_empty(q))
+		blk_plug_device(q);
+	add_request(q, req);
+out:
+	if (sync)
+		__generic_unplug_device(q);
+
+	spin_unlock_irq(q->queue_lock);
+	return 0;
+
+end_io:
+	bio_endio(bio, nr_sectors << 9, err);
+	return 0;
+}
+
+/*
+ * If bio->bi_dev is a partition, remap the location
+ */
+static inline void blk_partition_remap(struct bio *bio)
+{
+	struct block_device *bdev = bio->bi_bdev;
+
+	if (bdev != bdev->bd_contains) {
+		struct hd_struct *p = bdev->bd_part;
+		const int rw = bio_data_dir(bio);
+
+		p->sectors[rw] += bio_sectors(bio);
+		p->ios[rw]++;
+
+		bio->bi_sector += p->start_sect;
+		bio->bi_bdev = bdev->bd_contains;
+	}
+}
+
+static void handle_bad_sector(struct bio *bio)
+{
+	char b[BDEVNAME_SIZE];
+
+	printk(KERN_INFO "attempt to access beyond end of device\n");
+	printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
+			bdevname(bio->bi_bdev, b),
+			bio->bi_rw,
+			(unsigned long long)bio->bi_sector + bio_sectors(bio),
+			(long long)(bio->bi_bdev->bd_inode->i_size >> 9));
+
+	set_bit(BIO_EOF, &bio->bi_flags);
+}
+
+/**
+ * generic_make_request: hand a buffer to its device driver for I/O
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * generic_make_request() is used to make I/O requests of block
+ * devices. It is passed a &struct bio, which describes the I/O that needs
+ * to be done.
+ *
+ * generic_make_request() does not return any status.  The
+ * success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the bio->bi_end_io
+ * function described (one day) else where.
+ *
+ * The caller of generic_make_request must make sure that bi_io_vec
+ * are set to describe the memory buffer, and that bi_dev and bi_sector are
+ * set to describe the device address, and the
+ * bi_end_io and optionally bi_private are set to describe how
+ * completion notification should be signaled.
+ *
+ * generic_make_request and the drivers it calls may use bi_next if this
+ * bio happens to be merged with someone else, and may change bi_dev and
+ * bi_sector for remaps as it sees fit.  So the values of these fields
+ * should NOT be depended on after the call to generic_make_request.
+ */
+static inline void __generic_make_request(struct bio *bio)
+{
+	request_queue_t *q;
+	sector_t maxsector;
+	int ret, nr_sectors = bio_sectors(bio);
+	dev_t old_dev;
+
+	might_sleep();
+	/* Test device or partition size, when known. */
+	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+	if (maxsector) {
+		sector_t sector = bio->bi_sector;
+
+		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
+			/*
+			 * This may well happen - the kernel calls bread()
+			 * without checking the size of the device, e.g., when
+			 * mounting a device.
+			 */
+			handle_bad_sector(bio);
+			goto end_io;
+		}
+	}
+
+	/*
+	 * Resolve the mapping until finished. (drivers are
+	 * still free to implement/resolve their own stacking
+	 * by explicitly returning 0)
+	 *
+	 * NOTE: we don't repeat the blk_size check for each new device.
+	 * Stacking drivers are expected to know what they are doing.
+	 */
+	maxsector = -1;
+	old_dev = 0;
+	do {
+		char b[BDEVNAME_SIZE];
+
+		q = bdev_get_queue(bio->bi_bdev);
+		if (!q) {
+			printk(KERN_ERR
+			       "generic_make_request: Trying to access "
+				"nonexistent block-device %s (%Lu)\n",
+				bdevname(bio->bi_bdev, b),
+				(long long) bio->bi_sector);
+end_io:
+			bio_endio(bio, bio->bi_size, -EIO);
+			break;
+		}
+
+		if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
+			printk("bio too big device %s (%u > %u)\n", 
+				bdevname(bio->bi_bdev, b),
+				bio_sectors(bio),
+				q->max_hw_sectors);
+			goto end_io;
+		}
+
+		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+			goto end_io;
+
+		/*
+		 * If this device has partitions, remap block n
+		 * of partition p to block n+start(p) of the disk.
+		 */
+		blk_partition_remap(bio);
+
+		if (maxsector != -1)
+			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
+					    maxsector);
+
+		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+
+		maxsector = bio->bi_sector;
+		old_dev = bio->bi_bdev->bd_dev;
+
+		ret = q->make_request_fn(q, bio);
+	} while (ret);
+}
+
+/*
+ * We only want one ->make_request_fn to be active at a time,
+ * else stack usage with stacked devices could be a problem.
+ * So use current->bio_{list,tail} to keep a list of requests
+ * submited by a make_request_fn function.
+ * current->bio_tail is also used as a flag to say if
+ * generic_make_request is currently active in this task or not.
+ * If it is NULL, then no make_request is active.  If it is non-NULL,
+ * then a make_request is active, and new requests should be added
+ * at the tail
+ */
+void generic_make_request(struct bio *bio)
+{
+	if (current->bio_tail) {
+		/* make_request is active */
+		*(current->bio_tail) = bio;
+		bio->bi_next = NULL;
+		current->bio_tail = &bio->bi_next;
+		return;
+	}
+	/* following loop may be a bit non-obvious, and so deserves some
+	 * explanation.
+	 * Before entering the loop, bio->bi_next is NULL (as all callers
+	 * ensure that) so we have a list with a single bio.
+	 * We pretend that we have just taken it off a longer list, so
+	 * we assign bio_list to the next (which is NULL) and bio_tail
+	 * to &bio_list, thus initialising the bio_list of new bios to be
+	 * added.  __generic_make_request may indeed add some more bios
+	 * through a recursive call to generic_make_request.  If it
+	 * did, we find a non-NULL value in bio_list and re-enter the loop
+	 * from the top.  In this case we really did just take the bio
+	 * of the top of the list (no pretending) and so fixup bio_list and
+	 * bio_tail or bi_next, and call into __generic_make_request again.
+	 *
+	 * The loop was structured like this to make only one call to
+	 * __generic_make_request (which is important as it is large and
+	 * inlined) and to keep the structure simple.
+	 */
+	BUG_ON(bio->bi_next);
+	do {
+		current->bio_list = bio->bi_next;
+		if (bio->bi_next == NULL)
+			current->bio_tail = &current->bio_list;
+		else
+			bio->bi_next = NULL;
+		__generic_make_request(bio);
+		bio = current->bio_list;
+	} while (bio);
+	current->bio_tail = NULL; /* deactivate */
+}
+
+EXPORT_SYMBOL(generic_make_request);
+
+/**
+ * submit_bio: submit a bio to the block device layer for I/O
+ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
+ * @bio: The &struct bio which describes the I/O
+ *
+ * submit_bio() is very similar in purpose to generic_make_request(), and
+ * uses that function to do most of the work. Both are fairly rough
+ * interfaces, @bio must be presetup and ready for I/O.
+ *
+ */
+void submit_bio(int rw, struct bio *bio)
+{
+	int count = bio_sectors(bio);
+
+	BIO_BUG_ON(!bio->bi_size);
+	BIO_BUG_ON(!bio->bi_io_vec);
+	bio->bi_rw |= rw;
+	if (rw & WRITE)
+		mod_page_state(pgpgout, count);
+	else
+		mod_page_state(pgpgin, count);
+
+	if (unlikely(block_dump)) {
+		char b[BDEVNAME_SIZE];
+		printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+			current->comm, current->pid,
+			(rw & WRITE) ? "WRITE" : "READ",
+			(unsigned long long)bio->bi_sector,
+			bdevname(bio->bi_bdev,b));
+	}
+
+	generic_make_request(bio);
+}
+
+EXPORT_SYMBOL(submit_bio);
+
+static void blk_recalc_rq_segments(struct request *rq)
+{
+	struct bio *bio, *prevbio = NULL;
+	int nr_phys_segs, nr_hw_segs;
+	unsigned int phys_size, hw_size;
+	request_queue_t *q = rq->q;
+
+	if (!rq->bio)
+		return;
+
+	phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
+	rq_for_each_bio(bio, rq) {
+		/* Force bio hw/phys segs to be recalculated. */
+		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+
+		nr_phys_segs += bio_phys_segments(q, bio);
+		nr_hw_segs += bio_hw_segments(q, bio);
+		if (prevbio) {
+			int pseg = phys_size + prevbio->bi_size + bio->bi_size;
+			int hseg = hw_size + prevbio->bi_size + bio->bi_size;
+
+			if (blk_phys_contig_segment(q, prevbio, bio) &&
+			    pseg <= q->max_segment_size) {
+				nr_phys_segs--;
+				phys_size += prevbio->bi_size + bio->bi_size;
+			} else
+				phys_size = 0;
+
+			if (blk_hw_contig_segment(q, prevbio, bio) &&
+			    hseg <= q->max_segment_size) {
+				nr_hw_segs--;
+				hw_size += prevbio->bi_size + bio->bi_size;
+			} else
+				hw_size = 0;
+		}
+		prevbio = bio;
+	}
+
+	rq->nr_phys_segments = nr_phys_segs;
+	rq->nr_hw_segments = nr_hw_segs;
+}
+
+static void blk_recalc_rq_sectors(struct request *rq, int nsect)
+{
+	if (blk_fs_request(rq)) {
+		rq->hard_sector += nsect;
+		rq->hard_nr_sectors -= nsect;
+
+		/*
+		 * Move the I/O submission pointers ahead if required.
+		 */
+		if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
+		    (rq->sector <= rq->hard_sector)) {
+			rq->sector = rq->hard_sector;
+			rq->nr_sectors = rq->hard_nr_sectors;
+			rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
+			rq->current_nr_sectors = rq->hard_cur_sectors;
+			rq->buffer = bio_data(rq->bio);
+		}
+
+		/*
+		 * if total number of sectors is less than the first segment
+		 * size, something has gone terribly wrong
+		 */
+		if (rq->nr_sectors < rq->current_nr_sectors) {
+			printk("blk: request botched\n");
+			rq->nr_sectors = rq->current_nr_sectors;
+		}
+	}
+}
+
+static int __end_that_request_first(struct request *req, int uptodate,
+				    int nr_bytes)
+{
+	int total_bytes, bio_nbytes, error, next_idx = 0;
+	struct bio *bio;
+
+	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+
+	/*
+	 * extend uptodate bool to allow < 0 value to be direct io error
+	 */
+	error = 0;
+	if (end_io_error(uptodate))
+		error = !uptodate ? -EIO : uptodate;
+
+	/*
+	 * for a REQ_BLOCK_PC request, we want to carry any eventual
+	 * sense key with us all the way through
+	 */
+	if (!blk_pc_request(req))
+		req->errors = 0;
+
+	if (!uptodate) {
+		if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
+			printk("end_request: I/O error, dev %s, sector %llu\n",
+				req->rq_disk ? req->rq_disk->disk_name : "?",
+				(unsigned long long)req->sector);
+	}
+
+	if (blk_fs_request(req) && req->rq_disk) {
+		const int rw = rq_data_dir(req);
+
+		disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
+	}
+
+	total_bytes = bio_nbytes = 0;
+	while ((bio = req->bio) != NULL) {
+		int nbytes;
+
+		if (nr_bytes >= bio->bi_size) {
+			req->bio = bio->bi_next;
+			nbytes = bio->bi_size;
+			if (!ordered_bio_endio(req, bio, nbytes, error))
+				bio_endio(bio, nbytes, error);
+			next_idx = 0;
+			bio_nbytes = 0;
+		} else {
+			int idx = bio->bi_idx + next_idx;
+
+			if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
+				blk_dump_rq_flags(req, "__end_that");
+				printk("%s: bio idx %d >= vcnt %d\n",
+						__FUNCTION__,
+						bio->bi_idx, bio->bi_vcnt);
+				break;
+			}
+
+			nbytes = bio_iovec_idx(bio, idx)->bv_len;
+			BIO_BUG_ON(nbytes > bio->bi_size);
+
+			/*
+			 * not a complete bvec done
+			 */
+			if (unlikely(nbytes > nr_bytes)) {
+				bio_nbytes += nr_bytes;
+				total_bytes += nr_bytes;
+				break;
+			}
+
+			/*
+			 * advance to the next vector
+			 */
+			next_idx++;
+			bio_nbytes += nbytes;
+		}
+
+		total_bytes += nbytes;
+		nr_bytes -= nbytes;
+
+		if ((bio = req->bio)) {
+			/*
+			 * end more in this run, or just return 'not-done'
+			 */
+			if (unlikely(nr_bytes <= 0))
+				break;
+		}
+	}
+
+	/*
+	 * completely done
+	 */
+	if (!req->bio)
+		return 0;
+
+	/*
+	 * if the request wasn't completed, update state
+	 */
+	if (bio_nbytes) {
+		if (!ordered_bio_endio(req, bio, bio_nbytes, error))
+			bio_endio(bio, bio_nbytes, error);
+		bio->bi_idx += next_idx;
+		bio_iovec(bio)->bv_offset += nr_bytes;
+		bio_iovec(bio)->bv_len -= nr_bytes;
+	}
+
+	blk_recalc_rq_sectors(req, total_bytes >> 9);
+	blk_recalc_rq_segments(req);
+	return 1;
+}
+
+/**
+ * end_that_request_first - end I/O on a request
+ * @req:      the request being processed
+ * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
+ * @nr_sectors: number of sectors to end I/O on
+ *
+ * Description:
+ *     Ends I/O on a number of sectors attached to @req, and sets it up
+ *     for the next range of segments (if any) in the cluster.
+ *
+ * Return:
+ *     0 - we are done with this request, call end_that_request_last()
+ *     1 - still buffers pending for this request
+ **/
+int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
+{
+	return __end_that_request_first(req, uptodate, nr_sectors << 9);
+}
+
+EXPORT_SYMBOL(end_that_request_first);
+
+/**
+ * end_that_request_chunk - end I/O on a request
+ * @req:      the request being processed
+ * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @req, and sets it up
+ *     for the next range of segments (if any). Like end_that_request_first(),
+ *     but deals with bytes instead of sectors.
+ *
+ * Return:
+ *     0 - we are done with this request, call end_that_request_last()
+ *     1 - still buffers pending for this request
+ **/
+int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
+{
+	return __end_that_request_first(req, uptodate, nr_bytes);
+}
+
+EXPORT_SYMBOL(end_that_request_chunk);
+
+/*
+ * splice the completion data to a local structure and hand off to
+ * process_completion_queue() to complete the requests
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+	struct list_head *cpu_list;
+	LIST_HEAD(local_list);
+
+	local_irq_disable();
+	cpu_list = &__get_cpu_var(blk_cpu_done);
+	list_splice_init(cpu_list, &local_list);
+	local_irq_enable();
+
+	while (!list_empty(&local_list)) {
+		struct request *rq = list_entry(local_list.next, struct request, donelist);
+
+		list_del_init(&rq->donelist);
+		rq->q->softirq_done_fn(rq);
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
+			  void *hcpu)
+{
+	/*
+	 * If a CPU goes away, splice its entries to the current CPU
+	 * and trigger a run of the softirq
+	 */
+	if (action == CPU_DEAD) {
+		int cpu = (unsigned long) hcpu;
+
+		local_irq_disable();
+		list_splice_init(&per_cpu(blk_cpu_done, cpu),
+				 &__get_cpu_var(blk_cpu_done));
+		raise_softirq_irqoff(BLOCK_SOFTIRQ);
+		local_irq_enable();
+	}
+
+	return NOTIFY_OK;
+}
+
+
+static struct notifier_block __devinitdata blk_cpu_notifier = {
+	.notifier_call	= blk_cpu_notify,
+};
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/**
+ * blk_complete_request - end I/O on a request
+ * @req:      the request being processed
+ *
+ * Description:
+ *     Ends all I/O on a request. It does not handle partial completions,
+ *     unless the driver actually implements this in its completionc callback
+ *     through requeueing. Theh actual completion happens out-of-order,
+ *     through a softirq handler. The user must have registered a completion
+ *     callback through blk_queue_softirq_done().
+ **/
+
+void blk_complete_request(struct request *req)
+{
+	struct list_head *cpu_list;
+	unsigned long flags;
+
+	BUG_ON(!req->q->softirq_done_fn);
+		
+	local_irq_save(flags);
+
+	cpu_list = &__get_cpu_var(blk_cpu_done);
+	list_add_tail(&req->donelist, cpu_list);
+	raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(blk_complete_request);
+	
+/*
+ * queue lock must be held
+ */
+void end_that_request_last(struct request *req, int uptodate)
+{
+	struct gendisk *disk = req->rq_disk;
+	int error;
+
+	/*
+	 * extend uptodate bool to allow < 0 value to be direct io error
+	 */
+	error = 0;
+	if (end_io_error(uptodate))
+		error = !uptodate ? -EIO : uptodate;
+
+	if (unlikely(laptop_mode) && blk_fs_request(req))
+		laptop_io_completion();
+
+	if (disk && blk_fs_request(req)) {
+		unsigned long duration = jiffies - req->start_time;
+		const int rw = rq_data_dir(req);
+
+		__disk_stat_inc(disk, ios[rw]);
+		__disk_stat_add(disk, ticks[rw], duration);
+		disk_round_stats(disk);
+		disk->in_flight--;
+	}
+	if (req->end_io)
+		req->end_io(req, error);
+	else
+		__blk_put_request(req->q, req);
+}
+
+EXPORT_SYMBOL(end_that_request_last);
+
+void end_request(struct request *req, int uptodate)
+{
+	if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
+		add_disk_randomness(req->rq_disk);
+		blkdev_dequeue_request(req);
+		end_that_request_last(req, uptodate);
+	}
+}
+
+EXPORT_SYMBOL(end_request);
+
+void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio)
+{
+	/* first three bits are identical in rq->flags and bio->bi_rw */
+	rq->flags |= (bio->bi_rw & 7);
+
+	rq->nr_phys_segments = bio_phys_segments(q, bio);
+	rq->nr_hw_segments = bio_hw_segments(q, bio);
+	rq->current_nr_sectors = bio_cur_sectors(bio);
+	rq->hard_cur_sectors = rq->current_nr_sectors;
+	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
+	rq->buffer = bio_data(bio);
+
+	rq->bio = rq->biotail = bio;
+}
+
+EXPORT_SYMBOL(blk_rq_bio_prep);
+
+int kblockd_schedule_work(struct work_struct *work)
+{
+	return queue_work(kblockd_workqueue, work);
+}
+
+EXPORT_SYMBOL(kblockd_schedule_work);
+
+void kblockd_flush(void)
+{
+	flush_workqueue(kblockd_workqueue);
+}
+EXPORT_SYMBOL(kblockd_flush);
+
+int __init blk_dev_init(void)
+{
+	int i;
+
+	kblockd_workqueue = create_workqueue("kblockd");
+	if (!kblockd_workqueue)
+		panic("Failed to create kblockd\n");
+
+	request_cachep = kmem_cache_create("blkdev_requests",
+			sizeof(struct request), 0, SLAB_PANIC, NULL, NULL);
+
+	requestq_cachep = kmem_cache_create("blkdev_queue",
+			sizeof(request_queue_t), 0, SLAB_PANIC, NULL, NULL);
+
+	iocontext_cachep = kmem_cache_create("blkdev_ioc",
+			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
+
+	for_each_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+
+	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
+#ifdef CONFIG_HOTPLUG_CPU
+	register_cpu_notifier(&blk_cpu_notifier);
+#endif
+
+	blk_max_low_pfn = max_low_pfn;
+	blk_max_pfn = max_pfn;
+
+	return 0;
+}
+
+/*
+ * IO Context helper functions
+ */
+void put_io_context(struct io_context *ioc)
+{
+	if (ioc == NULL)
+		return;
+
+	BUG_ON(atomic_read(&ioc->refcount) == 0);
+
+	if (atomic_dec_and_test(&ioc->refcount)) {
+		struct cfq_io_context *cic;
+
+		if (ioc->aic && ioc->aic->dtor)
+			ioc->aic->dtor(ioc->aic);
+
+		if (ioc->cic_root.rb_node != NULL) {
+			cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
+			cic->dtor(ioc);
+		}
+
+		kmem_cache_free(iocontext_cachep, ioc);
+	}
+}
+EXPORT_SYMBOL(put_io_context);
+
+/* Called by the exitting task */
+void exit_io_context(void)
+{
+	unsigned long flags;
+	struct io_context *ioc;
+	struct cfq_io_context *cic;
+
+	local_irq_save(flags);
+	task_lock(current);
+	ioc = current->io_context;
+	current->io_context = NULL;
+	ioc->task = NULL;
+	task_unlock(current);
+	local_irq_restore(flags);
+
+	if (ioc->aic && ioc->aic->exit)
+		ioc->aic->exit(ioc->aic);
+
+	if (ioc->cic_root.rb_node != NULL) {
+		cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
+		cic->exit(ioc);
+	}
+
+	put_io_context(ioc);
+}
+
+/*
+ * If the current task has no IO context then create one and initialise it.
+ * Otherwise, return its existing IO context.
+ *
+ * This returned IO context doesn't have a specifically elevated refcount,
+ * but since the current task itself holds a reference, the context can be
+ * used in general code, so long as it stays within `current` context.
+ */
+struct io_context *current_io_context(gfp_t gfp_flags)
+{
+	struct task_struct *tsk = current;
+	struct io_context *ret;
+
+	ret = tsk->io_context;
+	if (likely(ret))
+		return ret;
+
+	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
+	if (ret) {
+		atomic_set(&ret->refcount, 1);
+		ret->task = current;
+		ret->set_ioprio = NULL;
+		ret->last_waited = jiffies; /* doesn't matter... */
+		ret->nr_batch_requests = 0; /* because this is 0 */
+		ret->aic = NULL;
+		ret->cic_root.rb_node = NULL;
+
+		tsk->io_context = ret;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(current_io_context);
+
+/*
+ * If the current task has no IO context then create one and initialise it.
+ * If it does have a context, take a ref on it.
+ *
+ * This is always called in the context of the task which submitted the I/O.
+ */
+struct io_context *get_io_context(gfp_t gfp_flags)
+{
+	struct io_context *ret;
+	ret = current_io_context(gfp_flags);
+	if (likely(ret))
+		atomic_inc(&ret->refcount);
+	return ret;
+}
+EXPORT_SYMBOL(get_io_context);
+
+void copy_io_context(struct io_context **pdst, struct io_context **psrc)
+{
+	struct io_context *src = *psrc;
+	struct io_context *dst = *pdst;
+
+	if (src) {
+		BUG_ON(atomic_read(&src->refcount) == 0);
+		atomic_inc(&src->refcount);
+		put_io_context(dst);
+		*pdst = src;
+	}
+}
+EXPORT_SYMBOL(copy_io_context);
+
+void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
+{
+	struct io_context *temp;
+	temp = *ioc1;
+	*ioc1 = *ioc2;
+	*ioc2 = temp;
+}
+EXPORT_SYMBOL(swap_io_context);
+
+/*
+ * sysfs parts below
+ */
+struct queue_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct request_queue *, char *);
+	ssize_t (*store)(struct request_queue *, const char *, size_t);
+};
+
+static ssize_t
+queue_var_show(unsigned int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+queue_var_store(unsigned long *var, const char *page, size_t count)
+{
+	char *p = (char *) page;
+
+	*var = simple_strtoul(p, &p, 10);
+	return count;
+}
+
+static ssize_t queue_requests_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->nr_requests, (page));
+}
+
+static ssize_t
+queue_requests_store(struct request_queue *q, const char *page, size_t count)
+{
+	struct request_list *rl = &q->rq;
+
+	int ret = queue_var_store(&q->nr_requests, page, count);
+	if (q->nr_requests < BLKDEV_MIN_RQ)
+		q->nr_requests = BLKDEV_MIN_RQ;
+	blk_queue_congestion_threshold(q);
+
+	if (rl->count[READ] >= queue_congestion_on_threshold(q))
+		set_queue_congested(q, READ);
+	else if (rl->count[READ] < queue_congestion_off_threshold(q))
+		clear_queue_congested(q, READ);
+
+	if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
+		set_queue_congested(q, WRITE);
+	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
+		clear_queue_congested(q, WRITE);
+
+	if (rl->count[READ] >= q->nr_requests) {
+		blk_set_queue_full(q, READ);
+	} else if (rl->count[READ]+1 <= q->nr_requests) {
+		blk_clear_queue_full(q, READ);
+		wake_up(&rl->wait[READ]);
+	}
+
+	if (rl->count[WRITE] >= q->nr_requests) {
+		blk_set_queue_full(q, WRITE);
+	} else if (rl->count[WRITE]+1 <= q->nr_requests) {
+		blk_clear_queue_full(q, WRITE);
+		wake_up(&rl->wait[WRITE]);
+	}
+	return ret;
+}
+
+static ssize_t queue_ra_show(struct request_queue *q, char *page)
+{
+	int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
+
+	return queue_var_show(ra_kb, (page));
+}
+
+static ssize_t
+queue_ra_store(struct request_queue *q, const char *page, size_t count)
+{
+	unsigned long ra_kb;
+	ssize_t ret = queue_var_store(&ra_kb, page, count);
+
+	spin_lock_irq(q->queue_lock);
+	if (ra_kb > (q->max_sectors >> 1))
+		ra_kb = (q->max_sectors >> 1);
+
+	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
+	spin_unlock_irq(q->queue_lock);
+
+	return ret;
+}
+
+static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
+{
+	int max_sectors_kb = q->max_sectors >> 1;
+
+	return queue_var_show(max_sectors_kb, (page));
+}
+
+static ssize_t
+queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
+{
+	unsigned long max_sectors_kb,
+			max_hw_sectors_kb = q->max_hw_sectors >> 1,
+			page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
+	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
+	int ra_kb;
+
+	if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
+		return -EINVAL;
+	/*
+	 * Take the queue lock to update the readahead and max_sectors
+	 * values synchronously:
+	 */
+	spin_lock_irq(q->queue_lock);
+	/*
+	 * Trim readahead window as well, if necessary:
+	 */
+	ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
+	if (ra_kb > max_sectors_kb)
+		q->backing_dev_info.ra_pages =
+				max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
+
+	q->max_sectors = max_sectors_kb << 1;
+	spin_unlock_irq(q->queue_lock);
+
+	return ret;
+}
+
+static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
+{
+	int max_hw_sectors_kb = q->max_hw_sectors >> 1;
+
+	return queue_var_show(max_hw_sectors_kb, (page));
+}
+
+
+static struct queue_sysfs_entry queue_requests_entry = {
+	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_requests_show,
+	.store = queue_requests_store,
+};
+
+static struct queue_sysfs_entry queue_ra_entry = {
+	.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_ra_show,
+	.store = queue_ra_store,
+};
+
+static struct queue_sysfs_entry queue_max_sectors_entry = {
+	.attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_max_sectors_show,
+	.store = queue_max_sectors_store,
+};
+
+static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
+	.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
+	.show = queue_max_hw_sectors_show,
+};
+
+static struct queue_sysfs_entry queue_iosched_entry = {
+	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
+	.show = elv_iosched_show,
+	.store = elv_iosched_store,
+};
+
+static struct attribute *default_attrs[] = {
+	&queue_requests_entry.attr,
+	&queue_ra_entry.attr,
+	&queue_max_hw_sectors_entry.attr,
+	&queue_max_sectors_entry.attr,
+	&queue_iosched_entry.attr,
+	NULL,
+};
+
+#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
+
+static ssize_t
+queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct queue_sysfs_entry *entry = to_queue(attr);
+	struct request_queue *q;
+
+	q = container_of(kobj, struct request_queue, kobj);
+	if (!entry->show)
+		return -EIO;
+
+	return entry->show(q, page);
+}
+
+static ssize_t
+queue_attr_store(struct kobject *kobj, struct attribute *attr,
+		    const char *page, size_t length)
+{
+	struct queue_sysfs_entry *entry = to_queue(attr);
+	struct request_queue *q;
+
+	q = container_of(kobj, struct request_queue, kobj);
+	if (!entry->store)
+		return -EIO;
+
+	return entry->store(q, page, length);
+}
+
+static struct sysfs_ops queue_sysfs_ops = {
+	.show	= queue_attr_show,
+	.store	= queue_attr_store,
+};
+
+static struct kobj_type queue_ktype = {
+	.sysfs_ops	= &queue_sysfs_ops,
+	.default_attrs	= default_attrs,
+};
+
+int blk_register_queue(struct gendisk *disk)
+{
+	int ret;
+
+	request_queue_t *q = disk->queue;
+
+	if (!q || !q->request_fn)
+		return -ENXIO;
+
+	q->kobj.parent = kobject_get(&disk->kobj);
+	if (!q->kobj.parent)
+		return -EBUSY;
+
+	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
+	q->kobj.ktype = &queue_ktype;
+
+	ret = kobject_register(&q->kobj);
+	if (ret < 0)
+		return ret;
+
+	ret = elv_register_queue(q);
+	if (ret) {
+		kobject_unregister(&q->kobj);
+		return ret;
+	}
+
+	return 0;
+}
+
+void blk_unregister_queue(struct gendisk *disk)
+{
+	request_queue_t *q = disk->queue;
+
+	if (q && q->request_fn) {
+		elv_unregister_queue(q);
+
+		kobject_unregister(&q->kobj);
+		kobject_put(&disk->kobj);
+	}
+}
diff -urN oldtree/crypto/Kconfig newtree/crypto/Kconfig
--- oldtree/crypto/Kconfig	2006-03-08 18:47:11.524849500 +0000
+++ newtree/crypto/Kconfig	2006-03-08 15:22:33.085495000 +0000
@@ -316,6 +316,13 @@
 	  
 	  You will most probably want this if using IPSec.
 
+config CRYPTO_LZF
+	tristate "LZF compression algorithm"
+	depends on CRYPTO
+	help
+	  This is the LZF algorithm. It is especially useful for Suspend2,
+	  because it achieves good compression quickly.
+
 config CRYPTO_MICHAEL_MIC
 	tristate "Michael MIC keyed digest algorithm"
 	depends on CRYPTO
diff -urN oldtree/crypto/Makefile newtree/crypto/Makefile
--- oldtree/crypto/Makefile	2006-01-03 03:21:10.000000000 +0000
+++ newtree/crypto/Makefile	2006-03-08 15:22:33.089495250 +0000
@@ -30,5 +30,6 @@
 obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o
 obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
 obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
+obj-$(CONFIG_CRYPTO_LZF) += lzf.o
 
 obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
diff -urN oldtree/crypto/deflate.c newtree/crypto/deflate.c
--- oldtree/crypto/deflate.c	2006-03-08 18:47:59.023818000 +0000
+++ newtree/crypto/deflate.c	2006-03-08 15:22:33.093495500 +0000
@@ -142,8 +142,15 @@
 
 	ret = zlib_deflate(stream, Z_FINISH);
 	if (ret != Z_STREAM_END) {
-		ret = -EINVAL;
-		goto out;
+	    	if (!(ret == Z_OK && !stream->avail_in && !stream->avail_out)) {
+			ret = -EINVAL;
+			goto out;
+		} else {
+			u8 zerostuff = 0;
+			stream->next_out = &zerostuff;
+			stream->avail_out = 1; 
+			ret = zlib_deflate(stream, Z_FINISH);
+		}
 	}
 	ret = 0;
 	*dlen = stream->total_out;
diff -urN oldtree/crypto/deflate.c.orig newtree/crypto/deflate.c.orig
--- oldtree/crypto/deflate.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/crypto/deflate.c.orig	2006-03-08 15:21:14.460581250 +0000
@@ -0,0 +1,224 @@
+/* 
+ * Cryptographic API.
+ *
+ * Deflate algorithm (RFC 1951), implemented here primarily for use
+ * by IPCOMP (RFC 3173 & RFC 2394).
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ * 
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ *
+ * FIXME: deflate transforms will require up to a total of about 436k of kernel
+ * memory on i386 (390k for compression, the rest for decompression), as the
+ * current zlib kernel code uses a worst case pre-allocation system by default.
+ * This needs to be fixed so that the amount of memory required is properly
+ * related to the  winbits and memlevel parameters.
+ *
+ * The default winbits of 11 should suit most packets, and it may be something
+ * to configure on a per-tfm basis in the future.
+ *
+ * Currently, compression history is not maintained between tfm calls, as
+ * it is not needed for IPCOMP and keeps the code simpler.  It can be
+ * implemented if someone wants it.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/zlib.h>
+#include <linux/vmalloc.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+
+#define DEFLATE_DEF_LEVEL		Z_DEFAULT_COMPRESSION
+#define DEFLATE_DEF_WINBITS		11
+#define DEFLATE_DEF_MEMLEVEL		MAX_MEM_LEVEL
+
+struct deflate_ctx {
+	struct z_stream_s comp_stream;
+	struct z_stream_s decomp_stream;
+};
+
+static int deflate_comp_init(struct deflate_ctx *ctx)
+{
+	int ret = 0;
+	struct z_stream_s *stream = &ctx->comp_stream;
+
+	stream->workspace = vmalloc(zlib_deflate_workspacesize());
+	if (!stream->workspace ) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	memset(stream->workspace, 0, zlib_deflate_workspacesize());
+	ret = zlib_deflateInit2(stream, DEFLATE_DEF_LEVEL, Z_DEFLATED,
+	                        -DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL,
+	                        Z_DEFAULT_STRATEGY);
+	if (ret != Z_OK) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+out:	
+	return ret;
+out_free:
+	vfree(stream->workspace);
+	goto out;
+}
+
+static int deflate_decomp_init(struct deflate_ctx *ctx)
+{
+	int ret = 0;
+	struct z_stream_s *stream = &ctx->decomp_stream;
+
+	stream->workspace = kzalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+	if (!stream->workspace ) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = zlib_inflateInit2(stream, -DEFLATE_DEF_WINBITS);
+	if (ret != Z_OK) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+out:
+	return ret;
+out_free:
+	kfree(stream->workspace);
+	goto out;
+}
+
+static void deflate_comp_exit(struct deflate_ctx *ctx)
+{
+	zlib_deflateEnd(&ctx->comp_stream);
+	vfree(ctx->comp_stream.workspace);
+}
+
+static void deflate_decomp_exit(struct deflate_ctx *ctx)
+{
+	zlib_inflateEnd(&ctx->decomp_stream);
+	kfree(ctx->decomp_stream.workspace);
+}
+
+static int deflate_init(void *ctx)
+{
+	int ret;
+	
+	ret = deflate_comp_init(ctx);
+	if (ret)
+		goto out;
+	ret = deflate_decomp_init(ctx);
+	if (ret)
+		deflate_comp_exit(ctx);
+out:
+	return ret;
+}
+
+static void deflate_exit(void *ctx)
+{
+	deflate_comp_exit(ctx);
+	deflate_decomp_exit(ctx);
+}
+
+static int deflate_compress(void *ctx, const u8 *src, unsigned int slen,
+	                    u8 *dst, unsigned int *dlen)
+{
+	int ret = 0;
+	struct deflate_ctx *dctx = ctx;
+	struct z_stream_s *stream = &dctx->comp_stream;
+
+	ret = zlib_deflateReset(stream);
+	if (ret != Z_OK) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	stream->next_in = (u8 *)src;
+	stream->avail_in = slen;
+	stream->next_out = (u8 *)dst;
+	stream->avail_out = *dlen;
+
+	ret = zlib_deflate(stream, Z_FINISH);
+	if (ret != Z_STREAM_END) {
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = 0;
+	*dlen = stream->total_out;
+out:
+	return ret;
+}
+ 
+static int deflate_decompress(void *ctx, const u8 *src, unsigned int slen,
+                              u8 *dst, unsigned int *dlen)
+{
+	
+	int ret = 0;
+	struct deflate_ctx *dctx = ctx;
+	struct z_stream_s *stream = &dctx->decomp_stream;
+
+	ret = zlib_inflateReset(stream);
+	if (ret != Z_OK) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	stream->next_in = (u8 *)src;
+	stream->avail_in = slen;
+	stream->next_out = (u8 *)dst;
+	stream->avail_out = *dlen;
+
+	ret = zlib_inflate(stream, Z_SYNC_FLUSH);
+	/*
+	 * Work around a bug in zlib, which sometimes wants to taste an extra
+	 * byte when being used in the (undocumented) raw deflate mode.
+	 * (From USAGI).
+	 */
+	if (ret == Z_OK && !stream->avail_in && stream->avail_out) {
+		u8 zerostuff = 0;
+		stream->next_in = &zerostuff;
+		stream->avail_in = 1; 
+		ret = zlib_inflate(stream, Z_FINISH);
+	}
+	if (ret != Z_STREAM_END) {
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = 0;
+	*dlen = stream->total_out;
+out:
+	return ret;
+}
+
+static struct crypto_alg alg = {
+	.cra_name		= "deflate",
+	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
+	.cra_ctxsize		= sizeof(struct deflate_ctx),
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(alg.cra_list),
+	.cra_u			= { .compress = {
+	.coa_init		= deflate_init,
+	.coa_exit		= deflate_exit,
+	.coa_compress 		= deflate_compress,
+	.coa_decompress  	= deflate_decompress } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Deflate Compression Algorithm for IPCOMP");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+
diff -urN oldtree/crypto/lzf.c newtree/crypto/lzf.c
--- oldtree/crypto/lzf.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/crypto/lzf.c	2006-03-08 15:22:33.097495750 +0000
@@ -0,0 +1,335 @@
+/* 
+ * Cryptoapi LZF compression module.
+ *
+ * Copyright (c) 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * based on the deflate.c file:
+ * 
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ * 
+ * and upon the LZF compression module donated to the Suspend2 project with
+ * the following copyright:
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ * Copyright (c) 2000-2003 Marc Alexander Lehmann <pcg@goof.com>
+ * 
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ * 
+ *   1.  Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ * 
+ *   2.  Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ * 
+ *   3.  The name of the author may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License version 2 (the "GPL"), in which case the
+ * provisions of the GPL are applicable instead of the above. If you wish to
+ * allow the use of your version of this file only under the terms of the
+ * GPL and not to allow others to use your version of this file under the
+ * BSD license, indicate your decision by deleting the provisions above and
+ * replace them with the notice and other provisions required by the GPL. If
+ * you do not delete the provisions above, a recipient may use your version
+ * of this file under either the BSD or the GPL.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <asm/string.h>
+
+struct lzf_ctx {
+	void *hbuf;
+	unsigned int bufofs;
+};
+
+/*
+ * size of hashtable is (1 << hlog) * sizeof (char *)
+ * decompression is independent of the hash table size
+ * the difference between 15 and 14 is very small
+ * for small blocks (and 14 is also faster).
+ * For a low-memory configuration, use hlog == 13;
+ * For best compression, use 15 or 16.
+ */
+static const int hlog = 14;
+
+/*
+ * don't play with this unless you benchmark!
+ * decompression is not dependent on the hash function
+ * the hashing function might seem strange, just believe me
+ * it works ;)
+ */
+static inline u16 first(const u8 *p)
+{
+	return ((p[0]) << 8) + p[1];
+}
+
+static inline u16 next(u8 v, const u8 *p)
+{
+	return ((v) << 8) + p[2];
+}
+
+static inline u32 idx(unsigned int h)
+{
+	return (((h ^ (h << 5)) >> (3*8 - hlog)) + h*3) & ((1 << hlog) - 1);
+}
+
+/*
+ * IDX works because it is very similar to a multiplicative hash, e.g.
+ * (h * 57321 >> (3*8 - hlog))
+ * the next one is also quite good, albeit slow ;)
+ * (int)(cos(h & 0xffffff) * 1e6)
+ */
+
+static const int max_lit = (1 <<  5);
+static const int max_off = (1 << 13);
+static const int max_ref = ((1 <<  8) + (1 << 3));
+
+/*
+ * compressed format
+ *
+ * 000LLLLL <L+1>    ; literal
+ * LLLOOOOO oooooooo ; backref L
+ * 111OOOOO LLLLLLLL oooooooo ; backref L+7
+ *
+ */
+
+static void lzf_compress_exit(void *context)
+{
+	struct lzf_ctx *ctx = (struct lzf_ctx *)context;
+
+	if (ctx->hbuf) {
+		vfree(ctx->hbuf);
+		ctx->hbuf = NULL;
+	}
+}
+
+static int lzf_compress_init(void *context)
+{
+	struct lzf_ctx *ctx = (struct lzf_ctx *)context;
+
+	/* Get LZF ready to go */
+	ctx->hbuf = vmalloc_32((1 << hlog) * sizeof(char *));
+	if (!ctx->hbuf) {
+		printk(KERN_WARNING
+		       "Failed to allocate %ld bytes for lzf workspace\n",
+		       (long) ((1 << hlog) * sizeof(char *)));
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static int lzf_compress(void *context, const u8 *in_data, unsigned int in_len,
+			u8 *out_data, unsigned int *out_len)
+{
+	struct lzf_ctx *ctx = (struct lzf_ctx *)context;
+	const u8 **htab = ctx->hbuf;
+	const u8 **hslot;
+	const u8 *ip = in_data;
+	u8 *op = out_data;
+	const u8 *in_end = ip + in_len;
+	u8 *out_end = op + *out_len - 3;
+	const u8 *ref;
+
+	unsigned int hval = first(ip);
+	unsigned long off;
+	int lit = 0;
+
+	memset(htab, 0, sizeof(htab));
+
+	for (;;) {
+		if (ip < in_end - 2) {
+			hval = next(hval, ip);
+			hslot = htab + idx(hval);
+			ref = *hslot;
+			*hslot = ip;
+
+			if ((off = ip - ref - 1) < max_off
+			    && ip + 4 < in_end && ref > in_data
+			    && *(u16 *) ref == *(u16 *) ip && ref[2] == ip[2]
+			    ) {
+				/* match found at *ref++ */
+				unsigned int len = 2;
+				unsigned int maxlen = in_end - ip - len;
+				maxlen = maxlen > max_ref ? max_ref : maxlen;
+
+				do
+					len++;
+				while (len < maxlen && ref[len] == ip[len]);
+
+				if (op + lit + 1 + 3 >= out_end) {
+					*out_len = PAGE_SIZE;
+					return 0;
+				}
+
+				if (lit) {
+					*op++ = lit - 1;
+					lit = -lit;
+					do
+						*op++ = ip[lit];
+					while (++lit);
+				}
+
+				len -= 2;
+				ip++;
+
+				if (len < 7) {
+					*op++ = (off >> 8) + (len << 5);
+				} else {
+					*op++ = (off >> 8) + (7 << 5);
+					*op++ = len - 7;
+				}
+
+				*op++ = off;
+
+				ip += len;
+				hval = first(ip);
+				hval = next(hval, ip);
+				htab[idx(hval)] = ip;
+				ip++;
+				continue;
+			}
+		} else if (ip == in_end)
+			break;
+
+		/* one more literal byte we must copy */
+		lit++;
+		ip++;
+
+		if (lit == max_lit) {
+			if (op + 1 + max_lit >= out_end) {
+				*out_len = PAGE_SIZE;
+				return 0;
+			}
+
+			*op++ = max_lit - 1;
+			memcpy(op, ip - max_lit, max_lit);
+			op += max_lit;
+			lit = 0;
+		}
+	}
+
+	if (lit) {
+		if (op + lit + 1 >= out_end) {
+			*out_len = PAGE_SIZE;
+			return 0;
+		}
+
+		*op++ = lit - 1;
+		lit = -lit;
+		do
+			*op++ = ip[lit];
+		while (++lit);
+	}
+
+	*out_len = op - out_data;
+	return 0;
+}
+
+static int lzf_decompress(void *context, const u8 *src, unsigned int slen,
+			  u8 *dst, unsigned int *dlen)
+{
+	u8 const *ip = src;
+	u8 *op = dst;
+	u8 const *const in_end = ip + slen;
+	u8 *const out_end = op + *dlen;
+
+	do {
+		unsigned int ctrl = *ip++;
+
+		if (ctrl < (1 << 5)) {	/* literal run */
+			ctrl++;
+
+			if (op + ctrl > out_end) {
+				*dlen = PAGE_SIZE;
+				return 0;
+			}
+			memcpy(op, ip, ctrl);
+			op += ctrl;
+			ip += ctrl;
+		} else {	/* back reference */
+
+			unsigned int len = ctrl >> 5;
+
+			u8 *ref = op - ((ctrl & 0x1f) << 8) - 1;
+
+			if (len == 7)
+				len += *ip++;
+
+			ref -= *ip++;
+
+			if (op + len + 2 > out_end) {
+				*dlen = PAGE_SIZE;
+				return 0;
+			}
+
+			if (ref < (u8 *) dst) {
+				*dlen = PAGE_SIZE;
+				return 0;
+			}
+
+			*op++ = *ref++;
+			*op++ = *ref++;
+
+			do
+				*op++ = *ref++;
+			while (--len);
+		}
+	}
+	while (op < out_end && ip < in_end);
+
+	*dlen = op - (u8 *) dst;
+	return 0;
+}
+
+static struct crypto_alg alg = {
+	.cra_name = "lzf",
+	.cra_flags = CRYPTO_ALG_TYPE_COMPRESS,
+	.cra_ctxsize = 0,
+	.cra_module = THIS_MODULE,
+	.cra_list = LIST_HEAD_INIT(alg.cra_list),
+	.cra_u = {.compress = {
+			       .coa_init = lzf_compress_init,
+			       .coa_exit = lzf_compress_exit,
+			       .coa_compress = lzf_compress,
+			       .coa_decompress = lzf_decompress}}
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZF Compression Algorithm");
+MODULE_AUTHOR("Marc Alexander Lehmann & Nigel Cunningham");
diff -urN oldtree/drivers/acpi/osl.c newtree/drivers/acpi/osl.c
--- oldtree/drivers/acpi/osl.c	2006-03-08 18:47:59.115823750 +0000
+++ newtree/drivers/acpi/osl.c	2006-03-08 15:22:33.101496000 +0000
@@ -91,7 +91,7 @@
 		       "Access to PCI configuration space unavailable\n");
 		return AE_NULL_ENTRY;
 	}
-	kacpid_wq = create_singlethread_workqueue("kacpid");
+	kacpid_wq = create_nofreeze_singlethread_workqueue("kacpid");
 	BUG_ON(!kacpid_wq);
 
 	return AE_OK;
diff -urN oldtree/drivers/acpi/sleep/proc.c newtree/drivers/acpi/sleep/proc.c
--- oldtree/drivers/acpi/sleep/proc.c	2006-03-08 18:47:59.131824750 +0000
+++ newtree/drivers/acpi/sleep/proc.c	2006-03-08 15:22:33.101496000 +0000
@@ -58,6 +58,15 @@
 		goto Done;
 	}
 	state = simple_strtoul(str, NULL, 0);
+	
+	/* 
+	 * I used to put this after the CONFIG_SOFTWARE_SUSPEND
+	 * test, but people who compile in suspend2 usually want
+	 * to use it instead of swsusp.   --NC
+	 */
+	if (may_try_suspend2(state))
+		goto Done;
+
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	if (state == 4) {
 		error = software_suspend();
diff -urN oldtree/drivers/base/sys.c newtree/drivers/base/sys.c
--- oldtree/drivers/base/sys.c	2006-03-08 18:47:11.692860000 +0000
+++ newtree/drivers/base/sys.c	2006-03-08 15:22:33.105496250 +0000
@@ -302,16 +302,14 @@
 		cls->resume(dev);
 
 	/* Call auxillary drivers next. */
-	list_for_each_entry(drv, &cls->drivers, entry) {
+	list_for_each_entry(drv, &cls->drivers, entry)
 		if (drv->resume)
 			drv->resume(dev);
-	}
 
 	/* Call global drivers. */
-	list_for_each_entry(drv, &sysdev_drivers, entry) {
+	list_for_each_entry(drv, &sysdev_drivers, entry)
 		if (drv->resume)
 			drv->resume(dev);
-	}
 }
 
 /**
diff -urN oldtree/drivers/char/hvc_console.c newtree/drivers/char/hvc_console.c
--- oldtree/drivers/char/hvc_console.c	2006-03-08 18:47:11.852870000 +0000
+++ newtree/drivers/char/hvc_console.c	2006-03-08 15:22:33.109496500 +0000
@@ -841,7 +841,7 @@
 
 	/* Always start the kthread because there can be hotplug vty adapters
 	 * added later. */
-	hvc_task = kthread_run(khvcd, NULL, "khvcd");
+	hvc_task = kthread_nofreeze_run(khvcd, NULL, "khvcd");
 	if (IS_ERR(hvc_task)) {
 		panic("Couldn't create kthread for console.\n");
 		put_tty_driver(hvc_driver);
diff -urN oldtree/drivers/char/hvc_console.c.orig newtree/drivers/char/hvc_console.c.orig
--- oldtree/drivers/char/hvc_console.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/drivers/char/hvc_console.c.orig	2006-03-08 15:20:25.297508750 +0000
@@ -0,0 +1,870 @@
+/*
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * Copyright (C) 2001 Paul Mackerras <paulus@au.ibm.com>, IBM
+ * Copyright (C) 2004 Benjamin Herrenschmidt <benh@kernel.crashing.org>, IBM Corp.
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Additional Author(s):
+ *  Ryan S. Arnold <rsa@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/console.h>
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/kbd_kern.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/major.h>
+#include <linux/sysrq.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/hvconsole.h>
+
+#define HVC_MAJOR	229
+#define HVC_MINOR	0
+
+#define TIMEOUT		(10)
+
+/*
+ * Wait this long per iteration while trying to push buffered data to the
+ * hypervisor before allowing the tty to complete a close operation.
+ */
+#define HVC_CLOSE_WAIT (HZ/100) /* 1/10 of a second */
+
+/*
+ * The Linux TTY code does not support dynamic addition of tty derived devices
+ * so we need to know how many tty devices we might need when space is allocated
+ * for the tty device.  Since this driver supports hotplug of vty adapters we
+ * need to make sure we have enough allocated.
+ */
+#define HVC_ALLOC_TTY_ADAPTERS	8
+
+#define N_OUTBUF	16
+#define N_INBUF		16
+
+#define __ALIGNED__	__attribute__((__aligned__(8)))
+
+static struct tty_driver *hvc_driver;
+static struct task_struct *hvc_task;
+
+/* Picks up late kicks after list walk but before schedule() */
+static int hvc_kicked;
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static int sysrq_pressed;
+#endif
+
+struct hvc_struct {
+	spinlock_t lock;
+	int index;
+	struct tty_struct *tty;
+	unsigned int count;
+	int do_wakeup;
+	char outbuf[N_OUTBUF] __ALIGNED__;
+	int n_outbuf;
+	uint32_t vtermno;
+	struct hv_ops *ops;
+	int irq_requested;
+	int irq;
+	struct list_head next;
+	struct kobject kobj; /* ref count & hvc_struct lifetime */
+};
+
+/* dynamic list of hvc_struct instances */
+static struct list_head hvc_structs = LIST_HEAD_INIT(hvc_structs);
+
+/*
+ * Protect the list of hvc_struct instances from inserts and removals during
+ * list traversal.
+ */
+static DEFINE_SPINLOCK(hvc_structs_lock);
+
+/*
+ * This value is used to assign a tty->index value to a hvc_struct based
+ * upon order of exposure via hvc_probe(), when we can not match it to
+ * a console canidate registered with hvc_instantiate().
+ */
+static int last_hvc = -1;
+
+/*
+ * Do not call this function with either the hvc_strucst_lock or the hvc_struct
+ * lock held.  If successful, this function increments the kobject reference
+ * count against the target hvc_struct so it should be released when finished.
+ */
+struct hvc_struct *hvc_get_by_index(int index)
+{
+	struct hvc_struct *hp;
+	unsigned long flags;
+
+	spin_lock(&hvc_structs_lock);
+
+	list_for_each_entry(hp, &hvc_structs, next) {
+		spin_lock_irqsave(&hp->lock, flags);
+		if (hp->index == index) {
+			kobject_get(&hp->kobj);
+			spin_unlock_irqrestore(&hp->lock, flags);
+			spin_unlock(&hvc_structs_lock);
+			return hp;
+		}
+		spin_unlock_irqrestore(&hp->lock, flags);
+	}
+	hp = NULL;
+
+	spin_unlock(&hvc_structs_lock);
+	return hp;
+}
+
+
+/*
+ * Initial console vtermnos for console API usage prior to full console
+ * initialization.  Any vty adapter outside this range will not have usable
+ * console interfaces but can still be used as a tty device.  This has to be
+ * static because kmalloc will not work during early console init.
+ */
+static struct hv_ops *cons_ops[MAX_NR_HVC_CONSOLES];
+static uint32_t vtermnos[MAX_NR_HVC_CONSOLES] =
+	{[0 ... MAX_NR_HVC_CONSOLES - 1] = -1};
+
+/*
+ * Console APIs, NOT TTY.  These APIs are available immediately when
+ * hvc_console_setup() finds adapters.
+ */
+
+void hvc_console_print(struct console *co, const char *b, unsigned count)
+{
+	char c[16] __ALIGNED__;
+	unsigned i = 0, n = 0;
+	int r, donecr = 0, index = co->index;
+
+	/* Console access attempt outside of acceptable console range. */
+	if (index >= MAX_NR_HVC_CONSOLES)
+		return;
+
+	/* This console adapter was removed so it is not useable. */
+	if (vtermnos[index] < 0)
+		return;
+
+	while (count > 0 || i > 0) {
+		if (count > 0 && i < sizeof(c)) {
+			if (b[n] == '\n' && !donecr) {
+				c[i++] = '\r';
+				donecr = 1;
+			} else {
+				c[i++] = b[n++];
+				donecr = 0;
+				--count;
+			}
+		} else {
+			r = cons_ops[index]->put_chars(vtermnos[index], c, i);
+			if (r < 0) {
+				/* throw away chars on error */
+				i = 0;
+			} else if (r > 0) {
+				i -= r;
+				if (i > 0)
+					memmove(c, c+r, i);
+			}
+		}
+	}
+}
+
+static struct tty_driver *hvc_console_device(struct console *c, int *index)
+{
+	if (vtermnos[c->index] == -1)
+		return NULL;
+
+	*index = c->index;
+	return hvc_driver;
+}
+
+static int __init hvc_console_setup(struct console *co, char *options)
+{
+	if (co->index < 0 || co->index >= MAX_NR_HVC_CONSOLES)
+		return -ENODEV;
+
+	if (vtermnos[co->index] == -1)
+		return -ENODEV;
+
+	return 0;
+}
+
+struct console hvc_con_driver = {
+	.name		= "hvc",
+	.write		= hvc_console_print,
+	.device		= hvc_console_device,
+	.setup		= hvc_console_setup,
+	.flags		= CON_PRINTBUFFER,
+	.index		= -1,
+};
+
+/*
+ * Early console initialization.  Preceeds driver initialization.
+ *
+ * (1) we are first, and the user specified another driver
+ * -- index will remain -1
+ * (2) we are first and the user specified no driver
+ * -- index will be set to 0, then we will fail setup.
+ * (3)  we are first and the user specified our driver
+ * -- index will be set to user specified driver, and we will fail
+ * (4) we are after driver, and this initcall will register us
+ * -- if the user didn't specify a driver then the console will match
+ *
+ * Note that for cases 2 and 3, we will match later when the io driver
+ * calls hvc_instantiate() and call register again.
+ */
+static int __init hvc_console_init(void)
+{
+	register_console(&hvc_con_driver);
+	return 0;
+}
+console_initcall(hvc_console_init);
+
+/*
+ * hvc_instantiate() is an early console discovery method which locates
+ * consoles * prior to the vio subsystem discovering them.  Hotplugged
+ * vty adapters do NOT get an hvc_instantiate() callback since they
+ * appear after early console init.
+ */
+int hvc_instantiate(uint32_t vtermno, int index, struct hv_ops *ops)
+{
+	struct hvc_struct *hp;
+
+	if (index < 0 || index >= MAX_NR_HVC_CONSOLES)
+		return -1;
+
+	if (vtermnos[index] != -1)
+		return -1;
+
+	/* make sure no no tty has been registerd in this index */
+	hp = hvc_get_by_index(index);
+	if (hp) {
+		kobject_put(&hp->kobj);
+		return -1;
+	}
+
+	vtermnos[index] = vtermno;
+	cons_ops[index] = ops;
+
+	/* reserve all indices upto and including this index */
+	if (last_hvc < index)
+		last_hvc = index;
+
+	/* if this index is what the user requested, then register
+	 * now (setup won't fail at this point).  It's ok to just
+	 * call register again if previously .setup failed.
+	 */
+	if (index == hvc_con_driver.index)
+		register_console(&hvc_con_driver);
+
+	return 0;
+}
+EXPORT_SYMBOL(hvc_instantiate);
+
+/* Wake the sleeping khvcd */
+static void hvc_kick(void)
+{
+	hvc_kicked = 1;
+	wake_up_process(hvc_task);
+}
+
+static int hvc_poll(struct hvc_struct *hp);
+
+/*
+ * NOTE: This API isn't used if the console adapter doesn't support interrupts.
+ * In this case the console is poll driven.
+ */
+static irqreturn_t hvc_handle_interrupt(int irq, void *dev_instance, struct pt_regs *regs)
+{
+	/* if hvc_poll request a repoll, then kick the hvcd thread */
+	if (hvc_poll(dev_instance))
+		hvc_kick();
+	return IRQ_HANDLED;
+}
+
+static void hvc_unthrottle(struct tty_struct *tty)
+{
+	hvc_kick();
+}
+
+/*
+ * The TTY interface won't be used until after the vio layer has exposed the vty
+ * adapter to the kernel.
+ */
+static int hvc_open(struct tty_struct *tty, struct file * filp)
+{
+	struct hvc_struct *hp;
+	unsigned long flags;
+	int irq = NO_IRQ;
+	int rc = 0;
+	struct kobject *kobjp;
+
+	/* Auto increments kobject reference if found. */
+	if (!(hp = hvc_get_by_index(tty->index))) {
+		printk(KERN_WARNING "hvc_console: tty open failed, no vty associated with tty.\n");
+		return -ENODEV;
+	}
+
+	spin_lock_irqsave(&hp->lock, flags);
+	/* Check and then increment for fast path open. */
+	if (hp->count++ > 0) {
+		spin_unlock_irqrestore(&hp->lock, flags);
+		hvc_kick();
+		return 0;
+	} /* else count == 0 */
+
+	tty->driver_data = hp;
+	tty->low_latency = 1; /* Makes flushes to ldisc synchronous. */
+
+	hp->tty = tty;
+	/* Save for request_irq outside of spin_lock. */
+	irq = hp->irq;
+	if (irq != NO_IRQ)
+		hp->irq_requested = 1;
+
+	kobjp = &hp->kobj;
+
+	spin_unlock_irqrestore(&hp->lock, flags);
+	/* check error, fallback to non-irq */
+	if (irq != NO_IRQ)
+		rc = request_irq(irq, hvc_handle_interrupt, SA_INTERRUPT, "hvc_console", hp);
+
+	/*
+	 * If the request_irq() fails and we return an error.  The tty layer
+	 * will call hvc_close() after a failed open but we don't want to clean
+	 * up there so we'll clean up here and clear out the previously set
+	 * tty fields and return the kobject reference.
+	 */
+	if (rc) {
+		spin_lock_irqsave(&hp->lock, flags);
+		hp->tty = NULL;
+		hp->irq_requested = 0;
+		spin_unlock_irqrestore(&hp->lock, flags);
+		tty->driver_data = NULL;
+		kobject_put(kobjp);
+		printk(KERN_ERR "hvc_open: request_irq failed with rc %d.\n", rc);
+	}
+	/* Force wakeup of the polling thread */
+	hvc_kick();
+
+	return rc;
+}
+
+static void hvc_close(struct tty_struct *tty, struct file * filp)
+{
+	struct hvc_struct *hp;
+	struct kobject *kobjp;
+	int irq = NO_IRQ;
+	unsigned long flags;
+
+	if (tty_hung_up_p(filp))
+		return;
+
+	/*
+	 * No driver_data means that this close was issued after a failed
+	 * hvc_open by the tty layer's release_dev() function and we can just
+	 * exit cleanly because the kobject reference wasn't made.
+	 */
+	if (!tty->driver_data)
+		return;
+
+	hp = tty->driver_data;
+	spin_lock_irqsave(&hp->lock, flags);
+
+	kobjp = &hp->kobj;
+	if (--hp->count == 0) {
+		if (hp->irq_requested)
+			irq = hp->irq;
+		hp->irq_requested = 0;
+
+		/* We are done with the tty pointer now. */
+		hp->tty = NULL;
+		spin_unlock_irqrestore(&hp->lock, flags);
+
+		/*
+		 * Chain calls chars_in_buffer() and returns immediately if
+		 * there is no buffered data otherwise sleeps on a wait queue
+		 * waking periodically to check chars_in_buffer().
+		 */
+		tty_wait_until_sent(tty, HVC_CLOSE_WAIT);
+
+		if (irq != NO_IRQ)
+			free_irq(irq, hp);
+
+	} else {
+		if (hp->count < 0)
+			printk(KERN_ERR "hvc_close %X: oops, count is %d\n",
+				hp->vtermno, hp->count);
+		spin_unlock_irqrestore(&hp->lock, flags);
+	}
+
+	kobject_put(kobjp);
+}
+
+static void hvc_hangup(struct tty_struct *tty)
+{
+	struct hvc_struct *hp = tty->driver_data;
+	unsigned long flags;
+	int irq = NO_IRQ;
+	int temp_open_count;
+	struct kobject *kobjp;
+
+	if (!hp)
+		return;
+
+	spin_lock_irqsave(&hp->lock, flags);
+
+	/*
+	 * The N_TTY line discipline has problems such that in a close vs
+	 * open->hangup case this can be called after the final close so prevent
+	 * that from happening for now.
+	 */
+	if (hp->count <= 0) {
+		spin_unlock_irqrestore(&hp->lock, flags);
+		return;
+	}
+
+	kobjp = &hp->kobj;
+	temp_open_count = hp->count;
+	hp->count = 0;
+	hp->n_outbuf = 0;
+	hp->tty = NULL;
+	if (hp->irq_requested)
+		/* Saved for use outside of spin_lock. */
+		irq = hp->irq;
+	hp->irq_requested = 0;
+	spin_unlock_irqrestore(&hp->lock, flags);
+	if (irq != NO_IRQ)
+		free_irq(irq, hp);
+	while(temp_open_count) {
+		--temp_open_count;
+		kobject_put(kobjp);
+	}
+}
+
+/*
+ * Push buffered characters whether they were just recently buffered or waiting
+ * on a blocked hypervisor.  Call this function with hp->lock held.
+ */
+static void hvc_push(struct hvc_struct *hp)
+{
+	int n;
+
+	n = hp->ops->put_chars(hp->vtermno, hp->outbuf, hp->n_outbuf);
+	if (n <= 0) {
+		if (n == 0)
+			return;
+		/* throw away output on error; this happens when
+		   there is no session connected to the vterm. */
+		hp->n_outbuf = 0;
+	} else
+		hp->n_outbuf -= n;
+	if (hp->n_outbuf > 0)
+		memmove(hp->outbuf, hp->outbuf + n, hp->n_outbuf);
+	else
+		hp->do_wakeup = 1;
+}
+
+static inline int __hvc_write_kernel(struct hvc_struct *hp,
+				   const unsigned char *buf, int count)
+{
+	unsigned long flags;
+	int rsize, written = 0;
+
+	spin_lock_irqsave(&hp->lock, flags);
+
+	/* Push pending writes */
+	if (hp->n_outbuf > 0)
+		hvc_push(hp);
+
+	while (count > 0 && (rsize = N_OUTBUF - hp->n_outbuf) > 0) {
+		if (rsize > count)
+			rsize = count;
+		memcpy(hp->outbuf + hp->n_outbuf, buf, rsize);
+		count -= rsize;
+		buf += rsize;
+		hp->n_outbuf += rsize;
+		written += rsize;
+		hvc_push(hp);
+	}
+	spin_unlock_irqrestore(&hp->lock, flags);
+
+	return written;
+}
+static int hvc_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+	struct hvc_struct *hp = tty->driver_data;
+	int written;
+
+	/* This write was probably executed during a tty close. */
+	if (!hp)
+		return -EPIPE;
+
+	if (hp->count <= 0)
+		return -EIO;
+
+	written = __hvc_write_kernel(hp, buf, count);
+
+	/*
+	 * Racy, but harmless, kick thread if there is still pending data.
+	 * There really is nothing wrong with kicking the thread, even if there
+	 * is no buffered data.
+	 */
+	if (hp->n_outbuf)
+		hvc_kick();
+
+	return written;
+}
+
+/*
+ * This is actually a contract between the driver and the tty layer outlining
+ * how much write room the driver can guarentee will be sent OR BUFFERED.  This
+ * driver MUST honor the return value.
+ */
+static int hvc_write_room(struct tty_struct *tty)
+{
+	struct hvc_struct *hp = tty->driver_data;
+
+	if (!hp)
+		return -1;
+
+	return N_OUTBUF - hp->n_outbuf;
+}
+
+static int hvc_chars_in_buffer(struct tty_struct *tty)
+{
+	struct hvc_struct *hp = tty->driver_data;
+
+	if (!hp)
+		return -1;
+	return hp->n_outbuf;
+}
+
+#define HVC_POLL_READ	0x00000001
+#define HVC_POLL_WRITE	0x00000002
+#define HVC_POLL_QUICK	0x00000004
+
+static int hvc_poll(struct hvc_struct *hp)
+{
+	struct tty_struct *tty;
+	int i, n, poll_mask = 0;
+	char buf[N_INBUF] __ALIGNED__;
+	unsigned long flags;
+	int read_total = 0;
+
+	spin_lock_irqsave(&hp->lock, flags);
+
+	/* Push pending writes */
+	if (hp->n_outbuf > 0)
+		hvc_push(hp);
+	/* Reschedule us if still some write pending */
+	if (hp->n_outbuf > 0)
+		poll_mask |= HVC_POLL_WRITE;
+
+	/* No tty attached, just skip */
+	tty = hp->tty;
+	if (tty == NULL)
+		goto bail;
+
+	/* Now check if we can get data (are we throttled ?) */
+	if (test_bit(TTY_THROTTLED, &tty->flags))
+		goto throttled;
+
+	/* If we aren't interrupt driven and aren't throttled, we always
+	 * request a reschedule
+	 */
+	if (hp->irq == NO_IRQ)
+		poll_mask |= HVC_POLL_READ;
+
+	/* Read data if any */
+	for (;;) {
+		int count = tty_buffer_request_room(tty, N_INBUF);
+
+		/* If flip is full, just reschedule a later read */
+		if (count == 0) {
+			poll_mask |= HVC_POLL_READ;
+			break;
+		}
+
+		n = hp->ops->get_chars(hp->vtermno, buf, count);
+		if (n <= 0) {
+			/* Hangup the tty when disconnected from host */
+			if (n == -EPIPE) {
+				spin_unlock_irqrestore(&hp->lock, flags);
+				tty_hangup(tty);
+				spin_lock_irqsave(&hp->lock, flags);
+			}
+			break;
+		}
+		for (i = 0; i < n; ++i) {
+#ifdef CONFIG_MAGIC_SYSRQ
+			if (hp->index == hvc_con_driver.index) {
+				/* Handle the SysRq Hack */
+				/* XXX should support a sequence */
+				if (buf[i] == '\x0f') {	/* ^O */
+					sysrq_pressed = 1;
+					continue;
+				} else if (sysrq_pressed) {
+					handle_sysrq(buf[i], NULL, tty);
+					sysrq_pressed = 0;
+					continue;
+				}
+			}
+#endif /* CONFIG_MAGIC_SYSRQ */
+			tty_insert_flip_char(tty, buf[i], 0);
+		}
+
+		/*
+		 * Account for the total amount read in one loop, and if above
+		 * 64 bytes, we do a quick schedule loop to let the tty grok
+		 * the data and eventually throttle us.
+		 */
+		read_total += n;
+		if (read_total >= 64) {
+			poll_mask |= HVC_POLL_QUICK;
+			break;
+		}
+	}
+ throttled:
+	/* Wakeup write queue if necessary */
+	if (hp->do_wakeup) {
+		hp->do_wakeup = 0;
+		tty_wakeup(tty);
+	}
+ bail:
+	spin_unlock_irqrestore(&hp->lock, flags);
+
+	if (read_total)
+		tty_flip_buffer_push(tty);
+	
+	return poll_mask;
+}
+
+#if defined(CONFIG_XMON) && defined(CONFIG_SMP)
+extern cpumask_t cpus_in_xmon;
+#else
+static const cpumask_t cpus_in_xmon = CPU_MASK_NONE;
+#endif
+
+/*
+ * This kthread is either polling or interrupt driven.  This is determined by
+ * calling hvc_poll() who determines whether a console adapter support
+ * interrupts.
+ */
+int khvcd(void *unused)
+{
+	int poll_mask;
+	struct hvc_struct *hp;
+
+	__set_current_state(TASK_RUNNING);
+	do {
+		poll_mask = 0;
+		hvc_kicked = 0;
+		wmb();
+		if (cpus_empty(cpus_in_xmon)) {
+			spin_lock(&hvc_structs_lock);
+			list_for_each_entry(hp, &hvc_structs, next) {
+				poll_mask |= hvc_poll(hp);
+			}
+			spin_unlock(&hvc_structs_lock);
+		} else
+			poll_mask |= HVC_POLL_READ;
+		if (hvc_kicked)
+			continue;
+		if (poll_mask & HVC_POLL_QUICK) {
+			yield();
+			continue;
+		}
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!hvc_kicked) {
+			if (poll_mask == 0)
+				schedule();
+			else
+				msleep_interruptible(TIMEOUT);
+		}
+		__set_current_state(TASK_RUNNING);
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+static struct tty_operations hvc_ops = {
+	.open = hvc_open,
+	.close = hvc_close,
+	.write = hvc_write,
+	.hangup = hvc_hangup,
+	.unthrottle = hvc_unthrottle,
+	.write_room = hvc_write_room,
+	.chars_in_buffer = hvc_chars_in_buffer,
+};
+
+/* callback when the kboject ref count reaches zero. */
+static void destroy_hvc_struct(struct kobject *kobj)
+{
+	struct hvc_struct *hp = container_of(kobj, struct hvc_struct, kobj);
+	unsigned long flags;
+
+	spin_lock(&hvc_structs_lock);
+
+	spin_lock_irqsave(&hp->lock, flags);
+	list_del(&(hp->next));
+	spin_unlock_irqrestore(&hp->lock, flags);
+
+	spin_unlock(&hvc_structs_lock);
+
+	kfree(hp);
+}
+
+static struct kobj_type hvc_kobj_type = {
+	.release = destroy_hvc_struct,
+};
+
+struct hvc_struct __devinit *hvc_alloc(uint32_t vtermno, int irq,
+					struct hv_ops *ops)
+{
+	struct hvc_struct *hp;
+	int i;
+
+	hp = kmalloc(sizeof(*hp), GFP_KERNEL);
+	if (!hp)
+		return ERR_PTR(-ENOMEM);
+
+	memset(hp, 0x00, sizeof(*hp));
+
+	hp->vtermno = vtermno;
+	hp->irq = irq;
+	hp->ops = ops;
+
+	kobject_init(&hp->kobj);
+	hp->kobj.ktype = &hvc_kobj_type;
+
+	spin_lock_init(&hp->lock);
+	spin_lock(&hvc_structs_lock);
+
+	/*
+	 * find index to use:
+	 * see if this vterm id matches one registered for console.
+	 */
+	for (i=0; i < MAX_NR_HVC_CONSOLES; i++)
+		if (vtermnos[i] == hp->vtermno)
+			break;
+
+	/* no matching slot, just use a counter */
+	if (i >= MAX_NR_HVC_CONSOLES)
+		i = ++last_hvc;
+
+	hp->index = i;
+
+	list_add_tail(&(hp->next), &hvc_structs);
+	spin_unlock(&hvc_structs_lock);
+
+	return hp;
+}
+EXPORT_SYMBOL(hvc_alloc);
+
+int __devexit hvc_remove(struct hvc_struct *hp)
+{
+	unsigned long flags;
+	struct kobject *kobjp;
+	struct tty_struct *tty;
+
+	spin_lock_irqsave(&hp->lock, flags);
+	tty = hp->tty;
+	kobjp = &hp->kobj;
+
+	if (hp->index < MAX_NR_HVC_CONSOLES)
+		vtermnos[hp->index] = -1;
+
+	/* Don't whack hp->irq because tty_hangup() will need to free the irq. */
+
+	spin_unlock_irqrestore(&hp->lock, flags);
+
+	/*
+	 * We 'put' the instance that was grabbed when the kobject instance
+	 * was intialized using kobject_init().  Let the last holder of this
+	 * kobject cause it to be removed, which will probably be the tty_hangup
+	 * below.
+	 */
+	kobject_put(kobjp);
+
+	/*
+	 * This function call will auto chain call hvc_hangup.  The tty should
+	 * always be valid at this time unless a simultaneous tty close already
+	 * cleaned up the hvc_struct.
+	 */
+	if (tty)
+		tty_hangup(tty);
+	return 0;
+}
+EXPORT_SYMBOL(hvc_remove);
+
+/* Driver initialization.  Follow console initialization.  This is where the TTY
+ * interfaces start to become available. */
+int __init hvc_init(void)
+{
+	/* We need more than hvc_count adapters due to hotplug additions. */
+	hvc_driver = alloc_tty_driver(HVC_ALLOC_TTY_ADAPTERS);
+	if (!hvc_driver)
+		return -ENOMEM;
+
+	hvc_driver->owner = THIS_MODULE;
+	hvc_driver->devfs_name = "hvc/";
+	hvc_driver->driver_name = "hvc";
+	hvc_driver->name = "hvc";
+	hvc_driver->major = HVC_MAJOR;
+	hvc_driver->minor_start = HVC_MINOR;
+	hvc_driver->type = TTY_DRIVER_TYPE_SYSTEM;
+	hvc_driver->init_termios = tty_std_termios;
+	hvc_driver->flags = TTY_DRIVER_REAL_RAW;
+	tty_set_operations(hvc_driver, &hvc_ops);
+
+	/* Always start the kthread because there can be hotplug vty adapters
+	 * added later. */
+	hvc_task = kthread_run(khvcd, NULL, "khvcd");
+	if (IS_ERR(hvc_task)) {
+		panic("Couldn't create kthread for console.\n");
+		put_tty_driver(hvc_driver);
+		return -EIO;
+	}
+
+	if (tty_register_driver(hvc_driver))
+		panic("Couldn't register hvc console driver\n");
+
+	return 0;
+}
+module_init(hvc_init);
+
+/* This isn't particularily necessary due to this being a console driver
+ * but it is nice to be thorough.
+ */
+static void __exit hvc_exit(void)
+{
+	kthread_stop(hvc_task);
+
+	tty_unregister_driver(hvc_driver);
+	/* return tty_struct instances allocated in hvc_init(). */
+	put_tty_driver(hvc_driver);
+	unregister_console(&hvc_con_driver);
+}
+module_exit(hvc_exit);
diff -urN oldtree/drivers/char/hvcs.c newtree/drivers/char/hvcs.c
--- oldtree/drivers/char/hvcs.c	2006-03-08 18:47:59.215830000 +0000
+++ newtree/drivers/char/hvcs.c	2006-03-08 15:22:33.113496750 +0000
@@ -1405,7 +1405,7 @@
 		return -ENOMEM;
 	}
 
-	hvcs_task = kthread_run(khvcsd, NULL, "khvcsd");
+	hvcs_task = kthread_nofreeze_run(khvcsd, NULL, "khvcsd");
 	if (IS_ERR(hvcs_task)) {
 		printk(KERN_ERR "HVCS: khvcsd creation failed.  Driver not loaded.\n");
 		kfree(hvcs_pi_buff);
diff -urN oldtree/drivers/char/hvcs.c.orig newtree/drivers/char/hvcs.c.orig
--- oldtree/drivers/char/hvcs.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/drivers/char/hvcs.c.orig	2006-03-08 15:21:14.720597500 +0000
@@ -0,0 +1,1651 @@
+/*
+ * IBM eServer Hypervisor Virtual Console Server Device Driver
+ * Copyright (C) 2003, 2004 IBM Corp.
+ *  Ryan S. Arnold (rsa@us.ibm.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Author(s) :  Ryan S. Arnold <rsa@us.ibm.com>
+ *
+ * This is the device driver for the IBM Hypervisor Virtual Console Server,
+ * "hvcs".  The IBM hvcs provides a tty driver interface to allow Linux
+ * user space applications access to the system consoles of logically
+ * partitioned operating systems, e.g. Linux, running on the same partitioned
+ * Power5 ppc64 system.  Physical hardware consoles per partition are not
+ * practical on this hardware so system consoles are accessed by this driver
+ * using inter-partition firmware interfaces to virtual terminal devices.
+ *
+ * A vty is known to the HMC as a "virtual serial server adapter".  It is a
+ * virtual terminal device that is created by firmware upon partition creation
+ * to act as a partitioned OS's console device.
+ *
+ * Firmware dynamically (via hotplug) exposes vty-servers to a running ppc64
+ * Linux system upon their creation by the HMC or their exposure during boot.
+ * The non-user interactive backend of this driver is implemented as a vio
+ * device driver so that it can receive notification of vty-server lifetimes
+ * after it registers with the vio bus to handle vty-server probe and remove
+ * callbacks.
+ *
+ * Many vty-servers can be configured to connect to one vty, but a vty can
+ * only be actively connected to by a single vty-server, in any manner, at one
+ * time.  If the HMC is currently hosting the console for a target Linux
+ * partition; attempts to open the tty device to the partition's console using
+ * the hvcs on any partition will return -EBUSY with every open attempt until
+ * the HMC frees the connection between its vty-server and the desired
+ * partition's vty device.  Conversely, a vty-server may only be connected to
+ * a single vty at one time even though it may have several configured vty
+ * partner possibilities.
+ *
+ * Firmware does not provide notification of vty partner changes to this
+ * driver.  This means that an HMC Super Admin may add or remove partner vtys
+ * from a vty-server's partner list but the changes will not be signaled to
+ * the vty-server.  Firmware only notifies the driver when a vty-server is
+ * added or removed from the system.  To compensate for this deficiency, this
+ * driver implements a sysfs update attribute which provides a method for
+ * rescanning partner information upon a user's request.
+ *
+ * Each vty-server, prior to being exposed to this driver is reference counted
+ * using the 2.6 Linux kernel kobject construct.  This kobject is also used by
+ * the vio bus to provide a vio device sysfs entry that this driver attaches
+ * device specific attributes to, including partner information.  The vio bus
+ * framework also provides a sysfs entry for each vio driver.  The hvcs driver
+ * provides driver attributes in this entry.
+ *
+ * For direction on installation and usage of this driver please reference
+ * Documentation/powerpc/hvcs.txt.
+ */
+
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/major.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <asm/hvconsole.h>
+#include <asm/hvcserver.h>
+#include <asm/uaccess.h>
+#include <asm/vio.h>
+
+/*
+ * 1.3.0 -> 1.3.1 In hvcs_open memset(..,0x00,..) instead of memset(..,0x3F,00).
+ * Removed braces around single statements following conditionals.  Removed '=
+ * 0' after static int declarations since these default to zero.  Removed
+ * list_for_each_safe() and replaced with list_for_each_entry() in
+ * hvcs_get_by_index().  The 'safe' version is un-needed now that the driver is
+ * using spinlocks.  Changed spin_lock_irqsave() to spin_lock() when locking
+ * hvcs_structs_lock and hvcs_pi_lock since these are not touched in an int
+ * handler.  Initialized hvcs_structs_lock and hvcs_pi_lock to
+ * SPIN_LOCK_UNLOCKED at declaration time rather than in hvcs_module_init().
+ * Added spin_lock around list_del() in destroy_hvcs_struct() to protect the
+ * list traversals from a deletion.  Removed '= NULL' from pointer declaration
+ * statements since they are initialized NULL by default.  Removed wmb()
+ * instances from hvcs_try_write().  They probably aren't needed with locking in
+ * place.  Added check and cleanup for hvcs_pi_buff = kmalloc() in
+ * hvcs_module_init().  Exposed hvcs_struct.index via a sysfs attribute so that
+ * the coupling between /dev/hvcs* and a vty-server can be automatically
+ * determined.  Moved kobject_put() in hvcs_open outside of the
+ * spin_unlock_irqrestore().
+ *
+ * 1.3.1 -> 1.3.2 Changed method for determining hvcs_struct->index and had it
+ * align with how the tty layer always assigns the lowest index available.  This
+ * change resulted in a list of ints that denotes which indexes are available.
+ * Device additions and removals use the new hvcs_get_index() and
+ * hvcs_return_index() helper functions.  The list is created with
+ * hvsc_alloc_index_list() and it is destroyed with hvcs_free_index_list().
+ * Without these fixes hotplug vty-server adapter support goes crazy with this
+ * driver if the user removes a vty-server adapter.  Moved free_irq() outside of
+ * the hvcs_final_close() function in order to get it out of the spinlock.
+ * Rearranged hvcs_close().  Cleaned up some printks and did some housekeeping
+ * on the changelog.  Removed local CLC_LENGTH and used HVCS_CLC_LENGTH from
+ * include/asm-powerpc/hvcserver.h 
+ *
+ * 1.3.2 -> 1.3.3 Replaced yield() in hvcs_close() with tty_wait_until_sent() to
+ * prevent possible lockup with realtime scheduling as similarily pointed out by
+ * akpm in hvc_console.  Changed resulted in the removal of hvcs_final_close()
+ * to reorder cleanup operations and prevent discarding of pending data during
+ * an hvcs_close().  Removed spinlock protection of hvcs_struct data members in
+ * hvcs_write_room() and hvcs_chars_in_buffer() because they aren't needed.
+ */
+
+#define HVCS_DRIVER_VERSION "1.3.3"
+
+MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>");
+MODULE_DESCRIPTION("IBM hvcs (Hypervisor Virtual Console Server) Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(HVCS_DRIVER_VERSION);
+
+/*
+ * Wait this long per iteration while trying to push buffered data to the
+ * hypervisor before allowing the tty to complete a close operation.
+ */
+#define HVCS_CLOSE_WAIT (HZ/100) /* 1/10 of a second */
+
+/*
+ * Since the Linux TTY code does not currently (2-04-2004) support dynamic
+ * addition of tty derived devices and we shouldn't allocate thousands of
+ * tty_device pointers when the number of vty-server & vty partner connections
+ * will most often be much lower than this, we'll arbitrarily allocate
+ * HVCS_DEFAULT_SERVER_ADAPTERS tty_structs and cdev's by default when we
+ * register the tty_driver. This can be overridden using an insmod parameter.
+ */
+#define HVCS_DEFAULT_SERVER_ADAPTERS	64
+
+/*
+ * The user can't insmod with more than HVCS_MAX_SERVER_ADAPTERS hvcs device
+ * nodes as a sanity check.  Theoretically there can be over 1 Billion
+ * vty-server & vty partner connections.
+ */
+#define HVCS_MAX_SERVER_ADAPTERS	1024
+
+/*
+ * We let Linux assign us a major number and we start the minors at zero.  There
+ * is no intuitive mapping between minor number and the target vty-server
+ * adapter except that each new vty-server adapter is always assigned to the
+ * smallest minor number available.
+ */
+#define HVCS_MINOR_START	0
+
+/*
+ * The hcall interface involves putting 8 chars into each of two registers.
+ * We load up those 2 registers (in arch/powerpc/platforms/pseries/hvconsole.c)
+ * by casting char[16] to long[2].  It would work without __ALIGNED__, but a 
+ * little (tiny) bit slower because an unaligned load is slower than aligned 
+ * load.
+ */
+#define __ALIGNED__	__attribute__((__aligned__(8)))
+
+/*
+ * How much data can firmware send with each hvc_put_chars()?  Maybe this
+ * should be moved into an architecture specific area.
+ */
+#define HVCS_BUFF_LEN	16
+
+/*
+ * This is the maximum amount of data we'll let the user send us (hvcs_write) at
+ * once in a chunk as a sanity check.
+ */
+#define HVCS_MAX_FROM_USER	4096
+
+/*
+ * Be careful when adding flags to this line discipline.  Don't add anything
+ * that will cause echoing or we'll go into recursive loop echoing chars back
+ * and forth with the console drivers.
+ */
+static struct termios hvcs_tty_termios = {
+	.c_iflag = IGNBRK | IGNPAR,
+	.c_oflag = OPOST,
+	.c_cflag = B38400 | CS8 | CREAD | HUPCL,
+	.c_cc = INIT_C_CC
+};
+
+/*
+ * This value is used to take the place of a command line parameter when the
+ * module is inserted.  It starts as -1 and stays as such if the user doesn't
+ * specify a module insmod parameter.  If they DO specify one then it is set to
+ * the value of the integer passed in.
+ */
+static int hvcs_parm_num_devs = -1;
+module_param(hvcs_parm_num_devs, int, 0);
+
+char hvcs_driver_name[] = "hvcs";
+char hvcs_device_node[] = "hvcs";
+char hvcs_driver_string[]
+	= "IBM hvcs (Hypervisor Virtual Console Server) Driver";
+
+/* Status of partner info rescan triggered via sysfs. */
+static int hvcs_rescan_status;
+
+static struct tty_driver *hvcs_tty_driver;
+
+/*
+ * In order to be somewhat sane this driver always associates the hvcs_struct
+ * index element with the numerically equal tty->index.  This means that a
+ * hotplugged vty-server adapter will always map to the lowest index valued
+ * device node.  If vty-servers were hotplug removed from the system and then
+ * new ones added the new vty-server may have the largest slot number of all
+ * the vty-server adapters in the partition but it may have the lowest dev node
+ * index of all the adapters due to the hole left by the hotplug removed
+ * adapter.  There are a set of functions provided to get the lowest index for
+ * a new device as well as return the index to the list.  This list is allocated
+ * with a number of elements equal to the number of device nodes requested when
+ * the module was inserted.
+ */
+static int *hvcs_index_list;
+
+/*
+ * How large is the list?  This is kept for traversal since the list is
+ * dynamically created.
+ */
+static int hvcs_index_count;
+
+/*
+ * Used by the khvcsd to pick up I/O operations when the kernel_thread is
+ * already awake but potentially shifted to TASK_INTERRUPTIBLE state.
+ */
+static int hvcs_kicked;
+
+/*
+ * Use by the kthread construct for task operations like waking the sleeping
+ * thread and stopping the kthread.
+ */
+static struct task_struct *hvcs_task;
+
+/*
+ * We allocate this for the use of all of the hvcs_structs when they fetch
+ * partner info.
+ */
+static unsigned long *hvcs_pi_buff;
+
+/* Only allow one hvcs_struct to use the hvcs_pi_buff at a time. */
+static DEFINE_SPINLOCK(hvcs_pi_lock);
+
+/* One vty-server per hvcs_struct */
+struct hvcs_struct {
+	spinlock_t lock;
+
+	/*
+	 * This index identifies this hvcs device as the complement to a
+	 * specific tty index.
+	 */
+	unsigned int index;
+
+	struct tty_struct *tty;
+	unsigned int open_count;
+
+	/*
+	 * Used to tell the driver kernel_thread what operations need to take
+	 * place upon this hvcs_struct instance.
+	 */
+	int todo_mask;
+
+	/*
+	 * This buffer is required so that when hvcs_write_room() reports that
+	 * it can send HVCS_BUFF_LEN characters that it will buffer the full
+	 * HVCS_BUFF_LEN characters if need be.  This is essential for opost
+	 * writes since they do not do high level buffering and expect to be
+	 * able to send what the driver commits to sending buffering
+	 * [e.g. tab to space conversions in n_tty.c opost()].
+	 */
+	char buffer[HVCS_BUFF_LEN];
+	int chars_in_buffer;
+
+	/*
+	 * Any variable below the kobject is valid before a tty is connected and
+	 * stays valid after the tty is disconnected.  These shouldn't be
+	 * whacked until the koject refcount reaches zero though some entries
+	 * may be changed via sysfs initiatives.
+	 */
+	struct kobject kobj; /* ref count & hvcs_struct lifetime */
+	int connected; /* is the vty-server currently connected to a vty? */
+	uint32_t p_unit_address; /* partner unit address */
+	uint32_t p_partition_ID; /* partner partition ID */
+	char p_location_code[HVCS_CLC_LENGTH + 1]; /* CLC + Null Term */
+	struct list_head next; /* list management */
+	struct vio_dev *vdev;
+};
+
+/* Required to back map a kobject to its containing object */
+#define from_kobj(kobj) container_of(kobj, struct hvcs_struct, kobj)
+
+static struct list_head hvcs_structs = LIST_HEAD_INIT(hvcs_structs);
+static DEFINE_SPINLOCK(hvcs_structs_lock);
+
+static void hvcs_unthrottle(struct tty_struct *tty);
+static void hvcs_throttle(struct tty_struct *tty);
+static irqreturn_t hvcs_handle_interrupt(int irq, void *dev_instance,
+		struct pt_regs *regs);
+
+static int hvcs_write(struct tty_struct *tty,
+		const unsigned char *buf, int count);
+static int hvcs_write_room(struct tty_struct *tty);
+static int hvcs_chars_in_buffer(struct tty_struct *tty);
+
+static int hvcs_has_pi(struct hvcs_struct *hvcsd);
+static void hvcs_set_pi(struct hvcs_partner_info *pi,
+		struct hvcs_struct *hvcsd);
+static int hvcs_get_pi(struct hvcs_struct *hvcsd);
+static int hvcs_rescan_devices_list(void);
+
+static int hvcs_partner_connect(struct hvcs_struct *hvcsd);
+static void hvcs_partner_free(struct hvcs_struct *hvcsd);
+
+static int hvcs_enable_device(struct hvcs_struct *hvcsd,
+		uint32_t unit_address, unsigned int irq, struct vio_dev *dev);
+
+static void destroy_hvcs_struct(struct kobject *kobj);
+static int hvcs_open(struct tty_struct *tty, struct file *filp);
+static void hvcs_close(struct tty_struct *tty, struct file *filp);
+static void hvcs_hangup(struct tty_struct * tty);
+
+static void hvcs_create_device_attrs(struct hvcs_struct *hvcsd);
+static void hvcs_remove_device_attrs(struct vio_dev *vdev);
+static void hvcs_create_driver_attrs(void);
+static void hvcs_remove_driver_attrs(void);
+
+static int __devinit hvcs_probe(struct vio_dev *dev,
+		const struct vio_device_id *id);
+static int __devexit hvcs_remove(struct vio_dev *dev);
+static int __init hvcs_module_init(void);
+static void __exit hvcs_module_exit(void);
+
+#define HVCS_SCHED_READ	0x00000001
+#define HVCS_QUICK_READ	0x00000002
+#define HVCS_TRY_WRITE	0x00000004
+#define HVCS_READ_MASK	(HVCS_SCHED_READ | HVCS_QUICK_READ)
+
+static void hvcs_kick(void)
+{
+	hvcs_kicked = 1;
+	wmb();
+	wake_up_process(hvcs_task);
+}
+
+static void hvcs_unthrottle(struct tty_struct *tty)
+{
+	struct hvcs_struct *hvcsd = tty->driver_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	hvcsd->todo_mask |= HVCS_SCHED_READ;
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	hvcs_kick();
+}
+
+static void hvcs_throttle(struct tty_struct *tty)
+{
+	struct hvcs_struct *hvcsd = tty->driver_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	vio_disable_interrupts(hvcsd->vdev);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+}
+
+/*
+ * If the device is being removed we don't have to worry about this interrupt
+ * handler taking any further interrupts because they are disabled which means
+ * the hvcs_struct will always be valid in this handler.
+ */
+static irqreturn_t hvcs_handle_interrupt(int irq, void *dev_instance,
+		struct pt_regs *regs)
+{
+	struct hvcs_struct *hvcsd = dev_instance;
+
+	spin_lock(&hvcsd->lock);
+	vio_disable_interrupts(hvcsd->vdev);
+	hvcsd->todo_mask |= HVCS_SCHED_READ;
+	spin_unlock(&hvcsd->lock);
+	hvcs_kick();
+
+	return IRQ_HANDLED;
+}
+
+/* This function must be called with the hvcsd->lock held */
+static void hvcs_try_write(struct hvcs_struct *hvcsd)
+{
+	uint32_t unit_address = hvcsd->vdev->unit_address;
+	struct tty_struct *tty = hvcsd->tty;
+	int sent;
+
+	if (hvcsd->todo_mask & HVCS_TRY_WRITE) {
+		/* won't send partial writes */
+		sent = hvc_put_chars(unit_address,
+				&hvcsd->buffer[0],
+				hvcsd->chars_in_buffer );
+		if (sent > 0) {
+			hvcsd->chars_in_buffer = 0;
+			/* wmb(); */
+			hvcsd->todo_mask &= ~(HVCS_TRY_WRITE);
+			/* wmb(); */
+
+			/*
+			 * We are still obligated to deliver the data to the
+			 * hypervisor even if the tty has been closed because
+			 * we commited to delivering it.  But don't try to wake
+			 * a non-existent tty.
+			 */
+			if (tty) {
+				tty_wakeup(tty);
+			}
+		}
+	}
+}
+
+static int hvcs_io(struct hvcs_struct *hvcsd)
+{
+	uint32_t unit_address;
+	struct tty_struct *tty;
+	char buf[HVCS_BUFF_LEN] __ALIGNED__;
+	unsigned long flags;
+	int got = 0;
+	int i;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+
+	unit_address = hvcsd->vdev->unit_address;
+	tty = hvcsd->tty;
+
+	hvcs_try_write(hvcsd);
+
+	if (!tty || test_bit(TTY_THROTTLED, &tty->flags)) {
+		hvcsd->todo_mask &= ~(HVCS_READ_MASK);
+		goto bail;
+	} else if (!(hvcsd->todo_mask & (HVCS_READ_MASK)))
+		goto bail;
+
+	/* remove the read masks */
+	hvcsd->todo_mask &= ~(HVCS_READ_MASK);
+
+	if (tty_buffer_request_room(tty, HVCS_BUFF_LEN) >= HVCS_BUFF_LEN) {
+		got = hvc_get_chars(unit_address,
+				&buf[0],
+				HVCS_BUFF_LEN);
+		tty_insert_flip_string(tty, buf, got);
+	}
+
+	/* Give the TTY time to process the data we just sent. */
+	if (got)
+		hvcsd->todo_mask |= HVCS_QUICK_READ;
+
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	/* This is synch because tty->low_latency == 1 */
+	if(got)
+		tty_flip_buffer_push(tty);
+
+	if (!got) {
+		/* Do this _after_ the flip_buffer_push */
+		spin_lock_irqsave(&hvcsd->lock, flags);
+		vio_enable_interrupts(hvcsd->vdev);
+		spin_unlock_irqrestore(&hvcsd->lock, flags);
+	}
+
+	return hvcsd->todo_mask;
+
+ bail:
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return hvcsd->todo_mask;
+}
+
+static int khvcsd(void *unused)
+{
+	struct hvcs_struct *hvcsd;
+	int hvcs_todo_mask;
+
+	__set_current_state(TASK_RUNNING);
+
+	do {
+		hvcs_todo_mask = 0;
+		hvcs_kicked = 0;
+		wmb();
+
+		spin_lock(&hvcs_structs_lock);
+		list_for_each_entry(hvcsd, &hvcs_structs, next) {
+			hvcs_todo_mask |= hvcs_io(hvcsd);
+		}
+		spin_unlock(&hvcs_structs_lock);
+
+		/*
+		 * If any of the hvcs adapters want to try a write or quick read
+		 * don't schedule(), yield a smidgen then execute the hvcs_io
+		 * thread again for those that want the write.
+		 */
+		 if (hvcs_todo_mask & (HVCS_TRY_WRITE | HVCS_QUICK_READ)) {
+			yield();
+			continue;
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!hvcs_kicked)
+			schedule();
+		__set_current_state(TASK_RUNNING);
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+static struct vio_device_id hvcs_driver_table[] __devinitdata= {
+	{"serial-server", "hvterm2"},
+	{ "", "" }
+};
+MODULE_DEVICE_TABLE(vio, hvcs_driver_table);
+
+static void hvcs_return_index(int index)
+{
+	/* Paranoia check */
+	if (!hvcs_index_list)
+		return;
+	if (index < 0 || index >= hvcs_index_count)
+		return;
+	if (hvcs_index_list[index] == -1)
+		return;
+	else
+		hvcs_index_list[index] = -1;
+}
+
+/* callback when the kboject ref count reaches zero */
+static void destroy_hvcs_struct(struct kobject *kobj)
+{
+	struct hvcs_struct *hvcsd = from_kobj(kobj);
+	struct vio_dev *vdev;
+	unsigned long flags;
+
+	spin_lock(&hvcs_structs_lock);
+	spin_lock_irqsave(&hvcsd->lock, flags);
+
+	/* the list_del poisons the pointers */
+	list_del(&(hvcsd->next));
+
+	if (hvcsd->connected == 1) {
+		hvcs_partner_free(hvcsd);
+		printk(KERN_INFO "HVCS: Closed vty-server@%X and"
+				" partner vty@%X:%d connection.\n",
+				hvcsd->vdev->unit_address,
+				hvcsd->p_unit_address,
+				(uint32_t)hvcsd->p_partition_ID);
+	}
+	printk(KERN_INFO "HVCS: Destroyed hvcs_struct for vty-server@%X.\n",
+			hvcsd->vdev->unit_address);
+
+	vdev = hvcsd->vdev;
+	hvcsd->vdev = NULL;
+
+	hvcsd->p_unit_address = 0;
+	hvcsd->p_partition_ID = 0;
+	hvcs_return_index(hvcsd->index);
+	memset(&hvcsd->p_location_code[0], 0x00, HVCS_CLC_LENGTH + 1);
+
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	spin_unlock(&hvcs_structs_lock);
+
+	hvcs_remove_device_attrs(vdev);
+
+	kfree(hvcsd);
+}
+
+static struct kobj_type hvcs_kobj_type = {
+	.release = destroy_hvcs_struct,
+};
+
+static int hvcs_get_index(void)
+{
+	int i;
+	/* Paranoia check */
+	if (!hvcs_index_list) {
+		printk(KERN_ERR "HVCS: hvcs_index_list NOT valid!.\n");
+		return -EFAULT;
+	}
+	/* Find the numerically lowest first free index. */
+	for(i = 0; i < hvcs_index_count; i++) {
+		if (hvcs_index_list[i] == -1) {
+			hvcs_index_list[i] = 0;
+			return i;
+		}
+	}
+	return -1;
+}
+
+static int __devinit hvcs_probe(
+	struct vio_dev *dev,
+	const struct vio_device_id *id)
+{
+	struct hvcs_struct *hvcsd;
+	int index;
+
+	if (!dev || !id) {
+		printk(KERN_ERR "HVCS: probed with invalid parameter.\n");
+		return -EPERM;
+	}
+
+	/* early to avoid cleanup on failure */
+	index = hvcs_get_index();
+	if (index < 0) {
+		return -EFAULT;
+	}
+
+	hvcsd = kmalloc(sizeof(*hvcsd), GFP_KERNEL);
+	if (!hvcsd)
+		return -ENODEV;
+
+	/* hvcsd->tty is zeroed out with the memset */
+	memset(hvcsd, 0x00, sizeof(*hvcsd));
+
+	spin_lock_init(&hvcsd->lock);
+	/* Automatically incs the refcount the first time */
+	kobject_init(&hvcsd->kobj);
+	/* Set up the callback for terminating the hvcs_struct's life */
+	hvcsd->kobj.ktype = &hvcs_kobj_type;
+
+	hvcsd->vdev = dev;
+	dev->dev.driver_data = hvcsd;
+
+	hvcsd->index = index;
+
+	/* hvcsd->index = ++hvcs_struct_count; */
+	hvcsd->chars_in_buffer = 0;
+	hvcsd->todo_mask = 0;
+	hvcsd->connected = 0;
+
+	/*
+	 * This will populate the hvcs_struct's partner info fields for the
+	 * first time.
+	 */
+	if (hvcs_get_pi(hvcsd)) {
+		printk(KERN_ERR "HVCS: Failed to fetch partner"
+			" info for vty-server@%X on device probe.\n",
+			hvcsd->vdev->unit_address);
+	}
+
+	/*
+	 * If a user app opens a tty that corresponds to this vty-server before
+	 * the hvcs_struct has been added to the devices list then the user app
+	 * will get -ENODEV.
+	 */
+
+	spin_lock(&hvcs_structs_lock);
+
+	list_add_tail(&(hvcsd->next), &hvcs_structs);
+
+	spin_unlock(&hvcs_structs_lock);
+
+	hvcs_create_device_attrs(hvcsd);
+
+	printk(KERN_INFO "HVCS: vty-server@%X added to the vio bus.\n", dev->unit_address);
+
+	/*
+	 * DON'T enable interrupts here because there is no user to receive the
+	 * data.
+	 */
+	return 0;
+}
+
+static int __devexit hvcs_remove(struct vio_dev *dev)
+{
+	struct hvcs_struct *hvcsd = dev->dev.driver_data;
+	unsigned long flags;
+	struct kobject *kobjp;
+	struct tty_struct *tty;
+
+	if (!hvcsd)
+		return -ENODEV;
+
+	/* By this time the vty-server won't be getting any more interrups */
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+
+	tty = hvcsd->tty;
+
+	kobjp = &hvcsd->kobj;
+
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+
+	/*
+	 * Let the last holder of this object cause it to be removed, which
+	 * would probably be tty_hangup below.
+	 */
+	kobject_put (kobjp);
+
+	/*
+	 * The hangup is a scheduled function which will auto chain call
+	 * hvcs_hangup.  The tty should always be valid at this time unless a
+	 * simultaneous tty close already cleaned up the hvcs_struct.
+	 */
+	if (tty)
+		tty_hangup(tty);
+
+	printk(KERN_INFO "HVCS: vty-server@%X removed from the"
+			" vio bus.\n", dev->unit_address);
+	return 0;
+};
+
+static struct vio_driver hvcs_vio_driver = {
+	.id_table	= hvcs_driver_table,
+	.probe		= hvcs_probe,
+	.remove		= hvcs_remove,
+	.driver		= {
+		.name	= hvcs_driver_name,
+		.owner	= THIS_MODULE,
+	}
+};
+
+/* Only called from hvcs_get_pi please */
+static void hvcs_set_pi(struct hvcs_partner_info *pi, struct hvcs_struct *hvcsd)
+{
+	int clclength;
+
+	hvcsd->p_unit_address = pi->unit_address;
+	hvcsd->p_partition_ID  = pi->partition_ID;
+	clclength = strlen(&pi->location_code[0]);
+	if (clclength > HVCS_CLC_LENGTH)
+		clclength = HVCS_CLC_LENGTH;
+
+	/* copy the null-term char too */
+	strncpy(&hvcsd->p_location_code[0],
+			&pi->location_code[0], clclength + 1);
+}
+
+/*
+ * Traverse the list and add the partner info that is found to the hvcs_struct
+ * struct entry. NOTE: At this time I know that partner info will return a
+ * single entry but in the future there may be multiple partner info entries per
+ * vty-server and you'll want to zero out that list and reset it.  If for some
+ * reason you have an old version of this driver but there IS more than one
+ * partner info then hvcsd->p_* will hold the last partner info data from the
+ * firmware query.  A good way to update this code would be to replace the three
+ * partner info fields in hvcs_struct with a list of hvcs_partner_info
+ * instances.
+ *
+ * This function must be called with the hvcsd->lock held.
+ */
+static int hvcs_get_pi(struct hvcs_struct *hvcsd)
+{
+	struct hvcs_partner_info *pi;
+	uint32_t unit_address = hvcsd->vdev->unit_address;
+	struct list_head head;
+	int retval;
+
+	spin_lock(&hvcs_pi_lock);
+	if (!hvcs_pi_buff) {
+		spin_unlock(&hvcs_pi_lock);
+		return -EFAULT;
+	}
+	retval = hvcs_get_partner_info(unit_address, &head, hvcs_pi_buff);
+	spin_unlock(&hvcs_pi_lock);
+	if (retval) {
+		printk(KERN_ERR "HVCS: Failed to fetch partner"
+			" info for vty-server@%x.\n", unit_address);
+		return retval;
+	}
+
+	/* nixes the values if the partner vty went away */
+	hvcsd->p_unit_address = 0;
+	hvcsd->p_partition_ID = 0;
+
+	list_for_each_entry(pi, &head, node)
+		hvcs_set_pi(pi, hvcsd);
+
+	hvcs_free_partner_info(&head);
+	return 0;
+}
+
+/*
+ * This function is executed by the driver "rescan" sysfs entry.  It shouldn't
+ * be executed elsewhere, in order to prevent deadlock issues.
+ */
+static int hvcs_rescan_devices_list(void)
+{
+	struct hvcs_struct *hvcsd;
+	unsigned long flags;
+
+	spin_lock(&hvcs_structs_lock);
+
+	list_for_each_entry(hvcsd, &hvcs_structs, next) {
+		spin_lock_irqsave(&hvcsd->lock, flags);
+		hvcs_get_pi(hvcsd);
+		spin_unlock_irqrestore(&hvcsd->lock, flags);
+	}
+
+	spin_unlock(&hvcs_structs_lock);
+
+	return 0;
+}
+
+/*
+ * Farm this off into its own function because it could be more complex once
+ * multiple partners support is added. This function should be called with
+ * the hvcsd->lock held.
+ */
+static int hvcs_has_pi(struct hvcs_struct *hvcsd)
+{
+	if ((!hvcsd->p_unit_address) || (!hvcsd->p_partition_ID))
+		return 0;
+	return 1;
+}
+
+/*
+ * NOTE: It is possible that the super admin removed a partner vty and then
+ * added a different vty as the new partner.
+ *
+ * This function must be called with the hvcsd->lock held.
+ */
+static int hvcs_partner_connect(struct hvcs_struct *hvcsd)
+{
+	int retval;
+	unsigned int unit_address = hvcsd->vdev->unit_address;
+
+	/*
+	 * If there wasn't any pi when the device was added it doesn't meant
+	 * there isn't any now.  This driver isn't notified when a new partner
+	 * vty is added to a vty-server so we discover changes on our own.
+	 * Please see comments in hvcs_register_connection() for justification
+	 * of this bizarre code.
+	 */
+	retval = hvcs_register_connection(unit_address,
+			hvcsd->p_partition_ID,
+			hvcsd->p_unit_address);
+	if (!retval) {
+		hvcsd->connected = 1;
+		return 0;
+	} else if (retval != -EINVAL)
+		return retval;
+
+	/*
+	 * As per the spec re-get the pi and try again if -EINVAL after the
+	 * first connection attempt.
+	 */
+	if (hvcs_get_pi(hvcsd))
+		return -ENOMEM;
+
+	if (!hvcs_has_pi(hvcsd))
+		return -ENODEV;
+
+	retval = hvcs_register_connection(unit_address,
+			hvcsd->p_partition_ID,
+			hvcsd->p_unit_address);
+	if (retval != -EINVAL) {
+		hvcsd->connected = 1;
+		return retval;
+	}
+
+	/*
+	 * EBUSY is the most likely scenario though the vty could have been
+	 * removed or there really could be an hcall error due to the parameter
+	 * data but thanks to ambiguous firmware return codes we can't really
+	 * tell.
+	 */
+	printk(KERN_INFO "HVCS: vty-server or partner"
+			" vty is busy.  Try again later.\n");
+	return -EBUSY;
+}
+
+/* This function must be called with the hvcsd->lock held */
+static void hvcs_partner_free(struct hvcs_struct *hvcsd)
+{
+	int retval;
+	do {
+		retval = hvcs_free_connection(hvcsd->vdev->unit_address);
+	} while (retval == -EBUSY);
+	hvcsd->connected = 0;
+}
+
+/* This helper function must be called WITHOUT the hvcsd->lock held */
+static int hvcs_enable_device(struct hvcs_struct *hvcsd, uint32_t unit_address,
+		unsigned int irq, struct vio_dev *vdev)
+{
+	unsigned long flags;
+	int rc;
+
+	/*
+	 * It is possible that the vty-server was removed between the time that
+	 * the conn was registered and now.
+	 */
+	if (!(rc = request_irq(irq, &hvcs_handle_interrupt,
+				SA_INTERRUPT, "ibmhvcs", hvcsd))) {
+		/*
+		 * It is possible the vty-server was removed after the irq was
+		 * requested but before we have time to enable interrupts.
+		 */
+		if (vio_enable_interrupts(vdev) == H_Success)
+			return 0;
+		else {
+			printk(KERN_ERR "HVCS: int enable failed for"
+					" vty-server@%X.\n", unit_address);
+			free_irq(irq, hvcsd);
+		}
+	} else
+		printk(KERN_ERR "HVCS: irq req failed for"
+				" vty-server@%X.\n", unit_address);
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	hvcs_partner_free(hvcsd);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+
+	return rc;
+
+}
+
+/*
+ * This always increments the kobject ref count if the call is successful.
+ * Please remember to dec when you are done with the instance.
+ *
+ * NOTICE: Do NOT hold either the hvcs_struct.lock or hvcs_structs_lock when
+ * calling this function or you will get deadlock.
+ */
+struct hvcs_struct *hvcs_get_by_index(int index)
+{
+	struct hvcs_struct *hvcsd = NULL;
+	unsigned long flags;
+
+	spin_lock(&hvcs_structs_lock);
+	/* We can immediately discard OOB requests */
+	if (index >= 0 && index < HVCS_MAX_SERVER_ADAPTERS) {
+		list_for_each_entry(hvcsd, &hvcs_structs, next) {
+			spin_lock_irqsave(&hvcsd->lock, flags);
+			if (hvcsd->index == index) {
+				kobject_get(&hvcsd->kobj);
+				spin_unlock_irqrestore(&hvcsd->lock, flags);
+				spin_unlock(&hvcs_structs_lock);
+				return hvcsd;
+			}
+			spin_unlock_irqrestore(&hvcsd->lock, flags);
+		}
+		hvcsd = NULL;
+	}
+
+	spin_unlock(&hvcs_structs_lock);
+	return hvcsd;
+}
+
+/*
+ * This is invoked via the tty_open interface when a user app connects to the
+ * /dev node.
+ */
+static int hvcs_open(struct tty_struct *tty, struct file *filp)
+{
+	struct hvcs_struct *hvcsd;
+	int rc, retval = 0;
+	unsigned long flags;
+	unsigned int irq;
+	struct vio_dev *vdev;
+	unsigned long unit_address;
+	struct kobject *kobjp;
+
+	if (tty->driver_data)
+		goto fast_open;
+
+	/*
+	 * Is there a vty-server that shares the same index?
+	 * This function increments the kobject index.
+	 */
+	if (!(hvcsd = hvcs_get_by_index(tty->index))) {
+		printk(KERN_WARNING "HVCS: open failed, no device associated"
+				" with tty->index %d.\n", tty->index);
+		return -ENODEV;
+	}
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+
+	if (hvcsd->connected == 0)
+		if ((retval = hvcs_partner_connect(hvcsd)))
+			goto error_release;
+
+	hvcsd->open_count = 1;
+	hvcsd->tty = tty;
+	tty->driver_data = hvcsd;
+
+	/*
+	 * Set this driver to low latency so that we actually have a chance at
+	 * catching a throttled TTY after we flip_buffer_push.  Otherwise the
+	 * flush_to_async may not execute until after the kernel_thread has
+	 * yielded and resumed the next flip_buffer_push resulting in data
+	 * loss.
+	 */
+	tty->low_latency = 1;
+
+	memset(&hvcsd->buffer[0], 0x00, HVCS_BUFF_LEN);
+
+	/*
+	 * Save these in the spinlock for the enable operations that need them
+	 * outside of the spinlock.
+	 */
+	irq = hvcsd->vdev->irq;
+	vdev = hvcsd->vdev;
+	unit_address = hvcsd->vdev->unit_address;
+
+	hvcsd->todo_mask |= HVCS_SCHED_READ;
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+
+	/*
+	 * This must be done outside of the spinlock because it requests irqs
+	 * and will grab the spinlock and free the connection if it fails.
+	 */
+	if (((rc = hvcs_enable_device(hvcsd, unit_address, irq, vdev)))) {
+		kobject_put(&hvcsd->kobj);
+		printk(KERN_WARNING "HVCS: enable device failed.\n");
+		return rc;
+	}
+
+	goto open_success;
+
+fast_open:
+	hvcsd = tty->driver_data;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	if (!kobject_get(&hvcsd->kobj)) {
+		spin_unlock_irqrestore(&hvcsd->lock, flags);
+		printk(KERN_ERR "HVCS: Kobject of open"
+			" hvcs doesn't exist.\n");
+		return -EFAULT; /* Is this the right return value? */
+	}
+
+	hvcsd->open_count++;
+
+	hvcsd->todo_mask |= HVCS_SCHED_READ;
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+open_success:
+	hvcs_kick();
+
+	printk(KERN_INFO "HVCS: vty-server@%X connection opened.\n",
+		hvcsd->vdev->unit_address );
+
+	return 0;
+
+error_release:
+	kobjp = &hvcsd->kobj;
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	kobject_put(&hvcsd->kobj);
+
+	printk(KERN_WARNING "HVCS: partner connect failed.\n");
+	return retval;
+}
+
+static void hvcs_close(struct tty_struct *tty, struct file *filp)
+{
+	struct hvcs_struct *hvcsd;
+	unsigned long flags;
+	struct kobject *kobjp;
+	int irq = NO_IRQ;
+
+	/*
+	 * Is someone trying to close the file associated with this device after
+	 * we have hung up?  If so tty->driver_data wouldn't be valid.
+	 */
+	if (tty_hung_up_p(filp))
+		return;
+
+	/*
+	 * No driver_data means that this close was probably issued after a
+	 * failed hvcs_open by the tty layer's release_dev() api and we can just
+	 * exit cleanly.
+	 */
+	if (!tty->driver_data)
+		return;
+
+	hvcsd = tty->driver_data;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	kobjp = &hvcsd->kobj;
+	if (--hvcsd->open_count == 0) {
+
+		vio_disable_interrupts(hvcsd->vdev);
+
+		/*
+		 * NULL this early so that the kernel_thread doesn't try to
+		 * execute any operations on the TTY even though it is obligated
+		 * to deliver any pending I/O to the hypervisor.
+		 */
+		hvcsd->tty = NULL;
+
+		irq = hvcsd->vdev->irq;
+		spin_unlock_irqrestore(&hvcsd->lock, flags);
+
+		tty_wait_until_sent(tty, HVCS_CLOSE_WAIT);
+
+		/*
+		 * This line is important because it tells hvcs_open that this
+		 * device needs to be re-configured the next time hvcs_open is
+		 * called.
+		 */
+		tty->driver_data = NULL;
+
+		free_irq(irq, hvcsd);
+		kobject_put(kobjp);
+		return;
+	} else if (hvcsd->open_count < 0) {
+		printk(KERN_ERR "HVCS: vty-server@%X open_count: %d"
+				" is missmanaged.\n",
+		hvcsd->vdev->unit_address, hvcsd->open_count);
+	}
+
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	kobject_put(kobjp);
+}
+
+static void hvcs_hangup(struct tty_struct * tty)
+{
+	struct hvcs_struct *hvcsd = tty->driver_data;
+	unsigned long flags;
+	int temp_open_count;
+	struct kobject *kobjp;
+	int irq = NO_IRQ;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	/* Preserve this so that we know how many kobject refs to put */
+	temp_open_count = hvcsd->open_count;
+
+	/*
+	 * Don't kobject put inside the spinlock because the destruction
+	 * callback may use the spinlock and it may get called before the
+	 * spinlock has been released.  Get a pointer to the kobject and
+	 * kobject_put on that after releasing the spinlock.
+	 */
+	kobjp = &hvcsd->kobj;
+
+	vio_disable_interrupts(hvcsd->vdev);
+
+	hvcsd->todo_mask = 0;
+
+	/* I don't think the tty needs the hvcs_struct pointer after a hangup */
+	hvcsd->tty->driver_data = NULL;
+	hvcsd->tty = NULL;
+
+	hvcsd->open_count = 0;
+
+	/* This will drop any buffered data on the floor which is OK in a hangup
+	 * scenario. */
+	memset(&hvcsd->buffer[0], 0x00, HVCS_BUFF_LEN);
+	hvcsd->chars_in_buffer = 0;
+
+	irq = hvcsd->vdev->irq;
+
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+
+	free_irq(irq, hvcsd);
+
+	/*
+	 * We need to kobject_put() for every open_count we have since the
+	 * tty_hangup() function doesn't invoke a close per open connection on a
+	 * non-console device.
+	 */
+	while(temp_open_count) {
+		--temp_open_count;
+		/*
+		 * The final put will trigger destruction of the hvcs_struct.
+		 * NOTE:  If this hangup was signaled from user space then the
+		 * final put will never happen.
+		 */
+		kobject_put(kobjp);
+	}
+}
+
+/*
+ * NOTE: This is almost always from_user since user level apps interact with the
+ * /dev nodes. I'm trusting that if hvcs_write gets called and interrupted by
+ * hvcs_remove (which removes the target device and executes tty_hangup()) that
+ * tty_hangup will allow hvcs_write time to complete execution before it
+ * terminates our device.
+ */
+static int hvcs_write(struct tty_struct *tty,
+		const unsigned char *buf, int count)
+{
+	struct hvcs_struct *hvcsd = tty->driver_data;
+	unsigned int unit_address;
+	const unsigned char *charbuf;
+	unsigned long flags;
+	int total_sent = 0;
+	int tosend = 0;
+	int result = 0;
+
+	/*
+	 * If they don't check the return code off of their open they may
+	 * attempt this even if there is no connected device.
+	 */
+	if (!hvcsd)
+		return -ENODEV;
+
+	/* Reasonable size to prevent user level flooding */
+	if (count > HVCS_MAX_FROM_USER) {
+		printk(KERN_WARNING "HVCS write: count being truncated to"
+				" HVCS_MAX_FROM_USER.\n");
+		count = HVCS_MAX_FROM_USER;
+	}
+
+	charbuf = buf;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+
+	/*
+	 * Somehow an open succedded but the device was removed or the
+	 * connection terminated between the vty-server and partner vty during
+	 * the middle of a write operation?  This is a crummy place to do this
+	 * but we want to keep it all in the spinlock.
+	 */
+	if (hvcsd->open_count <= 0) {
+		spin_unlock_irqrestore(&hvcsd->lock, flags);
+		return -ENODEV;
+	}
+
+	unit_address = hvcsd->vdev->unit_address;
+
+	while (count > 0) {
+		tosend = min(count, (HVCS_BUFF_LEN - hvcsd->chars_in_buffer));
+		/*
+		 * No more space, this probably means that the last call to
+		 * hvcs_write() didn't succeed and the buffer was filled up.
+		 */
+		if (!tosend)
+			break;
+
+		memcpy(&hvcsd->buffer[hvcsd->chars_in_buffer],
+				&charbuf[total_sent],
+				tosend);
+
+		hvcsd->chars_in_buffer += tosend;
+
+		result = 0;
+
+		/*
+		 * If this is true then we don't want to try writing to the
+		 * hypervisor because that is the kernel_threads job now.  We'll
+		 * just add to the buffer.
+		 */
+		if (!(hvcsd->todo_mask & HVCS_TRY_WRITE))
+			/* won't send partial writes */
+			result = hvc_put_chars(unit_address,
+					&hvcsd->buffer[0],
+					hvcsd->chars_in_buffer);
+
+		/*
+		 * Since we know we have enough room in hvcsd->buffer for
+		 * tosend we record that it was sent regardless of whether the
+		 * hypervisor actually took it because we have it buffered.
+		 */
+		total_sent+=tosend;
+		count-=tosend;
+		if (result == 0) {
+			hvcsd->todo_mask |= HVCS_TRY_WRITE;
+			hvcs_kick();
+			break;
+		}
+
+		hvcsd->chars_in_buffer = 0;
+		/*
+		 * Test after the chars_in_buffer reset otherwise this could
+		 * deadlock our writes if hvc_put_chars fails.
+		 */
+		if (result < 0)
+			break;
+	}
+
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+
+	if (result == -1)
+		return -EIO;
+	else
+		return total_sent;
+}
+
+/*
+ * This is really asking how much can we guarentee that we can send or that we
+ * absolutely WILL BUFFER if we can't send it.  This driver MUST honor the
+ * return value, hence the reason for hvcs_struct buffering.
+ */
+static int hvcs_write_room(struct tty_struct *tty)
+{
+	struct hvcs_struct *hvcsd = tty->driver_data;
+
+	if (!hvcsd || hvcsd->open_count <= 0)
+		return 0;
+
+	return HVCS_BUFF_LEN - hvcsd->chars_in_buffer;
+}
+
+static int hvcs_chars_in_buffer(struct tty_struct *tty)
+{
+	struct hvcs_struct *hvcsd = tty->driver_data;
+
+	return hvcsd->chars_in_buffer;
+}
+
+static struct tty_operations hvcs_ops = {
+	.open = hvcs_open,
+	.close = hvcs_close,
+	.hangup = hvcs_hangup,
+	.write = hvcs_write,
+	.write_room = hvcs_write_room,
+	.chars_in_buffer = hvcs_chars_in_buffer,
+	.unthrottle = hvcs_unthrottle,
+	.throttle = hvcs_throttle,
+};
+
+static int hvcs_alloc_index_list(int n)
+{
+	int i;
+	hvcs_index_list = kmalloc(n * sizeof(hvcs_index_count),GFP_KERNEL);
+	if (!hvcs_index_list)
+		return -ENOMEM;
+	hvcs_index_count = n;
+	for(i = 0; i < hvcs_index_count; i++)
+		hvcs_index_list[i] = -1;
+	return 0;
+}
+
+static void hvcs_free_index_list(void)
+{
+	/* Paranoia check to be thorough. */
+	if (hvcs_index_list) {
+		kfree(hvcs_index_list);
+		hvcs_index_list = NULL;
+		hvcs_index_count = 0;
+	}
+}
+
+static int __init hvcs_module_init(void)
+{
+	int rc;
+	int num_ttys_to_alloc;
+
+	printk(KERN_INFO "Initializing %s\n", hvcs_driver_string);
+
+	/* Has the user specified an overload with an insmod param? */
+	if (hvcs_parm_num_devs <= 0 ||
+		(hvcs_parm_num_devs > HVCS_MAX_SERVER_ADAPTERS)) {
+		num_ttys_to_alloc = HVCS_DEFAULT_SERVER_ADAPTERS;
+	} else
+		num_ttys_to_alloc = hvcs_parm_num_devs;
+
+	hvcs_tty_driver = alloc_tty_driver(num_ttys_to_alloc);
+	if (!hvcs_tty_driver)
+		return -ENOMEM;
+
+	if (hvcs_alloc_index_list(num_ttys_to_alloc))
+		return -ENOMEM;
+
+	hvcs_tty_driver->owner = THIS_MODULE;
+
+	hvcs_tty_driver->driver_name = hvcs_driver_name;
+	hvcs_tty_driver->name = hvcs_device_node;
+	hvcs_tty_driver->devfs_name = hvcs_device_node;
+
+	/*
+	 * We'll let the system assign us a major number, indicated by leaving
+	 * it blank.
+	 */
+
+	hvcs_tty_driver->minor_start = HVCS_MINOR_START;
+	hvcs_tty_driver->type = TTY_DRIVER_TYPE_SYSTEM;
+
+	/*
+	 * We role our own so that we DONT ECHO.  We can't echo because the
+	 * device we are connecting to already echoes by default and this would
+	 * throw us into a horrible recursive echo-echo-echo loop.
+	 */
+	hvcs_tty_driver->init_termios = hvcs_tty_termios;
+	hvcs_tty_driver->flags = TTY_DRIVER_REAL_RAW;
+
+	tty_set_operations(hvcs_tty_driver, &hvcs_ops);
+
+	/*
+	 * The following call will result in sysfs entries that denote the
+	 * dynamically assigned major and minor numbers for our devices.
+	 */
+	if (tty_register_driver(hvcs_tty_driver)) {
+		printk(KERN_ERR "HVCS: registration "
+			" as a tty driver failed.\n");
+		hvcs_free_index_list();
+		put_tty_driver(hvcs_tty_driver);
+		return -EIO;
+	}
+
+	hvcs_pi_buff = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!hvcs_pi_buff) {
+		tty_unregister_driver(hvcs_tty_driver);
+		hvcs_free_index_list();
+		put_tty_driver(hvcs_tty_driver);
+		return -ENOMEM;
+	}
+
+	hvcs_task = kthread_run(khvcsd, NULL, "khvcsd");
+	if (IS_ERR(hvcs_task)) {
+		printk(KERN_ERR "HVCS: khvcsd creation failed.  Driver not loaded.\n");
+		kfree(hvcs_pi_buff);
+		tty_unregister_driver(hvcs_tty_driver);
+		hvcs_free_index_list();
+		put_tty_driver(hvcs_tty_driver);
+		return -EIO;
+	}
+
+	rc = vio_register_driver(&hvcs_vio_driver);
+
+	/*
+	 * This needs to be done AFTER the vio_register_driver() call or else
+	 * the kobjects won't be initialized properly.
+	 */
+	hvcs_create_driver_attrs();
+
+	printk(KERN_INFO "HVCS: driver module inserted.\n");
+
+	return rc;
+}
+
+static void __exit hvcs_module_exit(void)
+{
+	/*
+	 * This driver receives hvcs_remove callbacks for each device upon
+	 * module removal.
+	 */
+
+	/*
+	 * This synchronous operation  will wake the khvcsd kthread if it is
+	 * asleep and will return when khvcsd has terminated.
+	 */
+	kthread_stop(hvcs_task);
+
+	spin_lock(&hvcs_pi_lock);
+	kfree(hvcs_pi_buff);
+	hvcs_pi_buff = NULL;
+	spin_unlock(&hvcs_pi_lock);
+
+	hvcs_remove_driver_attrs();
+
+	vio_unregister_driver(&hvcs_vio_driver);
+
+	tty_unregister_driver(hvcs_tty_driver);
+
+	hvcs_free_index_list();
+
+	put_tty_driver(hvcs_tty_driver);
+
+	printk(KERN_INFO "HVCS: driver module removed.\n");
+}
+
+module_init(hvcs_module_init);
+module_exit(hvcs_module_exit);
+
+static inline struct hvcs_struct *from_vio_dev(struct vio_dev *viod)
+{
+	return viod->dev.driver_data;
+}
+/* The sysfs interface for the driver and devices */
+
+static ssize_t hvcs_partner_vtys_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	retval = sprintf(buf, "%X\n", hvcsd->p_unit_address);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return retval;
+}
+static DEVICE_ATTR(partner_vtys, S_IRUGO, hvcs_partner_vtys_show, NULL);
+
+static ssize_t hvcs_partner_clcs_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	retval = sprintf(buf, "%s\n", &hvcsd->p_location_code[0]);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return retval;
+}
+static DEVICE_ATTR(partner_clcs, S_IRUGO, hvcs_partner_clcs_show, NULL);
+
+static ssize_t hvcs_current_vty_store(struct device *dev, struct device_attribute *attr, const char * buf,
+		size_t count)
+{
+	/*
+	 * Don't need this feature at the present time because firmware doesn't
+	 * yet support multiple partners.
+	 */
+	printk(KERN_INFO "HVCS: Denied current_vty change: -EPERM.\n");
+	return -EPERM;
+}
+
+static ssize_t hvcs_current_vty_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	retval = sprintf(buf, "%s\n", &hvcsd->p_location_code[0]);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return retval;
+}
+
+static DEVICE_ATTR(current_vty,
+	S_IRUGO | S_IWUSR, hvcs_current_vty_show, hvcs_current_vty_store);
+
+static ssize_t hvcs_vterm_state_store(struct device *dev, struct device_attribute *attr, const char *buf,
+		size_t count)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+
+	/* writing a '0' to this sysfs entry will result in the disconnect. */
+	if (simple_strtol(buf, NULL, 0) != 0)
+		return -EINVAL;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+
+	if (hvcsd->open_count > 0) {
+		spin_unlock_irqrestore(&hvcsd->lock, flags);
+		printk(KERN_INFO "HVCS: vterm state unchanged.  "
+				"The hvcs device node is still in use.\n");
+		return -EPERM;
+	}
+
+	if (hvcsd->connected == 0) {
+		spin_unlock_irqrestore(&hvcsd->lock, flags);
+		printk(KERN_INFO "HVCS: vterm state unchanged. The"
+				" vty-server is not connected to a vty.\n");
+		return -EPERM;
+	}
+
+	hvcs_partner_free(hvcsd);
+	printk(KERN_INFO "HVCS: Closed vty-server@%X and"
+			" partner vty@%X:%d connection.\n",
+			hvcsd->vdev->unit_address,
+			hvcsd->p_unit_address,
+			(uint32_t)hvcsd->p_partition_ID);
+
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return count;
+}
+
+static ssize_t hvcs_vterm_state_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	retval = sprintf(buf, "%d\n", hvcsd->connected);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return retval;
+}
+static DEVICE_ATTR(vterm_state, S_IRUGO | S_IWUSR,
+		hvcs_vterm_state_show, hvcs_vterm_state_store);
+
+static ssize_t hvcs_index_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viod = to_vio_dev(dev);
+	struct hvcs_struct *hvcsd = from_vio_dev(viod);
+	unsigned long flags;
+	int retval;
+
+	spin_lock_irqsave(&hvcsd->lock, flags);
+	retval = sprintf(buf, "%d\n", hvcsd->index);
+	spin_unlock_irqrestore(&hvcsd->lock, flags);
+	return retval;
+}
+
+static DEVICE_ATTR(index, S_IRUGO, hvcs_index_show, NULL);
+
+static struct attribute *hvcs_attrs[] = {
+	&dev_attr_partner_vtys.attr,
+	&dev_attr_partner_clcs.attr,
+	&dev_attr_current_vty.attr,
+	&dev_attr_vterm_state.attr,
+	&dev_attr_index.attr,
+	NULL,
+};
+
+static struct attribute_group hvcs_attr_group = {
+	.attrs = hvcs_attrs,
+};
+
+static void hvcs_create_device_attrs(struct hvcs_struct *hvcsd)
+{
+	struct vio_dev *vdev = hvcsd->vdev;
+	sysfs_create_group(&vdev->dev.kobj, &hvcs_attr_group);
+}
+
+static void hvcs_remove_device_attrs(struct vio_dev *vdev)
+{
+	sysfs_remove_group(&vdev->dev.kobj, &hvcs_attr_group);
+}
+
+static ssize_t hvcs_rescan_show(struct device_driver *ddp, char *buf)
+{
+	/* A 1 means it is updating, a 0 means it is done updating */
+	return snprintf(buf, PAGE_SIZE, "%d\n", hvcs_rescan_status);
+}
+
+static ssize_t hvcs_rescan_store(struct device_driver *ddp, const char * buf,
+		size_t count)
+{
+	if ((simple_strtol(buf, NULL, 0) != 1)
+		&& (hvcs_rescan_status != 0))
+		return -EINVAL;
+
+	hvcs_rescan_status = 1;
+	printk(KERN_INFO "HVCS: rescanning partner info for all"
+		" vty-servers.\n");
+	hvcs_rescan_devices_list();
+	hvcs_rescan_status = 0;
+	return count;
+}
+static DRIVER_ATTR(rescan,
+	S_IRUGO | S_IWUSR, hvcs_rescan_show, hvcs_rescan_store);
+
+static void hvcs_create_driver_attrs(void)
+{
+	struct device_driver *driverfs = &(hvcs_vio_driver.driver);
+	driver_create_file(driverfs, &driver_attr_rescan);
+}
+
+static void hvcs_remove_driver_attrs(void)
+{
+	struct device_driver *driverfs = &(hvcs_vio_driver.driver);
+	driver_remove_file(driverfs, &driver_attr_rescan);
+}
diff -urN oldtree/drivers/input/serio/serio.c newtree/drivers/input/serio/serio.c
--- oldtree/drivers/input/serio/serio.c	2006-03-08 18:47:59.511848500 +0000
+++ newtree/drivers/input/serio/serio.c	2006-03-08 15:22:33.121497250 +0000
@@ -902,7 +902,7 @@
 
 static int __init serio_init(void)
 {
-	serio_task = kthread_run(serio_thread, NULL, "kseriod");
+	serio_task = kthread_nofreeze_run(serio_thread, NULL, "kseriod");
 	if (IS_ERR(serio_task)) {
 		printk(KERN_ERR "serio: Failed to start kseriod\n");
 		return PTR_ERR(serio_task);
diff -urN oldtree/drivers/input/serio/serio.c.orig newtree/drivers/input/serio/serio.c.orig
--- oldtree/drivers/input/serio/serio.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/drivers/input/serio/serio.c.orig	2006-03-08 15:21:15.032617000 +0000
@@ -0,0 +1,928 @@
+/*
+ *  The Serio abstraction module
+ *
+ *  Copyright (c) 1999-2004 Vojtech Pavlik
+ *  Copyright (c) 2004 Dmitry Torokhov
+ *  Copyright (c) 2003 Daniele Bellucci
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Should you need to contact me, the author, you can do so either by
+ * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
+ * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
+ */
+
+#include <linux/stddef.h>
+#include <linux/module.h>
+#include <linux/serio.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+
+MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
+MODULE_DESCRIPTION("Serio abstraction core");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL(serio_interrupt);
+EXPORT_SYMBOL(__serio_register_port);
+EXPORT_SYMBOL(serio_unregister_port);
+EXPORT_SYMBOL(serio_unregister_child_port);
+EXPORT_SYMBOL(__serio_unregister_port_delayed);
+EXPORT_SYMBOL(__serio_register_driver);
+EXPORT_SYMBOL(serio_unregister_driver);
+EXPORT_SYMBOL(serio_open);
+EXPORT_SYMBOL(serio_close);
+EXPORT_SYMBOL(serio_rescan);
+EXPORT_SYMBOL(serio_reconnect);
+
+/*
+ * serio_mutex protects entire serio subsystem and is taken every time
+ * serio port or driver registrered or unregistered.
+ */
+static DEFINE_MUTEX(serio_mutex);
+
+static LIST_HEAD(serio_list);
+
+static struct bus_type serio_bus;
+
+static void serio_add_port(struct serio *serio);
+static void serio_destroy_port(struct serio *serio);
+static void serio_reconnect_port(struct serio *serio);
+static void serio_disconnect_port(struct serio *serio);
+
+static int serio_connect_driver(struct serio *serio, struct serio_driver *drv)
+{
+	int retval;
+
+	mutex_lock(&serio->drv_mutex);
+	retval = drv->connect(serio, drv);
+	mutex_unlock(&serio->drv_mutex);
+
+	return retval;
+}
+
+static int serio_reconnect_driver(struct serio *serio)
+{
+	int retval = -1;
+
+	mutex_lock(&serio->drv_mutex);
+	if (serio->drv && serio->drv->reconnect)
+		retval = serio->drv->reconnect(serio);
+	mutex_unlock(&serio->drv_mutex);
+
+	return retval;
+}
+
+static void serio_disconnect_driver(struct serio *serio)
+{
+	mutex_lock(&serio->drv_mutex);
+	if (serio->drv)
+		serio->drv->disconnect(serio);
+	mutex_unlock(&serio->drv_mutex);
+}
+
+static int serio_match_port(const struct serio_device_id *ids, struct serio *serio)
+{
+	while (ids->type || ids->proto) {
+		if ((ids->type == SERIO_ANY || ids->type == serio->id.type) &&
+		    (ids->proto == SERIO_ANY || ids->proto == serio->id.proto) &&
+		    (ids->extra == SERIO_ANY || ids->extra == serio->id.extra) &&
+		    (ids->id == SERIO_ANY || ids->id == serio->id.id))
+			return 1;
+		ids++;
+	}
+	return 0;
+}
+
+/*
+ * Basic serio -> driver core mappings
+ */
+
+static void serio_bind_driver(struct serio *serio, struct serio_driver *drv)
+{
+	down_write(&serio_bus.subsys.rwsem);
+
+	if (serio_match_port(drv->id_table, serio)) {
+		serio->dev.driver = &drv->driver;
+		if (serio_connect_driver(serio, drv)) {
+			serio->dev.driver = NULL;
+			goto out;
+		}
+		device_bind_driver(&serio->dev);
+	}
+out:
+	up_write(&serio_bus.subsys.rwsem);
+}
+
+static void serio_release_driver(struct serio *serio)
+{
+	down_write(&serio_bus.subsys.rwsem);
+	device_release_driver(&serio->dev);
+	up_write(&serio_bus.subsys.rwsem);
+}
+
+static void serio_find_driver(struct serio *serio)
+{
+	down_write(&serio_bus.subsys.rwsem);
+	device_attach(&serio->dev);
+	up_write(&serio_bus.subsys.rwsem);
+}
+
+
+/*
+ * Serio event processing.
+ */
+
+enum serio_event_type {
+	SERIO_RESCAN,
+	SERIO_RECONNECT,
+	SERIO_REGISTER_PORT,
+	SERIO_UNREGISTER_PORT,
+	SERIO_REGISTER_DRIVER,
+};
+
+struct serio_event {
+	enum serio_event_type type;
+	void *object;
+	struct module *owner;
+	struct list_head node;
+};
+
+static DEFINE_SPINLOCK(serio_event_lock);	/* protects serio_event_list */
+static LIST_HEAD(serio_event_list);
+static DECLARE_WAIT_QUEUE_HEAD(serio_wait);
+static struct task_struct *serio_task;
+
+static void serio_queue_event(void *object, struct module *owner,
+			      enum serio_event_type event_type)
+{
+	unsigned long flags;
+	struct serio_event *event;
+
+	spin_lock_irqsave(&serio_event_lock, flags);
+
+	/*
+	 * Scan event list for the other events for the same serio port,
+	 * starting with the most recent one. If event is the same we
+	 * do not need add new one. If event is of different type we
+	 * need to add this event and should not look further because
+	 * we need to preseve sequence of distinct events.
+	 */
+	list_for_each_entry_reverse(event, &serio_event_list, node) {
+		if (event->object == object) {
+			if (event->type == event_type)
+				goto out;
+			break;
+		}
+	}
+
+	if ((event = kmalloc(sizeof(struct serio_event), GFP_ATOMIC))) {
+		if (!try_module_get(owner)) {
+			printk(KERN_WARNING "serio: Can't get module reference, dropping event %d\n", event_type);
+			goto out;
+		}
+
+		event->type = event_type;
+		event->object = object;
+		event->owner = owner;
+
+		list_add_tail(&event->node, &serio_event_list);
+		wake_up(&serio_wait);
+	} else {
+		printk(KERN_ERR "serio: Not enough memory to queue event %d\n", event_type);
+	}
+out:
+	spin_unlock_irqrestore(&serio_event_lock, flags);
+}
+
+static void serio_free_event(struct serio_event *event)
+{
+	module_put(event->owner);
+	kfree(event);
+}
+
+static void serio_remove_duplicate_events(struct serio_event *event)
+{
+	struct list_head *node, *next;
+	struct serio_event *e;
+	unsigned long flags;
+
+	spin_lock_irqsave(&serio_event_lock, flags);
+
+	list_for_each_safe(node, next, &serio_event_list) {
+		e = list_entry(node, struct serio_event, node);
+		if (event->object == e->object) {
+			/*
+			 * If this event is of different type we should not
+			 * look further - we only suppress duplicate events
+			 * that were sent back-to-back.
+			 */
+			if (event->type != e->type)
+				break;
+
+			list_del_init(node);
+			serio_free_event(e);
+		}
+	}
+
+	spin_unlock_irqrestore(&serio_event_lock, flags);
+}
+
+
+static struct serio_event *serio_get_event(void)
+{
+	struct serio_event *event;
+	struct list_head *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&serio_event_lock, flags);
+
+	if (list_empty(&serio_event_list)) {
+		spin_unlock_irqrestore(&serio_event_lock, flags);
+		return NULL;
+	}
+
+	node = serio_event_list.next;
+	event = list_entry(node, struct serio_event, node);
+	list_del_init(node);
+
+	spin_unlock_irqrestore(&serio_event_lock, flags);
+
+	return event;
+}
+
+static void serio_handle_event(void)
+{
+	struct serio_event *event;
+	struct serio_driver *serio_drv;
+
+	mutex_lock(&serio_mutex);
+
+	/*
+	 * Note that we handle only one event here to give swsusp
+	 * a chance to freeze kseriod thread. Serio events should
+	 * be pretty rare so we are not concerned about taking
+	 * performance hit.
+	 */
+	if ((event = serio_get_event())) {
+
+		switch (event->type) {
+			case SERIO_REGISTER_PORT:
+				serio_add_port(event->object);
+				break;
+
+			case SERIO_UNREGISTER_PORT:
+				serio_disconnect_port(event->object);
+				serio_destroy_port(event->object);
+				break;
+
+			case SERIO_RECONNECT:
+				serio_reconnect_port(event->object);
+				break;
+
+			case SERIO_RESCAN:
+				serio_disconnect_port(event->object);
+				serio_find_driver(event->object);
+				break;
+
+			case SERIO_REGISTER_DRIVER:
+				serio_drv = event->object;
+				driver_register(&serio_drv->driver);
+				break;
+
+			default:
+				break;
+		}
+
+		serio_remove_duplicate_events(event);
+		serio_free_event(event);
+	}
+
+	mutex_unlock(&serio_mutex);
+}
+
+/*
+ * Remove all events that have been submitted for a given serio port.
+ */
+static void serio_remove_pending_events(struct serio *serio)
+{
+	struct list_head *node, *next;
+	struct serio_event *event;
+	unsigned long flags;
+
+	spin_lock_irqsave(&serio_event_lock, flags);
+
+	list_for_each_safe(node, next, &serio_event_list) {
+		event = list_entry(node, struct serio_event, node);
+		if (event->object == serio) {
+			list_del_init(node);
+			serio_free_event(event);
+		}
+	}
+
+	spin_unlock_irqrestore(&serio_event_lock, flags);
+}
+
+/*
+ * Destroy child serio port (if any) that has not been fully registered yet.
+ *
+ * Note that we rely on the fact that port can have only one child and therefore
+ * only one child registration request can be pending. Additionally, children
+ * are registered by driver's connect() handler so there can't be a grandchild
+ * pending registration together with a child.
+ */
+static struct serio *serio_get_pending_child(struct serio *parent)
+{
+	struct serio_event *event;
+	struct serio *serio, *child = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&serio_event_lock, flags);
+
+	list_for_each_entry(event, &serio_event_list, node) {
+		if (event->type == SERIO_REGISTER_PORT) {
+			serio = event->object;
+			if (serio->parent == parent) {
+				child = serio;
+				break;
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&serio_event_lock, flags);
+	return child;
+}
+
+static int serio_thread(void *nothing)
+{
+	do {
+		serio_handle_event();
+		wait_event_interruptible(serio_wait,
+			kthread_should_stop() || !list_empty(&serio_event_list));
+		try_to_freeze();
+	} while (!kthread_should_stop());
+
+	printk(KERN_DEBUG "serio: kseriod exiting\n");
+	return 0;
+}
+
+
+/*
+ * Serio port operations
+ */
+
+static ssize_t serio_show_description(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct serio *serio = to_serio_port(dev);
+	return sprintf(buf, "%s\n", serio->name);
+}
+
+static ssize_t serio_show_modalias(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct serio *serio = to_serio_port(dev);
+
+	return sprintf(buf, "serio:ty%02Xpr%02Xid%02Xex%02X\n",
+			serio->id.type, serio->id.proto, serio->id.id, serio->id.extra);
+}
+
+static ssize_t serio_show_id_type(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct serio *serio = to_serio_port(dev);
+	return sprintf(buf, "%02x\n", serio->id.type);
+}
+
+static ssize_t serio_show_id_proto(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct serio *serio = to_serio_port(dev);
+	return sprintf(buf, "%02x\n", serio->id.proto);
+}
+
+static ssize_t serio_show_id_id(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct serio *serio = to_serio_port(dev);
+	return sprintf(buf, "%02x\n", serio->id.id);
+}
+
+static ssize_t serio_show_id_extra(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct serio *serio = to_serio_port(dev);
+	return sprintf(buf, "%02x\n", serio->id.extra);
+}
+
+static DEVICE_ATTR(type, S_IRUGO, serio_show_id_type, NULL);
+static DEVICE_ATTR(proto, S_IRUGO, serio_show_id_proto, NULL);
+static DEVICE_ATTR(id, S_IRUGO, serio_show_id_id, NULL);
+static DEVICE_ATTR(extra, S_IRUGO, serio_show_id_extra, NULL);
+
+static struct attribute *serio_device_id_attrs[] = {
+	&dev_attr_type.attr,
+	&dev_attr_proto.attr,
+	&dev_attr_id.attr,
+	&dev_attr_extra.attr,
+	NULL
+};
+
+static struct attribute_group serio_id_attr_group = {
+	.name	= "id",
+	.attrs	= serio_device_id_attrs,
+};
+
+static ssize_t serio_rebind_driver(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct serio *serio = to_serio_port(dev);
+	struct device_driver *drv;
+	int retval;
+
+	retval = mutex_lock_interruptible(&serio_mutex);
+	if (retval)
+		return retval;
+
+	retval = count;
+	if (!strncmp(buf, "none", count)) {
+		serio_disconnect_port(serio);
+	} else if (!strncmp(buf, "reconnect", count)) {
+		serio_reconnect_port(serio);
+	} else if (!strncmp(buf, "rescan", count)) {
+		serio_disconnect_port(serio);
+		serio_find_driver(serio);
+	} else if ((drv = driver_find(buf, &serio_bus)) != NULL) {
+		serio_disconnect_port(serio);
+		serio_bind_driver(serio, to_serio_driver(drv));
+		put_driver(drv);
+	} else {
+		retval = -EINVAL;
+	}
+
+	mutex_unlock(&serio_mutex);
+
+	return retval;
+}
+
+static ssize_t serio_show_bind_mode(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct serio *serio = to_serio_port(dev);
+	return sprintf(buf, "%s\n", serio->manual_bind ? "manual" : "auto");
+}
+
+static ssize_t serio_set_bind_mode(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct serio *serio = to_serio_port(dev);
+	int retval;
+
+	retval = count;
+	if (!strncmp(buf, "manual", count)) {
+		serio->manual_bind = 1;
+	} else if (!strncmp(buf, "auto", count)) {
+		serio->manual_bind = 0;
+	} else {
+		retval = -EINVAL;
+	}
+
+	return retval;
+}
+
+static struct device_attribute serio_device_attrs[] = {
+	__ATTR(description, S_IRUGO, serio_show_description, NULL),
+	__ATTR(modalias, S_IRUGO, serio_show_modalias, NULL),
+	__ATTR(drvctl, S_IWUSR, NULL, serio_rebind_driver),
+	__ATTR(bind_mode, S_IWUSR | S_IRUGO, serio_show_bind_mode, serio_set_bind_mode),
+	__ATTR_NULL
+};
+
+
+static void serio_release_port(struct device *dev)
+{
+	struct serio *serio = to_serio_port(dev);
+
+	kfree(serio);
+	module_put(THIS_MODULE);
+}
+
+/*
+ * Prepare serio port for registration.
+ */
+static void serio_init_port(struct serio *serio)
+{
+	static atomic_t serio_no = ATOMIC_INIT(0);
+
+	__module_get(THIS_MODULE);
+
+	spin_lock_init(&serio->lock);
+	mutex_init(&serio->drv_mutex);
+	device_initialize(&serio->dev);
+	snprintf(serio->dev.bus_id, sizeof(serio->dev.bus_id),
+		 "serio%ld", (long)atomic_inc_return(&serio_no) - 1);
+	serio->dev.bus = &serio_bus;
+	serio->dev.release = serio_release_port;
+	if (serio->parent)
+		serio->dev.parent = &serio->parent->dev;
+}
+
+/*
+ * Complete serio port registration.
+ * Driver core will attempt to find appropriate driver for the port.
+ */
+static void serio_add_port(struct serio *serio)
+{
+	if (serio->parent) {
+		serio_pause_rx(serio->parent);
+		serio->parent->child = serio;
+		serio_continue_rx(serio->parent);
+	}
+
+	list_add_tail(&serio->node, &serio_list);
+	if (serio->start)
+		serio->start(serio);
+	device_add(&serio->dev);
+	sysfs_create_group(&serio->dev.kobj, &serio_id_attr_group);
+	serio->registered = 1;
+}
+
+/*
+ * serio_destroy_port() completes deregistration process and removes
+ * port from the system
+ */
+static void serio_destroy_port(struct serio *serio)
+{
+	struct serio *child;
+
+	child = serio_get_pending_child(serio);
+	if (child) {
+		serio_remove_pending_events(child);
+		put_device(&child->dev);
+	}
+
+	if (serio->stop)
+		serio->stop(serio);
+
+	if (serio->parent) {
+		serio_pause_rx(serio->parent);
+		serio->parent->child = NULL;
+		serio_continue_rx(serio->parent);
+		serio->parent = NULL;
+	}
+
+	if (serio->registered) {
+		sysfs_remove_group(&serio->dev.kobj, &serio_id_attr_group);
+		device_del(&serio->dev);
+		list_del_init(&serio->node);
+		serio->registered = 0;
+	}
+
+	serio_remove_pending_events(serio);
+	put_device(&serio->dev);
+}
+
+/*
+ * Reconnect serio port and all its children (re-initialize attached devices)
+ */
+static void serio_reconnect_port(struct serio *serio)
+{
+	do {
+		if (serio_reconnect_driver(serio)) {
+			serio_disconnect_port(serio);
+			serio_find_driver(serio);
+			/* Ok, old children are now gone, we are done */
+			break;
+		}
+		serio = serio->child;
+	} while (serio);
+}
+
+/*
+ * serio_disconnect_port() unbinds a port from its driver. As a side effect
+ * all child ports are unbound and destroyed.
+ */
+static void serio_disconnect_port(struct serio *serio)
+{
+	struct serio *s, *parent;
+
+	if (serio->child) {
+		/*
+		 * Children ports should be disconnected and destroyed
+		 * first, staring with the leaf one, since we don't want
+		 * to do recursion
+		 */
+		for (s = serio; s->child; s = s->child)
+			/* empty */;
+
+		do {
+			parent = s->parent;
+
+			serio_release_driver(s);
+			serio_destroy_port(s);
+		} while ((s = parent) != serio);
+	}
+
+	/*
+	 * Ok, no children left, now disconnect this port
+	 */
+	serio_release_driver(serio);
+}
+
+void serio_rescan(struct serio *serio)
+{
+	serio_queue_event(serio, NULL, SERIO_RESCAN);
+}
+
+void serio_reconnect(struct serio *serio)
+{
+	serio_queue_event(serio, NULL, SERIO_RECONNECT);
+}
+
+/*
+ * Submits register request to kseriod for subsequent execution.
+ * Note that port registration is always asynchronous.
+ */
+void __serio_register_port(struct serio *serio, struct module *owner)
+{
+	serio_init_port(serio);
+	serio_queue_event(serio, owner, SERIO_REGISTER_PORT);
+}
+
+/*
+ * Synchronously unregisters serio port.
+ */
+void serio_unregister_port(struct serio *serio)
+{
+	mutex_lock(&serio_mutex);
+	serio_disconnect_port(serio);
+	serio_destroy_port(serio);
+	mutex_unlock(&serio_mutex);
+}
+
+/*
+ * Safely unregisters child port if one is present.
+ */
+void serio_unregister_child_port(struct serio *serio)
+{
+	mutex_lock(&serio_mutex);
+	if (serio->child) {
+		serio_disconnect_port(serio->child);
+		serio_destroy_port(serio->child);
+	}
+	mutex_unlock(&serio_mutex);
+}
+
+/*
+ * Submits register request to kseriod for subsequent execution.
+ * Can be used when it is not obvious whether the serio_mutex is
+ * taken or not and when delayed execution is feasible.
+ */
+void __serio_unregister_port_delayed(struct serio *serio, struct module *owner)
+{
+	serio_queue_event(serio, owner, SERIO_UNREGISTER_PORT);
+}
+
+
+/*
+ * Serio driver operations
+ */
+
+static ssize_t serio_driver_show_description(struct device_driver *drv, char *buf)
+{
+	struct serio_driver *driver = to_serio_driver(drv);
+	return sprintf(buf, "%s\n", driver->description ? driver->description : "(none)");
+}
+
+static ssize_t serio_driver_show_bind_mode(struct device_driver *drv, char *buf)
+{
+	struct serio_driver *serio_drv = to_serio_driver(drv);
+	return sprintf(buf, "%s\n", serio_drv->manual_bind ? "manual" : "auto");
+}
+
+static ssize_t serio_driver_set_bind_mode(struct device_driver *drv, const char *buf, size_t count)
+{
+	struct serio_driver *serio_drv = to_serio_driver(drv);
+	int retval;
+
+	retval = count;
+	if (!strncmp(buf, "manual", count)) {
+		serio_drv->manual_bind = 1;
+	} else if (!strncmp(buf, "auto", count)) {
+		serio_drv->manual_bind = 0;
+	} else {
+		retval = -EINVAL;
+	}
+
+	return retval;
+}
+
+
+static struct driver_attribute serio_driver_attrs[] = {
+	__ATTR(description, S_IRUGO, serio_driver_show_description, NULL),
+	__ATTR(bind_mode, S_IWUSR | S_IRUGO,
+		serio_driver_show_bind_mode, serio_driver_set_bind_mode),
+	__ATTR_NULL
+};
+
+static int serio_driver_probe(struct device *dev)
+{
+	struct serio *serio = to_serio_port(dev);
+	struct serio_driver *drv = to_serio_driver(dev->driver);
+
+	return serio_connect_driver(serio, drv);
+}
+
+static int serio_driver_remove(struct device *dev)
+{
+	struct serio *serio = to_serio_port(dev);
+
+	serio_disconnect_driver(serio);
+	return 0;
+}
+
+static struct bus_type serio_bus = {
+	.name =	"serio",
+	.probe = serio_driver_probe,
+	.remove = serio_driver_remove,
+};
+
+void __serio_register_driver(struct serio_driver *drv, struct module *owner)
+{
+	drv->driver.bus = &serio_bus;
+
+	serio_queue_event(drv, owner, SERIO_REGISTER_DRIVER);
+}
+
+void serio_unregister_driver(struct serio_driver *drv)
+{
+	struct serio *serio;
+
+	mutex_lock(&serio_mutex);
+	drv->manual_bind = 1;	/* so serio_find_driver ignores it */
+
+start_over:
+	list_for_each_entry(serio, &serio_list, node) {
+		if (serio->drv == drv) {
+			serio_disconnect_port(serio);
+			serio_find_driver(serio);
+			/* we could've deleted some ports, restart */
+			goto start_over;
+		}
+	}
+
+	driver_unregister(&drv->driver);
+	mutex_unlock(&serio_mutex);
+}
+
+static void serio_set_drv(struct serio *serio, struct serio_driver *drv)
+{
+	serio_pause_rx(serio);
+	serio->drv = drv;
+	serio_continue_rx(serio);
+}
+
+static int serio_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct serio *serio = to_serio_port(dev);
+	struct serio_driver *serio_drv = to_serio_driver(drv);
+
+	if (serio->manual_bind || serio_drv->manual_bind)
+		return 0;
+
+	return serio_match_port(serio_drv->id_table, serio);
+}
+
+#ifdef CONFIG_HOTPLUG
+
+#define SERIO_ADD_UEVENT_VAR(fmt, val...)				\
+	do {								\
+		int err = add_uevent_var(envp, num_envp, &i,	\
+					buffer, buffer_size, &len,	\
+					fmt, val);			\
+		if (err)						\
+			return err;					\
+	} while (0)
+
+static int serio_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+{
+	struct serio *serio;
+	int i = 0;
+	int len = 0;
+
+	if (!dev)
+		return -ENODEV;
+
+	serio = to_serio_port(dev);
+
+	SERIO_ADD_UEVENT_VAR("SERIO_TYPE=%02x", serio->id.type);
+	SERIO_ADD_UEVENT_VAR("SERIO_PROTO=%02x", serio->id.proto);
+	SERIO_ADD_UEVENT_VAR("SERIO_ID=%02x", serio->id.id);
+	SERIO_ADD_UEVENT_VAR("SERIO_EXTRA=%02x", serio->id.extra);
+	SERIO_ADD_UEVENT_VAR("MODALIAS=serio:ty%02Xpr%02Xid%02Xex%02X",
+				serio->id.type, serio->id.proto, serio->id.id, serio->id.extra);
+	envp[i] = NULL;
+
+	return 0;
+}
+#undef SERIO_ADD_UEVENT_VAR
+
+#else
+
+static int serio_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+{
+	return -ENODEV;
+}
+
+#endif /* CONFIG_HOTPLUG */
+
+static int serio_resume(struct device *dev)
+{
+	struct serio *serio = to_serio_port(dev);
+
+	if (serio_reconnect_driver(serio)) {
+		/*
+		 * Driver re-probing can take a while, so better let kseriod
+		 * deal with it.
+		 */
+		serio_rescan(serio);
+	}
+
+	return 0;
+}
+
+/* called from serio_driver->connect/disconnect methods under serio_mutex */
+int serio_open(struct serio *serio, struct serio_driver *drv)
+{
+	serio_set_drv(serio, drv);
+
+	if (serio->open && serio->open(serio)) {
+		serio_set_drv(serio, NULL);
+		return -1;
+	}
+	return 0;
+}
+
+/* called from serio_driver->connect/disconnect methods under serio_mutex */
+void serio_close(struct serio *serio)
+{
+	if (serio->close)
+		serio->close(serio);
+
+	serio_set_drv(serio, NULL);
+}
+
+irqreturn_t serio_interrupt(struct serio *serio,
+		unsigned char data, unsigned int dfl, struct pt_regs *regs)
+{
+	unsigned long flags;
+	irqreturn_t ret = IRQ_NONE;
+
+	spin_lock_irqsave(&serio->lock, flags);
+
+        if (likely(serio->drv)) {
+                ret = serio->drv->interrupt(serio, data, dfl, regs);
+	} else if (!dfl && serio->registered) {
+		serio_rescan(serio);
+		ret = IRQ_HANDLED;
+	}
+
+	spin_unlock_irqrestore(&serio->lock, flags);
+
+	return ret;
+}
+
+static int __init serio_init(void)
+{
+	serio_task = kthread_run(serio_thread, NULL, "kseriod");
+	if (IS_ERR(serio_task)) {
+		printk(KERN_ERR "serio: Failed to start kseriod\n");
+		return PTR_ERR(serio_task);
+	}
+
+	serio_bus.dev_attrs = serio_device_attrs;
+	serio_bus.drv_attrs = serio_driver_attrs;
+	serio_bus.match = serio_bus_match;
+	serio_bus.uevent = serio_uevent;
+	serio_bus.resume = serio_resume;
+	bus_register(&serio_bus);
+
+	return 0;
+}
+
+static void __exit serio_exit(void)
+{
+	bus_unregister(&serio_bus);
+	kthread_stop(serio_task);
+}
+
+subsys_initcall(serio_init);
+module_exit(serio_exit);
diff -urN oldtree/drivers/macintosh/Kconfig newtree/drivers/macintosh/Kconfig
--- oldtree/drivers/macintosh/Kconfig	2006-03-08 18:47:12.300898000 +0000
+++ newtree/drivers/macintosh/Kconfig	2006-03-08 15:22:33.121497250 +0000
@@ -200,4 +200,8 @@
 	tristate "Support for ANS LCD display"
 	depends on ADB_CUDA && PPC_PMAC
 
+config SOFTWARE_REPLACE_SLEEP
+	bool "Using Software suspend replace broken sleep function"
+	depends on SUSPEND2
+
 endmenu
diff -urN oldtree/drivers/macintosh/via-pmu.c newtree/drivers/macintosh/via-pmu.c
--- oldtree/drivers/macintosh/via-pmu.c	2006-03-08 18:47:59.615855000 +0000
+++ newtree/drivers/macintosh/via-pmu.c	2006-03-08 15:22:33.129497750 +0000
@@ -2654,6 +2654,13 @@
 			return -EACCES;
 		if (sleep_in_progress)
 			return -EBUSY;
+#ifdef CONFIG_SOFTWARE_REPLACE_SLEEP
+		{
+		extern void software_suspend_pending(void);
+		software_suspend_pending();
+		return (0);
+		}
+#endif
 		sleep_in_progress = 1;
 		switch (pmu_kind) {
 		case PMU_OHARE_BASED:
diff -urN oldtree/drivers/macintosh/via-pmu.c.orig newtree/drivers/macintosh/via-pmu.c.orig
--- oldtree/drivers/macintosh/via-pmu.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/drivers/macintosh/via-pmu.c.orig	2006-03-08 15:21:15.132623250 +0000
@@ -0,0 +1,2918 @@
+/*
+ * Device driver for the via-pmu on Apple Powermacs.
+ *
+ * The VIA (versatile interface adapter) interfaces to the PMU,
+ * a 6805 microprocessor core whose primary function is to control
+ * battery charging and system power on the PowerBook 3400 and 2400.
+ * The PMU also controls the ADB (Apple Desktop Bus) which connects
+ * to the keyboard and mouse, as well as the non-volatile RAM
+ * and the RTC (real time clock) chip.
+ *
+ * Copyright (C) 1998 Paul Mackerras and Fabio Riccardi.
+ * Copyright (C) 2001-2002 Benjamin Herrenschmidt
+ *
+ * THIS DRIVER IS BECOMING A TOTAL MESS !
+ *  - Cleanup atomically disabling reply to PMU events after
+ *    a sleep or a freq. switch
+ *  - Move sleep code out of here to pmac_pm, merge into new
+ *    common PM infrastructure
+ *  - Move backlight code out as well
+ *  - Save/Restore PCI space properly
+ *
+ */
+#include <stdarg.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/blkdev.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/adb.h>
+#include <linux/pmu.h>
+#include <linux/cuda.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/pm.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/sysdev.h>
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/cpu.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/sections.h>
+#include <asm/irq.h>
+#include <asm/pmac_feature.h>
+#include <asm/pmac_pfunc.h>
+#include <asm/pmac_low_i2c.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/cputable.h>
+#include <asm/time.h>
+#ifdef CONFIG_PMAC_BACKLIGHT
+#include <asm/backlight.h>
+#endif
+
+#ifdef CONFIG_PPC32
+#include <asm/open_pic.h>
+#endif
+
+/* Some compile options */
+#undef SUSPEND_USES_PMU
+#define DEBUG_SLEEP
+#undef HACKED_PCI_SAVE
+
+/* Misc minor number allocated for /dev/pmu */
+#define PMU_MINOR		154
+
+/* How many iterations between battery polls */
+#define BATTERY_POLLING_COUNT	2
+
+static volatile unsigned char __iomem *via;
+
+/* VIA registers - spaced 0x200 bytes apart */
+#define RS		0x200		/* skip between registers */
+#define B		0		/* B-side data */
+#define A		RS		/* A-side data */
+#define DIRB		(2*RS)		/* B-side direction (1=output) */
+#define DIRA		(3*RS)		/* A-side direction (1=output) */
+#define T1CL		(4*RS)		/* Timer 1 ctr/latch (low 8 bits) */
+#define T1CH		(5*RS)		/* Timer 1 counter (high 8 bits) */
+#define T1LL		(6*RS)		/* Timer 1 latch (low 8 bits) */
+#define T1LH		(7*RS)		/* Timer 1 latch (high 8 bits) */
+#define T2CL		(8*RS)		/* Timer 2 ctr/latch (low 8 bits) */
+#define T2CH		(9*RS)		/* Timer 2 counter (high 8 bits) */
+#define SR		(10*RS)		/* Shift register */
+#define ACR		(11*RS)		/* Auxiliary control register */
+#define PCR		(12*RS)		/* Peripheral control register */
+#define IFR		(13*RS)		/* Interrupt flag register */
+#define IER		(14*RS)		/* Interrupt enable register */
+#define ANH		(15*RS)		/* A-side data, no handshake */
+
+/* Bits in B data register: both active low */
+#define TACK		0x08		/* Transfer acknowledge (input) */
+#define TREQ		0x10		/* Transfer request (output) */
+
+/* Bits in ACR */
+#define SR_CTRL		0x1c		/* Shift register control bits */
+#define SR_EXT		0x0c		/* Shift on external clock */
+#define SR_OUT		0x10		/* Shift out if 1 */
+
+/* Bits in IFR and IER */
+#define IER_SET		0x80		/* set bits in IER */
+#define IER_CLR		0		/* clear bits in IER */
+#define SR_INT		0x04		/* Shift register full/empty */
+#define CB2_INT		0x08
+#define CB1_INT		0x10		/* transition on CB1 input */
+
+static volatile enum pmu_state {
+	idle,
+	sending,
+	intack,
+	reading,
+	reading_intr,
+	locked,
+} pmu_state;
+
+static volatile enum int_data_state {
+	int_data_empty,
+	int_data_fill,
+	int_data_ready,
+	int_data_flush
+} int_data_state[2] = { int_data_empty, int_data_empty };
+
+static struct adb_request *current_req;
+static struct adb_request *last_req;
+static struct adb_request *req_awaiting_reply;
+static unsigned char interrupt_data[2][32];
+static int interrupt_data_len[2];
+static int int_data_last;
+static unsigned char *reply_ptr;
+static int data_index;
+static int data_len;
+static volatile int adb_int_pending;
+static volatile int disable_poll;
+static struct adb_request bright_req_1, bright_req_2;
+static struct device_node *vias;
+static int pmu_kind = PMU_UNKNOWN;
+static int pmu_fully_inited = 0;
+static int pmu_has_adb;
+static struct device_node *gpio_node;
+static unsigned char __iomem *gpio_reg = NULL;
+static int gpio_irq = -1;
+static int gpio_irq_enabled = -1;
+static volatile int pmu_suspended = 0;
+static spinlock_t pmu_lock;
+static u8 pmu_intr_mask;
+static int pmu_version;
+static int drop_interrupts;
+#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+static int option_lid_wakeup = 1;
+#endif /* CONFIG_PM && CONFIG_PPC32 */
+#if (defined(CONFIG_PM)&&defined(CONFIG_PPC32))||defined(CONFIG_PMAC_BACKLIGHT)
+static int sleep_in_progress;
+#endif
+static unsigned long async_req_locks;
+static unsigned int pmu_irq_stats[11];
+
+static struct proc_dir_entry *proc_pmu_root;
+static struct proc_dir_entry *proc_pmu_info;
+static struct proc_dir_entry *proc_pmu_irqstats;
+static struct proc_dir_entry *proc_pmu_options;
+static int option_server_mode;
+
+int pmu_battery_count;
+int pmu_cur_battery;
+unsigned int pmu_power_flags;
+struct pmu_battery_info pmu_batteries[PMU_MAX_BATTERIES];
+static int query_batt_timer = BATTERY_POLLING_COUNT;
+static struct adb_request batt_req;
+static struct proc_dir_entry *proc_pmu_batt[PMU_MAX_BATTERIES];
+
+#if defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_PMAC_BACKLIGHT)
+extern int disable_kernel_backlight;
+#endif /* defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_PMAC_BACKLIGHT) */
+
+int __fake_sleep;
+int asleep;
+BLOCKING_NOTIFIER_HEAD(sleep_notifier_list);
+
+#ifdef CONFIG_ADB
+static int adb_dev_map = 0;
+static int pmu_adb_flags;
+
+static int pmu_probe(void);
+static int pmu_init(void);
+static int pmu_send_request(struct adb_request *req, int sync);
+static int pmu_adb_autopoll(int devs);
+static int pmu_adb_reset_bus(void);
+#endif /* CONFIG_ADB */
+
+static int init_pmu(void);
+static void pmu_start(void);
+static irqreturn_t via_pmu_interrupt(int irq, void *arg, struct pt_regs *regs);
+static irqreturn_t gpio1_interrupt(int irq, void *arg, struct pt_regs *regs);
+static int proc_get_info(char *page, char **start, off_t off,
+			  int count, int *eof, void *data);
+static int proc_get_irqstats(char *page, char **start, off_t off,
+			  int count, int *eof, void *data);
+#ifdef CONFIG_PMAC_BACKLIGHT
+static int pmu_set_backlight_level(int level, void* data);
+static int pmu_set_backlight_enable(int on, int level, void* data);
+#endif /* CONFIG_PMAC_BACKLIGHT */
+static void pmu_pass_intr(unsigned char *data, int len);
+static int proc_get_batt(char *page, char **start, off_t off,
+			int count, int *eof, void *data);
+static int proc_read_options(char *page, char **start, off_t off,
+			int count, int *eof, void *data);
+static int proc_write_options(struct file *file, const char __user *buffer,
+			unsigned long count, void *data);
+
+#ifdef CONFIG_ADB
+struct adb_driver via_pmu_driver = {
+	"PMU",
+	pmu_probe,
+	pmu_init,
+	pmu_send_request,
+	pmu_adb_autopoll,
+	pmu_poll_adb,
+	pmu_adb_reset_bus
+};
+#endif /* CONFIG_ADB */
+
+extern void low_sleep_handler(void);
+extern void enable_kernel_altivec(void);
+extern void enable_kernel_fp(void);
+
+#ifdef DEBUG_SLEEP
+int pmu_polled_request(struct adb_request *req);
+int pmu_wink(struct adb_request *req);
+#endif
+
+/*
+ * This table indicates for each PMU opcode:
+ * - the number of data bytes to be sent with the command, or -1
+ *   if a length byte should be sent,
+ * - the number of response bytes which the PMU will return, or
+ *   -1 if it will send a length byte.
+ */
+static const s8 pmu_data_len[256][2] = {
+/*	   0	   1	   2	   3	   4	   5	   6	   7  */
+/*00*/	{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*08*/	{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
+/*10*/	{ 1, 0},{ 1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*18*/	{ 0, 1},{ 0, 1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{ 0, 0},
+/*20*/	{-1, 0},{ 0, 0},{ 2, 0},{ 1, 0},{ 1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*28*/	{ 0,-1},{ 0,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{ 0,-1},
+/*30*/	{ 4, 0},{20, 0},{-1, 0},{ 3, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*38*/	{ 0, 4},{ 0,20},{ 2,-1},{ 2, 1},{ 3,-1},{-1,-1},{-1,-1},{ 4, 0},
+/*40*/	{ 1, 0},{ 1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*48*/	{ 0, 1},{ 0, 1},{-1,-1},{ 1, 0},{ 1, 0},{-1,-1},{-1,-1},{-1,-1},
+/*50*/	{ 1, 0},{ 0, 0},{ 2, 0},{ 2, 0},{-1, 0},{ 1, 0},{ 3, 0},{ 1, 0},
+/*58*/	{ 0, 1},{ 1, 0},{ 0, 2},{ 0, 2},{ 0,-1},{-1,-1},{-1,-1},{-1,-1},
+/*60*/	{ 2, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*68*/	{ 0, 3},{ 0, 3},{ 0, 2},{ 0, 8},{ 0,-1},{ 0,-1},{-1,-1},{-1,-1},
+/*70*/	{ 1, 0},{ 1, 0},{ 1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*78*/	{ 0,-1},{ 0,-1},{-1,-1},{-1,-1},{-1,-1},{ 5, 1},{ 4, 1},{ 4, 1},
+/*80*/	{ 4, 0},{-1, 0},{ 0, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*88*/	{ 0, 5},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
+/*90*/	{ 1, 0},{ 2, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*98*/	{ 0, 1},{ 0, 1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
+/*a0*/	{ 2, 0},{ 2, 0},{ 2, 0},{ 4, 0},{-1, 0},{ 0, 0},{-1, 0},{-1, 0},
+/*a8*/	{ 1, 1},{ 1, 0},{ 3, 0},{ 2, 0},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
+/*b0*/	{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*b8*/	{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
+/*c0*/	{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*c8*/	{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
+/*d0*/	{ 0, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*d8*/	{ 1, 1},{ 1, 1},{-1,-1},{-1,-1},{ 0, 1},{ 0,-1},{-1,-1},{-1,-1},
+/*e0*/	{-1, 0},{ 4, 0},{ 0, 1},{-1, 0},{-1, 0},{ 4, 0},{-1, 0},{-1, 0},
+/*e8*/	{ 3,-1},{-1,-1},{ 0, 1},{-1,-1},{ 0,-1},{-1,-1},{-1,-1},{ 0, 0},
+/*f0*/	{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
+/*f8*/	{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
+};
+
+static char *pbook_type[] = {
+	"Unknown PowerBook",
+	"PowerBook 2400/3400/3500(G3)",
+	"PowerBook G3 Series",
+	"1999 PowerBook G3",
+	"Core99"
+};
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+static struct backlight_controller pmu_backlight_controller = {
+	pmu_set_backlight_enable,
+	pmu_set_backlight_level
+};
+#endif /* CONFIG_PMAC_BACKLIGHT */
+
+int __init find_via_pmu(void)
+{
+	u64 taddr;
+	u32 *reg;
+
+	if (via != 0)
+		return 1;
+	vias = of_find_node_by_name(NULL, "via-pmu");
+	if (vias == NULL)
+		return 0;
+
+	reg = (u32 *)get_property(vias, "reg", NULL);
+	if (reg == NULL) {
+		printk(KERN_ERR "via-pmu: No \"reg\" property !\n");
+		goto fail;
+	}
+	taddr = of_translate_address(vias, reg);
+	if (taddr == OF_BAD_ADDR) {
+		printk(KERN_ERR "via-pmu: Can't translate address !\n");
+		goto fail;
+	}
+
+	spin_lock_init(&pmu_lock);
+
+	pmu_has_adb = 1;
+
+	pmu_intr_mask =	PMU_INT_PCEJECT |
+			PMU_INT_SNDBRT |
+			PMU_INT_ADB |
+			PMU_INT_TICK;
+	
+	if (vias->parent->name && ((strcmp(vias->parent->name, "ohare") == 0)
+	    || device_is_compatible(vias->parent, "ohare")))
+		pmu_kind = PMU_OHARE_BASED;
+	else if (device_is_compatible(vias->parent, "paddington"))
+		pmu_kind = PMU_PADDINGTON_BASED;
+	else if (device_is_compatible(vias->parent, "heathrow"))
+		pmu_kind = PMU_HEATHROW_BASED;
+	else if (device_is_compatible(vias->parent, "Keylargo")
+		 || device_is_compatible(vias->parent, "K2-Keylargo")) {
+		struct device_node *gpiop;
+		u64 gaddr = OF_BAD_ADDR;
+
+		pmu_kind = PMU_KEYLARGO_BASED;
+		pmu_has_adb = (find_type_devices("adb") != NULL);
+		pmu_intr_mask =	PMU_INT_PCEJECT |
+				PMU_INT_SNDBRT |
+				PMU_INT_ADB |
+				PMU_INT_TICK |
+				PMU_INT_ENVIRONMENT;
+		
+		gpiop = of_find_node_by_name(NULL, "gpio");
+		if (gpiop) {
+			reg = (u32 *)get_property(gpiop, "reg", NULL);
+			if (reg)
+				gaddr = of_translate_address(gpiop, reg);
+			if (gaddr != OF_BAD_ADDR)
+				gpio_reg = ioremap(gaddr, 0x10);
+		}
+		if (gpio_reg == NULL)
+			printk(KERN_ERR "via-pmu: Can't find GPIO reg !\n");
+	} else
+		pmu_kind = PMU_UNKNOWN;
+
+	via = ioremap(taddr, 0x2000);
+	if (via == NULL) {
+		printk(KERN_ERR "via-pmu: Can't map address !\n");
+		goto fail;
+	}
+	
+	out_8(&via[IER], IER_CLR | 0x7f);	/* disable all intrs */
+	out_8(&via[IFR], 0x7f);			/* clear IFR */
+
+	pmu_state = idle;
+
+	if (!init_pmu()) {
+		via = NULL;
+		return 0;
+	}
+
+	printk(KERN_INFO "PMU driver v%d initialized for %s, firmware: %02x\n",
+	       PMU_DRIVER_VERSION, pbook_type[pmu_kind], pmu_version);
+	       
+	sys_ctrler = SYS_CTRLER_PMU;
+	
+	return 1;
+ fail:
+	of_node_put(vias);
+	vias = NULL;
+	return 0;
+}
+
+#ifdef CONFIG_ADB
+static int pmu_probe(void)
+{
+	return vias == NULL? -ENODEV: 0;
+}
+
+static int __init pmu_init(void)
+{
+	if (vias == NULL)
+		return -ENODEV;
+	return 0;
+}
+#endif /* CONFIG_ADB */
+
+/*
+ * We can't wait until pmu_init gets called, that happens too late.
+ * It happens after IDE and SCSI initialization, which can take a few
+ * seconds, and by that time the PMU could have given up on us and
+ * turned us off.
+ * Thus this is called with arch_initcall rather than device_initcall.
+ */
+static int __init via_pmu_start(void)
+{
+	if (vias == NULL)
+		return -ENODEV;
+
+	bright_req_1.complete = 1;
+	bright_req_2.complete = 1;
+	batt_req.complete = 1;
+
+#ifndef CONFIG_PPC_MERGE
+	if (pmu_kind == PMU_KEYLARGO_BASED)
+		openpic_set_irq_priority(vias->intrs[0].line,
+					 OPENPIC_PRIORITY_DEFAULT + 1);
+#endif
+
+	if (request_irq(vias->intrs[0].line, via_pmu_interrupt, 0, "VIA-PMU",
+			(void *)0)) {
+		printk(KERN_ERR "VIA-PMU: can't get irq %d\n",
+		       vias->intrs[0].line);
+		return -EAGAIN;
+	}
+
+	if (pmu_kind == PMU_KEYLARGO_BASED) {
+		gpio_node = of_find_node_by_name(NULL, "extint-gpio1");
+		if (gpio_node == NULL)
+			gpio_node = of_find_node_by_name(NULL,
+							 "pmu-interrupt");
+		if (gpio_node && gpio_node->n_intrs > 0)
+			gpio_irq = gpio_node->intrs[0].line;
+
+		if (gpio_irq != -1) {
+			if (request_irq(gpio_irq, gpio1_interrupt, 0,
+					"GPIO1 ADB", (void *)0))
+				printk(KERN_ERR "pmu: can't get irq %d"
+				       " (GPIO1)\n", gpio_irq);
+			else
+				gpio_irq_enabled = 1;
+		}
+	}
+
+	/* Enable interrupts */
+	out_8(&via[IER], IER_SET | SR_INT | CB1_INT);
+
+	pmu_fully_inited = 1;
+
+	/* Make sure PMU settle down before continuing. This is _very_ important
+	 * since the IDE probe may shut interrupts down for quite a bit of time. If
+	 * a PMU communication is pending while this happens, the PMU may timeout
+	 * Not that on Core99 machines, the PMU keeps sending us environement
+	 * messages, we should find a way to either fix IDE or make it call
+	 * pmu_suspend() before masking interrupts. This can also happens while
+	 * scolling with some fbdevs.
+	 */
+	do {
+		pmu_poll();
+	} while (pmu_state != idle);
+
+	return 0;
+}
+
+arch_initcall(via_pmu_start);
+
+/*
+ * This has to be done after pci_init, which is a subsys_initcall.
+ */
+static int __init via_pmu_dev_init(void)
+{
+	if (vias == NULL)
+		return -ENODEV;
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	/* Enable backlight */
+	register_backlight_controller(&pmu_backlight_controller, NULL, "pmu");
+#endif /* CONFIG_PMAC_BACKLIGHT */
+
+#ifdef CONFIG_PPC32
+  	if (machine_is_compatible("AAPL,3400/2400") ||
+  		machine_is_compatible("AAPL,3500")) {
+		int mb = pmac_call_feature(PMAC_FTR_GET_MB_INFO,
+			NULL, PMAC_MB_INFO_MODEL, 0);
+		pmu_battery_count = 1;
+		if (mb == PMAC_TYPE_COMET)
+			pmu_batteries[0].flags |= PMU_BATT_TYPE_COMET;
+		else
+			pmu_batteries[0].flags |= PMU_BATT_TYPE_HOOPER;
+	} else if (machine_is_compatible("AAPL,PowerBook1998") ||
+		machine_is_compatible("PowerBook1,1")) {
+		pmu_battery_count = 2;
+		pmu_batteries[0].flags |= PMU_BATT_TYPE_SMART;
+		pmu_batteries[1].flags |= PMU_BATT_TYPE_SMART;
+	} else {
+		struct device_node* prim = find_devices("power-mgt");
+		u32 *prim_info = NULL;
+		if (prim)
+			prim_info = (u32 *)get_property(prim, "prim-info", NULL);
+		if (prim_info) {
+			/* Other stuffs here yet unknown */
+			pmu_battery_count = (prim_info[6] >> 16) & 0xff;
+			pmu_batteries[0].flags |= PMU_BATT_TYPE_SMART;
+			if (pmu_battery_count > 1)
+				pmu_batteries[1].flags |= PMU_BATT_TYPE_SMART;
+		}
+	}
+#endif /* CONFIG_PPC32 */
+
+	/* Create /proc/pmu */
+	proc_pmu_root = proc_mkdir("pmu", NULL);
+	if (proc_pmu_root) {
+		long i;
+
+		for (i=0; i<pmu_battery_count; i++) {
+			char title[16];
+			sprintf(title, "battery_%ld", i);
+			proc_pmu_batt[i] = create_proc_read_entry(title, 0, proc_pmu_root,
+						proc_get_batt, (void *)i);
+		}
+
+		proc_pmu_info = create_proc_read_entry("info", 0, proc_pmu_root,
+					proc_get_info, NULL);
+		proc_pmu_irqstats = create_proc_read_entry("interrupts", 0, proc_pmu_root,
+					proc_get_irqstats, NULL);
+		proc_pmu_options = create_proc_entry("options", 0600, proc_pmu_root);
+		if (proc_pmu_options) {
+			proc_pmu_options->nlink = 1;
+			proc_pmu_options->read_proc = proc_read_options;
+			proc_pmu_options->write_proc = proc_write_options;
+		}
+	}
+	return 0;
+}
+
+device_initcall(via_pmu_dev_init);
+
+static int
+init_pmu(void)
+{
+	int timeout;
+	struct adb_request req;
+
+	out_8(&via[B], via[B] | TREQ);			/* negate TREQ */
+	out_8(&via[DIRB], (via[DIRB] | TREQ) & ~TACK);	/* TACK in, TREQ out */
+
+	pmu_request(&req, NULL, 2, PMU_SET_INTR_MASK, pmu_intr_mask);
+	timeout =  100000;
+	while (!req.complete) {
+		if (--timeout < 0) {
+			printk(KERN_ERR "init_pmu: no response from PMU\n");
+			return 0;
+		}
+		udelay(10);
+		pmu_poll();
+	}
+
+	/* ack all pending interrupts */
+	timeout = 100000;
+	interrupt_data[0][0] = 1;
+	while (interrupt_data[0][0] || pmu_state != idle) {
+		if (--timeout < 0) {
+			printk(KERN_ERR "init_pmu: timed out acking intrs\n");
+			return 0;
+		}
+		if (pmu_state == idle)
+			adb_int_pending = 1;
+		via_pmu_interrupt(0, NULL, NULL);
+		udelay(10);
+	}
+
+	/* Tell PMU we are ready.  */
+	if (pmu_kind == PMU_KEYLARGO_BASED) {
+		pmu_request(&req, NULL, 2, PMU_SYSTEM_READY, 2);
+		while (!req.complete)
+			pmu_poll();
+	}
+
+	/* Read PMU version */
+	pmu_request(&req, NULL, 1, PMU_GET_VERSION);
+	pmu_wait_complete(&req);
+	if (req.reply_len > 0)
+		pmu_version = req.reply[0];
+	
+	/* Read server mode setting */
+	if (pmu_kind == PMU_KEYLARGO_BASED) {
+		pmu_request(&req, NULL, 2, PMU_POWER_EVENTS,
+			    PMU_PWR_GET_POWERUP_EVENTS);
+		pmu_wait_complete(&req);
+		if (req.reply_len == 2) {
+			if (req.reply[1] & PMU_PWR_WAKEUP_AC_INSERT)
+				option_server_mode = 1;
+			printk(KERN_INFO "via-pmu: Server Mode is %s\n",
+			       option_server_mode ? "enabled" : "disabled");
+		}
+	}
+	return 1;
+}
+
+int
+pmu_get_model(void)
+{
+	return pmu_kind;
+}
+
+static void pmu_set_server_mode(int server_mode)
+{
+	struct adb_request req;
+
+	if (pmu_kind != PMU_KEYLARGO_BASED)
+		return;
+
+	option_server_mode = server_mode;
+	pmu_request(&req, NULL, 2, PMU_POWER_EVENTS, PMU_PWR_GET_POWERUP_EVENTS);
+	pmu_wait_complete(&req);
+	if (req.reply_len < 2)
+		return;
+	if (server_mode)
+		pmu_request(&req, NULL, 4, PMU_POWER_EVENTS,
+			    PMU_PWR_SET_POWERUP_EVENTS,
+			    req.reply[0], PMU_PWR_WAKEUP_AC_INSERT); 
+	else
+		pmu_request(&req, NULL, 4, PMU_POWER_EVENTS,
+			    PMU_PWR_CLR_POWERUP_EVENTS,
+			    req.reply[0], PMU_PWR_WAKEUP_AC_INSERT); 
+	pmu_wait_complete(&req);
+}
+
+/* This new version of the code for 2400/3400/3500 powerbooks
+ * is inspired from the implementation in gkrellm-pmu
+ */
+static void
+done_battery_state_ohare(struct adb_request* req)
+{
+	/* format:
+	 *  [0]    :  flags
+	 *    0x01 :  AC indicator
+	 *    0x02 :  charging
+	 *    0x04 :  battery exist
+	 *    0x08 :  
+	 *    0x10 :  
+	 *    0x20 :  full charged
+	 *    0x40 :  pcharge reset
+	 *    0x80 :  battery exist
+	 *
+	 *  [1][2] :  battery voltage
+	 *  [3]    :  CPU temperature
+	 *  [4]    :  battery temperature
+	 *  [5]    :  current
+	 *  [6][7] :  pcharge
+	 *              --tkoba
+	 */
+	unsigned int bat_flags = PMU_BATT_TYPE_HOOPER;
+	long pcharge, charge, vb, vmax, lmax;
+	long vmax_charging, vmax_charged;
+	long amperage, voltage, time, max;
+	int mb = pmac_call_feature(PMAC_FTR_GET_MB_INFO,
+			NULL, PMAC_MB_INFO_MODEL, 0);
+
+	if (req->reply[0] & 0x01)
+		pmu_power_flags |= PMU_PWR_AC_PRESENT;
+	else
+		pmu_power_flags &= ~PMU_PWR_AC_PRESENT;
+	
+	if (mb == PMAC_TYPE_COMET) {
+		vmax_charged = 189;
+		vmax_charging = 213;
+		lmax = 6500;
+	} else {
+		vmax_charged = 330;
+		vmax_charging = 330;
+		lmax = 6500;
+	}
+	vmax = vmax_charged;
+
+	/* If battery installed */
+	if (req->reply[0] & 0x04) {
+		bat_flags |= PMU_BATT_PRESENT;
+		if (req->reply[0] & 0x02)
+			bat_flags |= PMU_BATT_CHARGING;
+		vb = (req->reply[1] << 8) | req->reply[2];
+		voltage = (vb * 265 + 72665) / 10;
+		amperage = req->reply[5];
+		if ((req->reply[0] & 0x01) == 0) {
+			if (amperage > 200)
+				vb += ((amperage - 200) * 15)/100;
+		} else if (req->reply[0] & 0x02) {
+			vb = (vb * 97) / 100;
+			vmax = vmax_charging;
+		}
+		charge = (100 * vb) / vmax;
+		if (req->reply[0] & 0x40) {
+			pcharge = (req->reply[6] << 8) + req->reply[7];
+			if (pcharge > lmax)
+				pcharge = lmax;
+			pcharge *= 100;
+			pcharge = 100 - pcharge / lmax;
+			if (pcharge < charge)
+				charge = pcharge;
+		}
+		if (amperage > 0)
+			time = (charge * 16440) / amperage;
+		else
+			time = 0;
+		max = 100;
+		amperage = -amperage;
+	} else
+		charge = max = amperage = voltage = time = 0;
+
+	pmu_batteries[pmu_cur_battery].flags = bat_flags;
+	pmu_batteries[pmu_cur_battery].charge = charge;
+	pmu_batteries[pmu_cur_battery].max_charge = max;
+	pmu_batteries[pmu_cur_battery].amperage = amperage;
+	pmu_batteries[pmu_cur_battery].voltage = voltage;
+	pmu_batteries[pmu_cur_battery].time_remaining = time;
+
+	clear_bit(0, &async_req_locks);
+}
+
+static void
+done_battery_state_smart(struct adb_request* req)
+{
+	/* format:
+	 *  [0] : format of this structure (known: 3,4,5)
+	 *  [1] : flags
+	 *  
+	 *  format 3 & 4:
+	 *  
+	 *  [2] : charge
+	 *  [3] : max charge
+	 *  [4] : current
+	 *  [5] : voltage
+	 *  
+	 *  format 5:
+	 *  
+	 *  [2][3] : charge
+	 *  [4][5] : max charge
+	 *  [6][7] : current
+	 *  [8][9] : voltage
+	 */
+	 
+	unsigned int bat_flags = PMU_BATT_TYPE_SMART;
+	int amperage;
+	unsigned int capa, max, voltage;
+	
+	if (req->reply[1] & 0x01)
+		pmu_power_flags |= PMU_PWR_AC_PRESENT;
+	else
+		pmu_power_flags &= ~PMU_PWR_AC_PRESENT;
+
+
+	capa = max = amperage = voltage = 0;
+	
+	if (req->reply[1] & 0x04) {
+		bat_flags |= PMU_BATT_PRESENT;
+		switch(req->reply[0]) {
+			case 3:
+			case 4: capa = req->reply[2];
+				max = req->reply[3];
+				amperage = *((signed char *)&req->reply[4]);
+				voltage = req->reply[5];
+				break;
+			case 5: capa = (req->reply[2] << 8) | req->reply[3];
+				max = (req->reply[4] << 8) | req->reply[5];
+				amperage = *((signed short *)&req->reply[6]);
+				voltage = (req->reply[8] << 8) | req->reply[9];
+				break;
+			default:
+				printk(KERN_WARNING "pmu.c : unrecognized battery info, len: %d, %02x %02x %02x %02x\n",
+					req->reply_len, req->reply[0], req->reply[1], req->reply[2], req->reply[3]);
+				break;
+		}
+	}
+
+	if ((req->reply[1] & 0x01) && (amperage > 0))
+		bat_flags |= PMU_BATT_CHARGING;
+
+	pmu_batteries[pmu_cur_battery].flags = bat_flags;
+	pmu_batteries[pmu_cur_battery].charge = capa;
+	pmu_batteries[pmu_cur_battery].max_charge = max;
+	pmu_batteries[pmu_cur_battery].amperage = amperage;
+	pmu_batteries[pmu_cur_battery].voltage = voltage;
+	if (amperage) {
+		if ((req->reply[1] & 0x01) && (amperage > 0))
+			pmu_batteries[pmu_cur_battery].time_remaining
+				= ((max-capa) * 3600) / amperage;
+		else
+			pmu_batteries[pmu_cur_battery].time_remaining
+				= (capa * 3600) / (-amperage);
+	} else
+		pmu_batteries[pmu_cur_battery].time_remaining = 0;
+
+	pmu_cur_battery = (pmu_cur_battery + 1) % pmu_battery_count;
+
+	clear_bit(0, &async_req_locks);
+}
+
+static void
+query_battery_state(void)
+{
+	if (test_and_set_bit(0, &async_req_locks))
+		return;
+	if (pmu_kind == PMU_OHARE_BASED)
+		pmu_request(&batt_req, done_battery_state_ohare,
+			1, PMU_BATTERY_STATE);
+	else
+		pmu_request(&batt_req, done_battery_state_smart,
+			2, PMU_SMART_BATTERY_STATE, pmu_cur_battery+1);
+}
+
+static int
+proc_get_info(char *page, char **start, off_t off,
+		int count, int *eof, void *data)
+{
+	char* p = page;
+
+	p += sprintf(p, "PMU driver version     : %d\n", PMU_DRIVER_VERSION);
+	p += sprintf(p, "PMU firmware version   : %02x\n", pmu_version);
+	p += sprintf(p, "AC Power               : %d\n",
+		((pmu_power_flags & PMU_PWR_AC_PRESENT) != 0));
+	p += sprintf(p, "Battery count          : %d\n", pmu_battery_count);
+
+	return p - page;
+}
+
+static int
+proc_get_irqstats(char *page, char **start, off_t off,
+		  int count, int *eof, void *data)
+{
+	int i;
+	char* p = page;
+	static const char *irq_names[] = {
+		"Total CB1 triggered events",
+		"Total GPIO1 triggered events",
+		"PC-Card eject button",
+		"Sound/Brightness button",
+		"ADB message",
+		"Battery state change",
+		"Environment interrupt",
+		"Tick timer",
+		"Ghost interrupt (zero len)",
+		"Empty interrupt (empty mask)",
+		"Max irqs in a row"
+        };
+
+	for (i=0; i<11; i++) {
+		p += sprintf(p, " %2u: %10u (%s)\n",
+			     i, pmu_irq_stats[i], irq_names[i]);
+	}
+	return p - page;
+}
+
+static int
+proc_get_batt(char *page, char **start, off_t off,
+		int count, int *eof, void *data)
+{
+	long batnum = (long)data;
+	char *p = page;
+	
+	p += sprintf(p, "\n");
+	p += sprintf(p, "flags      : %08x\n",
+		pmu_batteries[batnum].flags);
+	p += sprintf(p, "charge     : %d\n",
+		pmu_batteries[batnum].charge);
+	p += sprintf(p, "max_charge : %d\n",
+		pmu_batteries[batnum].max_charge);
+	p += sprintf(p, "current    : %d\n",
+		pmu_batteries[batnum].amperage);
+	p += sprintf(p, "voltage    : %d\n",
+		pmu_batteries[batnum].voltage);
+	p += sprintf(p, "time rem.  : %d\n",
+		pmu_batteries[batnum].time_remaining);
+
+	return p - page;
+}
+
+static int
+proc_read_options(char *page, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	char *p = page;
+
+#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+	if (pmu_kind == PMU_KEYLARGO_BASED &&
+	    pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) >= 0)
+		p += sprintf(p, "lid_wakeup=%d\n", option_lid_wakeup);
+#endif
+	if (pmu_kind == PMU_KEYLARGO_BASED)
+		p += sprintf(p, "server_mode=%d\n", option_server_mode);
+
+	return p - page;
+}
+			
+static int
+proc_write_options(struct file *file, const char __user *buffer,
+			unsigned long count, void *data)
+{
+	char tmp[33];
+	char *label, *val;
+	unsigned long fcount = count;
+	
+	if (!count)
+		return -EINVAL;
+	if (count > 32)
+		count = 32;
+	if (copy_from_user(tmp, buffer, count))
+		return -EFAULT;
+	tmp[count] = 0;
+
+	label = tmp;
+	while(*label == ' ')
+		label++;
+	val = label;
+	while(*val && (*val != '=')) {
+		if (*val == ' ')
+			*val = 0;
+		val++;
+	}
+	if ((*val) == 0)
+		return -EINVAL;
+	*(val++) = 0;
+	while(*val == ' ')
+		val++;
+#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+	if (pmu_kind == PMU_KEYLARGO_BASED &&
+	    pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) >= 0)
+		if (!strcmp(label, "lid_wakeup"))
+			option_lid_wakeup = ((*val) == '1');
+#endif
+	if (pmu_kind == PMU_KEYLARGO_BASED && !strcmp(label, "server_mode")) {
+		int new_value;
+		new_value = ((*val) == '1');
+		if (new_value != option_server_mode)
+			pmu_set_server_mode(new_value);
+	}
+	return fcount;
+}
+
+#ifdef CONFIG_ADB
+/* Send an ADB command */
+static int
+pmu_send_request(struct adb_request *req, int sync)
+{
+	int i, ret;
+
+	if ((vias == NULL) || (!pmu_fully_inited)) {
+		req->complete = 1;
+		return -ENXIO;
+	}
+
+	ret = -EINVAL;
+
+	switch (req->data[0]) {
+	case PMU_PACKET:
+		for (i = 0; i < req->nbytes - 1; ++i)
+			req->data[i] = req->data[i+1];
+		--req->nbytes;
+		if (pmu_data_len[req->data[0]][1] != 0) {
+			req->reply[0] = ADB_RET_OK;
+			req->reply_len = 1;
+		} else
+			req->reply_len = 0;
+		ret = pmu_queue_request(req);
+		break;
+	case CUDA_PACKET:
+		switch (req->data[1]) {
+		case CUDA_GET_TIME:
+			if (req->nbytes != 2)
+				break;
+			req->data[0] = PMU_READ_RTC;
+			req->nbytes = 1;
+			req->reply_len = 3;
+			req->reply[0] = CUDA_PACKET;
+			req->reply[1] = 0;
+			req->reply[2] = CUDA_GET_TIME;
+			ret = pmu_queue_request(req);
+			break;
+		case CUDA_SET_TIME:
+			if (req->nbytes != 6)
+				break;
+			req->data[0] = PMU_SET_RTC;
+			req->nbytes = 5;
+			for (i = 1; i <= 4; ++i)
+				req->data[i] = req->data[i+1];
+			req->reply_len = 3;
+			req->reply[0] = CUDA_PACKET;
+			req->reply[1] = 0;
+			req->reply[2] = CUDA_SET_TIME;
+			ret = pmu_queue_request(req);
+			break;
+		}
+		break;
+	case ADB_PACKET:
+	    	if (!pmu_has_adb)
+    			return -ENXIO;
+		for (i = req->nbytes - 1; i > 1; --i)
+			req->data[i+2] = req->data[i];
+		req->data[3] = req->nbytes - 2;
+		req->data[2] = pmu_adb_flags;
+		/*req->data[1] = req->data[1];*/
+		req->data[0] = PMU_ADB_CMD;
+		req->nbytes += 2;
+		req->reply_expected = 1;
+		req->reply_len = 0;
+		ret = pmu_queue_request(req);
+		break;
+	}
+	if (ret) {
+		req->complete = 1;
+		return ret;
+	}
+
+	if (sync)
+		while (!req->complete)
+			pmu_poll();
+
+	return 0;
+}
+
+/* Enable/disable autopolling */
+static int
+pmu_adb_autopoll(int devs)
+{
+	struct adb_request req;
+
+	if ((vias == NULL) || (!pmu_fully_inited) || !pmu_has_adb)
+		return -ENXIO;
+
+	if (devs) {
+		adb_dev_map = devs;
+		pmu_request(&req, NULL, 5, PMU_ADB_CMD, 0, 0x86,
+			    adb_dev_map >> 8, adb_dev_map);
+		pmu_adb_flags = 2;
+	} else {
+		pmu_request(&req, NULL, 1, PMU_ADB_POLL_OFF);
+		pmu_adb_flags = 0;
+	}
+	while (!req.complete)
+		pmu_poll();
+	return 0;
+}
+
+/* Reset the ADB bus */
+static int
+pmu_adb_reset_bus(void)
+{
+	struct adb_request req;
+	int save_autopoll = adb_dev_map;
+
+	if ((vias == NULL) || (!pmu_fully_inited) || !pmu_has_adb)
+		return -ENXIO;
+
+	/* anyone got a better idea?? */
+	pmu_adb_autopoll(0);
+
+	req.nbytes = 5;
+	req.done = NULL;
+	req.data[0] = PMU_ADB_CMD;
+	req.data[1] = 0;
+	req.data[2] = ADB_BUSRESET;
+	req.data[3] = 0;
+	req.data[4] = 0;
+	req.reply_len = 0;
+	req.reply_expected = 1;
+	if (pmu_queue_request(&req) != 0) {
+		printk(KERN_ERR "pmu_adb_reset_bus: pmu_queue_request failed\n");
+		return -EIO;
+	}
+	pmu_wait_complete(&req);
+
+	if (save_autopoll != 0)
+		pmu_adb_autopoll(save_autopoll);
+
+	return 0;
+}
+#endif /* CONFIG_ADB */
+
+/* Construct and send a pmu request */
+int
+pmu_request(struct adb_request *req, void (*done)(struct adb_request *),
+	    int nbytes, ...)
+{
+	va_list list;
+	int i;
+
+	if (vias == NULL)
+		return -ENXIO;
+
+	if (nbytes < 0 || nbytes > 32) {
+		printk(KERN_ERR "pmu_request: bad nbytes (%d)\n", nbytes);
+		req->complete = 1;
+		return -EINVAL;
+	}
+	req->nbytes = nbytes;
+	req->done = done;
+	va_start(list, nbytes);
+	for (i = 0; i < nbytes; ++i)
+		req->data[i] = va_arg(list, int);
+	va_end(list);
+	req->reply_len = 0;
+	req->reply_expected = 0;
+	return pmu_queue_request(req);
+}
+
+int
+pmu_queue_request(struct adb_request *req)
+{
+	unsigned long flags;
+	int nsend;
+
+	if (via == NULL) {
+		req->complete = 1;
+		return -ENXIO;
+	}
+	if (req->nbytes <= 0) {
+		req->complete = 1;
+		return 0;
+	}
+	nsend = pmu_data_len[req->data[0]][0];
+	if (nsend >= 0 && req->nbytes != nsend + 1) {
+		req->complete = 1;
+		return -EINVAL;
+	}
+
+	req->next = NULL;
+	req->sent = 0;
+	req->complete = 0;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	if (current_req != 0) {
+		last_req->next = req;
+		last_req = req;
+	} else {
+		current_req = req;
+		last_req = req;
+		if (pmu_state == idle)
+			pmu_start();
+	}
+	spin_unlock_irqrestore(&pmu_lock, flags);
+
+	return 0;
+}
+
+static inline void
+wait_for_ack(void)
+{
+	/* Sightly increased the delay, I had one occurrence of the message
+	 * reported
+	 */
+	int timeout = 4000;
+	while ((in_8(&via[B]) & TACK) == 0) {
+		if (--timeout < 0) {
+			printk(KERN_ERR "PMU not responding (!ack)\n");
+			return;
+		}
+		udelay(10);
+	}
+}
+
+/* New PMU seems to be very sensitive to those timings, so we make sure
+ * PCI is flushed immediately */
+static inline void
+send_byte(int x)
+{
+	volatile unsigned char __iomem *v = via;
+
+	out_8(&v[ACR], in_8(&v[ACR]) | SR_OUT | SR_EXT);
+	out_8(&v[SR], x);
+	out_8(&v[B], in_8(&v[B]) & ~TREQ);		/* assert TREQ */
+	(void)in_8(&v[B]);
+}
+
+static inline void
+recv_byte(void)
+{
+	volatile unsigned char __iomem *v = via;
+
+	out_8(&v[ACR], (in_8(&v[ACR]) & ~SR_OUT) | SR_EXT);
+	in_8(&v[SR]);		/* resets SR */
+	out_8(&v[B], in_8(&v[B]) & ~TREQ);
+	(void)in_8(&v[B]);
+}
+
+static inline void
+pmu_done(struct adb_request *req)
+{
+	void (*done)(struct adb_request *) = req->done;
+	mb();
+	req->complete = 1;
+    	/* Here, we assume that if the request has a done member, the
+    	 * struct request will survive to setting req->complete to 1
+    	 */
+	if (done)
+		(*done)(req);
+}
+
+static void
+pmu_start(void)
+{
+	struct adb_request *req;
+
+	/* assert pmu_state == idle */
+	/* get the packet to send */
+	req = current_req;
+	if (req == 0 || pmu_state != idle
+	    || (/*req->reply_expected && */req_awaiting_reply))
+		return;
+
+	pmu_state = sending;
+	data_index = 1;
+	data_len = pmu_data_len[req->data[0]][0];
+
+	/* Sounds safer to make sure ACK is high before writing. This helped
+	 * kill a problem with ADB and some iBooks
+	 */
+	wait_for_ack();
+	/* set the shift register to shift out and send a byte */
+	send_byte(req->data[0]);
+}
+
+void
+pmu_poll(void)
+{
+	if (!via)
+		return;
+	if (disable_poll)
+		return;
+	via_pmu_interrupt(0, NULL, NULL);
+}
+
+void
+pmu_poll_adb(void)
+{
+	if (!via)
+		return;
+	if (disable_poll)
+		return;
+	/* Kicks ADB read when PMU is suspended */
+	adb_int_pending = 1;
+	do {
+		via_pmu_interrupt(0, NULL, NULL);
+	} while (pmu_suspended && (adb_int_pending || pmu_state != idle
+		|| req_awaiting_reply));
+}
+
+void
+pmu_wait_complete(struct adb_request *req)
+{
+	if (!via)
+		return;
+	while((pmu_state != idle && pmu_state != locked) || !req->complete)
+		via_pmu_interrupt(0, NULL, NULL);
+}
+
+/* This function loops until the PMU is idle and prevents it from
+ * anwsering to ADB interrupts. pmu_request can still be called.
+ * This is done to avoid spurrious shutdowns when we know we'll have
+ * interrupts switched off for a long time
+ */
+void
+pmu_suspend(void)
+{
+	unsigned long flags;
+#ifdef SUSPEND_USES_PMU
+	struct adb_request *req;
+#endif
+	if (!via)
+		return;
+	
+	spin_lock_irqsave(&pmu_lock, flags);
+	pmu_suspended++;
+	if (pmu_suspended > 1) {
+		spin_unlock_irqrestore(&pmu_lock, flags);
+		return;
+	}
+
+	do {
+		spin_unlock_irqrestore(&pmu_lock, flags);
+		if (req_awaiting_reply)
+			adb_int_pending = 1;
+		via_pmu_interrupt(0, NULL, NULL);
+		spin_lock_irqsave(&pmu_lock, flags);
+		if (!adb_int_pending && pmu_state == idle && !req_awaiting_reply) {
+#ifdef SUSPEND_USES_PMU
+			pmu_request(&req, NULL, 2, PMU_SET_INTR_MASK, 0);
+			spin_unlock_irqrestore(&pmu_lock, flags);
+			while(!req.complete)
+				pmu_poll();
+#else /* SUSPEND_USES_PMU */
+			if (gpio_irq >= 0)
+				disable_irq_nosync(gpio_irq);
+			out_8(&via[IER], CB1_INT | IER_CLR);
+			spin_unlock_irqrestore(&pmu_lock, flags);
+#endif /* SUSPEND_USES_PMU */
+			break;
+		}
+	} while (1);
+}
+
+void
+pmu_resume(void)
+{
+	unsigned long flags;
+
+	if (!via || (pmu_suspended < 1))
+		return;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	pmu_suspended--;
+	if (pmu_suspended > 0) {
+		spin_unlock_irqrestore(&pmu_lock, flags);
+		return;
+	}
+	adb_int_pending = 1;
+#ifdef SUSPEND_USES_PMU
+	pmu_request(&req, NULL, 2, PMU_SET_INTR_MASK, pmu_intr_mask);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+	while(!req.complete)
+		pmu_poll();
+#else /* SUSPEND_USES_PMU */
+	if (gpio_irq >= 0)
+		enable_irq(gpio_irq);
+	out_8(&via[IER], CB1_INT | IER_SET);
+	spin_unlock_irqrestore(&pmu_lock, flags);
+	pmu_poll();
+#endif /* SUSPEND_USES_PMU */
+}
+
+/* Interrupt data could be the result data from an ADB cmd */
+static void
+pmu_handle_data(unsigned char *data, int len, struct pt_regs *regs)
+{
+	unsigned char ints, pirq;
+	int i = 0;
+
+	asleep = 0;
+	if (drop_interrupts || len < 1) {
+		adb_int_pending = 0;
+		pmu_irq_stats[8]++;
+		return;
+	}
+
+	/* Get PMU interrupt mask */
+	ints = data[0];
+
+	/* Record zero interrupts for stats */
+	if (ints == 0)
+		pmu_irq_stats[9]++;
+
+	/* Hack to deal with ADB autopoll flag */
+	if (ints & PMU_INT_ADB)
+		ints &= ~(PMU_INT_ADB_AUTO | PMU_INT_AUTO_SRQ_POLL);
+
+next:
+
+	if (ints == 0) {
+		if (i > pmu_irq_stats[10])
+			pmu_irq_stats[10] = i;
+		return;
+	}
+
+	for (pirq = 0; pirq < 8; pirq++)
+		if (ints & (1 << pirq))
+			break;
+	pmu_irq_stats[pirq]++;
+	i++;
+	ints &= ~(1 << pirq);
+
+	/* Note: for some reason, we get an interrupt with len=1,
+	 * data[0]==0 after each normal ADB interrupt, at least
+	 * on the Pismo. Still investigating...  --BenH
+	 */
+	if ((1 << pirq) & PMU_INT_ADB) {
+		if ((data[0] & PMU_INT_ADB_AUTO) == 0) {
+			struct adb_request *req = req_awaiting_reply;
+			if (req == 0) {
+				printk(KERN_ERR "PMU: extra ADB reply\n");
+				return;
+			}
+			req_awaiting_reply = NULL;
+			if (len <= 2)
+				req->reply_len = 0;
+			else {
+				memcpy(req->reply, data + 1, len - 1);
+				req->reply_len = len - 1;
+			}
+			pmu_done(req);
+		} else {
+			if (len == 4 && data[1] == 0x2c) {
+				extern int xmon_wants_key, xmon_adb_keycode;
+				if (xmon_wants_key) {
+					xmon_adb_keycode = data[2];
+					return;
+				}
+			}
+#ifdef CONFIG_ADB
+			/*
+			 * XXX On the [23]400 the PMU gives us an up
+			 * event for keycodes 0x74 or 0x75 when the PC
+			 * card eject buttons are released, so we
+			 * ignore those events.
+			 */
+			if (!(pmu_kind == PMU_OHARE_BASED && len == 4
+			      && data[1] == 0x2c && data[3] == 0xff
+			      && (data[2] & ~1) == 0xf4))
+				adb_input(data+1, len-1, regs, 1);
+#endif /* CONFIG_ADB */		
+		}
+	}
+	/* Sound/brightness button pressed */
+	else if ((1 << pirq) & PMU_INT_SNDBRT) {
+#ifdef CONFIG_PMAC_BACKLIGHT
+		if (len == 3)
+#ifdef CONFIG_INPUT_ADBHID
+			if (!disable_kernel_backlight)
+#endif /* CONFIG_INPUT_ADBHID */
+				set_backlight_level(data[1] >> 4);
+#endif /* CONFIG_PMAC_BACKLIGHT */
+	}
+	/* Tick interrupt */
+	else if ((1 << pirq) & PMU_INT_TICK) {
+		/* Environement or tick interrupt, query batteries */
+		if (pmu_battery_count) {
+			if ((--query_batt_timer) == 0) {
+				query_battery_state();
+				query_batt_timer = BATTERY_POLLING_COUNT;
+			}
+		}
+        }
+	else if ((1 << pirq) & PMU_INT_ENVIRONMENT) {
+		if (pmu_battery_count)
+			query_battery_state();
+		pmu_pass_intr(data, len);
+	} else {
+	       pmu_pass_intr(data, len);
+	}
+	goto next;
+}
+
+static struct adb_request*
+pmu_sr_intr(struct pt_regs *regs)
+{
+	struct adb_request *req;
+	int bite = 0;
+
+	if (via[B] & TREQ) {
+		printk(KERN_ERR "PMU: spurious SR intr (%x)\n", via[B]);
+		out_8(&via[IFR], SR_INT);
+		return NULL;
+	}
+	/* The ack may not yet be low when we get the interrupt */
+	while ((in_8(&via[B]) & TACK) != 0)
+			;
+
+	/* if reading grab the byte, and reset the interrupt */
+	if (pmu_state == reading || pmu_state == reading_intr)
+		bite = in_8(&via[SR]);
+
+	/* reset TREQ and wait for TACK to go high */
+	out_8(&via[B], in_8(&via[B]) | TREQ);
+	wait_for_ack();
+
+	switch (pmu_state) {
+	case sending:
+		req = current_req;
+		if (data_len < 0) {
+			data_len = req->nbytes - 1;
+			send_byte(data_len);
+			break;
+		}
+		if (data_index <= data_len) {
+			send_byte(req->data[data_index++]);
+			break;
+		}
+		req->sent = 1;
+		data_len = pmu_data_len[req->data[0]][1];
+		if (data_len == 0) {
+			pmu_state = idle;
+			current_req = req->next;
+			if (req->reply_expected)
+				req_awaiting_reply = req;
+			else
+				return req;
+		} else {
+			pmu_state = reading;
+			data_index = 0;
+			reply_ptr = req->reply + req->reply_len;
+			recv_byte();
+		}
+		break;
+
+	case intack:
+		data_index = 0;
+		data_len = -1;
+		pmu_state = reading_intr;
+		reply_ptr = interrupt_data[int_data_last];
+		recv_byte();
+		if (gpio_irq >= 0 && !gpio_irq_enabled) {
+			enable_irq(gpio_irq);
+			gpio_irq_enabled = 1;
+		}
+		break;
+
+	case reading:
+	case reading_intr:
+		if (data_len == -1) {
+			data_len = bite;
+			if (bite > 32)
+				printk(KERN_ERR "PMU: bad reply len %d\n", bite);
+		} else if (data_index < 32) {
+			reply_ptr[data_index++] = bite;
+		}
+		if (data_index < data_len) {
+			recv_byte();
+			break;
+		}
+
+		if (pmu_state == reading_intr) {
+			pmu_state = idle;
+			int_data_state[int_data_last] = int_data_ready;
+			interrupt_data_len[int_data_last] = data_len;
+		} else {
+			req = current_req;
+			/* 
+			 * For PMU sleep and freq change requests, we lock the
+			 * PMU until it's explicitely unlocked. This avoids any
+			 * spurrious event polling getting in
+			 */
+			current_req = req->next;
+			req->reply_len += data_index;
+			if (req->data[0] == PMU_SLEEP || req->data[0] == PMU_CPU_SPEED)
+				pmu_state = locked;
+			else
+				pmu_state = idle;
+			return req;
+		}
+		break;
+
+	default:
+		printk(KERN_ERR "via_pmu_interrupt: unknown state %d?\n",
+		       pmu_state);
+	}
+	return NULL;
+}
+
+static irqreturn_t
+via_pmu_interrupt(int irq, void *arg, struct pt_regs *regs)
+{
+	unsigned long flags;
+	int intr;
+	int nloop = 0;
+	int int_data = -1;
+	struct adb_request *req = NULL;
+	int handled = 0;
+
+	/* This is a bit brutal, we can probably do better */
+	spin_lock_irqsave(&pmu_lock, flags);
+	++disable_poll;
+	
+	for (;;) {
+		intr = in_8(&via[IFR]) & (SR_INT | CB1_INT);
+		if (intr == 0)
+			break;
+		handled = 1;
+		if (++nloop > 1000) {
+			printk(KERN_DEBUG "PMU: stuck in intr loop, "
+			       "intr=%x, ier=%x pmu_state=%d\n",
+			       intr, in_8(&via[IER]), pmu_state);
+			break;
+		}
+		out_8(&via[IFR], intr);
+		if (intr & CB1_INT) {
+			adb_int_pending = 1;
+			pmu_irq_stats[0]++;
+		}
+		if (intr & SR_INT) {
+			req = pmu_sr_intr(regs);
+			if (req)
+				break;
+		}
+	}
+
+recheck:
+	if (pmu_state == idle) {
+		if (adb_int_pending) {
+			if (int_data_state[0] == int_data_empty)
+				int_data_last = 0;
+			else if (int_data_state[1] == int_data_empty)
+				int_data_last = 1;
+			else
+				goto no_free_slot;
+			pmu_state = intack;
+			int_data_state[int_data_last] = int_data_fill;
+			/* Sounds safer to make sure ACK is high before writing.
+			 * This helped kill a problem with ADB and some iBooks
+			 */
+			wait_for_ack();
+			send_byte(PMU_INT_ACK);
+			adb_int_pending = 0;
+		} else if (current_req)
+			pmu_start();
+	}
+no_free_slot:			
+	/* Mark the oldest buffer for flushing */
+	if (int_data_state[!int_data_last] == int_data_ready) {
+		int_data_state[!int_data_last] = int_data_flush;
+		int_data = !int_data_last;
+	} else if (int_data_state[int_data_last] == int_data_ready) {
+		int_data_state[int_data_last] = int_data_flush;
+		int_data = int_data_last;
+	}
+	--disable_poll;
+	spin_unlock_irqrestore(&pmu_lock, flags);
+
+	/* Deal with completed PMU requests outside of the lock */
+	if (req) {
+		pmu_done(req);
+		req = NULL;
+	}
+		
+	/* Deal with interrupt datas outside of the lock */
+	if (int_data >= 0) {
+		pmu_handle_data(interrupt_data[int_data], interrupt_data_len[int_data], regs);
+		spin_lock_irqsave(&pmu_lock, flags);
+		++disable_poll;
+		int_data_state[int_data] = int_data_empty;
+		int_data = -1;
+		goto recheck;
+	}
+
+	return IRQ_RETVAL(handled);
+}
+
+void
+pmu_unlock(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu_lock, flags);
+	if (pmu_state == locked)
+		pmu_state = idle;
+	adb_int_pending = 1;
+	spin_unlock_irqrestore(&pmu_lock, flags);
+}
+
+
+static irqreturn_t
+gpio1_interrupt(int irq, void *arg, struct pt_regs *regs)
+{
+	unsigned long flags;
+
+	if ((in_8(gpio_reg + 0x9) & 0x02) == 0) {
+		spin_lock_irqsave(&pmu_lock, flags);
+		if (gpio_irq_enabled > 0) {
+			disable_irq_nosync(gpio_irq);
+			gpio_irq_enabled = 0;
+		}
+		pmu_irq_stats[1]++;
+		adb_int_pending = 1;
+		spin_unlock_irqrestore(&pmu_lock, flags);
+		via_pmu_interrupt(0, NULL, NULL);
+		return IRQ_HANDLED;
+	}
+	return IRQ_NONE;
+}
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+static int backlight_to_bright[] = {
+	0x7f, 0x46, 0x42, 0x3e, 0x3a, 0x36, 0x32, 0x2e,
+	0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e
+};
+ 
+static int
+pmu_set_backlight_enable(int on, int level, void* data)
+{
+	struct adb_request req;
+	
+	if (vias == NULL)
+		return -ENODEV;
+
+	if (on) {
+		pmu_request(&req, NULL, 2, PMU_BACKLIGHT_BRIGHT,
+			    backlight_to_bright[level]);
+		pmu_wait_complete(&req);
+	}
+	pmu_request(&req, NULL, 2, PMU_POWER_CTRL,
+		    PMU_POW_BACKLIGHT | (on ? PMU_POW_ON : PMU_POW_OFF));
+       	pmu_wait_complete(&req);
+
+	return 0;
+}
+
+static void
+pmu_bright_complete(struct adb_request *req)
+{
+	if (req == &bright_req_1)
+		clear_bit(1, &async_req_locks);
+	if (req == &bright_req_2)
+		clear_bit(2, &async_req_locks);
+}
+
+static int
+pmu_set_backlight_level(int level, void* data)
+{
+	if (vias == NULL)
+		return -ENODEV;
+
+	if (test_and_set_bit(1, &async_req_locks))
+		return -EAGAIN;
+	pmu_request(&bright_req_1, pmu_bright_complete, 2, PMU_BACKLIGHT_BRIGHT,
+		backlight_to_bright[level]);
+	if (test_and_set_bit(2, &async_req_locks))
+		return -EAGAIN;
+	pmu_request(&bright_req_2, pmu_bright_complete, 2, PMU_POWER_CTRL,
+		    PMU_POW_BACKLIGHT | (level > BACKLIGHT_OFF ?
+					 PMU_POW_ON : PMU_POW_OFF));
+
+	return 0;
+}
+#endif /* CONFIG_PMAC_BACKLIGHT */
+
+void
+pmu_enable_irled(int on)
+{
+	struct adb_request req;
+
+	if (vias == NULL)
+		return ;
+	if (pmu_kind == PMU_KEYLARGO_BASED)
+		return ;
+
+	pmu_request(&req, NULL, 2, PMU_POWER_CTRL, PMU_POW_IRLED |
+	    (on ? PMU_POW_ON : PMU_POW_OFF));
+	pmu_wait_complete(&req);
+}
+
+void
+pmu_restart(void)
+{
+	struct adb_request req;
+
+	if (via == NULL)
+		return;
+
+	local_irq_disable();
+
+	drop_interrupts = 1;
+	
+	if (pmu_kind != PMU_KEYLARGO_BASED) {
+		pmu_request(&req, NULL, 2, PMU_SET_INTR_MASK, PMU_INT_ADB |
+						PMU_INT_TICK );
+		while(!req.complete)
+			pmu_poll();
+	}
+
+	pmu_request(&req, NULL, 1, PMU_RESET);
+	pmu_wait_complete(&req);
+	for (;;)
+		;
+}
+
+void
+pmu_shutdown(void)
+{
+	struct adb_request req;
+
+	if (via == NULL)
+		return;
+
+	local_irq_disable();
+
+	drop_interrupts = 1;
+
+	if (pmu_kind != PMU_KEYLARGO_BASED) {
+		pmu_request(&req, NULL, 2, PMU_SET_INTR_MASK, PMU_INT_ADB |
+						PMU_INT_TICK );
+		pmu_wait_complete(&req);
+	} else {
+		/* Disable server mode on shutdown or we'll just
+		 * wake up again
+		 */
+		pmu_set_server_mode(0);
+	}
+
+	pmu_request(&req, NULL, 5, PMU_SHUTDOWN,
+		    'M', 'A', 'T', 'T');
+	pmu_wait_complete(&req);
+	for (;;)
+		;
+}
+
+int
+pmu_present(void)
+{
+	return via != 0;
+}
+
+#ifdef CONFIG_PM
+
+static LIST_HEAD(sleep_notifiers);
+
+int
+pmu_register_sleep_notifier(struct pmu_sleep_notifier *n)
+{
+	struct list_head *list;
+	struct pmu_sleep_notifier *notifier;
+
+	for (list = sleep_notifiers.next; list != &sleep_notifiers;
+	     list = list->next) {
+		notifier = list_entry(list, struct pmu_sleep_notifier, list);
+		if (n->priority > notifier->priority)
+			break;
+	}
+	__list_add(&n->list, list->prev, list);
+	return 0;
+}
+EXPORT_SYMBOL(pmu_register_sleep_notifier);
+
+int
+pmu_unregister_sleep_notifier(struct pmu_sleep_notifier* n)
+{
+	if (n->list.next == 0)
+		return -ENOENT;
+	list_del(&n->list);
+	n->list.next = NULL;
+	return 0;
+}
+EXPORT_SYMBOL(pmu_unregister_sleep_notifier);
+#endif /* CONFIG_PM */
+
+#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+
+/* Sleep is broadcast last-to-first */
+static int
+broadcast_sleep(int when, int fallback)
+{
+	int ret = PBOOK_SLEEP_OK;
+	struct list_head *list;
+	struct pmu_sleep_notifier *notifier;
+
+	for (list = sleep_notifiers.prev; list != &sleep_notifiers;
+	     list = list->prev) {
+		notifier = list_entry(list, struct pmu_sleep_notifier, list);
+		ret = notifier->notifier_call(notifier, when);
+		if (ret != PBOOK_SLEEP_OK) {
+			printk(KERN_DEBUG "sleep %d rejected by %p (%p)\n",
+			       when, notifier, notifier->notifier_call);
+			for (; list != &sleep_notifiers; list = list->next) {
+				notifier = list_entry(list, struct pmu_sleep_notifier, list);
+				notifier->notifier_call(notifier, fallback);
+			}
+			return ret;
+		}
+	}
+	return ret;
+}
+
+/* Wake is broadcast first-to-last */
+static int
+broadcast_wake(void)
+{
+	int ret = PBOOK_SLEEP_OK;
+	struct list_head *list;
+	struct pmu_sleep_notifier *notifier;
+
+	for (list = sleep_notifiers.next; list != &sleep_notifiers;
+	     list = list->next) {
+		notifier = list_entry(list, struct pmu_sleep_notifier, list);
+		notifier->notifier_call(notifier, PBOOK_WAKE);
+	}
+	return ret;
+}
+
+/*
+ * This struct is used to store config register values for
+ * PCI devices which may get powered off when we sleep.
+ */
+static struct pci_save {
+#ifndef HACKED_PCI_SAVE
+	u16	command;
+	u16	cache_lat;
+	u16	intr;
+	u32	rom_address;
+#else
+	u32	config[16];
+#endif	
+} *pbook_pci_saves;
+static int pbook_npci_saves;
+
+static void
+pbook_alloc_pci_save(void)
+{
+	int npci;
+	struct pci_dev *pd = NULL;
+
+	npci = 0;
+	while ((pd = pci_find_device(PCI_ANY_ID, PCI_ANY_ID, pd)) != NULL) {
+		++npci;
+	}
+	if (npci == 0)
+		return;
+	pbook_pci_saves = (struct pci_save *)
+		kmalloc(npci * sizeof(struct pci_save), GFP_KERNEL);
+	pbook_npci_saves = npci;
+}
+
+static void
+pbook_free_pci_save(void)
+{
+	if (pbook_pci_saves == NULL)
+		return;
+	kfree(pbook_pci_saves);
+	pbook_pci_saves = NULL;
+	pbook_npci_saves = 0;
+}
+
+static void
+pbook_pci_save(void)
+{
+	struct pci_save *ps = pbook_pci_saves;
+	struct pci_dev *pd = NULL;
+	int npci = pbook_npci_saves;
+	
+	if (ps == NULL)
+		return;
+
+	while ((pd = pci_find_device(PCI_ANY_ID, PCI_ANY_ID, pd)) != NULL) {
+		if (npci-- == 0)
+			return;
+#ifndef HACKED_PCI_SAVE
+		pci_read_config_word(pd, PCI_COMMAND, &ps->command);
+		pci_read_config_word(pd, PCI_CACHE_LINE_SIZE, &ps->cache_lat);
+		pci_read_config_word(pd, PCI_INTERRUPT_LINE, &ps->intr);
+		pci_read_config_dword(pd, PCI_ROM_ADDRESS, &ps->rom_address);
+#else
+		int i;
+		for (i=1;i<16;i++)
+			pci_read_config_dword(pd, i<<4, &ps->config[i]);
+#endif
+		++ps;
+	}
+}
+
+/* For this to work, we must take care of a few things: If gmac was enabled
+ * during boot, it will be in the pci dev list. If it's disabled at this point
+ * (and it will probably be), then you can't access it's config space.
+ */
+static void
+pbook_pci_restore(void)
+{
+	u16 cmd;
+	struct pci_save *ps = pbook_pci_saves - 1;
+	struct pci_dev *pd = NULL;
+	int npci = pbook_npci_saves;
+	int j;
+
+	while ((pd = pci_find_device(PCI_ANY_ID, PCI_ANY_ID, pd)) != NULL) {
+#ifdef HACKED_PCI_SAVE
+		int i;
+		if (npci-- == 0)
+			return;
+		ps++;
+		for (i=2;i<16;i++)
+			pci_write_config_dword(pd, i<<4, ps->config[i]);
+		pci_write_config_dword(pd, 4, ps->config[1]);
+#else
+		if (npci-- == 0)
+			return;
+		ps++;
+		if (ps->command == 0)
+			continue;
+		pci_read_config_word(pd, PCI_COMMAND, &cmd);
+		if ((ps->command & ~cmd) == 0)
+			continue;
+		switch (pd->hdr_type) {
+		case PCI_HEADER_TYPE_NORMAL:
+			for (j = 0; j < 6; ++j)
+				pci_write_config_dword(pd,
+					PCI_BASE_ADDRESS_0 + j*4,
+					pd->resource[j].start);
+			pci_write_config_dword(pd, PCI_ROM_ADDRESS,
+				ps->rom_address);
+			pci_write_config_word(pd, PCI_CACHE_LINE_SIZE,
+				ps->cache_lat);
+			pci_write_config_word(pd, PCI_INTERRUPT_LINE,
+				ps->intr);
+			pci_write_config_word(pd, PCI_COMMAND, ps->command);
+			break;
+		}
+#endif	
+	}
+}
+
+#ifdef DEBUG_SLEEP
+/* N.B. This doesn't work on the 3400 */
+void 
+pmu_blink(int n)
+{
+	struct adb_request req;
+
+	memset(&req, 0, sizeof(req));
+
+	for (; n > 0; --n) {
+		req.nbytes = 4;
+		req.done = NULL;
+		req.data[0] = 0xee;
+		req.data[1] = 4;
+		req.data[2] = 0;
+		req.data[3] = 1;
+		req.reply[0] = ADB_RET_OK;
+		req.reply_len = 1;
+		req.reply_expected = 0;
+		pmu_polled_request(&req);
+		mdelay(50);
+		req.nbytes = 4;
+		req.done = NULL;
+		req.data[0] = 0xee;
+		req.data[1] = 4;
+		req.data[2] = 0;
+		req.data[3] = 0;
+		req.reply[0] = ADB_RET_OK;
+		req.reply_len = 1;
+		req.reply_expected = 0;
+		pmu_polled_request(&req);
+		mdelay(50);
+	}
+	mdelay(50);
+}
+#endif
+
+/*
+ * Put the powerbook to sleep.
+ */
+ 
+static u32 save_via[8];
+
+static void
+save_via_state(void)
+{
+	save_via[0] = in_8(&via[ANH]);
+	save_via[1] = in_8(&via[DIRA]);
+	save_via[2] = in_8(&via[B]);
+	save_via[3] = in_8(&via[DIRB]);
+	save_via[4] = in_8(&via[PCR]);
+	save_via[5] = in_8(&via[ACR]);
+	save_via[6] = in_8(&via[T1CL]);
+	save_via[7] = in_8(&via[T1CH]);
+}
+static void
+restore_via_state(void)
+{
+	out_8(&via[ANH], save_via[0]);
+	out_8(&via[DIRA], save_via[1]);
+	out_8(&via[B], save_via[2]);
+	out_8(&via[DIRB], save_via[3]);
+	out_8(&via[PCR], save_via[4]);
+	out_8(&via[ACR], save_via[5]);
+	out_8(&via[T1CL], save_via[6]);
+	out_8(&via[T1CH], save_via[7]);
+	out_8(&via[IER], IER_CLR | 0x7f);	/* disable all intrs */
+	out_8(&via[IFR], 0x7f);				/* clear IFR */
+	out_8(&via[IER], IER_SET | SR_INT | CB1_INT);
+}
+
+static int
+pmac_suspend_devices(void)
+{
+	int ret;
+
+	pm_prepare_console();
+	
+	/* Notify old-style device drivers & userland */
+	ret = broadcast_sleep(PBOOK_SLEEP_REQUEST, PBOOK_SLEEP_REJECT);
+	if (ret != PBOOK_SLEEP_OK) {
+		printk(KERN_ERR "Sleep rejected by drivers\n");
+		return -EBUSY;
+	}
+
+	/* Sync the disks. */
+	/* XXX It would be nice to have some way to ensure that
+	 * nobody is dirtying any new buffers while we wait. That
+	 * could be achieved using the refrigerator for processes
+	 * that swsusp uses
+	 */
+	sys_sync();
+
+	/* Sleep can fail now. May not be very robust but useful for debugging */
+	ret = broadcast_sleep(PBOOK_SLEEP_NOW, PBOOK_WAKE);
+	if (ret != PBOOK_SLEEP_OK) {
+		printk(KERN_ERR "Driver sleep failed\n");
+		return -EBUSY;
+	}
+
+	/* Send suspend call to devices, hold the device core's dpm_sem */
+	ret = device_suspend(PMSG_SUSPEND);
+	if (ret) {
+		broadcast_wake();
+		printk(KERN_ERR "Driver sleep failed\n");
+		return -EBUSY;
+	}
+
+	/* Call platform functions marked "on sleep" */
+	pmac_pfunc_i2c_suspend();
+	pmac_pfunc_base_suspend();
+
+	/* Stop preemption */
+	preempt_disable();
+
+	/* Make sure the decrementer won't interrupt us */
+	asm volatile("mtdec %0" : : "r" (0x7fffffff));
+	/* Make sure any pending DEC interrupt occurring while we did
+	 * the above didn't re-enable the DEC */
+	mb();
+	asm volatile("mtdec %0" : : "r" (0x7fffffff));
+
+	/* We can now disable MSR_EE. This code of course works properly only
+	 * on UP machines... For SMP, if we ever implement sleep, we'll have to
+	 * stop the "other" CPUs way before we do all that stuff.
+	 */
+	local_irq_disable();
+
+	/* Broadcast power down irq
+	 * This isn't that useful in most cases (only directly wired devices can
+	 * use this but still... This will take care of sysdev's as well, so
+	 * we exit from here with local irqs disabled and PIC off.
+	 */
+	ret = device_power_down(PMSG_SUSPEND);
+	if (ret) {
+		wakeup_decrementer();
+		local_irq_enable();
+		preempt_enable();
+		device_resume();
+		broadcast_wake();
+		printk(KERN_ERR "Driver powerdown failed\n");
+		return -EBUSY;
+	}
+
+	/* Wait for completion of async backlight requests */
+	while (!bright_req_1.complete || !bright_req_2.complete ||
+			!batt_req.complete)
+		pmu_poll();
+
+	/* Giveup the lazy FPU & vec so we don't have to back them
+	 * up from the low level code
+	 */
+	enable_kernel_fp();
+
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		enable_kernel_altivec();
+#endif /* CONFIG_ALTIVEC */
+
+	return 0;
+}
+
+static int
+pmac_wakeup_devices(void)
+{
+	mdelay(100);
+
+	/* Power back up system devices (including the PIC) */
+	device_power_up();
+
+	/* Force a poll of ADB interrupts */
+	adb_int_pending = 1;
+	via_pmu_interrupt(0, NULL, NULL);
+
+	/* Restart jiffies & scheduling */
+	wakeup_decrementer();
+
+	/* Re-enable local CPU interrupts */
+	local_irq_enable();
+	mdelay(10);
+	preempt_enable();
+
+	/* Call platform functions marked "on wake" */
+	pmac_pfunc_base_resume();
+	pmac_pfunc_i2c_resume();
+
+	/* Resume devices */
+	device_resume();
+
+	/* Notify old style drivers */
+	broadcast_wake();
+
+	pm_restore_console();
+
+	return 0;
+}
+
+#define	GRACKLE_PM	(1<<7)
+#define GRACKLE_DOZE	(1<<5)
+#define	GRACKLE_NAP	(1<<4)
+#define	GRACKLE_SLEEP	(1<<3)
+
+static int powerbook_sleep_grackle(void)
+{
+	unsigned long save_l2cr;
+	unsigned short pmcr1;
+	struct adb_request req;
+	int ret;
+	struct pci_dev *grackle;
+
+	grackle = pci_find_slot(0, 0);
+	if (!grackle)
+		return -ENODEV;
+
+	ret = pmac_suspend_devices();
+	if (ret) {
+		printk(KERN_ERR "Sleep rejected by devices\n");
+		return ret;
+	}
+	
+	/* Turn off various things. Darwin does some retry tests here... */
+	pmu_request(&req, NULL, 2, PMU_POWER_CTRL0, PMU_POW0_OFF|PMU_POW0_HARD_DRIVE);
+	pmu_wait_complete(&req);
+	pmu_request(&req, NULL, 2, PMU_POWER_CTRL,
+		PMU_POW_OFF|PMU_POW_BACKLIGHT|PMU_POW_IRLED|PMU_POW_MEDIABAY);
+	pmu_wait_complete(&req);
+
+	/* For 750, save backside cache setting and disable it */
+	save_l2cr = _get_L2CR();	/* (returns -1 if not available) */
+
+	if (!__fake_sleep) {
+		/* Ask the PMU to put us to sleep */
+		pmu_request(&req, NULL, 5, PMU_SLEEP, 'M', 'A', 'T', 'T');
+		pmu_wait_complete(&req);
+	}
+
+	/* The VIA is supposed not to be restored correctly*/
+	save_via_state();
+	/* We shut down some HW */
+	pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,1);
+
+	pci_read_config_word(grackle, 0x70, &pmcr1);
+	/* Apparently, MacOS uses NAP mode for Grackle ??? */
+	pmcr1 &= ~(GRACKLE_DOZE|GRACKLE_SLEEP); 
+	pmcr1 |= GRACKLE_PM|GRACKLE_NAP;
+	pci_write_config_word(grackle, 0x70, pmcr1);
+
+	/* Call low-level ASM sleep handler */
+	if (__fake_sleep)
+		mdelay(5000);
+	else
+		low_sleep_handler();
+
+	/* We're awake again, stop grackle PM */
+	pci_read_config_word(grackle, 0x70, &pmcr1);
+	pmcr1 &= ~(GRACKLE_PM|GRACKLE_DOZE|GRACKLE_SLEEP|GRACKLE_NAP); 
+	pci_write_config_word(grackle, 0x70, pmcr1);
+
+	/* Make sure the PMU is idle */
+	pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,0);
+	restore_via_state();
+	
+	/* Restore L2 cache */
+	if (save_l2cr != 0xffffffff && (save_l2cr & L2CR_L2E) != 0)
+ 		_set_L2CR(save_l2cr);
+	
+	/* Restore userland MMU context */
+	set_context(current->active_mm->context, current->active_mm->pgd);
+
+	/* Power things up */
+	pmu_unlock();
+	pmu_request(&req, NULL, 2, PMU_SET_INTR_MASK, pmu_intr_mask);
+	pmu_wait_complete(&req);
+	pmu_request(&req, NULL, 2, PMU_POWER_CTRL0,
+			PMU_POW0_ON|PMU_POW0_HARD_DRIVE);
+	pmu_wait_complete(&req);
+	pmu_request(&req, NULL, 2, PMU_POWER_CTRL,
+			PMU_POW_ON|PMU_POW_BACKLIGHT|PMU_POW_CHARGER|PMU_POW_IRLED|PMU_POW_MEDIABAY);
+	pmu_wait_complete(&req);
+
+	pmac_wakeup_devices();
+
+	return 0;
+}
+
+static int
+powerbook_sleep_Core99(void)
+{
+	unsigned long save_l2cr;
+	unsigned long save_l3cr;
+	struct adb_request req;
+	int ret;
+	
+	if (pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) < 0) {
+		printk(KERN_ERR "Sleep mode not supported on this machine\n");
+		return -ENOSYS;
+	}
+
+	if (num_online_cpus() > 1 || cpu_is_offline(0))
+		return -EAGAIN;
+
+	ret = pmac_suspend_devices();
+	if (ret) {
+		printk(KERN_ERR "Sleep rejected by devices\n");
+		return ret;
+	}
+
+	/* Stop environment and ADB interrupts */
+	pmu_request(&req, NULL, 2, PMU_SET_INTR_MASK, 0);
+	pmu_wait_complete(&req);
+
+	/* Tell PMU what events will wake us up */
+	pmu_request(&req, NULL, 4, PMU_POWER_EVENTS, PMU_PWR_CLR_WAKEUP_EVENTS,
+		0xff, 0xff);
+	pmu_wait_complete(&req);
+	pmu_request(&req, NULL, 4, PMU_POWER_EVENTS, PMU_PWR_SET_WAKEUP_EVENTS,
+		0, PMU_PWR_WAKEUP_KEY |
+		(option_lid_wakeup ? PMU_PWR_WAKEUP_LID_OPEN : 0));
+	pmu_wait_complete(&req);
+
+	/* Save the state of the L2 and L3 caches */
+	save_l3cr = _get_L3CR();	/* (returns -1 if not available) */
+	save_l2cr = _get_L2CR();	/* (returns -1 if not available) */
+
+	if (!__fake_sleep) {
+		/* Ask the PMU to put us to sleep */
+		pmu_request(&req, NULL, 5, PMU_SLEEP, 'M', 'A', 'T', 'T');
+		pmu_wait_complete(&req);
+	}
+
+	/* The VIA is supposed not to be restored correctly*/
+	save_via_state();
+
+	/* Shut down various ASICs. There's a chance that we can no longer
+	 * talk to the PMU after this, so I moved it to _after_ sending the
+	 * sleep command to it. Still need to be checked.
+	 */
+	pmac_call_feature(PMAC_FTR_SLEEP_STATE, NULL, 0, 1);
+
+	/* Call low-level ASM sleep handler */
+	if (__fake_sleep)
+		mdelay(5000);
+	else
+		low_sleep_handler();
+
+	/* Restore Apple core ASICs state */
+	pmac_call_feature(PMAC_FTR_SLEEP_STATE, NULL, 0, 0);
+
+	/* Restore VIA */
+	restore_via_state();
+
+	/* tweak LPJ before cpufreq is there */
+	loops_per_jiffy *= 2;
+
+	/* Restore video */
+	pmac_call_early_video_resume();
+
+	/* Restore L2 cache */
+	if (save_l2cr != 0xffffffff && (save_l2cr & L2CR_L2E) != 0)
+ 		_set_L2CR(save_l2cr);
+	/* Restore L3 cache */
+	if (save_l3cr != 0xffffffff && (save_l3cr & L3CR_L3E) != 0)
+ 		_set_L3CR(save_l3cr);
+	
+	/* Restore userland MMU context */
+	set_context(current->active_mm->context, current->active_mm->pgd);
+
+	/* Tell PMU we are ready */
+	pmu_unlock();
+	pmu_request(&req, NULL, 2, PMU_SYSTEM_READY, 2);
+	pmu_wait_complete(&req);
+	pmu_request(&req, NULL, 2, PMU_SET_INTR_MASK, pmu_intr_mask);
+	pmu_wait_complete(&req);
+
+	/* Restore LPJ, cpufreq will adjust the cpu frequency */
+	loops_per_jiffy /= 2;
+
+	pmac_wakeup_devices();
+
+	return 0;
+}
+
+#define PB3400_MEM_CTRL		0xf8000000
+#define PB3400_MEM_CTRL_SLEEP	0x70
+
+static int
+powerbook_sleep_3400(void)
+{
+	int ret, i, x;
+	unsigned int hid0;
+	unsigned long p;
+	struct adb_request sleep_req;
+	void __iomem *mem_ctrl;
+	unsigned int __iomem *mem_ctrl_sleep;
+
+	/* first map in the memory controller registers */
+	mem_ctrl = ioremap(PB3400_MEM_CTRL, 0x100);
+	if (mem_ctrl == NULL) {
+		printk("powerbook_sleep_3400: ioremap failed\n");
+		return -ENOMEM;
+	}
+	mem_ctrl_sleep = mem_ctrl + PB3400_MEM_CTRL_SLEEP;
+
+	/* Allocate room for PCI save */
+	pbook_alloc_pci_save();
+
+	ret = pmac_suspend_devices();
+	if (ret) {
+		pbook_free_pci_save();
+		printk(KERN_ERR "Sleep rejected by devices\n");
+		return ret;
+	}
+
+	/* Save the state of PCI config space for some slots */
+	pbook_pci_save();
+
+	/* Set the memory controller to keep the memory refreshed
+	   while we're asleep */
+	for (i = 0x403f; i >= 0x4000; --i) {
+		out_be32(mem_ctrl_sleep, i);
+		do {
+			x = (in_be32(mem_ctrl_sleep) >> 16) & 0x3ff;
+		} while (x == 0);
+		if (x >= 0x100)
+			break;
+	}
+
+	/* Ask the PMU to put us to sleep */
+	pmu_request(&sleep_req, NULL, 5, PMU_SLEEP, 'M', 'A', 'T', 'T');
+	while (!sleep_req.complete)
+		mb();
+
+	pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,1);
+
+	/* displacement-flush the L2 cache - necessary? */
+	for (p = KERNELBASE; p < KERNELBASE + 0x100000; p += 0x1000)
+		i = *(volatile int *)p;
+	asleep = 1;
+
+	/* Put the CPU into sleep mode */
+	hid0 = mfspr(SPRN_HID0);
+	hid0 = (hid0 & ~(HID0_NAP | HID0_DOZE)) | HID0_SLEEP;
+	mtspr(SPRN_HID0, hid0);
+	mtmsr(mfmsr() | MSR_POW | MSR_EE);
+	udelay(10);
+
+	/* OK, we're awake again, start restoring things */
+	out_be32(mem_ctrl_sleep, 0x3f);
+	pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,0);
+	pbook_pci_restore();
+	pmu_unlock();
+
+	/* wait for the PMU interrupt sequence to complete */
+	while (asleep)
+		mb();
+
+	pmac_wakeup_devices();
+	pbook_free_pci_save();
+	iounmap(mem_ctrl);
+
+	return 0;
+}
+
+#endif /* CONFIG_PM && CONFIG_PPC32 */
+
+/*
+ * Support for /dev/pmu device
+ */
+#define RB_SIZE		0x10
+struct pmu_private {
+	struct list_head list;
+	int	rb_get;
+	int	rb_put;
+	struct rb_entry {
+		unsigned short len;
+		unsigned char data[16];
+	}	rb_buf[RB_SIZE];
+	wait_queue_head_t wait;
+	spinlock_t lock;
+#if defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_PMAC_BACKLIGHT)
+	int	backlight_locker;
+#endif /* defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_PMAC_BACKLIGHT) */	
+};
+
+static LIST_HEAD(all_pmu_pvt);
+static DEFINE_SPINLOCK(all_pvt_lock);
+
+static void
+pmu_pass_intr(unsigned char *data, int len)
+{
+	struct pmu_private *pp;
+	struct list_head *list;
+	int i;
+	unsigned long flags;
+
+	if (len > sizeof(pp->rb_buf[0].data))
+		len = sizeof(pp->rb_buf[0].data);
+	spin_lock_irqsave(&all_pvt_lock, flags);
+	for (list = &all_pmu_pvt; (list = list->next) != &all_pmu_pvt; ) {
+		pp = list_entry(list, struct pmu_private, list);
+		spin_lock(&pp->lock);
+		i = pp->rb_put + 1;
+		if (i >= RB_SIZE)
+			i = 0;
+		if (i != pp->rb_get) {
+			struct rb_entry *rp = &pp->rb_buf[pp->rb_put];
+			rp->len = len;
+			memcpy(rp->data, data, len);
+			pp->rb_put = i;
+			wake_up_interruptible(&pp->wait);
+		}
+		spin_unlock(&pp->lock);
+	}
+	spin_unlock_irqrestore(&all_pvt_lock, flags);
+}
+
+static int
+pmu_open(struct inode *inode, struct file *file)
+{
+	struct pmu_private *pp;
+	unsigned long flags;
+
+	pp = kmalloc(sizeof(struct pmu_private), GFP_KERNEL);
+	if (pp == 0)
+		return -ENOMEM;
+	pp->rb_get = pp->rb_put = 0;
+	spin_lock_init(&pp->lock);
+	init_waitqueue_head(&pp->wait);
+	spin_lock_irqsave(&all_pvt_lock, flags);
+#if defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_PMAC_BACKLIGHT)
+	pp->backlight_locker = 0;
+#endif /* defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_PMAC_BACKLIGHT) */	
+	list_add(&pp->list, &all_pmu_pvt);
+	spin_unlock_irqrestore(&all_pvt_lock, flags);
+	file->private_data = pp;
+	return 0;
+}
+
+static ssize_t 
+pmu_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	struct pmu_private *pp = file->private_data;
+	DECLARE_WAITQUEUE(wait, current);
+	unsigned long flags;
+	int ret = 0;
+
+	if (count < 1 || pp == 0)
+		return -EINVAL;
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	spin_lock_irqsave(&pp->lock, flags);
+	add_wait_queue(&pp->wait, &wait);
+	current->state = TASK_INTERRUPTIBLE;
+
+	for (;;) {
+		ret = -EAGAIN;
+		if (pp->rb_get != pp->rb_put) {
+			int i = pp->rb_get;
+			struct rb_entry *rp = &pp->rb_buf[i];
+			ret = rp->len;
+			spin_unlock_irqrestore(&pp->lock, flags);
+			if (ret > count)
+				ret = count;
+			if (ret > 0 && copy_to_user(buf, rp->data, ret))
+				ret = -EFAULT;
+			if (++i >= RB_SIZE)
+				i = 0;
+			spin_lock_irqsave(&pp->lock, flags);
+			pp->rb_get = i;
+		}
+		if (ret >= 0)
+			break;
+		if (file->f_flags & O_NONBLOCK)
+			break;
+		ret = -ERESTARTSYS;
+		if (signal_pending(current))
+			break;
+		spin_unlock_irqrestore(&pp->lock, flags);
+		schedule();
+		spin_lock_irqsave(&pp->lock, flags);
+	}
+	current->state = TASK_RUNNING;
+	remove_wait_queue(&pp->wait, &wait);
+	spin_unlock_irqrestore(&pp->lock, flags);
+	
+	return ret;
+}
+
+static ssize_t
+pmu_write(struct file *file, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	return 0;
+}
+
+static unsigned int
+pmu_fpoll(struct file *filp, poll_table *wait)
+{
+	struct pmu_private *pp = filp->private_data;
+	unsigned int mask = 0;
+	unsigned long flags;
+	
+	if (pp == 0)
+		return 0;
+	poll_wait(filp, &pp->wait, wait);
+	spin_lock_irqsave(&pp->lock, flags);
+	if (pp->rb_get != pp->rb_put)
+		mask |= POLLIN;
+	spin_unlock_irqrestore(&pp->lock, flags);
+	return mask;
+}
+
+static int
+pmu_release(struct inode *inode, struct file *file)
+{
+	struct pmu_private *pp = file->private_data;
+	unsigned long flags;
+
+	lock_kernel();
+	if (pp != 0) {
+		file->private_data = NULL;
+		spin_lock_irqsave(&all_pvt_lock, flags);
+		list_del(&pp->list);
+		spin_unlock_irqrestore(&all_pvt_lock, flags);
+#if defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_PMAC_BACKLIGHT)
+		if (pp->backlight_locker) {
+			spin_lock_irqsave(&pmu_lock, flags);
+			disable_kernel_backlight--;
+			spin_unlock_irqrestore(&pmu_lock, flags);
+		}
+#endif /* defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_PMAC_BACKLIGHT) */
+		kfree(pp);
+	}
+	unlock_kernel();
+	return 0;
+}
+
+static int
+pmu_ioctl(struct inode * inode, struct file *filp,
+		     u_int cmd, u_long arg)
+{
+	__u32 __user *argp = (__u32 __user *)arg;
+	int error = -EINVAL;
+
+	switch (cmd) {
+#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+	case PMU_IOC_SLEEP:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		if (sleep_in_progress)
+			return -EBUSY;
+		sleep_in_progress = 1;
+		switch (pmu_kind) {
+		case PMU_OHARE_BASED:
+			error = powerbook_sleep_3400();
+			break;
+		case PMU_HEATHROW_BASED:
+		case PMU_PADDINGTON_BASED:
+			error = powerbook_sleep_grackle();
+			break;
+		case PMU_KEYLARGO_BASED:
+			error = powerbook_sleep_Core99();
+			break;
+		default:
+			error = -ENOSYS;
+		}
+		sleep_in_progress = 0;
+		break;
+	case PMU_IOC_CAN_SLEEP:
+		if (pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) < 0)
+			return put_user(0, argp);
+		else
+			return put_user(1, argp);
+#endif /* CONFIG_PM && CONFIG_PPC32 */
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+	/* Backlight should have its own device or go via
+	 * the fbdev
+	 */
+	case PMU_IOC_GET_BACKLIGHT:
+		if (sleep_in_progress)
+			return -EBUSY;
+		error = get_backlight_level();
+		if (error < 0)
+			return error;
+		return put_user(error, argp);
+	case PMU_IOC_SET_BACKLIGHT:
+	{
+		__u32 value;
+		if (sleep_in_progress)
+			return -EBUSY;
+		error = get_user(value, argp);
+		if (!error)
+			error = set_backlight_level(value);
+		break;
+	}
+#ifdef CONFIG_INPUT_ADBHID
+	case PMU_IOC_GRAB_BACKLIGHT: {
+		struct pmu_private *pp = filp->private_data;
+		unsigned long flags;
+
+		if (pp->backlight_locker)
+			return 0;
+		pp->backlight_locker = 1;
+		spin_lock_irqsave(&pmu_lock, flags);
+		disable_kernel_backlight++;
+		spin_unlock_irqrestore(&pmu_lock, flags);
+		return 0;
+	}
+#endif /* CONFIG_INPUT_ADBHID */
+#endif /* CONFIG_PMAC_BACKLIGHT */
+	case PMU_IOC_GET_MODEL:
+	    	return put_user(pmu_kind, argp);
+	case PMU_IOC_HAS_ADB:
+		return put_user(pmu_has_adb, argp);
+	}
+	return error;
+}
+
+static struct file_operations pmu_device_fops = {
+	.read		= pmu_read,
+	.write		= pmu_write,
+	.poll		= pmu_fpoll,
+	.ioctl		= pmu_ioctl,
+	.open		= pmu_open,
+	.release	= pmu_release,
+};
+
+static struct miscdevice pmu_device = {
+	PMU_MINOR, "pmu", &pmu_device_fops
+};
+
+static int pmu_device_init(void)
+{
+	if (!via)
+		return 0;
+	if (misc_register(&pmu_device) < 0)
+		printk(KERN_ERR "via-pmu: cannot register misc device.\n");
+	return 0;
+}
+device_initcall(pmu_device_init);
+
+
+#ifdef DEBUG_SLEEP
+static inline void 
+polled_handshake(volatile unsigned char __iomem *via)
+{
+	via[B] &= ~TREQ; eieio();
+	while ((via[B] & TACK) != 0)
+		;
+	via[B] |= TREQ; eieio();
+	while ((via[B] & TACK) == 0)
+		;
+}
+
+static inline void 
+polled_send_byte(volatile unsigned char __iomem *via, int x)
+{
+	via[ACR] |= SR_OUT | SR_EXT; eieio();
+	via[SR] = x; eieio();
+	polled_handshake(via);
+}
+
+static inline int
+polled_recv_byte(volatile unsigned char __iomem *via)
+{
+	int x;
+
+	via[ACR] = (via[ACR] & ~SR_OUT) | SR_EXT; eieio();
+	x = via[SR]; eieio();
+	polled_handshake(via);
+	x = via[SR]; eieio();
+	return x;
+}
+
+int
+pmu_polled_request(struct adb_request *req)
+{
+	unsigned long flags;
+	int i, l, c;
+	volatile unsigned char __iomem *v = via;
+
+	req->complete = 1;
+	c = req->data[0];
+	l = pmu_data_len[c][0];
+	if (l >= 0 && req->nbytes != l + 1)
+		return -EINVAL;
+
+	local_irq_save(flags);
+	while (pmu_state != idle)
+		pmu_poll();
+
+	while ((via[B] & TACK) == 0)
+		;
+	polled_send_byte(v, c);
+	if (l < 0) {
+		l = req->nbytes - 1;
+		polled_send_byte(v, l);
+	}
+	for (i = 1; i <= l; ++i)
+		polled_send_byte(v, req->data[i]);
+
+	l = pmu_data_len[c][1];
+	if (l < 0)
+		l = polled_recv_byte(v);
+	for (i = 0; i < l; ++i)
+		req->reply[i + req->reply_len] = polled_recv_byte(v);
+
+	if (req->done)
+		(*req->done)(req);
+
+	local_irq_restore(flags);
+	return 0;
+}
+#endif /* DEBUG_SLEEP */
+
+
+/* FIXME: This is a temporary set of callbacks to enable us
+ * to do suspend-to-disk.
+ */
+
+#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+
+static int pmu_sys_suspended = 0;
+
+static int pmu_sys_suspend(struct sys_device *sysdev, pm_message_t state)
+{
+	if (state.event != PM_EVENT_SUSPEND || pmu_sys_suspended)
+		return 0;
+
+	/* Suspend PMU event interrupts */
+	pmu_suspend();
+
+	pmu_sys_suspended = 1;
+	return 0;
+}
+
+static int pmu_sys_resume(struct sys_device *sysdev)
+{
+	struct adb_request req;
+
+	if (!pmu_sys_suspended)
+		return 0;
+
+	/* Tell PMU we are ready */
+	pmu_request(&req, NULL, 2, PMU_SYSTEM_READY, 2);
+	pmu_wait_complete(&req);
+
+	/* Resume PMU event interrupts */
+	pmu_resume();
+
+	pmu_sys_suspended = 0;
+
+	return 0;
+}
+
+#endif /* CONFIG_PM && CONFIG_PPC32 */
+
+static struct sysdev_class pmu_sysclass = {
+	set_kset_name("pmu"),
+};
+
+static struct sys_device device_pmu = {
+	.id		= 0,
+	.cls		= &pmu_sysclass,
+};
+
+static struct sysdev_driver driver_pmu = {
+#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+	.suspend	= &pmu_sys_suspend,
+	.resume		= &pmu_sys_resume,
+#endif /* CONFIG_PM && CONFIG_PPC32 */
+};
+
+static int __init init_pmu_sysfs(void)
+{
+	int rc;
+
+	rc = sysdev_class_register(&pmu_sysclass);
+	if (rc) {
+		printk(KERN_ERR "Failed registering PMU sys class\n");
+		return -ENODEV;
+	}
+	rc = sysdev_register(&device_pmu);
+	if (rc) {
+		printk(KERN_ERR "Failed registering PMU sys device\n");
+		return -ENODEV;
+	}
+	rc = sysdev_driver_register(&pmu_sysclass, &driver_pmu);
+	if (rc) {
+		printk(KERN_ERR "Failed registering PMU sys driver\n");
+		return -ENODEV;
+	}
+	return 0;
+}
+
+subsys_initcall(init_pmu_sysfs);
+
+EXPORT_SYMBOL(pmu_request);
+EXPORT_SYMBOL(pmu_queue_request);
+EXPORT_SYMBOL(pmu_poll);
+EXPORT_SYMBOL(pmu_poll_adb);
+EXPORT_SYMBOL(pmu_wait_complete);
+EXPORT_SYMBOL(pmu_suspend);
+EXPORT_SYMBOL(pmu_resume);
+EXPORT_SYMBOL(pmu_unlock);
+#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+EXPORT_SYMBOL(pmu_enable_irled);
+EXPORT_SYMBOL(pmu_battery_count);
+EXPORT_SYMBOL(pmu_batteries);
+EXPORT_SYMBOL(pmu_power_flags);
+#endif /* CONFIG_PM && CONFIG_PPC32 */
+
diff -urN oldtree/drivers/md/dm-crypt.c newtree/drivers/md/dm-crypt.c
--- oldtree/drivers/md/dm-crypt.c	2006-03-08 18:47:59.623855500 +0000
+++ newtree/drivers/md/dm-crypt.c	2006-03-08 15:22:33.133498000 +0000
@@ -915,7 +915,7 @@
 	if (!_crypt_io_pool)
 		return -ENOMEM;
 
-	_kcryptd_workqueue = create_workqueue("kcryptd");
+	_kcryptd_workqueue = create_nofreeze_workqueue("kcryptd");
 	if (!_kcryptd_workqueue) {
 		r = -ENOMEM;
 		DMERR(PFX "couldn't create kcryptd");
diff -urN oldtree/drivers/md/dm-crypt.c.orig newtree/drivers/md/dm-crypt.c.orig
--- oldtree/drivers/md/dm-crypt.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/drivers/md/dm-crypt.c.orig	2006-03-08 15:21:15.144624000 +0000
@@ -0,0 +1,956 @@
+/*
+ * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
+ * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/workqueue.h>
+#include <asm/atomic.h>
+#include <linux/scatterlist.h>
+#include <asm/page.h>
+
+#include "dm.h"
+
+#define PFX	"crypt: "
+
+/*
+ * per bio private data
+ */
+struct crypt_io {
+	struct dm_target *target;
+	struct bio *bio;
+	struct bio *first_clone;
+	struct work_struct work;
+	atomic_t pending;
+	int error;
+};
+
+/*
+ * context holding the current state of a multi-part conversion
+ */
+struct convert_context {
+	struct bio *bio_in;
+	struct bio *bio_out;
+	unsigned int offset_in;
+	unsigned int offset_out;
+	unsigned int idx_in;
+	unsigned int idx_out;
+	sector_t sector;
+	int write;
+};
+
+struct crypt_config;
+
+struct crypt_iv_operations {
+	int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
+	           const char *opts);
+	void (*dtr)(struct crypt_config *cc);
+	const char *(*status)(struct crypt_config *cc);
+	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
+};
+
+/*
+ * Crypt: maps a linear range of a block device
+ * and encrypts / decrypts at the same time.
+ */
+struct crypt_config {
+	struct dm_dev *dev;
+	sector_t start;
+
+	/*
+	 * pool for per bio private data and
+	 * for encryption buffer pages
+	 */
+	mempool_t *io_pool;
+	mempool_t *page_pool;
+
+	/*
+	 * crypto related data
+	 */
+	struct crypt_iv_operations *iv_gen_ops;
+	char *iv_mode;
+	void *iv_gen_private;
+	sector_t iv_offset;
+	unsigned int iv_size;
+
+	struct crypto_tfm *tfm;
+	unsigned int key_size;
+	u8 key[0];
+};
+
+#define MIN_IOS        256
+#define MIN_POOL_PAGES 32
+#define MIN_BIO_PAGES  8
+
+static kmem_cache_t *_crypt_io_pool;
+
+/*
+ * Different IV generation algorithms:
+ *
+ * plain: the initial vector is the 32-bit low-endian version of the sector
+ *        number, padded with zeros if neccessary.
+ *
+ * ess_iv: "encrypted sector|salt initial vector", the sector number is
+ *         encrypted with the bulk cipher using a salt as key. The salt
+ *         should be derived from the bulk cipher's key via hashing.
+ *
+ * plumb: unimplemented, see:
+ * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
+ */
+
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+{
+	memset(iv, 0, cc->iv_size);
+	*(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
+
+	return 0;
+}
+
+static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
+	                      const char *opts)
+{
+	struct crypto_tfm *essiv_tfm;
+	struct crypto_tfm *hash_tfm;
+	struct scatterlist sg;
+	unsigned int saltsize;
+	u8 *salt;
+
+	if (opts == NULL) {
+		ti->error = PFX "Digest algorithm missing for ESSIV mode";
+		return -EINVAL;
+	}
+
+	/* Hash the cipher key with the given hash algorithm */
+	hash_tfm = crypto_alloc_tfm(opts, CRYPTO_TFM_REQ_MAY_SLEEP);
+	if (hash_tfm == NULL) {
+		ti->error = PFX "Error initializing ESSIV hash";
+		return -EINVAL;
+	}
+
+	if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) {
+		ti->error = PFX "Expected digest algorithm for ESSIV hash";
+		crypto_free_tfm(hash_tfm);
+		return -EINVAL;
+	}
+
+	saltsize = crypto_tfm_alg_digestsize(hash_tfm);
+	salt = kmalloc(saltsize, GFP_KERNEL);
+	if (salt == NULL) {
+		ti->error = PFX "Error kmallocing salt storage in ESSIV";
+		crypto_free_tfm(hash_tfm);
+		return -ENOMEM;
+	}
+
+	sg_set_buf(&sg, cc->key, cc->key_size);
+	crypto_digest_digest(hash_tfm, &sg, 1, salt);
+	crypto_free_tfm(hash_tfm);
+
+	/* Setup the essiv_tfm with the given salt */
+	essiv_tfm = crypto_alloc_tfm(crypto_tfm_alg_name(cc->tfm),
+	                             CRYPTO_TFM_MODE_ECB |
+	                             CRYPTO_TFM_REQ_MAY_SLEEP);
+	if (essiv_tfm == NULL) {
+		ti->error = PFX "Error allocating crypto tfm for ESSIV";
+		kfree(salt);
+		return -EINVAL;
+	}
+	if (crypto_tfm_alg_blocksize(essiv_tfm)
+	    != crypto_tfm_alg_ivsize(cc->tfm)) {
+		ti->error = PFX "Block size of ESSIV cipher does "
+			        "not match IV size of block cipher";
+		crypto_free_tfm(essiv_tfm);
+		kfree(salt);
+		return -EINVAL;
+	}
+	if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) {
+		ti->error = PFX "Failed to set key for ESSIV cipher";
+		crypto_free_tfm(essiv_tfm);
+		kfree(salt);
+		return -EINVAL;
+	}
+	kfree(salt);
+
+	cc->iv_gen_private = (void *)essiv_tfm;
+	return 0;
+}
+
+static void crypt_iv_essiv_dtr(struct crypt_config *cc)
+{
+	crypto_free_tfm((struct crypto_tfm *)cc->iv_gen_private);
+	cc->iv_gen_private = NULL;
+}
+
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+{
+	struct scatterlist sg;
+
+	memset(iv, 0, cc->iv_size);
+	*(u64 *)iv = cpu_to_le64(sector);
+
+	sg_set_buf(&sg, iv, cc->iv_size);
+	crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private,
+	                      &sg, &sg, cc->iv_size);
+
+	return 0;
+}
+
+static struct crypt_iv_operations crypt_iv_plain_ops = {
+	.generator = crypt_iv_plain_gen
+};
+
+static struct crypt_iv_operations crypt_iv_essiv_ops = {
+	.ctr       = crypt_iv_essiv_ctr,
+	.dtr       = crypt_iv_essiv_dtr,
+	.generator = crypt_iv_essiv_gen
+};
+
+
+static int
+crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
+                          struct scatterlist *in, unsigned int length,
+                          int write, sector_t sector)
+{
+	u8 iv[cc->iv_size];
+	int r;
+
+	if (cc->iv_gen_ops) {
+		r = cc->iv_gen_ops->generator(cc, iv, sector);
+		if (r < 0)
+			return r;
+
+		if (write)
+			r = crypto_cipher_encrypt_iv(cc->tfm, out, in, length, iv);
+		else
+			r = crypto_cipher_decrypt_iv(cc->tfm, out, in, length, iv);
+	} else {
+		if (write)
+			r = crypto_cipher_encrypt(cc->tfm, out, in, length);
+		else
+			r = crypto_cipher_decrypt(cc->tfm, out, in, length);
+	}
+
+	return r;
+}
+
+static void
+crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx,
+                   struct bio *bio_out, struct bio *bio_in,
+                   sector_t sector, int write)
+{
+	ctx->bio_in = bio_in;
+	ctx->bio_out = bio_out;
+	ctx->offset_in = 0;
+	ctx->offset_out = 0;
+	ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
+	ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
+	ctx->sector = sector + cc->iv_offset;
+	ctx->write = write;
+}
+
+/*
+ * Encrypt / decrypt data from one bio to another one (can be the same one)
+ */
+static int crypt_convert(struct crypt_config *cc,
+                         struct convert_context *ctx)
+{
+	int r = 0;
+
+	while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
+	      ctx->idx_out < ctx->bio_out->bi_vcnt) {
+		struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
+		struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
+		struct scatterlist sg_in = {
+			.page = bv_in->bv_page,
+			.offset = bv_in->bv_offset + ctx->offset_in,
+			.length = 1 << SECTOR_SHIFT
+		};
+		struct scatterlist sg_out = {
+			.page = bv_out->bv_page,
+			.offset = bv_out->bv_offset + ctx->offset_out,
+			.length = 1 << SECTOR_SHIFT
+		};
+
+		ctx->offset_in += sg_in.length;
+		if (ctx->offset_in >= bv_in->bv_len) {
+			ctx->offset_in = 0;
+			ctx->idx_in++;
+		}
+
+		ctx->offset_out += sg_out.length;
+		if (ctx->offset_out >= bv_out->bv_len) {
+			ctx->offset_out = 0;
+			ctx->idx_out++;
+		}
+
+		r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length,
+		                              ctx->write, ctx->sector);
+		if (r < 0)
+			break;
+
+		ctx->sector++;
+	}
+
+	return r;
+}
+
+/*
+ * Generate a new unfragmented bio with the given size
+ * This should never violate the device limitations
+ * May return a smaller bio when running out of pages
+ */
+static struct bio *
+crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
+                   struct bio *base_bio, unsigned int *bio_vec_idx)
+{
+	struct bio *bio;
+	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
+	unsigned int i;
+
+	/*
+	 * Use __GFP_NOMEMALLOC to tell the VM to act less aggressively and
+	 * to fail earlier.  This is not necessary but increases throughput.
+	 * FIXME: Is this really intelligent?
+	 */
+	if (base_bio)
+		bio = bio_clone(base_bio, GFP_NOIO|__GFP_NOMEMALLOC);
+	else
+		bio = bio_alloc(GFP_NOIO|__GFP_NOMEMALLOC, nr_iovecs);
+	if (!bio)
+		return NULL;
+
+	/* if the last bio was not complete, continue where that one ended */
+	bio->bi_idx = *bio_vec_idx;
+	bio->bi_vcnt = *bio_vec_idx;
+	bio->bi_size = 0;
+	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+
+	/* bio->bi_idx pages have already been allocated */
+	size -= bio->bi_idx * PAGE_SIZE;
+
+	for(i = bio->bi_idx; i < nr_iovecs; i++) {
+		struct bio_vec *bv = bio_iovec_idx(bio, i);
+
+		bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask);
+		if (!bv->bv_page)
+			break;
+
+		/*
+		 * if additional pages cannot be allocated without waiting,
+		 * return a partially allocated bio, the caller will then try
+		 * to allocate additional bios while submitting this partial bio
+		 */
+		if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1))
+			gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
+
+		bv->bv_offset = 0;
+		if (size > PAGE_SIZE)
+			bv->bv_len = PAGE_SIZE;
+		else
+			bv->bv_len = size;
+
+		bio->bi_size += bv->bv_len;
+		bio->bi_vcnt++;
+		size -= bv->bv_len;
+	}
+
+	if (!bio->bi_size) {
+		bio_put(bio);
+		return NULL;
+	}
+
+	/*
+	 * Remember the last bio_vec allocated to be able
+	 * to correctly continue after the splitting.
+	 */
+	*bio_vec_idx = bio->bi_vcnt;
+
+	return bio;
+}
+
+static void crypt_free_buffer_pages(struct crypt_config *cc,
+                                    struct bio *bio, unsigned int bytes)
+{
+	unsigned int i, start, end;
+	struct bio_vec *bv;
+
+	/*
+	 * This is ugly, but Jens Axboe thinks that using bi_idx in the
+	 * endio function is too dangerous at the moment, so I calculate the
+	 * correct position using bi_vcnt and bi_size.
+	 * The bv_offset and bv_len fields might already be modified but we
+	 * know that we always allocated whole pages.
+	 * A fix to the bi_idx issue in the kernel is in the works, so
+	 * we will hopefully be able to revert to the cleaner solution soon.
+	 */
+	i = bio->bi_vcnt - 1;
+	bv = bio_iovec_idx(bio, i);
+	end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size;
+	start = end - bytes;
+
+	start >>= PAGE_SHIFT;
+	if (!bio->bi_size)
+		end = bio->bi_vcnt;
+	else
+		end >>= PAGE_SHIFT;
+
+	for(i = start; i < end; i++) {
+		bv = bio_iovec_idx(bio, i);
+		BUG_ON(!bv->bv_page);
+		mempool_free(bv->bv_page, cc->page_pool);
+		bv->bv_page = NULL;
+	}
+}
+
+/*
+ * One of the bios was finished. Check for completion of
+ * the whole request and correctly clean up the buffer.
+ */
+static void dec_pending(struct crypt_io *io, int error)
+{
+	struct crypt_config *cc = (struct crypt_config *) io->target->private;
+
+	if (error < 0)
+		io->error = error;
+
+	if (!atomic_dec_and_test(&io->pending))
+		return;
+
+	if (io->first_clone)
+		bio_put(io->first_clone);
+
+	bio_endio(io->bio, io->bio->bi_size, io->error);
+
+	mempool_free(io, cc->io_pool);
+}
+
+/*
+ * kcryptd:
+ *
+ * Needed because it would be very unwise to do decryption in an
+ * interrupt context, so bios returning from read requests get
+ * queued here.
+ */
+static struct workqueue_struct *_kcryptd_workqueue;
+
+static void kcryptd_do_work(void *data)
+{
+	struct crypt_io *io = (struct crypt_io *) data;
+	struct crypt_config *cc = (struct crypt_config *) io->target->private;
+	struct convert_context ctx;
+	int r;
+
+	crypt_convert_init(cc, &ctx, io->bio, io->bio,
+	                   io->bio->bi_sector - io->target->begin, 0);
+	r = crypt_convert(cc, &ctx);
+
+	dec_pending(io, r);
+}
+
+static void kcryptd_queue_io(struct crypt_io *io)
+{
+	INIT_WORK(&io->work, kcryptd_do_work, io);
+	queue_work(_kcryptd_workqueue, &io->work);
+}
+
+/*
+ * Decode key from its hex representation
+ */
+static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
+{
+	char buffer[3];
+	char *endp;
+	unsigned int i;
+
+	buffer[2] = '\0';
+
+	for(i = 0; i < size; i++) {
+		buffer[0] = *hex++;
+		buffer[1] = *hex++;
+
+		key[i] = (u8)simple_strtoul(buffer, &endp, 16);
+
+		if (endp != &buffer[2])
+			return -EINVAL;
+	}
+
+	if (*hex != '\0')
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Encode key into its hex representation
+ */
+static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
+{
+	unsigned int i;
+
+	for(i = 0; i < size; i++) {
+		sprintf(hex, "%02x", *key);
+		hex += 2;
+		key++;
+	}
+}
+
+/*
+ * Construct an encryption mapping:
+ * <cipher> <key> <iv_offset> <dev_path> <start>
+ */
+static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct crypt_config *cc;
+	struct crypto_tfm *tfm;
+	char *tmp;
+	char *cipher;
+	char *chainmode;
+	char *ivmode;
+	char *ivopts;
+	unsigned int crypto_flags;
+	unsigned int key_size;
+	unsigned long long tmpll;
+
+	if (argc != 5) {
+		ti->error = PFX "Not enough arguments";
+		return -EINVAL;
+	}
+
+	tmp = argv[0];
+	cipher = strsep(&tmp, "-");
+	chainmode = strsep(&tmp, "-");
+	ivopts = strsep(&tmp, "-");
+	ivmode = strsep(&ivopts, ":");
+
+	if (tmp)
+		DMWARN(PFX "Unexpected additional cipher options");
+
+	key_size = strlen(argv[1]) >> 1;
+
+	cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
+	if (cc == NULL) {
+		ti->error =
+			PFX "Cannot allocate transparent encryption context";
+		return -ENOMEM;
+	}
+
+	cc->key_size = key_size;
+	if ((!key_size && strcmp(argv[1], "-") != 0) ||
+	    (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) {
+		ti->error = PFX "Error decoding key";
+		goto bad1;
+	}
+
+	/* Compatiblity mode for old dm-crypt cipher strings */
+	if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
+		chainmode = "cbc";
+		ivmode = "plain";
+	}
+
+	/* Choose crypto_flags according to chainmode */
+	if (strcmp(chainmode, "cbc") == 0)
+		crypto_flags = CRYPTO_TFM_MODE_CBC;
+	else if (strcmp(chainmode, "ecb") == 0)
+		crypto_flags = CRYPTO_TFM_MODE_ECB;
+	else {
+		ti->error = PFX "Unknown chaining mode";
+		goto bad1;
+	}
+
+	if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) {
+		ti->error = PFX "This chaining mode requires an IV mechanism";
+		goto bad1;
+	}
+
+	tfm = crypto_alloc_tfm(cipher, crypto_flags | CRYPTO_TFM_REQ_MAY_SLEEP);
+	if (!tfm) {
+		ti->error = PFX "Error allocating crypto tfm";
+		goto bad1;
+	}
+	if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) {
+		ti->error = PFX "Expected cipher algorithm";
+		goto bad2;
+	}
+
+	cc->tfm = tfm;
+
+	/*
+	 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>".
+	 * See comments at iv code
+	 */
+
+	if (ivmode == NULL)
+		cc->iv_gen_ops = NULL;
+	else if (strcmp(ivmode, "plain") == 0)
+		cc->iv_gen_ops = &crypt_iv_plain_ops;
+	else if (strcmp(ivmode, "essiv") == 0)
+		cc->iv_gen_ops = &crypt_iv_essiv_ops;
+	else {
+		ti->error = PFX "Invalid IV mode";
+		goto bad2;
+	}
+
+	if (cc->iv_gen_ops && cc->iv_gen_ops->ctr &&
+	    cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
+		goto bad2;
+
+	if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv)
+		/* at least a 64 bit sector number should fit in our buffer */
+		cc->iv_size = max(crypto_tfm_alg_ivsize(tfm),
+		                  (unsigned int)(sizeof(u64) / sizeof(u8)));
+	else {
+		cc->iv_size = 0;
+		if (cc->iv_gen_ops) {
+			DMWARN(PFX "Selected cipher does not support IVs");
+			if (cc->iv_gen_ops->dtr)
+				cc->iv_gen_ops->dtr(cc);
+			cc->iv_gen_ops = NULL;
+		}
+	}
+
+	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
+	if (!cc->io_pool) {
+		ti->error = PFX "Cannot allocate crypt io mempool";
+		goto bad3;
+	}
+
+	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+	if (!cc->page_pool) {
+		ti->error = PFX "Cannot allocate page mempool";
+		goto bad4;
+	}
+
+	if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) {
+		ti->error = PFX "Error setting key";
+		goto bad5;
+	}
+
+	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
+		ti->error = PFX "Invalid iv_offset sector";
+		goto bad5;
+	}
+	cc->iv_offset = tmpll;
+
+	if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+		ti->error = PFX "Invalid device sector";
+		goto bad5;
+	}
+	cc->start = tmpll;
+
+	if (dm_get_device(ti, argv[3], cc->start, ti->len,
+	                  dm_table_get_mode(ti->table), &cc->dev)) {
+		ti->error = PFX "Device lookup failed";
+		goto bad5;
+	}
+
+	if (ivmode && cc->iv_gen_ops) {
+		if (ivopts)
+			*(ivopts - 1) = ':';
+		cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL);
+		if (!cc->iv_mode) {
+			ti->error = PFX "Error kmallocing iv_mode string";
+			goto bad5;
+		}
+		strcpy(cc->iv_mode, ivmode);
+	} else
+		cc->iv_mode = NULL;
+
+	ti->private = cc;
+	return 0;
+
+bad5:
+	mempool_destroy(cc->page_pool);
+bad4:
+	mempool_destroy(cc->io_pool);
+bad3:
+	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
+		cc->iv_gen_ops->dtr(cc);
+bad2:
+	crypto_free_tfm(tfm);
+bad1:
+	/* Must zero key material before freeing */
+	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
+	kfree(cc);
+	return -EINVAL;
+}
+
+static void crypt_dtr(struct dm_target *ti)
+{
+	struct crypt_config *cc = (struct crypt_config *) ti->private;
+
+	mempool_destroy(cc->page_pool);
+	mempool_destroy(cc->io_pool);
+
+	kfree(cc->iv_mode);
+	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
+		cc->iv_gen_ops->dtr(cc);
+	crypto_free_tfm(cc->tfm);
+	dm_put_device(ti, cc->dev);
+
+	/* Must zero key material before freeing */
+	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
+	kfree(cc);
+}
+
+static int crypt_endio(struct bio *bio, unsigned int done, int error)
+{
+	struct crypt_io *io = (struct crypt_io *) bio->bi_private;
+	struct crypt_config *cc = (struct crypt_config *) io->target->private;
+
+	if (bio_data_dir(bio) == WRITE) {
+		/*
+		 * free the processed pages, even if
+		 * it's only a partially completed write
+		 */
+		crypt_free_buffer_pages(cc, bio, done);
+	}
+
+	if (bio->bi_size)
+		return 1;
+
+	bio_put(bio);
+
+	/*
+	 * successful reads are decrypted by the worker thread
+	 */
+	if ((bio_data_dir(bio) == READ)
+	    && bio_flagged(bio, BIO_UPTODATE)) {
+		kcryptd_queue_io(io);
+		return 0;
+	}
+
+	dec_pending(io, error);
+	return error;
+}
+
+static inline struct bio *
+crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio,
+            sector_t sector, unsigned int *bvec_idx,
+            struct convert_context *ctx)
+{
+	struct bio *clone;
+
+	if (bio_data_dir(bio) == WRITE) {
+		clone = crypt_alloc_buffer(cc, bio->bi_size,
+                                 io->first_clone, bvec_idx);
+		if (clone) {
+			ctx->bio_out = clone;
+			if (crypt_convert(cc, ctx) < 0) {
+				crypt_free_buffer_pages(cc, clone,
+				                        clone->bi_size);
+				bio_put(clone);
+				return NULL;
+			}
+		}
+	} else {
+		/*
+		 * The block layer might modify the bvec array, so always
+		 * copy the required bvecs because we need the original
+		 * one in order to decrypt the whole bio data *afterwards*.
+		 */
+		clone = bio_alloc(GFP_NOIO, bio_segments(bio));
+		if (clone) {
+			clone->bi_idx = 0;
+			clone->bi_vcnt = bio_segments(bio);
+			clone->bi_size = bio->bi_size;
+			memcpy(clone->bi_io_vec, bio_iovec(bio),
+			       sizeof(struct bio_vec) * clone->bi_vcnt);
+		}
+	}
+
+	if (!clone)
+		return NULL;
+
+	clone->bi_private = io;
+	clone->bi_end_io = crypt_endio;
+	clone->bi_bdev = cc->dev->bdev;
+	clone->bi_sector = cc->start + sector;
+	clone->bi_rw = bio->bi_rw;
+
+	return clone;
+}
+
+static int crypt_map(struct dm_target *ti, struct bio *bio,
+		     union map_info *map_context)
+{
+	struct crypt_config *cc = (struct crypt_config *) ti->private;
+	struct crypt_io *io = mempool_alloc(cc->io_pool, GFP_NOIO);
+	struct convert_context ctx;
+	struct bio *clone;
+	unsigned int remaining = bio->bi_size;
+	sector_t sector = bio->bi_sector - ti->begin;
+	unsigned int bvec_idx = 0;
+
+	io->target = ti;
+	io->bio = bio;
+	io->first_clone = NULL;
+	io->error = 0;
+	atomic_set(&io->pending, 1); /* hold a reference */
+
+	if (bio_data_dir(bio) == WRITE)
+		crypt_convert_init(cc, &ctx, NULL, bio, sector, 1);
+
+	/*
+	 * The allocated buffers can be smaller than the whole bio,
+	 * so repeat the whole process until all the data can be handled.
+	 */
+	while (remaining) {
+		clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx);
+		if (!clone)
+			goto cleanup;
+
+		if (!io->first_clone) {
+			/*
+			 * hold a reference to the first clone, because it
+			 * holds the bio_vec array and that can't be freed
+			 * before all other clones are released
+			 */
+			bio_get(clone);
+			io->first_clone = clone;
+		}
+		atomic_inc(&io->pending);
+
+		remaining -= clone->bi_size;
+		sector += bio_sectors(clone);
+
+		generic_make_request(clone);
+
+		/* out of memory -> run queues */
+		if (remaining)
+			blk_congestion_wait(bio_data_dir(clone), HZ/100);
+	}
+
+	/* drop reference, clones could have returned before we reach this */
+	dec_pending(io, 0);
+	return 0;
+
+cleanup:
+	if (io->first_clone) {
+		dec_pending(io, -ENOMEM);
+		return 0;
+	}
+
+	/* if no bio has been dispatched yet, we can directly return the error */
+	mempool_free(io, cc->io_pool);
+	return -ENOMEM;
+}
+
+static int crypt_status(struct dm_target *ti, status_type_t type,
+			char *result, unsigned int maxlen)
+{
+	struct crypt_config *cc = (struct crypt_config *) ti->private;
+	const char *cipher;
+	const char *chainmode = NULL;
+	unsigned int sz = 0;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		cipher = crypto_tfm_alg_name(cc->tfm);
+
+		switch(cc->tfm->crt_cipher.cit_mode) {
+		case CRYPTO_TFM_MODE_CBC:
+			chainmode = "cbc";
+			break;
+		case CRYPTO_TFM_MODE_ECB:
+			chainmode = "ecb";
+			break;
+		default:
+			BUG();
+		}
+
+		if (cc->iv_mode)
+			DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode);
+		else
+			DMEMIT("%s-%s ", cipher, chainmode);
+
+		if (cc->key_size > 0) {
+			if ((maxlen - sz) < ((cc->key_size << 1) + 1))
+				return -ENOMEM;
+
+			crypt_encode_key(result + sz, cc->key, cc->key_size);
+			sz += cc->key_size << 1;
+		} else {
+			if (sz >= maxlen)
+				return -ENOMEM;
+			result[sz++] = '-';
+		}
+
+		DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
+				cc->dev->name, (unsigned long long)cc->start);
+		break;
+	}
+	return 0;
+}
+
+static struct target_type crypt_target = {
+	.name   = "crypt",
+	.version= {1, 1, 0},
+	.module = THIS_MODULE,
+	.ctr    = crypt_ctr,
+	.dtr    = crypt_dtr,
+	.map    = crypt_map,
+	.status = crypt_status,
+};
+
+static int __init dm_crypt_init(void)
+{
+	int r;
+
+	_crypt_io_pool = kmem_cache_create("dm-crypt_io",
+	                                   sizeof(struct crypt_io),
+	                                   0, 0, NULL, NULL);
+	if (!_crypt_io_pool)
+		return -ENOMEM;
+
+	_kcryptd_workqueue = create_workqueue("kcryptd");
+	if (!_kcryptd_workqueue) {
+		r = -ENOMEM;
+		DMERR(PFX "couldn't create kcryptd");
+		goto bad1;
+	}
+
+	r = dm_register_target(&crypt_target);
+	if (r < 0) {
+		DMERR(PFX "register failed %d", r);
+		goto bad2;
+	}
+
+	return 0;
+
+bad2:
+	destroy_workqueue(_kcryptd_workqueue);
+bad1:
+	kmem_cache_destroy(_crypt_io_pool);
+	return r;
+}
+
+static void __exit dm_crypt_exit(void)
+{
+	int r = dm_unregister_target(&crypt_target);
+
+	if (r < 0)
+		DMERR(PFX "unregister failed %d", r);
+
+	destroy_workqueue(_kcryptd_workqueue);
+	kmem_cache_destroy(_crypt_io_pool);
+}
+
+module_init(dm_crypt_init);
+module_exit(dm_crypt_exit);
+
+MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
+MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption");
+MODULE_LICENSE("GPL");
diff -urN oldtree/drivers/md/md.c newtree/drivers/md/md.c
--- oldtree/drivers/md/md.c	2006-03-08 18:47:59.631856000 +0000
+++ newtree/drivers/md/md.c	2006-03-08 15:22:33.141498500 +0000
@@ -41,7 +41,6 @@
 #include <linux/sysctl.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
-#include <linux/suspend.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
 
@@ -3976,7 +3975,8 @@
 	thread->run = run;
 	thread->mddev = mddev;
 	thread->timeout = MAX_SCHEDULE_TIMEOUT;
-	thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
+	thread->tsk = kthread_nofreeze_run(md_thread, thread,
+			name, mdname(thread->mddev));
 	if (IS_ERR(thread->tsk)) {
 		kfree(thread);
 		return NULL;
diff -urN oldtree/drivers/md/md.c.orig newtree/drivers/md/md.c.orig
--- oldtree/drivers/md/md.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/drivers/md/md.c.orig	2006-03-08 15:21:15.152624500 +0000
@@ -0,0 +1,5051 @@
+/*
+   md.c : Multiple Devices driver for Linux
+	  Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+     completely rewritten, based on the MD driver code from Marc Zyngier
+
+   Changes:
+
+   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
+   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+   - kmod support by: Cyrus Durgin
+   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+   - lots of fixes and improvements to the RAID1/RAID5 and generic
+     RAID code (such as request based resynchronization):
+
+     Neil Brown <neilb@cse.unsw.edu.au>.
+
+   - persistent bitmap code
+     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/kthread.h>
+#include <linux/linkage.h>
+#include <linux/raid/md.h>
+#include <linux/raid/bitmap.h>
+#include <linux/sysctl.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/suspend.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+
+#include <linux/init.h>
+
+#include <linux/file.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+
+/* 63 partitions with the alternate major number (mdp) */
+#define MdpMinorShift 6
+
+#define DEBUG 0
+#define dprintk(x...) ((void)(DEBUG && printk(x)))
+
+
+#ifndef MODULE
+static void autostart_arrays (int part);
+#endif
+
+static LIST_HEAD(pers_list);
+static DEFINE_SPINLOCK(pers_lock);
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 1000 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwidth if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ * or /sys/block/mdX/md/sync_speed_{min,max}
+ */
+
+static int sysctl_speed_limit_min = 1000;
+static int sysctl_speed_limit_max = 200000;
+static inline int speed_min(mddev_t *mddev)
+{
+	return mddev->sync_speed_min ?
+		mddev->sync_speed_min : sysctl_speed_limit_min;
+}
+
+static inline int speed_max(mddev_t *mddev)
+{
+	return mddev->sync_speed_max ?
+		mddev->sync_speed_max : sysctl_speed_limit_max;
+}
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+	{
+		.ctl_name	= DEV_RAID_SPEED_LIMIT_MIN,
+		.procname	= "speed_limit_min",
+		.data		= &sysctl_speed_limit_min,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= DEV_RAID_SPEED_LIMIT_MAX,
+		.procname	= "speed_limit_max",
+		.data		= &sysctl_speed_limit_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table raid_dir_table[] = {
+	{
+		.ctl_name	= DEV_RAID,
+		.procname	= "raid",
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= raid_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table raid_root_table[] = {
+	{
+		.ctl_name	= CTL_DEV,
+		.procname	= "dev",
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= raid_dir_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct block_device_operations md_fops;
+
+static int start_readonly;
+
+/*
+ * We have a system wide 'event count' that is incremented
+ * on any 'interesting' event, and readers of /proc/mdstat
+ * can use 'poll' or 'select' to find out when the event
+ * count increases.
+ *
+ * Events are:
+ *  start array, stop array, error, add device, remove device,
+ *  start build, activate spare
+ */
+static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
+static atomic_t md_event_count;
+static void md_new_event(mddev_t *mddev)
+{
+	atomic_inc(&md_event_count);
+	wake_up(&md_event_waiters);
+}
+
+/*
+ * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list.
+ */
+static LIST_HEAD(all_mddevs);
+static DEFINE_SPINLOCK(all_mddevs_lock);
+
+
+/*
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
+ */
+#define ITERATE_MDDEV(mddev,tmp)					\
+									\
+	for (({ spin_lock(&all_mddevs_lock); 				\
+		tmp = all_mddevs.next;					\
+		mddev = NULL;});					\
+	     ({ if (tmp != &all_mddevs)					\
+			mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
+		spin_unlock(&all_mddevs_lock);				\
+		if (mddev) mddev_put(mddev);				\
+		mddev = list_entry(tmp, mddev_t, all_mddevs);		\
+		tmp != &all_mddevs;});					\
+	     ({ spin_lock(&all_mddevs_lock);				\
+		tmp = tmp->next;})					\
+		)
+
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+	bio_io_error(bio, bio->bi_size);
+	return 0;
+}
+
+static inline mddev_t *mddev_get(mddev_t *mddev)
+{
+	atomic_inc(&mddev->active);
+	return mddev;
+}
+
+static void mddev_put(mddev_t *mddev)
+{
+	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+		return;
+	if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+		list_del(&mddev->all_mddevs);
+		blk_put_queue(mddev->queue);
+		kobject_unregister(&mddev->kobj);
+	}
+	spin_unlock(&all_mddevs_lock);
+}
+
+static mddev_t * mddev_find(dev_t unit)
+{
+	mddev_t *mddev, *new = NULL;
+
+ retry:
+	spin_lock(&all_mddevs_lock);
+	list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+		if (mddev->unit == unit) {
+			mddev_get(mddev);
+			spin_unlock(&all_mddevs_lock);
+			kfree(new);
+			return mddev;
+		}
+
+	if (new) {
+		list_add(&new->all_mddevs, &all_mddevs);
+		spin_unlock(&all_mddevs_lock);
+		return new;
+	}
+	spin_unlock(&all_mddevs_lock);
+
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	new->unit = unit;
+	if (MAJOR(unit) == MD_MAJOR)
+		new->md_minor = MINOR(unit);
+	else
+		new->md_minor = MINOR(unit) >> MdpMinorShift;
+
+	init_MUTEX(&new->reconfig_sem);
+	INIT_LIST_HEAD(&new->disks);
+	INIT_LIST_HEAD(&new->all_mddevs);
+	init_timer(&new->safemode_timer);
+	atomic_set(&new->active, 1);
+	spin_lock_init(&new->write_lock);
+	init_waitqueue_head(&new->sb_wait);
+
+	new->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!new->queue) {
+		kfree(new);
+		return NULL;
+	}
+	set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
+
+	blk_queue_make_request(new->queue, md_fail_request);
+
+	goto retry;
+}
+
+static inline int mddev_lock(mddev_t * mddev)
+{
+	return down_interruptible(&mddev->reconfig_sem);
+}
+
+static inline void mddev_lock_uninterruptible(mddev_t * mddev)
+{
+	down(&mddev->reconfig_sem);
+}
+
+static inline int mddev_trylock(mddev_t * mddev)
+{
+	return down_trylock(&mddev->reconfig_sem);
+}
+
+static inline void mddev_unlock(mddev_t * mddev)
+{
+	up(&mddev->reconfig_sem);
+
+	md_wakeup_thread(mddev->thread);
+}
+
+static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+	mdk_rdev_t * rdev;
+	struct list_head *tmp;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->desc_nr == nr)
+			return rdev;
+	}
+	return NULL;
+}
+
+static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->bdev->bd_dev == dev)
+			return rdev;
+	}
+	return NULL;
+}
+
+static struct mdk_personality *find_pers(int level, char *clevel)
+{
+	struct mdk_personality *pers;
+	list_for_each_entry(pers, &pers_list, list) {
+		if (level != LEVEL_NONE && pers->level == level)
+			return pers;
+		if (strcmp(pers->name, clevel)==0)
+			return pers;
+	}
+	return NULL;
+}
+
+static inline sector_t calc_dev_sboffset(struct block_device *bdev)
+{
+	sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+	return MD_NEW_SIZE_BLOCKS(size);
+}
+
+static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
+{
+	sector_t size;
+
+	size = rdev->sb_offset;
+
+	if (chunk_size)
+		size &= ~((sector_t)chunk_size/1024 - 1);
+	return size;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+	if (rdev->sb_page)
+		MD_BUG();
+
+	rdev->sb_page = alloc_page(GFP_KERNEL);
+	if (!rdev->sb_page) {
+		printk(KERN_ALERT "md: out of memory.\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+	if (rdev->sb_page) {
+		put_page(rdev->sb_page);
+		rdev->sb_loaded = 0;
+		rdev->sb_page = NULL;
+		rdev->sb_offset = 0;
+		rdev->size = 0;
+	}
+}
+
+
+static int super_written(struct bio *bio, unsigned int bytes_done, int error)
+{
+	mdk_rdev_t *rdev = bio->bi_private;
+	mddev_t *mddev = rdev->mddev;
+	if (bio->bi_size)
+		return 1;
+
+	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
+		md_error(mddev, rdev);
+
+	if (atomic_dec_and_test(&mddev->pending_writes))
+		wake_up(&mddev->sb_wait);
+	bio_put(bio);
+	return 0;
+}
+
+static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
+{
+	struct bio *bio2 = bio->bi_private;
+	mdk_rdev_t *rdev = bio2->bi_private;
+	mddev_t *mddev = rdev->mddev;
+	if (bio->bi_size)
+		return 1;
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+	    error == -EOPNOTSUPP) {
+		unsigned long flags;
+		/* barriers don't appear to be supported :-( */
+		set_bit(BarriersNotsupp, &rdev->flags);
+		mddev->barriers_work = 0;
+		spin_lock_irqsave(&mddev->write_lock, flags);
+		bio2->bi_next = mddev->biolist;
+		mddev->biolist = bio2;
+		spin_unlock_irqrestore(&mddev->write_lock, flags);
+		wake_up(&mddev->sb_wait);
+		bio_put(bio);
+		return 0;
+	}
+	bio_put(bio2);
+	bio->bi_private = rdev;
+	return super_written(bio, bytes_done, error);
+}
+
+void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+		   sector_t sector, int size, struct page *page)
+{
+	/* write first size bytes of page to sector of rdev
+	 * Increment mddev->pending_writes before returning
+	 * and decrement it on completion, waking up sb_wait
+	 * if zero is reached.
+	 * If an error occurred, call md_error
+	 *
+	 * As we might need to resubmit the request if BIO_RW_BARRIER
+	 * causes ENOTSUPP, we allocate a spare bio...
+	 */
+	struct bio *bio = bio_alloc(GFP_NOIO, 1);
+	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
+
+	bio->bi_bdev = rdev->bdev;
+	bio->bi_sector = sector;
+	bio_add_page(bio, page, size, 0);
+	bio->bi_private = rdev;
+	bio->bi_end_io = super_written;
+	bio->bi_rw = rw;
+
+	atomic_inc(&mddev->pending_writes);
+	if (!test_bit(BarriersNotsupp, &rdev->flags)) {
+		struct bio *rbio;
+		rw |= (1<<BIO_RW_BARRIER);
+		rbio = bio_clone(bio, GFP_NOIO);
+		rbio->bi_private = bio;
+		rbio->bi_end_io = super_written_barrier;
+		submit_bio(rw, rbio);
+	} else
+		submit_bio(rw, bio);
+}
+
+void md_super_wait(mddev_t *mddev)
+{
+	/* wait for all superblock writes that were scheduled to complete.
+	 * if any had to be retried (due to BARRIER problems), retry them
+	 */
+	DEFINE_WAIT(wq);
+	for(;;) {
+		prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&mddev->pending_writes)==0)
+			break;
+		while (mddev->biolist) {
+			struct bio *bio;
+			spin_lock_irq(&mddev->write_lock);
+			bio = mddev->biolist;
+			mddev->biolist = bio->bi_next ;
+			bio->bi_next = NULL;
+			spin_unlock_irq(&mddev->write_lock);
+			submit_bio(bio->bi_rw, bio);
+		}
+		schedule();
+	}
+	finish_wait(&mddev->sb_wait, &wq);
+}
+
+static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
+{
+	if (bio->bi_size)
+		return 1;
+
+	complete((struct completion*)bio->bi_private);
+	return 0;
+}
+
+int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+		   struct page *page, int rw)
+{
+	struct bio *bio = bio_alloc(GFP_NOIO, 1);
+	struct completion event;
+	int ret;
+
+	rw |= (1 << BIO_RW_SYNC);
+
+	bio->bi_bdev = bdev;
+	bio->bi_sector = sector;
+	bio_add_page(bio, page, size, 0);
+	init_completion(&event);
+	bio->bi_private = &event;
+	bio->bi_end_io = bi_complete;
+	submit_bio(rw, bio);
+	wait_for_completion(&event);
+
+	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_put(bio);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sync_page_io);
+
+static int read_disk_sb(mdk_rdev_t * rdev, int size)
+{
+	char b[BDEVNAME_SIZE];
+	if (!rdev->sb_page) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	if (rdev->sb_loaded)
+		return 0;
+
+
+	if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
+		goto fail;
+	rdev->sb_loaded = 1;
+	return 0;
+
+fail:
+	printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
+		bdevname(rdev->bdev,b));
+	return -EINVAL;
+}
+
+static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+	if (	(sb1->set_uuid0 == sb2->set_uuid0) &&
+		(sb1->set_uuid1 == sb2->set_uuid1) &&
+		(sb1->set_uuid2 == sb2->set_uuid2) &&
+		(sb1->set_uuid3 == sb2->set_uuid3))
+
+		return 1;
+
+	return 0;
+}
+
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+	int ret;
+	mdp_super_t *tmp1, *tmp2;
+
+	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+	if (!tmp1 || !tmp2) {
+		ret = 0;
+		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+		goto abort;
+	}
+
+	*tmp1 = *sb1;
+	*tmp2 = *sb2;
+
+	/*
+	 * nr_disks is not constant
+	 */
+	tmp1->nr_disks = 0;
+	tmp2->nr_disks = 0;
+
+	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+		ret = 0;
+	else
+		ret = 1;
+
+abort:
+	kfree(tmp1);
+	kfree(tmp2);
+	return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+	unsigned int disk_csum, csum;
+
+	disk_csum = sb->sb_csum;
+	sb->sb_csum = 0;
+	csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+	sb->sb_csum = disk_csum;
+	return csum;
+}
+
+
+/*
+ * Handle superblock details.
+ * We want to be able to handle multiple superblock formats
+ * so we have a common interface to them all, and an array of
+ * different handlers.
+ * We rely on user-space to write the initial superblock, and support
+ * reading and updating of superblocks.
+ * Interface methods are:
+ *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
+ *      loads and validates a superblock on dev.
+ *      if refdev != NULL, compare superblocks on both devices
+ *    Return:
+ *      0 - dev has a superblock that is compatible with refdev
+ *      1 - dev has a superblock that is compatible and newer than refdev
+ *          so dev should be used as the refdev in future
+ *     -EINVAL superblock incompatible or invalid
+ *     -othererror e.g. -EIO
+ *
+ *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
+ *      Verify that dev is acceptable into mddev.
+ *       The first time, mddev->raid_disks will be 0, and data from
+ *       dev should be merged in.  Subsequent calls check that dev
+ *       is new enough.  Return 0 or -EINVAL
+ *
+ *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
+ *     Update the superblock for rdev with data in mddev
+ *     This does not write to disc.
+ *
+ */
+
+struct super_type  {
+	char 		*name;
+	struct module	*owner;
+	int		(*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
+	int		(*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+	void		(*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+};
+
+/*
+ * load_super for 0.90.0 
+ */
+static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+	mdp_super_t *sb;
+	int ret;
+	sector_t sb_offset;
+
+	/*
+	 * Calculate the position of the superblock,
+	 * it's at the end of the disk.
+	 *
+	 * It also happens to be a multiple of 4Kb.
+	 */
+	sb_offset = calc_dev_sboffset(rdev->bdev);
+	rdev->sb_offset = sb_offset;
+
+	ret = read_disk_sb(rdev, MD_SB_BYTES);
+	if (ret) return ret;
+
+	ret = -EINVAL;
+
+	bdevname(rdev->bdev, b);
+	sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+	if (sb->md_magic != MD_SB_MAGIC) {
+		printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
+		       b);
+		goto abort;
+	}
+
+	if (sb->major_version != 0 ||
+	    sb->minor_version != 90) {
+		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
+			sb->major_version, sb->minor_version,
+			b);
+		goto abort;
+	}
+
+	if (sb->raid_disks <= 0)
+		goto abort;
+
+	if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
+		printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
+			b);
+		goto abort;
+	}
+
+	rdev->preferred_minor = sb->md_minor;
+	rdev->data_offset = 0;
+	rdev->sb_size = MD_SB_BYTES;
+
+	if (sb->level == LEVEL_MULTIPATH)
+		rdev->desc_nr = -1;
+	else
+		rdev->desc_nr = sb->this_disk.number;
+
+	if (refdev == 0)
+		ret = 1;
+	else {
+		__u64 ev1, ev2;
+		mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+		if (!uuid_equal(refsb, sb)) {
+			printk(KERN_WARNING "md: %s has different UUID to %s\n",
+				b, bdevname(refdev->bdev,b2));
+			goto abort;
+		}
+		if (!sb_equal(refsb, sb)) {
+			printk(KERN_WARNING "md: %s has same UUID"
+			       " but different superblock to %s\n",
+			       b, bdevname(refdev->bdev, b2));
+			goto abort;
+		}
+		ev1 = md_event(sb);
+		ev2 = md_event(refsb);
+		if (ev1 > ev2)
+			ret = 1;
+		else 
+			ret = 0;
+	}
+	rdev->size = calc_dev_size(rdev, sb->chunk_size);
+
+	if (rdev->size < sb->size && sb->level > 1)
+		/* "this cannot possibly happen" ... */
+		ret = -EINVAL;
+
+ abort:
+	return ret;
+}
+
+/*
+ * validate_super for 0.90.0
+ */
+static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	mdp_disk_t *desc;
+	mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+
+	rdev->raid_disk = -1;
+	rdev->flags = 0;
+	if (mddev->raid_disks == 0) {
+		mddev->major_version = 0;
+		mddev->minor_version = sb->minor_version;
+		mddev->patch_version = sb->patch_version;
+		mddev->persistent = ! sb->not_persistent;
+		mddev->chunk_size = sb->chunk_size;
+		mddev->ctime = sb->ctime;
+		mddev->utime = sb->utime;
+		mddev->level = sb->level;
+		mddev->clevel[0] = 0;
+		mddev->layout = sb->layout;
+		mddev->raid_disks = sb->raid_disks;
+		mddev->size = sb->size;
+		mddev->events = md_event(sb);
+		mddev->bitmap_offset = 0;
+		mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
+
+		if (sb->state & (1<<MD_SB_CLEAN))
+			mddev->recovery_cp = MaxSector;
+		else {
+			if (sb->events_hi == sb->cp_events_hi && 
+				sb->events_lo == sb->cp_events_lo) {
+				mddev->recovery_cp = sb->recovery_cp;
+			} else
+				mddev->recovery_cp = 0;
+		}
+
+		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
+		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
+		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
+		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
+
+		mddev->max_disks = MD_SB_DISKS;
+
+		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
+		    mddev->bitmap_file == NULL) {
+			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+			    && mddev->level != 10) {
+				/* FIXME use a better test */
+				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
+				return -EINVAL;
+			}
+			mddev->bitmap_offset = mddev->default_bitmap_offset;
+		}
+
+	} else if (mddev->pers == NULL) {
+		/* Insist on good event counter while assembling */
+		__u64 ev1 = md_event(sb);
+		++ev1;
+		if (ev1 < mddev->events) 
+			return -EINVAL;
+	} else if (mddev->bitmap) {
+		/* if adding to array with a bitmap, then we can accept an
+		 * older device ... but not too old.
+		 */
+		__u64 ev1 = md_event(sb);
+		if (ev1 < mddev->bitmap->events_cleared)
+			return 0;
+	} else /* just a hot-add of a new device, leave raid_disk at -1 */
+		return 0;
+
+	if (mddev->level != LEVEL_MULTIPATH) {
+		desc = sb->disks + rdev->desc_nr;
+
+		if (desc->state & (1<<MD_DISK_FAULTY))
+			set_bit(Faulty, &rdev->flags);
+		else if (desc->state & (1<<MD_DISK_SYNC) &&
+			 desc->raid_disk < mddev->raid_disks) {
+			set_bit(In_sync, &rdev->flags);
+			rdev->raid_disk = desc->raid_disk;
+		}
+		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
+	} else /* MULTIPATH are always insync */
+		set_bit(In_sync, &rdev->flags);
+	return 0;
+}
+
+/*
+ * sync_super for 0.90.0
+ */
+static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	mdp_super_t *sb;
+	struct list_head *tmp;
+	mdk_rdev_t *rdev2;
+	int next_spare = mddev->raid_disks;
+
+
+	/* make rdev->sb match mddev data..
+	 *
+	 * 1/ zero out disks
+	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
+	 * 3/ any empty disks < next_spare become removed
+	 *
+	 * disks[0] gets initialised to REMOVED because
+	 * we cannot be sure from other fields if it has
+	 * been initialised or not.
+	 */
+	int i;
+	int active=0, working=0,failed=0,spare=0,nr_disks=0;
+
+	rdev->sb_size = MD_SB_BYTES;
+
+	sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+	memset(sb, 0, sizeof(*sb));
+
+	sb->md_magic = MD_SB_MAGIC;
+	sb->major_version = mddev->major_version;
+	sb->minor_version = mddev->minor_version;
+	sb->patch_version = mddev->patch_version;
+	sb->gvalid_words  = 0; /* ignored */
+	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
+	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
+	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
+	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
+
+	sb->ctime = mddev->ctime;
+	sb->level = mddev->level;
+	sb->size  = mddev->size;
+	sb->raid_disks = mddev->raid_disks;
+	sb->md_minor = mddev->md_minor;
+	sb->not_persistent = !mddev->persistent;
+	sb->utime = mddev->utime;
+	sb->state = 0;
+	sb->events_hi = (mddev->events>>32);
+	sb->events_lo = (u32)mddev->events;
+
+	if (mddev->in_sync)
+	{
+		sb->recovery_cp = mddev->recovery_cp;
+		sb->cp_events_hi = (mddev->events>>32);
+		sb->cp_events_lo = (u32)mddev->events;
+		if (mddev->recovery_cp == MaxSector)
+			sb->state = (1<< MD_SB_CLEAN);
+	} else
+		sb->recovery_cp = 0;
+
+	sb->layout = mddev->layout;
+	sb->chunk_size = mddev->chunk_size;
+
+	if (mddev->bitmap && mddev->bitmap_file == NULL)
+		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
+
+	sb->disks[0].state = (1<<MD_DISK_REMOVED);
+	ITERATE_RDEV(mddev,rdev2,tmp) {
+		mdp_disk_t *d;
+		int desc_nr;
+		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
+		    && !test_bit(Faulty, &rdev2->flags))
+			desc_nr = rdev2->raid_disk;
+		else
+			desc_nr = next_spare++;
+		rdev2->desc_nr = desc_nr;
+		d = &sb->disks[rdev2->desc_nr];
+		nr_disks++;
+		d->number = rdev2->desc_nr;
+		d->major = MAJOR(rdev2->bdev->bd_dev);
+		d->minor = MINOR(rdev2->bdev->bd_dev);
+		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
+		    && !test_bit(Faulty, &rdev2->flags))
+			d->raid_disk = rdev2->raid_disk;
+		else
+			d->raid_disk = rdev2->desc_nr; /* compatibility */
+		if (test_bit(Faulty, &rdev2->flags)) {
+			d->state = (1<<MD_DISK_FAULTY);
+			failed++;
+		} else if (test_bit(In_sync, &rdev2->flags)) {
+			d->state = (1<<MD_DISK_ACTIVE);
+			d->state |= (1<<MD_DISK_SYNC);
+			active++;
+			working++;
+		} else {
+			d->state = 0;
+			spare++;
+			working++;
+		}
+		if (test_bit(WriteMostly, &rdev2->flags))
+			d->state |= (1<<MD_DISK_WRITEMOSTLY);
+	}
+	/* now set the "removed" and "faulty" bits on any missing devices */
+	for (i=0 ; i < mddev->raid_disks ; i++) {
+		mdp_disk_t *d = &sb->disks[i];
+		if (d->state == 0 && d->number == 0) {
+			d->number = i;
+			d->raid_disk = i;
+			d->state = (1<<MD_DISK_REMOVED);
+			d->state |= (1<<MD_DISK_FAULTY);
+			failed++;
+		}
+	}
+	sb->nr_disks = nr_disks;
+	sb->active_disks = active;
+	sb->working_disks = working;
+	sb->failed_disks = failed;
+	sb->spare_disks = spare;
+
+	sb->this_disk = sb->disks[rdev->desc_nr];
+	sb->sb_csum = calc_sb_csum(sb);
+}
+
+/*
+ * version 1 superblock
+ */
+
+static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+{
+	unsigned int disk_csum, csum;
+	unsigned long long newcsum;
+	int size = 256 + le32_to_cpu(sb->max_dev)*2;
+	unsigned int *isuper = (unsigned int*)sb;
+	int i;
+
+	disk_csum = sb->sb_csum;
+	sb->sb_csum = 0;
+	newcsum = 0;
+	for (i=0; size>=4; size -= 4 )
+		newcsum += le32_to_cpu(*isuper++);
+
+	if (size == 2)
+		newcsum += le16_to_cpu(*(unsigned short*) isuper);
+
+	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
+	sb->sb_csum = disk_csum;
+	return cpu_to_le32(csum);
+}
+
+static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+	struct mdp_superblock_1 *sb;
+	int ret;
+	sector_t sb_offset;
+	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+	int bmask;
+
+	/*
+	 * Calculate the position of the superblock.
+	 * It is always aligned to a 4K boundary and
+	 * depeding on minor_version, it can be:
+	 * 0: At least 8K, but less than 12K, from end of device
+	 * 1: At start of device
+	 * 2: 4K from start of device.
+	 */
+	switch(minor_version) {
+	case 0:
+		sb_offset = rdev->bdev->bd_inode->i_size >> 9;
+		sb_offset -= 8*2;
+		sb_offset &= ~(sector_t)(4*2-1);
+		/* convert from sectors to K */
+		sb_offset /= 2;
+		break;
+	case 1:
+		sb_offset = 0;
+		break;
+	case 2:
+		sb_offset = 4;
+		break;
+	default:
+		return -EINVAL;
+	}
+	rdev->sb_offset = sb_offset;
+
+	/* superblock is rarely larger than 1K, but it can be larger,
+	 * and it is safe to read 4k, so we do that
+	 */
+	ret = read_disk_sb(rdev, 4096);
+	if (ret) return ret;
+
+
+	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
+	    sb->major_version != cpu_to_le32(1) ||
+	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
+	    le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
+		return -EINVAL;
+
+	if (calc_sb_1_csum(sb) != sb->sb_csum) {
+		printk("md: invalid superblock checksum on %s\n",
+			bdevname(rdev->bdev,b));
+		return -EINVAL;
+	}
+	if (le64_to_cpu(sb->data_size) < 10) {
+		printk("md: data_size too small on %s\n",
+		       bdevname(rdev->bdev,b));
+		return -EINVAL;
+	}
+	rdev->preferred_minor = 0xffff;
+	rdev->data_offset = le64_to_cpu(sb->data_offset);
+	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
+
+	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
+	bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
+	if (rdev->sb_size & bmask)
+		rdev-> sb_size = (rdev->sb_size | bmask)+1;
+
+	if (refdev == 0)
+		ret = 1;
+	else {
+		__u64 ev1, ev2;
+		struct mdp_superblock_1 *refsb = 
+			(struct mdp_superblock_1*)page_address(refdev->sb_page);
+
+		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
+		    sb->level != refsb->level ||
+		    sb->layout != refsb->layout ||
+		    sb->chunksize != refsb->chunksize) {
+			printk(KERN_WARNING "md: %s has strangely different"
+				" superblock to %s\n",
+				bdevname(rdev->bdev,b),
+				bdevname(refdev->bdev,b2));
+			return -EINVAL;
+		}
+		ev1 = le64_to_cpu(sb->events);
+		ev2 = le64_to_cpu(refsb->events);
+
+		if (ev1 > ev2)
+			ret = 1;
+		else
+			ret = 0;
+	}
+	if (minor_version) 
+		rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+	else
+		rdev->size = rdev->sb_offset;
+	if (rdev->size < le64_to_cpu(sb->data_size)/2)
+		return -EINVAL;
+	rdev->size = le64_to_cpu(sb->data_size)/2;
+	if (le32_to_cpu(sb->chunksize))
+		rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+
+	if (le32_to_cpu(sb->size) > rdev->size*2)
+		return -EINVAL;
+	return ret;
+}
+
+static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+	rdev->raid_disk = -1;
+	rdev->flags = 0;
+	if (mddev->raid_disks == 0) {
+		mddev->major_version = 1;
+		mddev->patch_version = 0;
+		mddev->persistent = 1;
+		mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
+		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+		mddev->level = le32_to_cpu(sb->level);
+		mddev->clevel[0] = 0;
+		mddev->layout = le32_to_cpu(sb->layout);
+		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
+		mddev->size = le64_to_cpu(sb->size)/2;
+		mddev->events = le64_to_cpu(sb->events);
+		mddev->bitmap_offset = 0;
+		mddev->default_bitmap_offset = 1024 >> 9;
+		
+		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+		memcpy(mddev->uuid, sb->set_uuid, 16);
+
+		mddev->max_disks =  (4096-256)/2;
+
+		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
+		    mddev->bitmap_file == NULL ) {
+			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+			    && mddev->level != 10) {
+				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
+				return -EINVAL;
+			}
+			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
+		}
+	} else if (mddev->pers == NULL) {
+		/* Insist of good event counter while assembling */
+		__u64 ev1 = le64_to_cpu(sb->events);
+		++ev1;
+		if (ev1 < mddev->events)
+			return -EINVAL;
+	} else if (mddev->bitmap) {
+		/* If adding to array with a bitmap, then we can accept an
+		 * older device, but not too old.
+		 */
+		__u64 ev1 = le64_to_cpu(sb->events);
+		if (ev1 < mddev->bitmap->events_cleared)
+			return 0;
+	} else /* just a hot-add of a new device, leave raid_disk at -1 */
+		return 0;
+
+	if (mddev->level != LEVEL_MULTIPATH) {
+		int role;
+		rdev->desc_nr = le32_to_cpu(sb->dev_number);
+		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+		switch(role) {
+		case 0xffff: /* spare */
+			break;
+		case 0xfffe: /* faulty */
+			set_bit(Faulty, &rdev->flags);
+			break;
+		default:
+			set_bit(In_sync, &rdev->flags);
+			rdev->raid_disk = role;
+			break;
+		}
+		if (sb->devflags & WriteMostly1)
+			set_bit(WriteMostly, &rdev->flags);
+	} else /* MULTIPATH are always insync */
+		set_bit(In_sync, &rdev->flags);
+
+	return 0;
+}
+
+static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct mdp_superblock_1 *sb;
+	struct list_head *tmp;
+	mdk_rdev_t *rdev2;
+	int max_dev, i;
+	/* make rdev->sb match mddev and rdev data. */
+
+	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+	sb->feature_map = 0;
+	sb->pad0 = 0;
+	memset(sb->pad1, 0, sizeof(sb->pad1));
+	memset(sb->pad2, 0, sizeof(sb->pad2));
+	memset(sb->pad3, 0, sizeof(sb->pad3));
+
+	sb->utime = cpu_to_le64((__u64)mddev->utime);
+	sb->events = cpu_to_le64(mddev->events);
+	if (mddev->in_sync)
+		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+	else
+		sb->resync_offset = cpu_to_le64(0);
+
+	sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
+
+	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
+	sb->size = cpu_to_le64(mddev->size<<1);
+
+	if (mddev->bitmap && mddev->bitmap_file == NULL) {
+		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
+		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
+	}
+
+	max_dev = 0;
+	ITERATE_RDEV(mddev,rdev2,tmp)
+		if (rdev2->desc_nr+1 > max_dev)
+			max_dev = rdev2->desc_nr+1;
+	
+	sb->max_dev = cpu_to_le32(max_dev);
+	for (i=0; i<max_dev;i++)
+		sb->dev_roles[i] = cpu_to_le16(0xfffe);
+	
+	ITERATE_RDEV(mddev,rdev2,tmp) {
+		i = rdev2->desc_nr;
+		if (test_bit(Faulty, &rdev2->flags))
+			sb->dev_roles[i] = cpu_to_le16(0xfffe);
+		else if (test_bit(In_sync, &rdev2->flags))
+			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+		else
+			sb->dev_roles[i] = cpu_to_le16(0xffff);
+	}
+
+	sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
+	sb->sb_csum = calc_sb_1_csum(sb);
+}
+
+
+static struct super_type super_types[] = {
+	[0] = {
+		.name	= "0.90.0",
+		.owner	= THIS_MODULE,
+		.load_super	= super_90_load,
+		.validate_super	= super_90_validate,
+		.sync_super	= super_90_sync,
+	},
+	[1] = {
+		.name	= "md-1",
+		.owner	= THIS_MODULE,
+		.load_super	= super_1_load,
+		.validate_super	= super_1_validate,
+		.sync_super	= super_1_sync,
+	},
+};
+	
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	ITERATE_RDEV(mddev,rdev,tmp)
+		if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
+			return rdev;
+
+	return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	ITERATE_RDEV(mddev1,rdev,tmp)
+		if (match_dev_unit(mddev2, rdev))
+			return 1;
+
+	return 0;
+}
+
+static LIST_HEAD(pending_raid_disks);
+
+static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+	mdk_rdev_t *same_pdev;
+	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+	struct kobject *ko;
+	char *s;
+
+	if (rdev->mddev) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	/* make sure rdev->size exceeds mddev->size */
+	if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
+		if (mddev->pers)
+			/* Cannot change size, so fail */
+			return -ENOSPC;
+		else
+			mddev->size = rdev->size;
+	}
+	same_pdev = match_dev_unit(mddev, rdev);
+	if (same_pdev)
+		printk(KERN_WARNING
+			"%s: WARNING: %s appears to be on the same physical"
+	 		" disk as %s. True\n     protection against single-disk"
+			" failure might be compromised.\n",
+			mdname(mddev), bdevname(rdev->bdev,b),
+			bdevname(same_pdev->bdev,b2));
+
+	/* Verify rdev->desc_nr is unique.
+	 * If it is -1, assign a free number, else
+	 * check number is not in use
+	 */
+	if (rdev->desc_nr < 0) {
+		int choice = 0;
+		if (mddev->pers) choice = mddev->raid_disks;
+		while (find_rdev_nr(mddev, choice))
+			choice++;
+		rdev->desc_nr = choice;
+	} else {
+		if (find_rdev_nr(mddev, rdev->desc_nr))
+			return -EBUSY;
+	}
+	bdevname(rdev->bdev,b);
+	if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
+		return -ENOMEM;
+	while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
+		*s = '!';
+			
+	list_add(&rdev->same_set, &mddev->disks);
+	rdev->mddev = mddev;
+	printk(KERN_INFO "md: bind<%s>\n", b);
+
+	rdev->kobj.parent = &mddev->kobj;
+	kobject_add(&rdev->kobj);
+
+	if (rdev->bdev->bd_part)
+		ko = &rdev->bdev->bd_part->kobj;
+	else
+		ko = &rdev->bdev->bd_disk->kobj;
+	sysfs_create_link(&rdev->kobj, ko, "block");
+	bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
+	return 0;
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+	char b[BDEVNAME_SIZE];
+	if (!rdev->mddev) {
+		MD_BUG();
+		return;
+	}
+	bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
+	list_del_init(&rdev->same_set);
+	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
+	rdev->mddev = NULL;
+	sysfs_remove_link(&rdev->kobj, "block");
+	kobject_del(&rdev->kobj);
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by bd_claiming the device.
+ */
+static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
+{
+	int err = 0;
+	struct block_device *bdev;
+	char b[BDEVNAME_SIZE];
+
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	if (IS_ERR(bdev)) {
+		printk(KERN_ERR "md: could not open %s.\n",
+			__bdevname(dev, b));
+		return PTR_ERR(bdev);
+	}
+	err = bd_claim(bdev, rdev);
+	if (err) {
+		printk(KERN_ERR "md: could not bd_claim %s.\n",
+			bdevname(bdev, b));
+		blkdev_put(bdev);
+		return err;
+	}
+	rdev->bdev = bdev;
+	return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+	struct block_device *bdev = rdev->bdev;
+	rdev->bdev = NULL;
+	if (!bdev)
+		MD_BUG();
+	bd_release(bdev);
+	blkdev_put(bdev);
+}
+
+void md_autodetect_dev(dev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+	char b[BDEVNAME_SIZE];
+	printk(KERN_INFO "md: export_rdev(%s)\n",
+		bdevname(rdev->bdev,b));
+	if (rdev->mddev)
+		MD_BUG();
+	free_disk_sb(rdev);
+	list_del_init(&rdev->same_set);
+#ifndef MODULE
+	md_autodetect_dev(rdev->bdev->bd_dev);
+#endif
+	unlock_rdev(rdev);
+	kobject_put(&rdev->kobj);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+	unbind_rdev_from_array(rdev);
+	export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (!rdev->mddev) {
+			MD_BUG();
+			continue;
+		}
+		kick_rdev_from_array(rdev);
+	}
+	if (!list_empty(&mddev->disks))
+		MD_BUG();
+	mddev->raid_disks = 0;
+	mddev->major_version = 0;
+}
+
+static void print_desc(mdp_disk_t *desc)
+{
+	printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
+		desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+	int i;
+
+	printk(KERN_INFO 
+		"md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+		sb->major_version, sb->minor_version, sb->patch_version,
+		sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+		sb->ctime);
+	printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
+		sb->level, sb->size, sb->nr_disks, sb->raid_disks,
+		sb->md_minor, sb->layout, sb->chunk_size);
+	printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
+		" FD:%d SD:%d CSUM:%08x E:%08lx\n",
+		sb->utime, sb->state, sb->active_disks, sb->working_disks,
+		sb->failed_disks, sb->spare_disks,
+		sb->sb_csum, (unsigned long)sb->events_lo);
+
+	printk(KERN_INFO);
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		mdp_disk_t *desc;
+
+		desc = sb->disks + i;
+		if (desc->number || desc->major || desc->minor ||
+		    desc->raid_disk || (desc->state && (desc->state != 4))) {
+			printk("     D %2d: ", i);
+			print_desc(desc);
+		}
+	}
+	printk(KERN_INFO "md:     THIS: ");
+	print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+	char b[BDEVNAME_SIZE];
+	printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
+		bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
+	        test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
+	        rdev->desc_nr);
+	if (rdev->sb_loaded) {
+		printk(KERN_INFO "md: rdev superblock:\n");
+		print_sb((mdp_super_t*)page_address(rdev->sb_page));
+	} else
+		printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+	struct list_head *tmp, *tmp2;
+	mdk_rdev_t *rdev;
+	mddev_t *mddev;
+	char b[BDEVNAME_SIZE];
+
+	printk("\n");
+	printk("md:	**********************************\n");
+	printk("md:	* <COMPLETE RAID STATE PRINTOUT> *\n");
+	printk("md:	**********************************\n");
+	ITERATE_MDDEV(mddev,tmp) {
+
+		if (mddev->bitmap)
+			bitmap_print_sb(mddev->bitmap);
+		else
+			printk("%s: ", mdname(mddev));
+		ITERATE_RDEV(mddev,rdev,tmp2)
+			printk("<%s>", bdevname(rdev->bdev,b));
+		printk("\n");
+
+		ITERATE_RDEV(mddev,rdev,tmp2)
+			print_rdev(rdev);
+	}
+	printk("md:	**********************************\n");
+	printk("\n");
+}
+
+
+static void sync_sbs(mddev_t * mddev)
+{
+	mdk_rdev_t *rdev;
+	struct list_head *tmp;
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		super_types[mddev->major_version].
+			sync_super(mddev, rdev);
+		rdev->sb_loaded = 1;
+	}
+}
+
+static void md_update_sb(mddev_t * mddev)
+{
+	int err;
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+	int sync_req;
+
+repeat:
+	spin_lock_irq(&mddev->write_lock);
+	sync_req = mddev->in_sync;
+	mddev->utime = get_seconds();
+	mddev->events ++;
+
+	if (!mddev->events) {
+		/*
+		 * oops, this 64-bit counter should never wrap.
+		 * Either we are in around ~1 trillion A.C., assuming
+		 * 1 reboot per second, or we have a bug:
+		 */
+		MD_BUG();
+		mddev->events --;
+	}
+	mddev->sb_dirty = 2;
+	sync_sbs(mddev);
+
+	/*
+	 * do not write anything to disk if using
+	 * nonpersistent superblocks
+	 */
+	if (!mddev->persistent) {
+		mddev->sb_dirty = 0;
+		spin_unlock_irq(&mddev->write_lock);
+		wake_up(&mddev->sb_wait);
+		return;
+	}
+	spin_unlock_irq(&mddev->write_lock);
+
+	dprintk(KERN_INFO 
+		"md: updating %s RAID superblock on device (in sync %d)\n",
+		mdname(mddev),mddev->in_sync);
+
+	err = bitmap_update_sb(mddev->bitmap);
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		char b[BDEVNAME_SIZE];
+		dprintk(KERN_INFO "md: ");
+		if (test_bit(Faulty, &rdev->flags))
+			dprintk("(skipping faulty ");
+
+		dprintk("%s ", bdevname(rdev->bdev,b));
+		if (!test_bit(Faulty, &rdev->flags)) {
+			md_super_write(mddev,rdev,
+				       rdev->sb_offset<<1, rdev->sb_size,
+				       rdev->sb_page);
+			dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
+				bdevname(rdev->bdev,b),
+				(unsigned long long)rdev->sb_offset);
+
+		} else
+			dprintk(")\n");
+		if (mddev->level == LEVEL_MULTIPATH)
+			/* only need to write one superblock... */
+			break;
+	}
+	md_super_wait(mddev);
+	/* if there was a failure, sb_dirty was set to 1, and we re-write super */
+
+	spin_lock_irq(&mddev->write_lock);
+	if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
+		/* have to write it out again */
+		spin_unlock_irq(&mddev->write_lock);
+		goto repeat;
+	}
+	mddev->sb_dirty = 0;
+	spin_unlock_irq(&mddev->write_lock);
+	wake_up(&mddev->sb_wait);
+
+}
+
+/* words written to sysfs files may, or my not, be \n terminated.
+ * We want to accept with case. For this we use cmd_match.
+ */
+static int cmd_match(const char *cmd, const char *str)
+{
+	/* See if cmd, written into a sysfs file, matches
+	 * str.  They must either be the same, or cmd can
+	 * have a trailing newline
+	 */
+	while (*cmd && *str && *cmd == *str) {
+		cmd++;
+		str++;
+	}
+	if (*cmd == '\n')
+		cmd++;
+	if (*str || *cmd)
+		return 0;
+	return 1;
+}
+
+struct rdev_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(mdk_rdev_t *, char *);
+	ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
+};
+
+static ssize_t
+state_show(mdk_rdev_t *rdev, char *page)
+{
+	char *sep = "";
+	int len=0;
+
+	if (test_bit(Faulty, &rdev->flags)) {
+		len+= sprintf(page+len, "%sfaulty",sep);
+		sep = ",";
+	}
+	if (test_bit(In_sync, &rdev->flags)) {
+		len += sprintf(page+len, "%sin_sync",sep);
+		sep = ",";
+	}
+	if (!test_bit(Faulty, &rdev->flags) &&
+	    !test_bit(In_sync, &rdev->flags)) {
+		len += sprintf(page+len, "%sspare", sep);
+		sep = ",";
+	}
+	return len+sprintf(page+len, "\n");
+}
+
+static struct rdev_sysfs_entry
+rdev_state = __ATTR_RO(state);
+
+static ssize_t
+super_show(mdk_rdev_t *rdev, char *page)
+{
+	if (rdev->sb_loaded && rdev->sb_size) {
+		memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
+		return rdev->sb_size;
+	} else
+		return 0;
+}
+static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
+
+static ssize_t
+errors_show(mdk_rdev_t *rdev, char *page)
+{
+	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
+}
+
+static ssize_t
+errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+	if (*buf && (*e == 0 || *e == '\n')) {
+		atomic_set(&rdev->corrected_errors, n);
+		return len;
+	}
+	return -EINVAL;
+}
+static struct rdev_sysfs_entry rdev_errors =
+__ATTR(errors, 0644, errors_show, errors_store);
+
+static ssize_t
+slot_show(mdk_rdev_t *rdev, char *page)
+{
+	if (rdev->raid_disk < 0)
+		return sprintf(page, "none\n");
+	else
+		return sprintf(page, "%d\n", rdev->raid_disk);
+}
+
+static ssize_t
+slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	int slot = simple_strtoul(buf, &e, 10);
+	if (strncmp(buf, "none", 4)==0)
+		slot = -1;
+	else if (e==buf || (*e && *e!= '\n'))
+		return -EINVAL;
+	if (rdev->mddev->pers)
+		/* Cannot set slot in active array (yet) */
+		return -EBUSY;
+	if (slot >= rdev->mddev->raid_disks)
+		return -ENOSPC;
+	rdev->raid_disk = slot;
+	/* assume it is working */
+	rdev->flags = 0;
+	set_bit(In_sync, &rdev->flags);
+	return len;
+}
+
+
+static struct rdev_sysfs_entry rdev_slot =
+__ATTR(slot, 0644, slot_show, slot_store);
+
+static ssize_t
+offset_show(mdk_rdev_t *rdev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
+}
+
+static ssize_t
+offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long offset = simple_strtoull(buf, &e, 10);
+	if (e==buf || (*e && *e != '\n'))
+		return -EINVAL;
+	if (rdev->mddev->pers)
+		return -EBUSY;
+	rdev->data_offset = offset;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_offset =
+__ATTR(offset, 0644, offset_show, offset_store);
+
+static ssize_t
+rdev_size_show(mdk_rdev_t *rdev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
+}
+
+static ssize_t
+rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long size = simple_strtoull(buf, &e, 10);
+	if (e==buf || (*e && *e != '\n'))
+		return -EINVAL;
+	if (rdev->mddev->pers)
+		return -EBUSY;
+	rdev->size = size;
+	if (size < rdev->mddev->size || rdev->mddev->size == 0)
+		rdev->mddev->size = size;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_size =
+__ATTR(size, 0644, rdev_size_show, rdev_size_store);
+
+static struct attribute *rdev_default_attrs[] = {
+	&rdev_state.attr,
+	&rdev_super.attr,
+	&rdev_errors.attr,
+	&rdev_slot.attr,
+	&rdev_offset.attr,
+	&rdev_size.attr,
+	NULL,
+};
+static ssize_t
+rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
+	mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
+
+	if (!entry->show)
+		return -EIO;
+	return entry->show(rdev, page);
+}
+
+static ssize_t
+rdev_attr_store(struct kobject *kobj, struct attribute *attr,
+	      const char *page, size_t length)
+{
+	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
+	mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
+
+	if (!entry->store)
+		return -EIO;
+	return entry->store(rdev, page, length);
+}
+
+static void rdev_free(struct kobject *ko)
+{
+	mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
+	kfree(rdev);
+}
+static struct sysfs_ops rdev_sysfs_ops = {
+	.show		= rdev_attr_show,
+	.store		= rdev_attr_store,
+};
+static struct kobj_type rdev_ktype = {
+	.release	= rdev_free,
+	.sysfs_ops	= &rdev_sysfs_ops,
+	.default_attrs	= rdev_default_attrs,
+};
+
+/*
+ * Import a device. If 'super_format' >= 0, then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ *   - the device is nonexistent (zero size)
+ *   - the device has no valid superblock
+ *
+ * a faulty rdev _never_ has rdev->sb set.
+ */
+static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
+{
+	char b[BDEVNAME_SIZE];
+	int err;
+	mdk_rdev_t *rdev;
+	sector_t size;
+
+	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
+	if (!rdev) {
+		printk(KERN_ERR "md: could not alloc mem for new device!\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if ((err = alloc_disk_sb(rdev)))
+		goto abort_free;
+
+	err = lock_rdev(rdev, newdev);
+	if (err)
+		goto abort_free;
+
+	rdev->kobj.parent = NULL;
+	rdev->kobj.ktype = &rdev_ktype;
+	kobject_init(&rdev->kobj);
+
+	rdev->desc_nr = -1;
+	rdev->flags = 0;
+	rdev->data_offset = 0;
+	atomic_set(&rdev->nr_pending, 0);
+	atomic_set(&rdev->read_errors, 0);
+	atomic_set(&rdev->corrected_errors, 0);
+
+	size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+	if (!size) {
+		printk(KERN_WARNING 
+			"md: %s has zero or unknown size, marking faulty!\n",
+			bdevname(rdev->bdev,b));
+		err = -EINVAL;
+		goto abort_free;
+	}
+
+	if (super_format >= 0) {
+		err = super_types[super_format].
+			load_super(rdev, NULL, super_minor);
+		if (err == -EINVAL) {
+			printk(KERN_WARNING 
+				"md: %s has invalid sb, not importing!\n",
+				bdevname(rdev->bdev,b));
+			goto abort_free;
+		}
+		if (err < 0) {
+			printk(KERN_WARNING 
+				"md: could not read %s's sb, not importing!\n",
+				bdevname(rdev->bdev,b));
+			goto abort_free;
+		}
+	}
+	INIT_LIST_HEAD(&rdev->same_set);
+
+	return rdev;
+
+abort_free:
+	if (rdev->sb_page) {
+		if (rdev->bdev)
+			unlock_rdev(rdev);
+		free_disk_sb(rdev);
+	}
+	kfree(rdev);
+	return ERR_PTR(err);
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+
+static void analyze_sbs(mddev_t * mddev)
+{
+	int i;
+	struct list_head *tmp;
+	mdk_rdev_t *rdev, *freshest;
+	char b[BDEVNAME_SIZE];
+
+	freshest = NULL;
+	ITERATE_RDEV(mddev,rdev,tmp)
+		switch (super_types[mddev->major_version].
+			load_super(rdev, freshest, mddev->minor_version)) {
+		case 1:
+			freshest = rdev;
+			break;
+		case 0:
+			break;
+		default:
+			printk( KERN_ERR \
+				"md: fatal superblock inconsistency in %s"
+				" -- removing from array\n", 
+				bdevname(rdev->bdev,b));
+			kick_rdev_from_array(rdev);
+		}
+
+
+	super_types[mddev->major_version].
+		validate_super(mddev, freshest);
+
+	i = 0;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev != freshest)
+			if (super_types[mddev->major_version].
+			    validate_super(mddev, rdev)) {
+				printk(KERN_WARNING "md: kicking non-fresh %s"
+					" from array!\n",
+					bdevname(rdev->bdev,b));
+				kick_rdev_from_array(rdev);
+				continue;
+			}
+		if (mddev->level == LEVEL_MULTIPATH) {
+			rdev->desc_nr = i++;
+			rdev->raid_disk = rdev->desc_nr;
+			set_bit(In_sync, &rdev->flags);
+		}
+	}
+
+
+
+	if (mddev->recovery_cp != MaxSector &&
+	    mddev->level >= 1)
+		printk(KERN_ERR "md: %s: raid array is not clean"
+		       " -- starting background reconstruction\n",
+		       mdname(mddev));
+
+}
+
+static ssize_t
+level_show(mddev_t *mddev, char *page)
+{
+	struct mdk_personality *p = mddev->pers;
+	if (p)
+		return sprintf(page, "%s\n", p->name);
+	else if (mddev->clevel[0])
+		return sprintf(page, "%s\n", mddev->clevel);
+	else if (mddev->level != LEVEL_NONE)
+		return sprintf(page, "%d\n", mddev->level);
+	else
+		return 0;
+}
+
+static ssize_t
+level_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int rv = len;
+	if (mddev->pers)
+		return -EBUSY;
+	if (len == 0)
+		return 0;
+	if (len >= sizeof(mddev->clevel))
+		return -ENOSPC;
+	strncpy(mddev->clevel, buf, len);
+	if (mddev->clevel[len-1] == '\n')
+		len--;
+	mddev->clevel[len] = 0;
+	mddev->level = LEVEL_NONE;
+	return rv;
+}
+
+static struct md_sysfs_entry md_level =
+__ATTR(level, 0644, level_show, level_store);
+
+static ssize_t
+raid_disks_show(mddev_t *mddev, char *page)
+{
+	if (mddev->raid_disks == 0)
+		return 0;
+	return sprintf(page, "%d\n", mddev->raid_disks);
+}
+
+static int update_raid_disks(mddev_t *mddev, int raid_disks);
+
+static ssize_t
+raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* can only set raid_disks if array is not yet active */
+	char *e;
+	int rv = 0;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+
+	if (!*buf || (*e && *e != '\n'))
+		return -EINVAL;
+
+	if (mddev->pers)
+		rv = update_raid_disks(mddev, n);
+	else
+		mddev->raid_disks = n;
+	return rv ? rv : len;
+}
+static struct md_sysfs_entry md_raid_disks =
+__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
+
+static ssize_t
+chunk_size_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d\n", mddev->chunk_size);
+}
+
+static ssize_t
+chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* can only set chunk_size if array is not yet active */
+	char *e;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+
+	if (mddev->pers)
+		return -EBUSY;
+	if (!*buf || (*e && *e != '\n'))
+		return -EINVAL;
+
+	mddev->chunk_size = n;
+	return len;
+}
+static struct md_sysfs_entry md_chunk_size =
+__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
+
+static ssize_t
+null_show(mddev_t *mddev, char *page)
+{
+	return -EINVAL;
+}
+
+static ssize_t
+new_dev_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* buf must be %d:%d\n? giving major and minor numbers */
+	/* The new device is added to the array.
+	 * If the array has a persistent superblock, we read the
+	 * superblock to initialise info and check validity.
+	 * Otherwise, only checking done is that in bind_rdev_to_array,
+	 * which mainly checks size.
+	 */
+	char *e;
+	int major = simple_strtoul(buf, &e, 10);
+	int minor;
+	dev_t dev;
+	mdk_rdev_t *rdev;
+	int err;
+
+	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
+		return -EINVAL;
+	minor = simple_strtoul(e+1, &e, 10);
+	if (*e && *e != '\n')
+		return -EINVAL;
+	dev = MKDEV(major, minor);
+	if (major != MAJOR(dev) ||
+	    minor != MINOR(dev))
+		return -EOVERFLOW;
+
+
+	if (mddev->persistent) {
+		rdev = md_import_device(dev, mddev->major_version,
+					mddev->minor_version);
+		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
+			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+						       mdk_rdev_t, same_set);
+			err = super_types[mddev->major_version]
+				.load_super(rdev, rdev0, mddev->minor_version);
+			if (err < 0)
+				goto out;
+		}
+	} else
+		rdev = md_import_device(dev, -1, -1);
+
+	if (IS_ERR(rdev))
+		return PTR_ERR(rdev);
+	err = bind_rdev_to_array(rdev, mddev);
+ out:
+	if (err)
+		export_rdev(rdev);
+	return err ? err : len;
+}
+
+static struct md_sysfs_entry md_new_device =
+__ATTR(new_dev, 0200, null_show, new_dev_store);
+
+static ssize_t
+size_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
+}
+
+static int update_size(mddev_t *mddev, unsigned long size);
+
+static ssize_t
+size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* If array is inactive, we can reduce the component size, but
+	 * not increase it (except from 0).
+	 * If array is active, we can try an on-line resize
+	 */
+	char *e;
+	int err = 0;
+	unsigned long long size = simple_strtoull(buf, &e, 10);
+	if (!*buf || *buf == '\n' ||
+	    (*e && *e != '\n'))
+		return -EINVAL;
+
+	if (mddev->pers) {
+		err = update_size(mddev, size);
+		md_update_sb(mddev);
+	} else {
+		if (mddev->size == 0 ||
+		    mddev->size > size)
+			mddev->size = size;
+		else
+			err = -ENOSPC;
+	}
+	return err ? err : len;
+}
+
+static struct md_sysfs_entry md_size =
+__ATTR(component_size, 0644, size_show, size_store);
+
+
+/* Metdata version.
+ * This is either 'none' for arrays with externally managed metadata,
+ * or N.M for internally known formats
+ */
+static ssize_t
+metadata_show(mddev_t *mddev, char *page)
+{
+	if (mddev->persistent)
+		return sprintf(page, "%d.%d\n",
+			       mddev->major_version, mddev->minor_version);
+	else
+		return sprintf(page, "none\n");
+}
+
+static ssize_t
+metadata_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int major, minor;
+	char *e;
+	if (!list_empty(&mddev->disks))
+		return -EBUSY;
+
+	if (cmd_match(buf, "none")) {
+		mddev->persistent = 0;
+		mddev->major_version = 0;
+		mddev->minor_version = 90;
+		return len;
+	}
+	major = simple_strtoul(buf, &e, 10);
+	if (e==buf || *e != '.')
+		return -EINVAL;
+	buf = e+1;
+	minor = simple_strtoul(buf, &e, 10);
+	if (e==buf || *e != '\n')
+		return -EINVAL;
+	if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
+	    super_types[major].name == NULL)
+		return -ENOENT;
+	mddev->major_version = major;
+	mddev->minor_version = minor;
+	mddev->persistent = 1;
+	return len;
+}
+
+static struct md_sysfs_entry md_metadata =
+__ATTR(metadata_version, 0644, metadata_show, metadata_store);
+
+static ssize_t
+action_show(mddev_t *mddev, char *page)
+{
+	char *type = "idle";
+	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+	    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
+		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+				type = "resync";
+			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+				type = "check";
+			else
+				type = "repair";
+		} else
+			type = "recover";
+	}
+	return sprintf(page, "%s\n", type);
+}
+
+static ssize_t
+action_store(mddev_t *mddev, const char *page, size_t len)
+{
+	if (!mddev->pers || !mddev->pers->sync_request)
+		return -EINVAL;
+
+	if (cmd_match(page, "idle")) {
+		if (mddev->sync_thread) {
+			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+			md_unregister_thread(mddev->sync_thread);
+			mddev->sync_thread = NULL;
+			mddev->recovery = 0;
+		}
+	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+		return -EBUSY;
+	else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	else {
+		if (cmd_match(page, "check"))
+			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+		else if (cmd_match(page, "repair"))
+			return -EINVAL;
+		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+	}
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	md_wakeup_thread(mddev->thread);
+	return len;
+}
+
+static ssize_t
+mismatch_cnt_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n",
+		       (unsigned long long) mddev->resync_mismatches);
+}
+
+static struct md_sysfs_entry
+md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+
+
+static struct md_sysfs_entry
+md_mismatches = __ATTR_RO(mismatch_cnt);
+
+static ssize_t
+sync_min_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d (%s)\n", speed_min(mddev),
+		       mddev->sync_speed_min ? "local": "system");
+}
+
+static ssize_t
+sync_min_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int min;
+	char *e;
+	if (strncmp(buf, "system", 6)==0) {
+		mddev->sync_speed_min = 0;
+		return len;
+	}
+	min = simple_strtoul(buf, &e, 10);
+	if (buf == e || (*e && *e != '\n') || min <= 0)
+		return -EINVAL;
+	mddev->sync_speed_min = min;
+	return len;
+}
+
+static struct md_sysfs_entry md_sync_min =
+__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
+
+static ssize_t
+sync_max_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d (%s)\n", speed_max(mddev),
+		       mddev->sync_speed_max ? "local": "system");
+}
+
+static ssize_t
+sync_max_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int max;
+	char *e;
+	if (strncmp(buf, "system", 6)==0) {
+		mddev->sync_speed_max = 0;
+		return len;
+	}
+	max = simple_strtoul(buf, &e, 10);
+	if (buf == e || (*e && *e != '\n') || max <= 0)
+		return -EINVAL;
+	mddev->sync_speed_max = max;
+	return len;
+}
+
+static struct md_sysfs_entry md_sync_max =
+__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
+
+
+static ssize_t
+sync_speed_show(mddev_t *mddev, char *page)
+{
+	unsigned long resync, dt, db;
+	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+	dt = ((jiffies - mddev->resync_mark) / HZ);
+	if (!dt) dt++;
+	db = resync - (mddev->resync_mark_cnt);
+	return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
+}
+
+static struct md_sysfs_entry
+md_sync_speed = __ATTR_RO(sync_speed);
+
+static ssize_t
+sync_completed_show(mddev_t *mddev, char *page)
+{
+	unsigned long max_blocks, resync;
+
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+		max_blocks = mddev->resync_max_sectors;
+	else
+		max_blocks = mddev->size << 1;
+
+	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+	return sprintf(page, "%lu / %lu\n", resync, max_blocks);
+}
+
+static struct md_sysfs_entry
+md_sync_completed = __ATTR_RO(sync_completed);
+
+static struct attribute *md_default_attrs[] = {
+	&md_level.attr,
+	&md_raid_disks.attr,
+	&md_chunk_size.attr,
+	&md_size.attr,
+	&md_metadata.attr,
+	&md_new_device.attr,
+	NULL,
+};
+
+static struct attribute *md_redundancy_attrs[] = {
+	&md_scan_mode.attr,
+	&md_mismatches.attr,
+	&md_sync_min.attr,
+	&md_sync_max.attr,
+	&md_sync_speed.attr,
+	&md_sync_completed.attr,
+	NULL,
+};
+static struct attribute_group md_redundancy_group = {
+	.name = NULL,
+	.attrs = md_redundancy_attrs,
+};
+
+
+static ssize_t
+md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
+	mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
+	ssize_t rv;
+
+	if (!entry->show)
+		return -EIO;
+	mddev_lock(mddev);
+	rv = entry->show(mddev, page);
+	mddev_unlock(mddev);
+	return rv;
+}
+
+static ssize_t
+md_attr_store(struct kobject *kobj, struct attribute *attr,
+	      const char *page, size_t length)
+{
+	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
+	mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
+	ssize_t rv;
+
+	if (!entry->store)
+		return -EIO;
+	mddev_lock(mddev);
+	rv = entry->store(mddev, page, length);
+	mddev_unlock(mddev);
+	return rv;
+}
+
+static void md_free(struct kobject *ko)
+{
+	mddev_t *mddev = container_of(ko, mddev_t, kobj);
+	kfree(mddev);
+}
+
+static struct sysfs_ops md_sysfs_ops = {
+	.show	= md_attr_show,
+	.store	= md_attr_store,
+};
+static struct kobj_type md_ktype = {
+	.release	= md_free,
+	.sysfs_ops	= &md_sysfs_ops,
+	.default_attrs	= md_default_attrs,
+};
+
+int mdp_major = 0;
+
+static struct kobject *md_probe(dev_t dev, int *part, void *data)
+{
+	static DEFINE_MUTEX(disks_mutex);
+	mddev_t *mddev = mddev_find(dev);
+	struct gendisk *disk;
+	int partitioned = (MAJOR(dev) != MD_MAJOR);
+	int shift = partitioned ? MdpMinorShift : 0;
+	int unit = MINOR(dev) >> shift;
+
+	if (!mddev)
+		return NULL;
+
+	mutex_lock(&disks_mutex);
+	if (mddev->gendisk) {
+		mutex_unlock(&disks_mutex);
+		mddev_put(mddev);
+		return NULL;
+	}
+	disk = alloc_disk(1 << shift);
+	if (!disk) {
+		mutex_unlock(&disks_mutex);
+		mddev_put(mddev);
+		return NULL;
+	}
+	disk->major = MAJOR(dev);
+	disk->first_minor = unit << shift;
+	if (partitioned) {
+		sprintf(disk->disk_name, "md_d%d", unit);
+		sprintf(disk->devfs_name, "md/d%d", unit);
+	} else {
+		sprintf(disk->disk_name, "md%d", unit);
+		sprintf(disk->devfs_name, "md/%d", unit);
+	}
+	disk->fops = &md_fops;
+	disk->private_data = mddev;
+	disk->queue = mddev->queue;
+	add_disk(disk);
+	mddev->gendisk = disk;
+	mutex_unlock(&disks_mutex);
+	mddev->kobj.parent = &disk->kobj;
+	mddev->kobj.k_name = NULL;
+	snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
+	mddev->kobj.ktype = &md_ktype;
+	kobject_register(&mddev->kobj);
+	return NULL;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread);
+
+static void md_safemode_timeout(unsigned long data)
+{
+	mddev_t *mddev = (mddev_t *) data;
+
+	mddev->safemode = 1;
+	md_wakeup_thread(mddev->thread);
+}
+
+static int start_dirty_degraded;
+
+static int do_md_run(mddev_t * mddev)
+{
+	int err;
+	int chunk_size;
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+	struct gendisk *disk;
+	struct mdk_personality *pers;
+	char b[BDEVNAME_SIZE];
+
+	if (list_empty(&mddev->disks))
+		/* cannot run an array with no devices.. */
+		return -EINVAL;
+
+	if (mddev->pers)
+		return -EBUSY;
+
+	/*
+	 * Analyze all RAID superblock(s)
+	 */
+	if (!mddev->raid_disks)
+		analyze_sbs(mddev);
+
+	chunk_size = mddev->chunk_size;
+
+	if (chunk_size) {
+		if (chunk_size > MAX_CHUNK_SIZE) {
+			printk(KERN_ERR "too big chunk_size: %d > %d\n",
+				chunk_size, MAX_CHUNK_SIZE);
+			return -EINVAL;
+		}
+		/*
+		 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+		 */
+		if ( (1 << ffz(~chunk_size)) != chunk_size) {
+			printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
+			return -EINVAL;
+		}
+		if (chunk_size < PAGE_SIZE) {
+			printk(KERN_ERR "too small chunk_size: %d < %ld\n",
+				chunk_size, PAGE_SIZE);
+			return -EINVAL;
+		}
+
+		/* devices must have minimum size of one chunk */
+		ITERATE_RDEV(mddev,rdev,tmp) {
+			if (test_bit(Faulty, &rdev->flags))
+				continue;
+			if (rdev->size < chunk_size / 1024) {
+				printk(KERN_WARNING
+					"md: Dev %s smaller than chunk_size:"
+					" %lluk < %dk\n",
+					bdevname(rdev->bdev,b),
+					(unsigned long long)rdev->size,
+					chunk_size / 1024);
+				return -EINVAL;
+			}
+		}
+	}
+
+#ifdef CONFIG_KMOD
+	if (mddev->level != LEVEL_NONE)
+		request_module("md-level-%d", mddev->level);
+	else if (mddev->clevel[0])
+		request_module("md-%s", mddev->clevel);
+#endif
+
+	/*
+	 * Drop all container device buffers, from now on
+	 * the only valid external interface is through the md
+	 * device.
+	 * Also find largest hardsector size
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (test_bit(Faulty, &rdev->flags))
+			continue;
+		sync_blockdev(rdev->bdev);
+		invalidate_bdev(rdev->bdev, 0);
+	}
+
+	md_probe(mddev->unit, NULL, NULL);
+	disk = mddev->gendisk;
+	if (!disk)
+		return -ENOMEM;
+
+	spin_lock(&pers_lock);
+	pers = find_pers(mddev->level, mddev->clevel);
+	if (!pers || !try_module_get(pers->owner)) {
+		spin_unlock(&pers_lock);
+		if (mddev->level != LEVEL_NONE)
+			printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
+			       mddev->level);
+		else
+			printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
+			       mddev->clevel);
+		return -EINVAL;
+	}
+	mddev->pers = pers;
+	spin_unlock(&pers_lock);
+	mddev->level = pers->level;
+	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+
+	mddev->recovery = 0;
+	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
+	mddev->barriers_work = 1;
+	mddev->ok_start_degraded = start_dirty_degraded;
+
+	if (start_readonly)
+		mddev->ro = 2; /* read-only, but switch on first write */
+
+	err = mddev->pers->run(mddev);
+	if (!err && mddev->pers->sync_request) {
+		err = bitmap_create(mddev);
+		if (err) {
+			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
+			       mdname(mddev), err);
+			mddev->pers->stop(mddev);
+		}
+	}
+	if (err) {
+		printk(KERN_ERR "md: pers->run() failed ...\n");
+		module_put(mddev->pers->owner);
+		mddev->pers = NULL;
+		bitmap_destroy(mddev);
+		return err;
+	}
+	if (mddev->pers->sync_request)
+		sysfs_create_group(&mddev->kobj, &md_redundancy_group);
+	else if (mddev->ro == 2) /* auto-readonly not meaningful */
+		mddev->ro = 0;
+
+ 	atomic_set(&mddev->writes_pending,0);
+	mddev->safemode = 0;
+	mddev->safemode_timer.function = md_safemode_timeout;
+	mddev->safemode_timer.data = (unsigned long) mddev;
+	mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
+	mddev->in_sync = 1;
+
+	ITERATE_RDEV(mddev,rdev,tmp)
+		if (rdev->raid_disk >= 0) {
+			char nm[20];
+			sprintf(nm, "rd%d", rdev->raid_disk);
+			sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+		}
+	
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	md_wakeup_thread(mddev->thread);
+	
+	if (mddev->sb_dirty)
+		md_update_sb(mddev);
+
+	set_capacity(disk, mddev->array_size<<1);
+
+	/* If we call blk_queue_make_request here, it will
+	 * re-initialise max_sectors etc which may have been
+	 * refined inside -> run.  So just set the bits we need to set.
+	 * Most initialisation happended when we called
+	 * blk_queue_make_request(..., md_fail_request)
+	 * earlier.
+	 */
+	mddev->queue->queuedata = mddev;
+	mddev->queue->make_request_fn = mddev->pers->make_request;
+
+	mddev->changed = 1;
+	md_new_event(mddev);
+	return 0;
+}
+
+static int restart_array(mddev_t *mddev)
+{
+	struct gendisk *disk = mddev->gendisk;
+	int err;
+
+	/*
+	 * Complain if it has no devices
+	 */
+	err = -ENXIO;
+	if (list_empty(&mddev->disks))
+		goto out;
+
+	if (mddev->pers) {
+		err = -EBUSY;
+		if (!mddev->ro)
+			goto out;
+
+		mddev->safemode = 0;
+		mddev->ro = 0;
+		set_disk_ro(disk, 0);
+
+		printk(KERN_INFO "md: %s switched to read-write mode.\n",
+			mdname(mddev));
+		/*
+		 * Kick recovery or resync if necessary
+		 */
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+		err = 0;
+	} else {
+		printk(KERN_ERR "md: %s has no personality assigned.\n",
+			mdname(mddev));
+		err = -EINVAL;
+	}
+
+out:
+	return err;
+}
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+	int err = 0;
+	struct gendisk *disk = mddev->gendisk;
+
+	if (mddev->pers) {
+		if (atomic_read(&mddev->active)>2) {
+			printk("md: %s still in use.\n",mdname(mddev));
+			return -EBUSY;
+		}
+
+		if (mddev->sync_thread) {
+			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+			md_unregister_thread(mddev->sync_thread);
+			mddev->sync_thread = NULL;
+		}
+
+		del_timer_sync(&mddev->safemode_timer);
+
+		invalidate_partition(disk, 0);
+
+		if (ro) {
+			err  = -ENXIO;
+			if (mddev->ro==1)
+				goto out;
+			mddev->ro = 1;
+		} else {
+			bitmap_flush(mddev);
+			md_super_wait(mddev);
+			if (mddev->ro)
+				set_disk_ro(disk, 0);
+			blk_queue_make_request(mddev->queue, md_fail_request);
+			mddev->pers->stop(mddev);
+			if (mddev->pers->sync_request)
+				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+
+			module_put(mddev->pers->owner);
+			mddev->pers = NULL;
+			if (mddev->ro)
+				mddev->ro = 0;
+		}
+		if (!mddev->in_sync) {
+			/* mark array as shutdown cleanly */
+			mddev->in_sync = 1;
+			md_update_sb(mddev);
+		}
+		if (ro)
+			set_disk_ro(disk, 1);
+	}
+
+	/*
+	 * Free resources if final stop
+	 */
+	if (!ro) {
+		mdk_rdev_t *rdev;
+		struct list_head *tmp;
+		struct gendisk *disk;
+		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
+
+		bitmap_destroy(mddev);
+		if (mddev->bitmap_file) {
+			atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
+			fput(mddev->bitmap_file);
+			mddev->bitmap_file = NULL;
+		}
+		mddev->bitmap_offset = 0;
+
+		ITERATE_RDEV(mddev,rdev,tmp)
+			if (rdev->raid_disk >= 0) {
+				char nm[20];
+				sprintf(nm, "rd%d", rdev->raid_disk);
+				sysfs_remove_link(&mddev->kobj, nm);
+			}
+
+		export_array(mddev);
+
+		mddev->array_size = 0;
+		disk = mddev->gendisk;
+		if (disk)
+			set_capacity(disk, 0);
+		mddev->changed = 1;
+	} else
+		printk(KERN_INFO "md: %s switched to read-only mode.\n",
+			mdname(mddev));
+	err = 0;
+	md_new_event(mddev);
+out:
+	return err;
+}
+
+static void autorun_array(mddev_t *mddev)
+{
+	mdk_rdev_t *rdev;
+	struct list_head *tmp;
+	int err;
+
+	if (list_empty(&mddev->disks))
+		return;
+
+	printk(KERN_INFO "md: running: ");
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		char b[BDEVNAME_SIZE];
+		printk("<%s>", bdevname(rdev->bdev,b));
+	}
+	printk("\n");
+
+	err = do_md_run (mddev);
+	if (err) {
+		printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
+		do_md_stop (mddev, 0);
+	}
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in pending_raid_disks)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(int part)
+{
+	struct list_head candidates;
+	struct list_head *tmp;
+	mdk_rdev_t *rdev0, *rdev;
+	mddev_t *mddev;
+	char b[BDEVNAME_SIZE];
+
+	printk(KERN_INFO "md: autorun ...\n");
+	while (!list_empty(&pending_raid_disks)) {
+		dev_t dev;
+		rdev0 = list_entry(pending_raid_disks.next,
+					 mdk_rdev_t, same_set);
+
+		printk(KERN_INFO "md: considering %s ...\n",
+			bdevname(rdev0->bdev,b));
+		INIT_LIST_HEAD(&candidates);
+		ITERATE_RDEV_PENDING(rdev,tmp)
+			if (super_90_load(rdev, rdev0, 0) >= 0) {
+				printk(KERN_INFO "md:  adding %s ...\n",
+					bdevname(rdev->bdev,b));
+				list_move(&rdev->same_set, &candidates);
+			}
+		/*
+		 * now we have a set of devices, with all of them having
+		 * mostly sane superblocks. It's time to allocate the
+		 * mddev.
+		 */
+		if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
+			printk(KERN_INFO "md: unit number in %s is bad: %d\n",
+			       bdevname(rdev0->bdev, b), rdev0->preferred_minor);
+			break;
+		}
+		if (part)
+			dev = MKDEV(mdp_major,
+				    rdev0->preferred_minor << MdpMinorShift);
+		else
+			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
+
+		md_probe(dev, NULL, NULL);
+		mddev = mddev_find(dev);
+		if (!mddev) {
+			printk(KERN_ERR 
+				"md: cannot allocate memory for md drive.\n");
+			break;
+		}
+		if (mddev_lock(mddev)) 
+			printk(KERN_WARNING "md: %s locked, cannot run\n",
+			       mdname(mddev));
+		else if (mddev->raid_disks || mddev->major_version
+			 || !list_empty(&mddev->disks)) {
+			printk(KERN_WARNING 
+				"md: %s already running, cannot run %s\n",
+				mdname(mddev), bdevname(rdev0->bdev,b));
+			mddev_unlock(mddev);
+		} else {
+			printk(KERN_INFO "md: created %s\n", mdname(mddev));
+			ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
+				list_del_init(&rdev->same_set);
+				if (bind_rdev_to_array(rdev, mddev))
+					export_rdev(rdev);
+			}
+			autorun_array(mddev);
+			mddev_unlock(mddev);
+		}
+		/* on success, candidates will be empty, on error
+		 * it won't...
+		 */
+		ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
+			export_rdev(rdev);
+		mddev_put(mddev);
+	}
+	printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+static int autostart_array(dev_t startdev)
+{
+	char b[BDEVNAME_SIZE];
+	int err = -EINVAL, i;
+	mdp_super_t *sb = NULL;
+	mdk_rdev_t *start_rdev = NULL, *rdev;
+
+	start_rdev = md_import_device(startdev, 0, 0);
+	if (IS_ERR(start_rdev))
+		return err;
+
+
+	/* NOTE: this can only work for 0.90.0 superblocks */
+	sb = (mdp_super_t*)page_address(start_rdev->sb_page);
+	if (sb->major_version != 0 ||
+	    sb->minor_version != 90 ) {
+		printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
+		export_rdev(start_rdev);
+		return err;
+	}
+
+	if (test_bit(Faulty, &start_rdev->flags)) {
+		printk(KERN_WARNING 
+			"md: can not autostart based on faulty %s!\n",
+			bdevname(start_rdev->bdev,b));
+		export_rdev(start_rdev);
+		return err;
+	}
+	list_add(&start_rdev->same_set, &pending_raid_disks);
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		mdp_disk_t *desc = sb->disks + i;
+		dev_t dev = MKDEV(desc->major, desc->minor);
+
+		if (!dev)
+			continue;
+		if (dev == startdev)
+			continue;
+		if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
+			continue;
+		rdev = md_import_device(dev, 0, 0);
+		if (IS_ERR(rdev))
+			continue;
+
+		list_add(&rdev->same_set, &pending_raid_disks);
+	}
+
+	/*
+	 * possibly return codes
+	 */
+	autorun_devices(0);
+	return 0;
+
+}
+
+
+static int get_version(void __user * arg)
+{
+	mdu_version_t ver;
+
+	ver.major = MD_MAJOR_VERSION;
+	ver.minor = MD_MINOR_VERSION;
+	ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+	if (copy_to_user(arg, &ver, sizeof(ver)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int get_array_info(mddev_t * mddev, void __user * arg)
+{
+	mdu_array_info_t info;
+	int nr,working,active,failed,spare;
+	mdk_rdev_t *rdev;
+	struct list_head *tmp;
+
+	nr=working=active=failed=spare=0;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		nr++;
+		if (test_bit(Faulty, &rdev->flags))
+			failed++;
+		else {
+			working++;
+			if (test_bit(In_sync, &rdev->flags))
+				active++;	
+			else
+				spare++;
+		}
+	}
+
+	info.major_version = mddev->major_version;
+	info.minor_version = mddev->minor_version;
+	info.patch_version = MD_PATCHLEVEL_VERSION;
+	info.ctime         = mddev->ctime;
+	info.level         = mddev->level;
+	info.size          = mddev->size;
+	if (info.size != mddev->size) /* overflow */
+		info.size = -1;
+	info.nr_disks      = nr;
+	info.raid_disks    = mddev->raid_disks;
+	info.md_minor      = mddev->md_minor;
+	info.not_persistent= !mddev->persistent;
+
+	info.utime         = mddev->utime;
+	info.state         = 0;
+	if (mddev->in_sync)
+		info.state = (1<<MD_SB_CLEAN);
+	if (mddev->bitmap && mddev->bitmap_offset)
+		info.state = (1<<MD_SB_BITMAP_PRESENT);
+	info.active_disks  = active;
+	info.working_disks = working;
+	info.failed_disks  = failed;
+	info.spare_disks   = spare;
+
+	info.layout        = mddev->layout;
+	info.chunk_size    = mddev->chunk_size;
+
+	if (copy_to_user(arg, &info, sizeof(info)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int get_bitmap_file(mddev_t * mddev, void __user * arg)
+{
+	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
+	char *ptr, *buf = NULL;
+	int err = -ENOMEM;
+
+	file = kmalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		goto out;
+
+	/* bitmap disabled, zero the first byte and copy out */
+	if (!mddev->bitmap || !mddev->bitmap->file) {
+		file->pathname[0] = '\0';
+		goto copy_out;
+	}
+
+	buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
+	if (!ptr)
+		goto out;
+
+	strcpy(file->pathname, ptr);
+
+copy_out:
+	err = 0;
+	if (copy_to_user(arg, file, sizeof(*file)))
+		err = -EFAULT;
+out:
+	kfree(buf);
+	kfree(file);
+	return err;
+}
+
+static int get_disk_info(mddev_t * mddev, void __user * arg)
+{
+	mdu_disk_info_t info;
+	unsigned int nr;
+	mdk_rdev_t *rdev;
+
+	if (copy_from_user(&info, arg, sizeof(info)))
+		return -EFAULT;
+
+	nr = info.number;
+
+	rdev = find_rdev_nr(mddev, nr);
+	if (rdev) {
+		info.major = MAJOR(rdev->bdev->bd_dev);
+		info.minor = MINOR(rdev->bdev->bd_dev);
+		info.raid_disk = rdev->raid_disk;
+		info.state = 0;
+		if (test_bit(Faulty, &rdev->flags))
+			info.state |= (1<<MD_DISK_FAULTY);
+		else if (test_bit(In_sync, &rdev->flags)) {
+			info.state |= (1<<MD_DISK_ACTIVE);
+			info.state |= (1<<MD_DISK_SYNC);
+		}
+		if (test_bit(WriteMostly, &rdev->flags))
+			info.state |= (1<<MD_DISK_WRITEMOSTLY);
+	} else {
+		info.major = info.minor = 0;
+		info.raid_disk = -1;
+		info.state = (1<<MD_DISK_REMOVED);
+	}
+
+	if (copy_to_user(arg, &info, sizeof(info)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+	mdk_rdev_t *rdev;
+	dev_t dev = MKDEV(info->major,info->minor);
+
+	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
+		return -EOVERFLOW;
+
+	if (!mddev->raid_disks) {
+		int err;
+		/* expecting a device which has a superblock */
+		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
+		if (IS_ERR(rdev)) {
+			printk(KERN_WARNING 
+				"md: md_import_device returned %ld\n",
+				PTR_ERR(rdev));
+			return PTR_ERR(rdev);
+		}
+		if (!list_empty(&mddev->disks)) {
+			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+							mdk_rdev_t, same_set);
+			int err = super_types[mddev->major_version]
+				.load_super(rdev, rdev0, mddev->minor_version);
+			if (err < 0) {
+				printk(KERN_WARNING 
+					"md: %s has different UUID to %s\n",
+					bdevname(rdev->bdev,b), 
+					bdevname(rdev0->bdev,b2));
+				export_rdev(rdev);
+				return -EINVAL;
+			}
+		}
+		err = bind_rdev_to_array(rdev, mddev);
+		if (err)
+			export_rdev(rdev);
+		return err;
+	}
+
+	/*
+	 * add_new_disk can be used once the array is assembled
+	 * to add "hot spares".  They must already have a superblock
+	 * written
+	 */
+	if (mddev->pers) {
+		int err;
+		if (!mddev->pers->hot_add_disk) {
+			printk(KERN_WARNING 
+				"%s: personality does not support diskops!\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		if (mddev->persistent)
+			rdev = md_import_device(dev, mddev->major_version,
+						mddev->minor_version);
+		else
+			rdev = md_import_device(dev, -1, -1);
+		if (IS_ERR(rdev)) {
+			printk(KERN_WARNING 
+				"md: md_import_device returned %ld\n",
+				PTR_ERR(rdev));
+			return PTR_ERR(rdev);
+		}
+		/* set save_raid_disk if appropriate */
+		if (!mddev->persistent) {
+			if (info->state & (1<<MD_DISK_SYNC)  &&
+			    info->raid_disk < mddev->raid_disks)
+				rdev->raid_disk = info->raid_disk;
+			else
+				rdev->raid_disk = -1;
+		} else
+			super_types[mddev->major_version].
+				validate_super(mddev, rdev);
+		rdev->saved_raid_disk = rdev->raid_disk;
+
+		clear_bit(In_sync, &rdev->flags); /* just to be sure */
+		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
+
+		rdev->raid_disk = -1;
+		err = bind_rdev_to_array(rdev, mddev);
+		if (err)
+			export_rdev(rdev);
+
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+		return err;
+	}
+
+	/* otherwise, add_new_disk is only allowed
+	 * for major_version==0 superblocks
+	 */
+	if (mddev->major_version != 0) {
+		printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
+		       mdname(mddev));
+		return -EINVAL;
+	}
+
+	if (!(info->state & (1<<MD_DISK_FAULTY))) {
+		int err;
+		rdev = md_import_device (dev, -1, 0);
+		if (IS_ERR(rdev)) {
+			printk(KERN_WARNING 
+				"md: error, md_import_device() returned %ld\n",
+				PTR_ERR(rdev));
+			return PTR_ERR(rdev);
+		}
+		rdev->desc_nr = info->number;
+		if (info->raid_disk < mddev->raid_disks)
+			rdev->raid_disk = info->raid_disk;
+		else
+			rdev->raid_disk = -1;
+
+		rdev->flags = 0;
+
+		if (rdev->raid_disk < mddev->raid_disks)
+			if (info->state & (1<<MD_DISK_SYNC))
+				set_bit(In_sync, &rdev->flags);
+
+		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
+
+		if (!mddev->persistent) {
+			printk(KERN_INFO "md: nonpersistent superblock ...\n");
+			rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+		} else 
+			rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+		rdev->size = calc_dev_size(rdev, mddev->chunk_size);
+
+		err = bind_rdev_to_array(rdev, mddev);
+		if (err) {
+			export_rdev(rdev);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, dev_t dev)
+{
+	char b[BDEVNAME_SIZE];
+	mdk_rdev_t *rdev;
+
+	if (!mddev->pers)
+		return -ENODEV;
+
+	rdev = find_rdev(mddev, dev);
+	if (!rdev)
+		return -ENXIO;
+
+	if (rdev->raid_disk >= 0)
+		goto busy;
+
+	kick_rdev_from_array(rdev);
+	md_update_sb(mddev);
+	md_new_event(mddev);
+
+	return 0;
+busy:
+	printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
+		bdevname(rdev->bdev,b), mdname(mddev));
+	return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, dev_t dev)
+{
+	char b[BDEVNAME_SIZE];
+	int err;
+	unsigned int size;
+	mdk_rdev_t *rdev;
+
+	if (!mddev->pers)
+		return -ENODEV;
+
+	if (mddev->major_version != 0) {
+		printk(KERN_WARNING "%s: HOT_ADD may only be used with"
+			" version-0 superblocks.\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+	if (!mddev->pers->hot_add_disk) {
+		printk(KERN_WARNING 
+			"%s: personality does not support diskops!\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+
+	rdev = md_import_device (dev, -1, 0);
+	if (IS_ERR(rdev)) {
+		printk(KERN_WARNING 
+			"md: error, md_import_device() returned %ld\n",
+			PTR_ERR(rdev));
+		return -EINVAL;
+	}
+
+	if (mddev->persistent)
+		rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+	else
+		rdev->sb_offset =
+			rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+
+	size = calc_dev_size(rdev, mddev->chunk_size);
+	rdev->size = size;
+
+	if (test_bit(Faulty, &rdev->flags)) {
+		printk(KERN_WARNING 
+			"md: can not hot-add faulty %s disk to %s!\n",
+			bdevname(rdev->bdev,b), mdname(mddev));
+		err = -EINVAL;
+		goto abort_export;
+	}
+	clear_bit(In_sync, &rdev->flags);
+	rdev->desc_nr = -1;
+	err = bind_rdev_to_array(rdev, mddev);
+	if (err)
+		goto abort_export;
+
+	/*
+	 * The rest should better be atomic, we can have disk failures
+	 * noticed in interrupt contexts ...
+	 */
+
+	if (rdev->desc_nr == mddev->max_disks) {
+		printk(KERN_WARNING "%s: can not hot-add to full array!\n",
+			mdname(mddev));
+		err = -EBUSY;
+		goto abort_unbind_export;
+	}
+
+	rdev->raid_disk = -1;
+
+	md_update_sb(mddev);
+
+	/*
+	 * Kick recovery, maybe this spare has to be added to the
+	 * array immediately.
+	 */
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	md_wakeup_thread(mddev->thread);
+	md_new_event(mddev);
+	return 0;
+
+abort_unbind_export:
+	unbind_rdev_from_array(rdev);
+
+abort_export:
+	export_rdev(rdev);
+	return err;
+}
+
+/* similar to deny_write_access, but accounts for our holding a reference
+ * to the file ourselves */
+static int deny_bitmap_write_access(struct file * file)
+{
+	struct inode *inode = file->f_mapping->host;
+
+	spin_lock(&inode->i_lock);
+	if (atomic_read(&inode->i_writecount) > 1) {
+		spin_unlock(&inode->i_lock);
+		return -ETXTBSY;
+	}
+	atomic_set(&inode->i_writecount, -1);
+	spin_unlock(&inode->i_lock);
+
+	return 0;
+}
+
+static int set_bitmap_file(mddev_t *mddev, int fd)
+{
+	int err;
+
+	if (mddev->pers) {
+		if (!mddev->pers->quiesce)
+			return -EBUSY;
+		if (mddev->recovery || mddev->sync_thread)
+			return -EBUSY;
+		/* we should be able to change the bitmap.. */
+	}
+
+
+	if (fd >= 0) {
+		if (mddev->bitmap)
+			return -EEXIST; /* cannot add when bitmap is present */
+		mddev->bitmap_file = fget(fd);
+
+		if (mddev->bitmap_file == NULL) {
+			printk(KERN_ERR "%s: error: failed to get bitmap file\n",
+			       mdname(mddev));
+			return -EBADF;
+		}
+
+		err = deny_bitmap_write_access(mddev->bitmap_file);
+		if (err) {
+			printk(KERN_ERR "%s: error: bitmap file is already in use\n",
+			       mdname(mddev));
+			fput(mddev->bitmap_file);
+			mddev->bitmap_file = NULL;
+			return err;
+		}
+		mddev->bitmap_offset = 0; /* file overrides offset */
+	} else if (mddev->bitmap == NULL)
+		return -ENOENT; /* cannot remove what isn't there */
+	err = 0;
+	if (mddev->pers) {
+		mddev->pers->quiesce(mddev, 1);
+		if (fd >= 0)
+			err = bitmap_create(mddev);
+		if (fd < 0 || err)
+			bitmap_destroy(mddev);
+		mddev->pers->quiesce(mddev, 0);
+	} else if (fd < 0) {
+		if (mddev->bitmap_file)
+			fput(mddev->bitmap_file);
+		mddev->bitmap_file = NULL;
+	}
+
+	return err;
+}
+
+/*
+ * set_array_info is used two different ways
+ * The original usage is when creating a new array.
+ * In this usage, raid_disks is > 0 and it together with
+ *  level, size, not_persistent,layout,chunksize determine the
+ *  shape of the array.
+ *  This will always create an array with a type-0.90.0 superblock.
+ * The newer usage is when assembling an array.
+ *  In this case raid_disks will be 0, and the major_version field is
+ *  use to determine which style super-blocks are to be found on the devices.
+ *  The minor and patch _version numbers are also kept incase the
+ *  super_block handler wishes to interpret them.
+ */
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+	if (info->raid_disks == 0) {
+		/* just setting version number for superblock loading */
+		if (info->major_version < 0 ||
+		    info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
+		    super_types[info->major_version].name == NULL) {
+			/* maybe try to auto-load a module? */
+			printk(KERN_INFO 
+				"md: superblock version %d not known\n",
+				info->major_version);
+			return -EINVAL;
+		}
+		mddev->major_version = info->major_version;
+		mddev->minor_version = info->minor_version;
+		mddev->patch_version = info->patch_version;
+		return 0;
+	}
+	mddev->major_version = MD_MAJOR_VERSION;
+	mddev->minor_version = MD_MINOR_VERSION;
+	mddev->patch_version = MD_PATCHLEVEL_VERSION;
+	mddev->ctime         = get_seconds();
+
+	mddev->level         = info->level;
+	mddev->clevel[0]     = 0;
+	mddev->size          = info->size;
+	mddev->raid_disks    = info->raid_disks;
+	/* don't set md_minor, it is determined by which /dev/md* was
+	 * openned
+	 */
+	if (info->state & (1<<MD_SB_CLEAN))
+		mddev->recovery_cp = MaxSector;
+	else
+		mddev->recovery_cp = 0;
+	mddev->persistent    = ! info->not_persistent;
+
+	mddev->layout        = info->layout;
+	mddev->chunk_size    = info->chunk_size;
+
+	mddev->max_disks     = MD_SB_DISKS;
+
+	mddev->sb_dirty      = 1;
+
+	mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
+	mddev->bitmap_offset = 0;
+
+	/*
+	 * Generate a 128 bit UUID
+	 */
+	get_random_bytes(mddev->uuid, 16);
+
+	return 0;
+}
+
+static int update_size(mddev_t *mddev, unsigned long size)
+{
+	mdk_rdev_t * rdev;
+	int rv;
+	struct list_head *tmp;
+
+	if (mddev->pers->resize == NULL)
+		return -EINVAL;
+	/* The "size" is the amount of each device that is used.
+	 * This can only make sense for arrays with redundancy.
+	 * linear and raid0 always use whatever space is available
+	 * We can only consider changing the size if no resync
+	 * or reconstruction is happening, and if the new size
+	 * is acceptable. It must fit before the sb_offset or,
+	 * if that is <data_offset, it must fit before the
+	 * size of each device.
+	 * If size is zero, we find the largest size that fits.
+	 */
+	if (mddev->sync_thread)
+		return -EBUSY;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		sector_t avail;
+		int fit = (size == 0);
+		if (rdev->sb_offset > rdev->data_offset)
+			avail = (rdev->sb_offset*2) - rdev->data_offset;
+		else
+			avail = get_capacity(rdev->bdev->bd_disk)
+				- rdev->data_offset;
+		if (fit && (size == 0 || size > avail/2))
+			size = avail/2;
+		if (avail < ((sector_t)size << 1))
+			return -ENOSPC;
+	}
+	rv = mddev->pers->resize(mddev, (sector_t)size *2);
+	if (!rv) {
+		struct block_device *bdev;
+
+		bdev = bdget_disk(mddev->gendisk, 0);
+		if (bdev) {
+			mutex_lock(&bdev->bd_inode->i_mutex);
+			i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
+			mutex_unlock(&bdev->bd_inode->i_mutex);
+			bdput(bdev);
+		}
+	}
+	return rv;
+}
+
+static int update_raid_disks(mddev_t *mddev, int raid_disks)
+{
+	int rv;
+	/* change the number of raid disks */
+	if (mddev->pers->reshape == NULL)
+		return -EINVAL;
+	if (raid_disks <= 0 ||
+	    raid_disks >= mddev->max_disks)
+		return -EINVAL;
+	if (mddev->sync_thread)
+		return -EBUSY;
+	rv = mddev->pers->reshape(mddev, raid_disks);
+	return rv;
+}
+
+
+/*
+ * update_array_info is used to change the configuration of an
+ * on-line array.
+ * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
+ * fields in the info are checked against the array.
+ * Any differences that cannot be handled will cause an error.
+ * Normally, only one change can be managed at a time.
+ */
+static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
+{
+	int rv = 0;
+	int cnt = 0;
+	int state = 0;
+
+	/* calculate expected state,ignoring low bits */
+	if (mddev->bitmap && mddev->bitmap_offset)
+		state |= (1 << MD_SB_BITMAP_PRESENT);
+
+	if (mddev->major_version != info->major_version ||
+	    mddev->minor_version != info->minor_version ||
+/*	    mddev->patch_version != info->patch_version || */
+	    mddev->ctime         != info->ctime         ||
+	    mddev->level         != info->level         ||
+/*	    mddev->layout        != info->layout        || */
+	    !mddev->persistent	 != info->not_persistent||
+	    mddev->chunk_size    != info->chunk_size    ||
+	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
+	    ((state^info->state) & 0xfffffe00)
+		)
+		return -EINVAL;
+	/* Check there is only one change */
+	if (info->size >= 0 && mddev->size != info->size) cnt++;
+	if (mddev->raid_disks != info->raid_disks) cnt++;
+	if (mddev->layout != info->layout) cnt++;
+	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
+	if (cnt == 0) return 0;
+	if (cnt > 1) return -EINVAL;
+
+	if (mddev->layout != info->layout) {
+		/* Change layout
+		 * we don't need to do anything at the md level, the
+		 * personality will take care of it all.
+		 */
+		if (mddev->pers->reconfig == NULL)
+			return -EINVAL;
+		else
+			return mddev->pers->reconfig(mddev, info->layout, -1);
+	}
+	if (info->size >= 0 && mddev->size != info->size)
+		rv = update_size(mddev, info->size);
+
+	if (mddev->raid_disks    != info->raid_disks)
+		rv = update_raid_disks(mddev, info->raid_disks);
+
+	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
+		if (mddev->pers->quiesce == NULL)
+			return -EINVAL;
+		if (mddev->recovery || mddev->sync_thread)
+			return -EBUSY;
+		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
+			/* add the bitmap */
+			if (mddev->bitmap)
+				return -EEXIST;
+			if (mddev->default_bitmap_offset == 0)
+				return -EINVAL;
+			mddev->bitmap_offset = mddev->default_bitmap_offset;
+			mddev->pers->quiesce(mddev, 1);
+			rv = bitmap_create(mddev);
+			if (rv)
+				bitmap_destroy(mddev);
+			mddev->pers->quiesce(mddev, 0);
+		} else {
+			/* remove the bitmap */
+			if (!mddev->bitmap)
+				return -ENOENT;
+			if (mddev->bitmap->file)
+				return -EINVAL;
+			mddev->pers->quiesce(mddev, 1);
+			bitmap_destroy(mddev);
+			mddev->pers->quiesce(mddev, 0);
+			mddev->bitmap_offset = 0;
+		}
+	}
+	md_update_sb(mddev);
+	return rv;
+}
+
+static int set_disk_faulty(mddev_t *mddev, dev_t dev)
+{
+	mdk_rdev_t *rdev;
+
+	if (mddev->pers == NULL)
+		return -ENODEV;
+
+	rdev = find_rdev(mddev, dev);
+	if (!rdev)
+		return -ENODEV;
+
+	md_error(mddev, rdev);
+	return 0;
+}
+
+static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	mddev_t *mddev = bdev->bd_disk->private_data;
+
+	geo->heads = 2;
+	geo->sectors = 4;
+	geo->cylinders = get_capacity(mddev->gendisk) / 8;
+	return 0;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+			unsigned int cmd, unsigned long arg)
+{
+	int err = 0;
+	void __user *argp = (void __user *)arg;
+	mddev_t *mddev = NULL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	/*
+	 * Commands dealing with the RAID driver but not any
+	 * particular array:
+	 */
+	switch (cmd)
+	{
+		case RAID_VERSION:
+			err = get_version(argp);
+			goto done;
+
+		case PRINT_RAID_DEBUG:
+			err = 0;
+			md_print_devices();
+			goto done;
+
+#ifndef MODULE
+		case RAID_AUTORUN:
+			err = 0;
+			autostart_arrays(arg);
+			goto done;
+#endif
+		default:;
+	}
+
+	/*
+	 * Commands creating/starting a new array:
+	 */
+
+	mddev = inode->i_bdev->bd_disk->private_data;
+
+	if (!mddev) {
+		BUG();
+		goto abort;
+	}
+
+
+	if (cmd == START_ARRAY) {
+		/* START_ARRAY doesn't need to lock the array as autostart_array
+		 * does the locking, and it could even be a different array
+		 */
+		static int cnt = 3;
+		if (cnt > 0 ) {
+			printk(KERN_WARNING
+			       "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
+			       "This will not be supported beyond July 2006\n",
+			       current->comm, current->pid);
+			cnt--;
+		}
+		err = autostart_array(new_decode_dev(arg));
+		if (err) {
+			printk(KERN_WARNING "md: autostart failed!\n");
+			goto abort;
+		}
+		goto done;
+	}
+
+	err = mddev_lock(mddev);
+	if (err) {
+		printk(KERN_INFO 
+			"md: ioctl lock interrupted, reason %d, cmd %d\n",
+			err, cmd);
+		goto abort;
+	}
+
+	switch (cmd)
+	{
+		case SET_ARRAY_INFO:
+			{
+				mdu_array_info_t info;
+				if (!arg)
+					memset(&info, 0, sizeof(info));
+				else if (copy_from_user(&info, argp, sizeof(info))) {
+					err = -EFAULT;
+					goto abort_unlock;
+				}
+				if (mddev->pers) {
+					err = update_array_info(mddev, &info);
+					if (err) {
+						printk(KERN_WARNING "md: couldn't update"
+						       " array info. %d\n", err);
+						goto abort_unlock;
+					}
+					goto done_unlock;
+				}
+				if (!list_empty(&mddev->disks)) {
+					printk(KERN_WARNING
+					       "md: array %s already has disks!\n",
+					       mdname(mddev));
+					err = -EBUSY;
+					goto abort_unlock;
+				}
+				if (mddev->raid_disks) {
+					printk(KERN_WARNING
+					       "md: array %s already initialised!\n",
+					       mdname(mddev));
+					err = -EBUSY;
+					goto abort_unlock;
+				}
+				err = set_array_info(mddev, &info);
+				if (err) {
+					printk(KERN_WARNING "md: couldn't set"
+					       " array info. %d\n", err);
+					goto abort_unlock;
+				}
+			}
+			goto done_unlock;
+
+		default:;
+	}
+
+	/*
+	 * Commands querying/configuring an existing array:
+	 */
+	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
+	 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
+	if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
+			&& cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
+		err = -ENODEV;
+		goto abort_unlock;
+	}
+
+	/*
+	 * Commands even a read-only array can execute:
+	 */
+	switch (cmd)
+	{
+		case GET_ARRAY_INFO:
+			err = get_array_info(mddev, argp);
+			goto done_unlock;
+
+		case GET_BITMAP_FILE:
+			err = get_bitmap_file(mddev, argp);
+			goto done_unlock;
+
+		case GET_DISK_INFO:
+			err = get_disk_info(mddev, argp);
+			goto done_unlock;
+
+		case RESTART_ARRAY_RW:
+			err = restart_array(mddev);
+			goto done_unlock;
+
+		case STOP_ARRAY:
+			err = do_md_stop (mddev, 0);
+			goto done_unlock;
+
+		case STOP_ARRAY_RO:
+			err = do_md_stop (mddev, 1);
+			goto done_unlock;
+
+	/*
+	 * We have a problem here : there is no easy way to give a CHS
+	 * virtual geometry. We currently pretend that we have a 2 heads
+	 * 4 sectors (with a BIG number of cylinders...). This drives
+	 * dosfs just mad... ;-)
+	 */
+	}
+
+	/*
+	 * The remaining ioctls are changing the state of the
+	 * superblock, so we do not allow them on read-only arrays.
+	 * However non-MD ioctls (e.g. get-size) will still come through
+	 * here and hit the 'default' below, so only disallow
+	 * 'md' ioctls, and switch to rw mode if started auto-readonly.
+	 */
+	if (_IOC_TYPE(cmd) == MD_MAJOR &&
+	    mddev->ro && mddev->pers) {
+		if (mddev->ro == 2) {
+			mddev->ro = 0;
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+
+		} else {
+			err = -EROFS;
+			goto abort_unlock;
+		}
+	}
+
+	switch (cmd)
+	{
+		case ADD_NEW_DISK:
+		{
+			mdu_disk_info_t info;
+			if (copy_from_user(&info, argp, sizeof(info)))
+				err = -EFAULT;
+			else
+				err = add_new_disk(mddev, &info);
+			goto done_unlock;
+		}
+
+		case HOT_REMOVE_DISK:
+			err = hot_remove_disk(mddev, new_decode_dev(arg));
+			goto done_unlock;
+
+		case HOT_ADD_DISK:
+			err = hot_add_disk(mddev, new_decode_dev(arg));
+			goto done_unlock;
+
+		case SET_DISK_FAULTY:
+			err = set_disk_faulty(mddev, new_decode_dev(arg));
+			goto done_unlock;
+
+		case RUN_ARRAY:
+			err = do_md_run (mddev);
+			goto done_unlock;
+
+		case SET_BITMAP_FILE:
+			err = set_bitmap_file(mddev, (int)arg);
+			goto done_unlock;
+
+		default:
+			if (_IOC_TYPE(cmd) == MD_MAJOR)
+				printk(KERN_WARNING "md: %s(pid %d) used"
+					" obsolete MD ioctl, upgrade your"
+					" software to use new ictls.\n",
+					current->comm, current->pid);
+			err = -EINVAL;
+			goto abort_unlock;
+	}
+
+done_unlock:
+abort_unlock:
+	mddev_unlock(mddev);
+
+	return err;
+done:
+	if (err)
+		MD_BUG();
+abort:
+	return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+	/*
+	 * Succeed if we can lock the mddev, which confirms that
+	 * it isn't being stopped right now.
+	 */
+	mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
+	int err;
+
+	if ((err = mddev_lock(mddev)))
+		goto out;
+
+	err = 0;
+	mddev_get(mddev);
+	mddev_unlock(mddev);
+
+	check_disk_change(inode->i_bdev);
+ out:
+	return err;
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ 	mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
+
+	if (!mddev)
+		BUG();
+	mddev_put(mddev);
+
+	return 0;
+}
+
+static int md_media_changed(struct gendisk *disk)
+{
+	mddev_t *mddev = disk->private_data;
+
+	return mddev->changed;
+}
+
+static int md_revalidate(struct gendisk *disk)
+{
+	mddev_t *mddev = disk->private_data;
+
+	mddev->changed = 0;
+	return 0;
+}
+static struct block_device_operations md_fops =
+{
+	.owner		= THIS_MODULE,
+	.open		= md_open,
+	.release	= md_release,
+	.ioctl		= md_ioctl,
+	.getgeo		= md_getgeo,
+	.media_changed	= md_media_changed,
+	.revalidate_disk= md_revalidate,
+};
+
+static int md_thread(void * arg)
+{
+	mdk_thread_t *thread = arg;
+
+	/*
+	 * md_thread is a 'system-thread', it's priority should be very
+	 * high. We avoid resource deadlocks individually in each
+	 * raid personality. (RAID5 does preallocation) We also use RR and
+	 * the very same RT priority as kswapd, thus we will never get
+	 * into a priority inversion deadlock.
+	 *
+	 * we definitely have to have equal or higher priority than
+	 * bdflush, otherwise bdflush will deadlock if there are too
+	 * many dirty RAID5 blocks.
+	 */
+
+	allow_signal(SIGKILL);
+	while (!kthread_should_stop()) {
+
+		/* We need to wait INTERRUPTIBLE so that
+		 * we don't add to the load-average.
+		 * That means we need to be sure no signals are
+		 * pending
+		 */
+		if (signal_pending(current))
+			flush_signals(current);
+
+		wait_event_interruptible_timeout
+			(thread->wqueue,
+			 test_bit(THREAD_WAKEUP, &thread->flags)
+			 || kthread_should_stop(),
+			 thread->timeout);
+		try_to_freeze();
+
+		clear_bit(THREAD_WAKEUP, &thread->flags);
+
+		thread->run(thread->mddev);
+	}
+
+	return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+	if (thread) {
+		dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
+		set_bit(THREAD_WAKEUP, &thread->flags);
+		wake_up(&thread->wqueue);
+	}
+}
+
+mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
+				 const char *name)
+{
+	mdk_thread_t *thread;
+
+	thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
+	if (!thread)
+		return NULL;
+
+	init_waitqueue_head(&thread->wqueue);
+
+	thread->run = run;
+	thread->mddev = mddev;
+	thread->timeout = MAX_SCHEDULE_TIMEOUT;
+	thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
+	if (IS_ERR(thread->tsk)) {
+		kfree(thread);
+		return NULL;
+	}
+	return thread;
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+	dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+
+	kthread_stop(thread->tsk);
+	kfree(thread);
+}
+
+void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	if (!mddev) {
+		MD_BUG();
+		return;
+	}
+
+	if (!rdev || test_bit(Faulty, &rdev->flags))
+		return;
+/*
+	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+		mdname(mddev),
+		MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
+		__builtin_return_address(0),__builtin_return_address(1),
+		__builtin_return_address(2),__builtin_return_address(3));
+*/
+	if (!mddev->pers->error_handler)
+		return;
+	mddev->pers->error_handler(mddev,rdev);
+	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	md_wakeup_thread(mddev->thread);
+	md_new_event(mddev);
+}
+
+/* seq_file implementation /proc/mdstat */
+
+static void status_unused(struct seq_file *seq)
+{
+	int i = 0;
+	mdk_rdev_t *rdev;
+	struct list_head *tmp;
+
+	seq_printf(seq, "unused devices: ");
+
+	ITERATE_RDEV_PENDING(rdev,tmp) {
+		char b[BDEVNAME_SIZE];
+		i++;
+		seq_printf(seq, "%s ",
+			      bdevname(rdev->bdev,b));
+	}
+	if (!i)
+		seq_printf(seq, "<none>");
+
+	seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+	unsigned long max_blocks, resync, res, dt, db, rt;
+
+	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+		max_blocks = mddev->resync_max_sectors >> 1;
+	else
+		max_blocks = mddev->size;
+
+	/*
+	 * Should not happen.
+	 */
+	if (!max_blocks) {
+		MD_BUG();
+		return;
+	}
+	res = (resync/1024)*1000/(max_blocks/1024 + 1);
+	{
+		int i, x = res/50, y = 20-x;
+		seq_printf(seq, "[");
+		for (i = 0; i < x; i++)
+			seq_printf(seq, "=");
+		seq_printf(seq, ">");
+		for (i = 0; i < y; i++)
+			seq_printf(seq, ".");
+		seq_printf(seq, "] ");
+	}
+	seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+		      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+		       "resync" : "recovery"),
+		      res/10, res % 10, resync, max_blocks);
+
+	/*
+	 * We do not want to overflow, so the order of operands and
+	 * the * 100 / 100 trick are important. We do a +1 to be
+	 * safe against division by zero. We only estimate anyway.
+	 *
+	 * dt: time from mark until now
+	 * db: blocks written from mark until now
+	 * rt: remaining time
+	 */
+	dt = ((jiffies - mddev->resync_mark) / HZ);
+	if (!dt) dt++;
+	db = resync - (mddev->resync_mark_cnt/2);
+	rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+	seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+	seq_printf(seq, " speed=%ldK/sec", db/dt);
+}
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct list_head *tmp;
+	loff_t l = *pos;
+	mddev_t *mddev;
+
+	if (l >= 0x10000)
+		return NULL;
+	if (!l--)
+		/* header */
+		return (void*)1;
+
+	spin_lock(&all_mddevs_lock);
+	list_for_each(tmp,&all_mddevs)
+		if (!l--) {
+			mddev = list_entry(tmp, mddev_t, all_mddevs);
+			mddev_get(mddev);
+			spin_unlock(&all_mddevs_lock);
+			return mddev;
+		}
+	spin_unlock(&all_mddevs_lock);
+	if (!l--)
+		return (void*)2;/* tail */
+	return NULL;
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct list_head *tmp;
+	mddev_t *next_mddev, *mddev = v;
+	
+	++*pos;
+	if (v == (void*)2)
+		return NULL;
+
+	spin_lock(&all_mddevs_lock);
+	if (v == (void*)1)
+		tmp = all_mddevs.next;
+	else
+		tmp = mddev->all_mddevs.next;
+	if (tmp != &all_mddevs)
+		next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
+	else {
+		next_mddev = (void*)2;
+		*pos = 0x10000;
+	}		
+	spin_unlock(&all_mddevs_lock);
+
+	if (v != (void*)1)
+		mddev_put(mddev);
+	return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+	mddev_t *mddev = v;
+
+	if (mddev && v != (void*)1 && v != (void*)2)
+		mddev_put(mddev);
+}
+
+struct mdstat_info {
+	int event;
+};
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+	mddev_t *mddev = v;
+	sector_t size;
+	struct list_head *tmp2;
+	mdk_rdev_t *rdev;
+	struct mdstat_info *mi = seq->private;
+	struct bitmap *bitmap;
+
+	if (v == (void*)1) {
+		struct mdk_personality *pers;
+		seq_printf(seq, "Personalities : ");
+		spin_lock(&pers_lock);
+		list_for_each_entry(pers, &pers_list, list)
+			seq_printf(seq, "[%s] ", pers->name);
+
+		spin_unlock(&pers_lock);
+		seq_printf(seq, "\n");
+		mi->event = atomic_read(&md_event_count);
+		return 0;
+	}
+	if (v == (void*)2) {
+		status_unused(seq);
+		return 0;
+	}
+
+	if (mddev_lock(mddev)!=0) 
+		return -EINTR;
+	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
+		seq_printf(seq, "%s : %sactive", mdname(mddev),
+						mddev->pers ? "" : "in");
+		if (mddev->pers) {
+			if (mddev->ro==1)
+				seq_printf(seq, " (read-only)");
+			if (mddev->ro==2)
+				seq_printf(seq, "(auto-read-only)");
+			seq_printf(seq, " %s", mddev->pers->name);
+		}
+
+		size = 0;
+		ITERATE_RDEV(mddev,rdev,tmp2) {
+			char b[BDEVNAME_SIZE];
+			seq_printf(seq, " %s[%d]",
+				bdevname(rdev->bdev,b), rdev->desc_nr);
+			if (test_bit(WriteMostly, &rdev->flags))
+				seq_printf(seq, "(W)");
+			if (test_bit(Faulty, &rdev->flags)) {
+				seq_printf(seq, "(F)");
+				continue;
+			} else if (rdev->raid_disk < 0)
+				seq_printf(seq, "(S)"); /* spare */
+			size += rdev->size;
+		}
+
+		if (!list_empty(&mddev->disks)) {
+			if (mddev->pers)
+				seq_printf(seq, "\n      %llu blocks",
+					(unsigned long long)mddev->array_size);
+			else
+				seq_printf(seq, "\n      %llu blocks",
+					(unsigned long long)size);
+		}
+		if (mddev->persistent) {
+			if (mddev->major_version != 0 ||
+			    mddev->minor_version != 90) {
+				seq_printf(seq," super %d.%d",
+					   mddev->major_version,
+					   mddev->minor_version);
+			}
+		} else
+			seq_printf(seq, " super non-persistent");
+
+		if (mddev->pers) {
+			mddev->pers->status (seq, mddev);
+	 		seq_printf(seq, "\n      ");
+			if (mddev->pers->sync_request) {
+				if (mddev->curr_resync > 2) {
+					status_resync (seq, mddev);
+					seq_printf(seq, "\n      ");
+				} else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
+					seq_printf(seq, "\tresync=DELAYED\n      ");
+				else if (mddev->recovery_cp < MaxSector)
+					seq_printf(seq, "\tresync=PENDING\n      ");
+			}
+		} else
+			seq_printf(seq, "\n       ");
+
+		if ((bitmap = mddev->bitmap)) {
+			unsigned long chunk_kb;
+			unsigned long flags;
+			spin_lock_irqsave(&bitmap->lock, flags);
+			chunk_kb = bitmap->chunksize >> 10;
+			seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
+				"%lu%s chunk",
+				bitmap->pages - bitmap->missing_pages,
+				bitmap->pages,
+				(bitmap->pages - bitmap->missing_pages)
+					<< (PAGE_SHIFT - 10),
+				chunk_kb ? chunk_kb : bitmap->chunksize,
+				chunk_kb ? "KB" : "B");
+			if (bitmap->file) {
+				seq_printf(seq, ", file: ");
+				seq_path(seq, bitmap->file->f_vfsmnt,
+					 bitmap->file->f_dentry," \t\n");
+			}
+
+			seq_printf(seq, "\n");
+			spin_unlock_irqrestore(&bitmap->lock, flags);
+		}
+
+		seq_printf(seq, "\n");
+	}
+	mddev_unlock(mddev);
+	
+	return 0;
+}
+
+static struct seq_operations md_seq_ops = {
+	.start  = md_seq_start,
+	.next   = md_seq_next,
+	.stop   = md_seq_stop,
+	.show   = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+	int error;
+	struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
+	if (mi == NULL)
+		return -ENOMEM;
+
+	error = seq_open(file, &md_seq_ops);
+	if (error)
+		kfree(mi);
+	else {
+		struct seq_file *p = file->private_data;
+		p->private = mi;
+		mi->event = atomic_read(&md_event_count);
+	}
+	return error;
+}
+
+static int md_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	struct mdstat_info *mi = m->private;
+	m->private = NULL;
+	kfree(mi);
+	return seq_release(inode, file);
+}
+
+static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
+{
+	struct seq_file *m = filp->private_data;
+	struct mdstat_info *mi = m->private;
+	int mask;
+
+	poll_wait(filp, &md_event_waiters, wait);
+
+	/* always allow read */
+	mask = POLLIN | POLLRDNORM;
+
+	if (mi->event != atomic_read(&md_event_count))
+		mask |= POLLERR | POLLPRI;
+	return mask;
+}
+
+static struct file_operations md_seq_fops = {
+	.open           = md_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= md_seq_release,
+	.poll		= mdstat_poll,
+};
+
+int register_md_personality(struct mdk_personality *p)
+{
+	spin_lock(&pers_lock);
+	list_add_tail(&p->list, &pers_list);
+	printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
+	spin_unlock(&pers_lock);
+	return 0;
+}
+
+int unregister_md_personality(struct mdk_personality *p)
+{
+	printk(KERN_INFO "md: %s personality unregistered\n", p->name);
+	spin_lock(&pers_lock);
+	list_del_init(&p->list);
+	spin_unlock(&pers_lock);
+	return 0;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+	mdk_rdev_t * rdev;
+	struct list_head *tmp;
+	int idle;
+	unsigned long curr_events;
+
+	idle = 1;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+		curr_events = disk_stat_read(disk, sectors[0]) + 
+				disk_stat_read(disk, sectors[1]) - 
+				atomic_read(&disk->sync_io);
+		/* The difference between curr_events and last_events
+		 * will be affected by any new non-sync IO (making
+		 * curr_events bigger) and any difference in the amount of
+		 * in-flight syncio (making current_events bigger or smaller)
+		 * The amount in-flight is currently limited to
+		 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6
+		 * which is at most 4096 sectors.
+		 * These numbers are fairly fragile and should be made
+		 * more robust, probably by enforcing the
+		 * 'window size' that md_do_sync sort-of uses.
+		 *
+		 * Note: the following is an unsigned comparison.
+		 */
+		if ((curr_events - rdev->last_events + 4096) > 8192) {
+			rdev->last_events = curr_events;
+			idle = 0;
+		}
+	}
+	return idle;
+}
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+	/* another "blocks" (512byte) blocks have been synced */
+	atomic_sub(blocks, &mddev->recovery_active);
+	wake_up(&mddev->recovery_wait);
+	if (!ok) {
+		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+		// stop recovery, signal do_sync ....
+	}
+}
+
+
+/* md_write_start(mddev, bi)
+ * If we need to update some array metadata (e.g. 'active' flag
+ * in superblock) before writing, schedule a superblock update
+ * and wait for it to complete.
+ */
+void md_write_start(mddev_t *mddev, struct bio *bi)
+{
+	if (bio_data_dir(bi) != WRITE)
+		return;
+
+	BUG_ON(mddev->ro == 1);
+	if (mddev->ro == 2) {
+		/* need to switch to read/write */
+		mddev->ro = 0;
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+	}
+	atomic_inc(&mddev->writes_pending);
+	if (mddev->in_sync) {
+		spin_lock_irq(&mddev->write_lock);
+		if (mddev->in_sync) {
+			mddev->in_sync = 0;
+			mddev->sb_dirty = 1;
+			md_wakeup_thread(mddev->thread);
+		}
+		spin_unlock_irq(&mddev->write_lock);
+	}
+	wait_event(mddev->sb_wait, mddev->sb_dirty==0);
+}
+
+void md_write_end(mddev_t *mddev)
+{
+	if (atomic_dec_and_test(&mddev->writes_pending)) {
+		if (mddev->safemode == 2)
+			md_wakeup_thread(mddev->thread);
+		else
+			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
+	}
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+#define SYNC_MARKS	10
+#define	SYNC_MARK_STEP	(3*HZ)
+static void md_do_sync(mddev_t *mddev)
+{
+	mddev_t *mddev2;
+	unsigned int currspeed = 0,
+		 window;
+	sector_t max_sectors,j, io_sectors;
+	unsigned long mark[SYNC_MARKS];
+	sector_t mark_cnt[SYNC_MARKS];
+	int last_mark,m;
+	struct list_head *tmp;
+	sector_t last_check;
+	int skipped = 0;
+
+	/* just incase thread restarts... */
+	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+		return;
+
+	/* we overload curr_resync somewhat here.
+	 * 0 == not engaged in resync at all
+	 * 2 == checking that there is no conflict with another sync
+	 * 1 == like 2, but have yielded to allow conflicting resync to
+	 *		commense
+	 * other == active in resync - this many blocks
+	 *
+	 * Before starting a resync we must have set curr_resync to
+	 * 2, and then checked that every "conflicting" array has curr_resync
+	 * less than ours.  When we find one that is the same or higher
+	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
+	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
+	 * This will mean we have to start checking from the beginning again.
+	 *
+	 */
+
+	do {
+		mddev->curr_resync = 2;
+
+	try_again:
+		if (kthread_should_stop()) {
+			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+			goto skip;
+		}
+		ITERATE_MDDEV(mddev2,tmp) {
+			if (mddev2 == mddev)
+				continue;
+			if (mddev2->curr_resync && 
+			    match_mddev_units(mddev,mddev2)) {
+				DEFINE_WAIT(wq);
+				if (mddev < mddev2 && mddev->curr_resync == 2) {
+					/* arbitrarily yield */
+					mddev->curr_resync = 1;
+					wake_up(&resync_wait);
+				}
+				if (mddev > mddev2 && mddev->curr_resync == 1)
+					/* no need to wait here, we can wait the next
+					 * time 'round when curr_resync == 2
+					 */
+					continue;
+				prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
+				if (!kthread_should_stop() &&
+				    mddev2->curr_resync >= mddev->curr_resync) {
+					printk(KERN_INFO "md: delaying resync of %s"
+					       " until %s has finished resync (they"
+					       " share one or more physical units)\n",
+					       mdname(mddev), mdname(mddev2));
+					mddev_put(mddev2);
+					schedule();
+					finish_wait(&resync_wait, &wq);
+					goto try_again;
+				}
+				finish_wait(&resync_wait, &wq);
+			}
+		}
+	} while (mddev->curr_resync < 2);
+
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+		/* resync follows the size requested by the personality,
+		 * which defaults to physical size, but can be virtual size
+		 */
+		max_sectors = mddev->resync_max_sectors;
+		mddev->resync_mismatches = 0;
+	} else
+		/* recovery follows the physical size of devices */
+		max_sectors = mddev->size << 1;
+
+	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
+	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
+		" %d KB/sec/disc.\n", speed_min(mddev));
+	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
+	       "(but not more than %d KB/sec) for reconstruction.\n",
+	       speed_max(mddev));
+
+	is_mddev_idle(mddev); /* this also initializes IO event counters */
+	/* we don't use the checkpoint if there's a bitmap */
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
+	    && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+		j = mddev->recovery_cp;
+	else
+		j = 0;
+	io_sectors = 0;
+	for (m = 0; m < SYNC_MARKS; m++) {
+		mark[m] = jiffies;
+		mark_cnt[m] = io_sectors;
+	}
+	last_mark = 0;
+	mddev->resync_mark = mark[last_mark];
+	mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+	/*
+	 * Tune reconstruction:
+	 */
+	window = 32*(PAGE_SIZE/512);
+	printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
+		window/2,(unsigned long long) max_sectors/2);
+
+	atomic_set(&mddev->recovery_active, 0);
+	init_waitqueue_head(&mddev->recovery_wait);
+	last_check = 0;
+
+	if (j>2) {
+		printk(KERN_INFO 
+			"md: resuming recovery of %s from checkpoint.\n",
+			mdname(mddev));
+		mddev->curr_resync = j;
+	}
+
+	while (j < max_sectors) {
+		sector_t sectors;
+
+		skipped = 0;
+		sectors = mddev->pers->sync_request(mddev, j, &skipped,
+					    currspeed < speed_min(mddev));
+		if (sectors == 0) {
+			set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+			goto out;
+		}
+
+		if (!skipped) { /* actual IO requested */
+			io_sectors += sectors;
+			atomic_add(sectors, &mddev->recovery_active);
+		}
+
+		j += sectors;
+		if (j>1) mddev->curr_resync = j;
+		if (last_check == 0)
+			/* this is the earliers that rebuilt will be
+			 * visible in /proc/mdstat
+			 */
+			md_new_event(mddev);
+
+		if (last_check + window > io_sectors || j == max_sectors)
+			continue;
+
+		last_check = io_sectors;
+
+		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
+		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
+			break;
+
+	repeat:
+		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
+			/* step marks */
+			int next = (last_mark+1) % SYNC_MARKS;
+
+			mddev->resync_mark = mark[next];
+			mddev->resync_mark_cnt = mark_cnt[next];
+			mark[next] = jiffies;
+			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
+			last_mark = next;
+		}
+
+
+		if (kthread_should_stop()) {
+			/*
+			 * got a signal, exit.
+			 */
+			printk(KERN_INFO 
+				"md: md_do_sync() got signal ... exiting\n");
+			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+			goto out;
+		}
+
+		/*
+		 * this loop exits only if either when we are slower than
+		 * the 'hard' speed limit, or the system was IO-idle for
+		 * a jiffy.
+		 * the system might be non-idle CPU-wise, but we only care
+		 * about not overloading the IO subsystem. (things like an
+		 * e2fsck being done on the RAID array should execute fast)
+		 */
+		mddev->queue->unplug_fn(mddev->queue);
+		cond_resched();
+
+		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
+			/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+		if (currspeed > speed_min(mddev)) {
+			if ((currspeed > speed_max(mddev)) ||
+					!is_mddev_idle(mddev)) {
+				msleep(500);
+				goto repeat;
+			}
+		}
+	}
+	printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
+	/*
+	 * this also signals 'finished resyncing' to md_stop
+	 */
+ out:
+	mddev->queue->unplug_fn(mddev->queue);
+
+	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+
+	/* tell personality that we are finished */
+	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
+
+	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+	    mddev->curr_resync > 2 &&
+	    mddev->curr_resync >= mddev->recovery_cp) {
+		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+			printk(KERN_INFO 
+				"md: checkpointing recovery of %s.\n",
+				mdname(mddev));
+			mddev->recovery_cp = mddev->curr_resync;
+		} else
+			mddev->recovery_cp = MaxSector;
+	}
+
+ skip:
+	mddev->curr_resync = 0;
+	wake_up(&resync_wait);
+	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
+	md_wakeup_thread(mddev->thread);
+}
+
+
+/*
+ * This routine is regularly called by all per-raid-array threads to
+ * deal with generic issues like resync and super-block update.
+ * Raid personalities that don't have a thread (linear/raid0) do not
+ * need this as they never do any recovery or update the superblock.
+ *
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
+ * "->recovery" and create a thread at ->sync_thread.
+ * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
+ * and wakeups up this thread which will reap the thread and finish up.
+ * This thread also removes any faulty devices (with nr_pending == 0).
+ *
+ * The overall approach is:
+ *  1/ if the superblock needs updating, update it.
+ *  2/ If a recovery thread is running, don't do anything else.
+ *  3/ If recovery has finished, clean up, possibly marking spares active.
+ *  4/ If there are any faulty devices, remove them.
+ *  5/ If array is degraded, try to add spares devices
+ *  6/ If array has spares or is not in-sync, start a resync thread.
+ */
+void md_check_recovery(mddev_t *mddev)
+{
+	mdk_rdev_t *rdev;
+	struct list_head *rtmp;
+
+
+	if (mddev->bitmap)
+		bitmap_daemon_work(mddev->bitmap);
+
+	if (mddev->ro)
+		return;
+
+	if (signal_pending(current)) {
+		if (mddev->pers->sync_request) {
+			printk(KERN_INFO "md: %s in immediate safe mode\n",
+			       mdname(mddev));
+			mddev->safemode = 2;
+		}
+		flush_signals(current);
+	}
+
+	if ( ! (
+		mddev->sb_dirty ||
+		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+		(mddev->safemode == 1) ||
+		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
+		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
+		))
+		return;
+
+	if (mddev_trylock(mddev)==0) {
+		int spares =0;
+
+		spin_lock_irq(&mddev->write_lock);
+		if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
+		    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
+			mddev->in_sync = 1;
+			mddev->sb_dirty = 1;
+		}
+		if (mddev->safemode == 1)
+			mddev->safemode = 0;
+		spin_unlock_irq(&mddev->write_lock);
+
+		if (mddev->sb_dirty)
+			md_update_sb(mddev);
+
+
+		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
+			/* resync/recovery still happening */
+			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			goto unlock;
+		}
+		if (mddev->sync_thread) {
+			/* resync has finished, collect result */
+			md_unregister_thread(mddev->sync_thread);
+			mddev->sync_thread = NULL;
+			if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+			    !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+				/* success...*/
+				/* activate any spares */
+				mddev->pers->spare_active(mddev);
+			}
+			md_update_sb(mddev);
+
+			/* if array is no-longer degraded, then any saved_raid_disk
+			 * information must be scrapped
+			 */
+			if (!mddev->degraded)
+				ITERATE_RDEV(mddev,rdev,rtmp)
+					rdev->saved_raid_disk = -1;
+
+			mddev->recovery = 0;
+			/* flag recovery needed just to double check */
+			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			md_new_event(mddev);
+			goto unlock;
+		}
+		/* Clear some bits that don't mean anything, but
+		 * might be left set
+		 */
+		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
+		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
+
+		/* no recovery is running.
+		 * remove any failed drives, then
+		 * add spares if possible.
+		 * Spare are also removed and re-added, to allow
+		 * the personality to fail the re-add.
+		 */
+		ITERATE_RDEV(mddev,rdev,rtmp)
+			if (rdev->raid_disk >= 0 &&
+			    (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) &&
+			    atomic_read(&rdev->nr_pending)==0) {
+				if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) {
+					char nm[20];
+					sprintf(nm,"rd%d", rdev->raid_disk);
+					sysfs_remove_link(&mddev->kobj, nm);
+					rdev->raid_disk = -1;
+				}
+			}
+
+		if (mddev->degraded) {
+			ITERATE_RDEV(mddev,rdev,rtmp)
+				if (rdev->raid_disk < 0
+				    && !test_bit(Faulty, &rdev->flags)) {
+					if (mddev->pers->hot_add_disk(mddev,rdev)) {
+						char nm[20];
+						sprintf(nm, "rd%d", rdev->raid_disk);
+						sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+						spares++;
+						md_new_event(mddev);
+					} else
+						break;
+				}
+		}
+
+		if (spares) {
+			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+		} else if (mddev->recovery_cp < MaxSector) {
+			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+			/* nothing to be done ... */
+			goto unlock;
+
+		if (mddev->pers->sync_request) {
+			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+			if (spares && mddev->bitmap && ! mddev->bitmap->file) {
+				/* We are adding a device or devices to an array
+				 * which has the bitmap stored on all devices.
+				 * So make sure all bitmap pages get written
+				 */
+				bitmap_write_all(mddev->bitmap);
+			}
+			mddev->sync_thread = md_register_thread(md_do_sync,
+								mddev,
+								"%s_resync");
+			if (!mddev->sync_thread) {
+				printk(KERN_ERR "%s: could not start resync"
+					" thread...\n", 
+					mdname(mddev));
+				/* leave the spares where they are, it shouldn't hurt */
+				mddev->recovery = 0;
+			} else
+				md_wakeup_thread(mddev->sync_thread);
+			md_new_event(mddev);
+		}
+	unlock:
+		mddev_unlock(mddev);
+	}
+}
+
+static int md_notify_reboot(struct notifier_block *this,
+			    unsigned long code, void *x)
+{
+	struct list_head *tmp;
+	mddev_t *mddev;
+
+	if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
+
+		printk(KERN_INFO "md: stopping all md devices.\n");
+
+		ITERATE_MDDEV(mddev,tmp)
+			if (mddev_trylock(mddev)==0)
+				do_md_stop (mddev, 1);
+		/*
+		 * certain more exotic SCSI devices are known to be
+		 * volatile wrt too early system reboots. While the
+		 * right place to handle this issue is the given
+		 * driver, we do want to have a safe RAID driver ...
+		 */
+		mdelay(1000*1);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block md_notifier = {
+	.notifier_call	= md_notify_reboot,
+	.next		= NULL,
+	.priority	= INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+	struct proc_dir_entry *p;
+
+	dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+	p = create_proc_entry("mdstat", S_IRUGO, NULL);
+	if (p)
+		p->proc_fops = &md_seq_fops;
+}
+
+static int __init md_init(void)
+{
+	int minor;
+
+	printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
+			" MD_SB_DISKS=%d\n",
+			MD_MAJOR_VERSION, MD_MINOR_VERSION,
+			MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+	printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
+			BITMAP_MINOR);
+
+	if (register_blkdev(MAJOR_NR, "md"))
+		return -1;
+	if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
+		unregister_blkdev(MAJOR_NR, "md");
+		return -1;
+	}
+	devfs_mk_dir("md");
+	blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
+				md_probe, NULL, NULL);
+	blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
+			    md_probe, NULL, NULL);
+
+	for (minor=0; minor < MAX_MD_DEVS; ++minor)
+		devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
+				S_IFBLK|S_IRUSR|S_IWUSR,
+				"md/%d", minor);
+
+	for (minor=0; minor < MAX_MD_DEVS; ++minor)
+		devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
+			      S_IFBLK|S_IRUSR|S_IWUSR,
+			      "md/mdp%d", minor);
+
+
+	register_reboot_notifier(&md_notifier);
+	raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+	md_geninit();
+	return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static dev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(dev_t dev)
+{
+	if (dev_cnt >= 0 && dev_cnt < 127)
+		detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(int part)
+{
+	mdk_rdev_t *rdev;
+	int i;
+
+	printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+	for (i = 0; i < dev_cnt; i++) {
+		dev_t dev = detected_devices[i];
+
+		rdev = md_import_device(dev,0, 0);
+		if (IS_ERR(rdev))
+			continue;
+
+		if (test_bit(Faulty, &rdev->flags)) {
+			MD_BUG();
+			continue;
+		}
+		list_add(&rdev->same_set, &pending_raid_disks);
+	}
+	dev_cnt = 0;
+
+	autorun_devices(part);
+}
+
+#endif
+
+static __exit void md_exit(void)
+{
+	mddev_t *mddev;
+	struct list_head *tmp;
+	int i;
+	blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
+	blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
+	for (i=0; i < MAX_MD_DEVS; i++)
+		devfs_remove("md/%d", i);
+	for (i=0; i < MAX_MD_DEVS; i++)
+		devfs_remove("md/d%d", i);
+
+	devfs_remove("md");
+
+	unregister_blkdev(MAJOR_NR,"md");
+	unregister_blkdev(mdp_major, "mdp");
+	unregister_reboot_notifier(&md_notifier);
+	unregister_sysctl_table(raid_table_header);
+	remove_proc_entry("mdstat", NULL);
+	ITERATE_MDDEV(mddev,tmp) {
+		struct gendisk *disk = mddev->gendisk;
+		if (!disk)
+			continue;
+		export_array(mddev);
+		del_gendisk(disk);
+		put_disk(disk);
+		mddev->gendisk = NULL;
+		mddev_put(mddev);
+	}
+}
+
+module_init(md_init)
+module_exit(md_exit)
+
+static int get_ro(char *buffer, struct kernel_param *kp)
+{
+	return sprintf(buffer, "%d", start_readonly);
+}
+static int set_ro(const char *val, struct kernel_param *kp)
+{
+	char *e;
+	int num = simple_strtoul(val, &e, 10);
+	if (*val && (*e == '\0' || *e == '\n')) {
+		start_readonly = num;
+		return 0;
+	}
+	return -EINVAL;
+}
+
+module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
+module_param(start_dirty_degraded, int, 0644);
+
+
+EXPORT_SYMBOL(register_md_personality);
+EXPORT_SYMBOL(unregister_md_personality);
+EXPORT_SYMBOL(md_error);
+EXPORT_SYMBOL(md_done_sync);
+EXPORT_SYMBOL(md_write_start);
+EXPORT_SYMBOL(md_write_end);
+EXPORT_SYMBOL(md_register_thread);
+EXPORT_SYMBOL(md_unregister_thread);
+EXPORT_SYMBOL(md_wakeup_thread);
+EXPORT_SYMBOL(md_print_devices);
+EXPORT_SYMBOL(md_check_recovery);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md");
+MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
diff -urN oldtree/drivers/net/irda/sir_kthread.c newtree/drivers/net/irda/sir_kthread.c
--- oldtree/drivers/net/irda/sir_kthread.c	2006-03-08 18:47:13.420968000 +0000
+++ newtree/drivers/net/irda/sir_kthread.c	2006-03-08 15:22:33.145498750 +0000
@@ -112,6 +112,7 @@
 	DECLARE_WAITQUEUE(wait, current);
 
 	daemonize("kIrDAd");
+	current->flags |= PF_NOFREEZE;
 
 	irda_rq_queue.thread = current;
 
@@ -134,9 +135,6 @@
 			__set_task_state(current, TASK_RUNNING);
 		remove_wait_queue(&irda_rq_queue.kick, &wait);
 
-		/* make swsusp happy with our thread */
-		try_to_freeze();
-
 		run_irda_queue();
 	}
 
diff -urN oldtree/drivers/scsi/hosts.c newtree/drivers/scsi/hosts.c
--- oldtree/drivers/scsi/hosts.c	2006-03-08 18:48:01.123949250 +0000
+++ newtree/drivers/scsi/hosts.c	2006-03-08 15:22:33.149499000 +0000
@@ -227,7 +227,7 @@
 	if (shost->transportt->create_work_queue) {
 		snprintf(shost->work_q_name, KOBJ_NAME_LEN, "scsi_wq_%d",
 			shost->host_no);
-		shost->work_q = create_singlethread_workqueue(
+		shost->work_q = create_nofreeze_singlethread_workqueue(
 					shost->work_q_name);
 		if (!shost->work_q)
 			goto out_free_shost_data;
diff -urN oldtree/drivers/scsi/lpfc/lpfc_init.c newtree/drivers/scsi/lpfc/lpfc_init.c
--- oldtree/drivers/scsi/lpfc/lpfc_init.c	2006-03-08 18:48:01.167952000 +0000
+++ newtree/drivers/scsi/lpfc/lpfc_init.c	2006-03-08 15:22:33.153499250 +0000
@@ -1602,7 +1602,7 @@
 	phba->work_ha_mask |= (HA_RXMASK << (LPFC_ELS_RING * 4));
 
 	/* Startup the kernel thread for this host adapter. */
-	phba->worker_thread = kthread_run(lpfc_do_work, phba,
+	phba->worker_thread = kthread_nofreeze_run(lpfc_do_work, phba,
 				       "lpfc_worker_%d", phba->brd_no);
 	if (IS_ERR(phba->worker_thread)) {
 		error = PTR_ERR(phba->worker_thread);
diff -urN oldtree/drivers/scsi/lpfc/lpfc_init.c.orig newtree/drivers/scsi/lpfc/lpfc_init.c.orig
--- oldtree/drivers/scsi/lpfc/lpfc_init.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/drivers/scsi/lpfc/lpfc_init.c.orig	2006-03-08 15:21:16.292695750 +0000
@@ -0,0 +1,1889 @@
+/*******************************************************************
+ * This file is part of the Emulex Linux Device Driver for         *
+ * Fibre Channel Host Bus Adapters.                                *
+ * Copyright (C) 2004-2006 Emulex.  All rights reserved.           *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ * Portions Copyright (C) 2004-2005 Christoph Hellwig              *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *******************************************************************/
+
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_transport_fc.h>
+
+#include "lpfc_hw.h"
+#include "lpfc_sli.h"
+#include "lpfc_disc.h"
+#include "lpfc_scsi.h"
+#include "lpfc.h"
+#include "lpfc_logmsg.h"
+#include "lpfc_crtn.h"
+#include "lpfc_version.h"
+
+static int lpfc_parse_vpd(struct lpfc_hba *, uint8_t *, int);
+static void lpfc_get_hba_model_desc(struct lpfc_hba *, uint8_t *, uint8_t *);
+static int lpfc_post_rcv_buf(struct lpfc_hba *);
+
+static struct scsi_transport_template *lpfc_transport_template = NULL;
+static DEFINE_IDR(lpfc_hba_index);
+
+/************************************************************************/
+/*                                                                      */
+/*    lpfc_config_port_prep                                             */
+/*    This routine will do LPFC initialization prior to the             */
+/*    CONFIG_PORT mailbox command. This will be initialized             */
+/*    as a SLI layer callback routine.                                  */
+/*    This routine returns 0 on success or -ERESTART if it wants        */
+/*    the SLI layer to reset the HBA and try again. Any                 */
+/*    other return value indicates an error.                            */
+/*                                                                      */
+/************************************************************************/
+int
+lpfc_config_port_prep(struct lpfc_hba * phba)
+{
+	lpfc_vpd_t *vp = &phba->vpd;
+	int i = 0, rc;
+	LPFC_MBOXQ_t *pmb;
+	MAILBOX_t *mb;
+	char *lpfc_vpd_data = NULL;
+	uint16_t offset = 0;
+	static char licensed[56] =
+		    "key unlock for use with gnu public licensed code only\0";
+
+	pmb = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
+	if (!pmb) {
+		phba->hba_state = LPFC_HBA_ERROR;
+		return -ENOMEM;
+	}
+
+	mb = &pmb->mb;
+	phba->hba_state = LPFC_INIT_MBX_CMDS;
+
+	if (lpfc_is_LC_HBA(phba->pcidev->device)) {
+		uint32_t *ptext = (uint32_t *) licensed;
+
+		for (i = 0; i < 56; i += sizeof (uint32_t), ptext++)
+			*ptext = cpu_to_be32(*ptext);
+
+		lpfc_read_nv(phba, pmb);
+		memset((char*)mb->un.varRDnvp.rsvd3, 0,
+			sizeof (mb->un.varRDnvp.rsvd3));
+		memcpy((char*)mb->un.varRDnvp.rsvd3, licensed,
+			 sizeof (licensed));
+
+		rc = lpfc_sli_issue_mbox(phba, pmb, MBX_POLL);
+
+		if (rc != MBX_SUCCESS) {
+			lpfc_printf_log(phba,
+					KERN_ERR,
+					LOG_MBOX,
+					"%d:0324 Config Port initialization "
+					"error, mbxCmd x%x READ_NVPARM, "
+					"mbxStatus x%x\n",
+					phba->brd_no,
+					mb->mbxCommand, mb->mbxStatus);
+			mempool_free(pmb, phba->mbox_mem_pool);
+			return -ERESTART;
+		}
+		memcpy(phba->wwnn, (char *)mb->un.varRDnvp.nodename,
+		       sizeof (mb->un.varRDnvp.nodename));
+	}
+
+	/* Setup and issue mailbox READ REV command */
+	lpfc_read_rev(phba, pmb);
+	rc = lpfc_sli_issue_mbox(phba, pmb, MBX_POLL);
+	if (rc != MBX_SUCCESS) {
+		lpfc_printf_log(phba,
+				KERN_ERR,
+				LOG_INIT,
+				"%d:0439 Adapter failed to init, mbxCmd x%x "
+				"READ_REV, mbxStatus x%x\n",
+				phba->brd_no,
+				mb->mbxCommand, mb->mbxStatus);
+		mempool_free( pmb, phba->mbox_mem_pool);
+		return -ERESTART;
+	}
+
+	/*
+	 * The value of rr must be 1 since the driver set the cv field to 1.
+	 * This setting requires the FW to set all revision fields.
+	 */
+	if (mb->un.varRdRev.rr == 0) {
+		vp->rev.rBit = 0;
+		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+				"%d:0440 Adapter failed to init, READ_REV has "
+				"missing revision information.\n",
+				phba->brd_no);
+		mempool_free(pmb, phba->mbox_mem_pool);
+		return -ERESTART;
+	}
+
+	/* Save information as VPD data */
+	vp->rev.rBit = 1;
+	vp->rev.sli1FwRev = mb->un.varRdRev.sli1FwRev;
+	memcpy(vp->rev.sli1FwName, (char*) mb->un.varRdRev.sli1FwName, 16);
+	vp->rev.sli2FwRev = mb->un.varRdRev.sli2FwRev;
+	memcpy(vp->rev.sli2FwName, (char *) mb->un.varRdRev.sli2FwName, 16);
+	vp->rev.biuRev = mb->un.varRdRev.biuRev;
+	vp->rev.smRev = mb->un.varRdRev.smRev;
+	vp->rev.smFwRev = mb->un.varRdRev.un.smFwRev;
+	vp->rev.endecRev = mb->un.varRdRev.endecRev;
+	vp->rev.fcphHigh = mb->un.varRdRev.fcphHigh;
+	vp->rev.fcphLow = mb->un.varRdRev.fcphLow;
+	vp->rev.feaLevelHigh = mb->un.varRdRev.feaLevelHigh;
+	vp->rev.feaLevelLow = mb->un.varRdRev.feaLevelLow;
+	vp->rev.postKernRev = mb->un.varRdRev.postKernRev;
+	vp->rev.opFwRev = mb->un.varRdRev.opFwRev;
+
+	if (lpfc_is_LC_HBA(phba->pcidev->device))
+		memcpy(phba->RandomData, (char *)&mb->un.varWords[24],
+						sizeof (phba->RandomData));
+
+	/* Get adapter VPD information */
+	pmb->context2 = kmalloc(DMP_RSP_SIZE, GFP_KERNEL);
+	if (!pmb->context2)
+		goto out_free_mbox;
+	lpfc_vpd_data = kmalloc(DMP_VPD_SIZE, GFP_KERNEL);
+	if (!lpfc_vpd_data)
+		goto out_free_context2;
+
+	do {
+		lpfc_dump_mem(phba, pmb, offset);
+		rc = lpfc_sli_issue_mbox(phba, pmb, MBX_POLL);
+
+		if (rc != MBX_SUCCESS) {
+			lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
+					"%d:0441 VPD not present on adapter, "
+					"mbxCmd x%x DUMP VPD, mbxStatus x%x\n",
+					phba->brd_no,
+					mb->mbxCommand, mb->mbxStatus);
+			mb->un.varDmp.word_cnt = 0;
+		}
+		if (mb->un.varDmp.word_cnt > DMP_VPD_SIZE - offset)
+			mb->un.varDmp.word_cnt = DMP_VPD_SIZE - offset;
+		lpfc_sli_pcimem_bcopy(pmb->context2, lpfc_vpd_data + offset,
+							mb->un.varDmp.word_cnt);
+		offset += mb->un.varDmp.word_cnt;
+	} while (mb->un.varDmp.word_cnt && offset < DMP_VPD_SIZE);
+	lpfc_parse_vpd(phba, lpfc_vpd_data, offset);
+
+	kfree(lpfc_vpd_data);
+out_free_context2:
+	kfree(pmb->context2);
+out_free_mbox:
+	mempool_free(pmb, phba->mbox_mem_pool);
+	return 0;
+}
+
+/************************************************************************/
+/*                                                                      */
+/*    lpfc_config_port_post                                             */
+/*    This routine will do LPFC initialization after the                */
+/*    CONFIG_PORT mailbox command. This will be initialized             */
+/*    as a SLI layer callback routine.                                  */
+/*    This routine returns 0 on success. Any other return value         */
+/*    indicates an error.                                               */
+/*                                                                      */
+/************************************************************************/
+int
+lpfc_config_port_post(struct lpfc_hba * phba)
+{
+	LPFC_MBOXQ_t *pmb;
+	MAILBOX_t *mb;
+	struct lpfc_dmabuf *mp;
+	struct lpfc_sli *psli = &phba->sli;
+	uint32_t status, timeout;
+	int i, j, rc;
+
+	pmb = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
+	if (!pmb) {
+		phba->hba_state = LPFC_HBA_ERROR;
+		return -ENOMEM;
+	}
+	mb = &pmb->mb;
+
+	lpfc_config_link(phba, pmb);
+	rc = lpfc_sli_issue_mbox(phba, pmb, MBX_POLL);
+	if (rc != MBX_SUCCESS) {
+		lpfc_printf_log(phba,
+				KERN_ERR,
+				LOG_INIT,
+				"%d:0447 Adapter failed init, mbxCmd x%x "
+				"CONFIG_LINK mbxStatus x%x\n",
+				phba->brd_no,
+				mb->mbxCommand, mb->mbxStatus);
+		phba->hba_state = LPFC_HBA_ERROR;
+		mempool_free( pmb, phba->mbox_mem_pool);
+		return -EIO;
+	}
+
+	/* Get login parameters for NID.  */
+	lpfc_read_sparam(phba, pmb);
+	if (lpfc_sli_issue_mbox(phba, pmb, MBX_POLL) != MBX_SUCCESS) {
+		lpfc_printf_log(phba,
+				KERN_ERR,
+				LOG_INIT,
+				"%d:0448 Adapter failed init, mbxCmd x%x "
+				"READ_SPARM mbxStatus x%x\n",
+				phba->brd_no,
+				mb->mbxCommand, mb->mbxStatus);
+		phba->hba_state = LPFC_HBA_ERROR;
+		mp = (struct lpfc_dmabuf *) pmb->context1;
+		mempool_free( pmb, phba->mbox_mem_pool);
+		lpfc_mbuf_free(phba, mp->virt, mp->phys);
+		kfree(mp);
+		return -EIO;
+	}
+
+	mp = (struct lpfc_dmabuf *) pmb->context1;
+
+	memcpy(&phba->fc_sparam, mp->virt, sizeof (struct serv_parm));
+	lpfc_mbuf_free(phba, mp->virt, mp->phys);
+	kfree(mp);
+	pmb->context1 = NULL;
+
+	memcpy(&phba->fc_nodename, &phba->fc_sparam.nodeName,
+	       sizeof (struct lpfc_name));
+	memcpy(&phba->fc_portname, &phba->fc_sparam.portName,
+	       sizeof (struct lpfc_name));
+	/* If no serial number in VPD data, use low 6 bytes of WWNN */
+	/* This should be consolidated into parse_vpd ? - mr */
+	if (phba->SerialNumber[0] == 0) {
+		uint8_t *outptr;
+
+		outptr = &phba->fc_nodename.u.s.IEEE[0];
+		for (i = 0; i < 12; i++) {
+			status = *outptr++;
+			j = ((status & 0xf0) >> 4);
+			if (j <= 9)
+				phba->SerialNumber[i] =
+				    (char)((uint8_t) 0x30 + (uint8_t) j);
+			else
+				phba->SerialNumber[i] =
+				    (char)((uint8_t) 0x61 + (uint8_t) (j - 10));
+			i++;
+			j = (status & 0xf);
+			if (j <= 9)
+				phba->SerialNumber[i] =
+				    (char)((uint8_t) 0x30 + (uint8_t) j);
+			else
+				phba->SerialNumber[i] =
+				    (char)((uint8_t) 0x61 + (uint8_t) (j - 10));
+		}
+	}
+
+	/* This should turn on DELAYED ABTS for ELS timeouts */
+	lpfc_set_slim(phba, pmb, 0x052198, 0x1);
+	if (lpfc_sli_issue_mbox(phba, pmb, MBX_POLL) != MBX_SUCCESS) {
+		phba->hba_state = LPFC_HBA_ERROR;
+		mempool_free( pmb, phba->mbox_mem_pool);
+		return -EIO;
+	}
+
+
+	lpfc_read_config(phba, pmb);
+	if (lpfc_sli_issue_mbox(phba, pmb, MBX_POLL) != MBX_SUCCESS) {
+		lpfc_printf_log(phba,
+				KERN_ERR,
+				LOG_INIT,
+				"%d:0453 Adapter failed to init, mbxCmd x%x "
+				"READ_CONFIG, mbxStatus x%x\n",
+				phba->brd_no,
+				mb->mbxCommand, mb->mbxStatus);
+		phba->hba_state = LPFC_HBA_ERROR;
+		mempool_free( pmb, phba->mbox_mem_pool);
+		return -EIO;
+	}
+
+	/* Reset the DFT_HBA_Q_DEPTH to the max xri  */
+	if (phba->cfg_hba_queue_depth > (mb->un.varRdConfig.max_xri+1))
+		phba->cfg_hba_queue_depth =
+			mb->un.varRdConfig.max_xri + 1;
+
+	phba->lmt = mb->un.varRdConfig.lmt;
+
+	/* Get the default values for Model Name and Description */
+	lpfc_get_hba_model_desc(phba, phba->ModelName, phba->ModelDesc);
+
+	if ((phba->cfg_link_speed > LINK_SPEED_10G)
+	    || ((phba->cfg_link_speed == LINK_SPEED_1G)
+		&& !(phba->lmt & LMT_1Gb))
+	    || ((phba->cfg_link_speed == LINK_SPEED_2G)
+		&& !(phba->lmt & LMT_2Gb))
+	    || ((phba->cfg_link_speed == LINK_SPEED_4G)
+		&& !(phba->lmt & LMT_4Gb))
+	    || ((phba->cfg_link_speed == LINK_SPEED_8G)
+		&& !(phba->lmt & LMT_8Gb))
+	    || ((phba->cfg_link_speed == LINK_SPEED_10G)
+		&& !(phba->lmt & LMT_10Gb))) {
+		/* Reset link speed to auto */
+		lpfc_printf_log(phba,
+			KERN_WARNING,
+			LOG_LINK_EVENT,
+			"%d:1302 Invalid speed for this board: "
+			"Reset link speed to auto: x%x\n",
+			phba->brd_no,
+			phba->cfg_link_speed);
+			phba->cfg_link_speed = LINK_SPEED_AUTO;
+	}
+
+	phba->hba_state = LPFC_LINK_DOWN;
+
+	/* Only process IOCBs on ring 0 till hba_state is READY */
+	if (psli->ring[psli->ip_ring].cmdringaddr)
+		psli->ring[psli->ip_ring].flag |= LPFC_STOP_IOCB_EVENT;
+	if (psli->ring[psli->fcp_ring].cmdringaddr)
+		psli->ring[psli->fcp_ring].flag |= LPFC_STOP_IOCB_EVENT;
+	if (psli->ring[psli->next_ring].cmdringaddr)
+		psli->ring[psli->next_ring].flag |= LPFC_STOP_IOCB_EVENT;
+
+	/* Post receive buffers for desired rings */
+	lpfc_post_rcv_buf(phba);
+
+	/* Enable appropriate host interrupts */
+	spin_lock_irq(phba->host->host_lock);
+	status = readl(phba->HCregaddr);
+	status |= HC_MBINT_ENA | HC_ERINT_ENA | HC_LAINT_ENA;
+	if (psli->num_rings > 0)
+		status |= HC_R0INT_ENA;
+	if (psli->num_rings > 1)
+		status |= HC_R1INT_ENA;
+	if (psli->num_rings > 2)
+		status |= HC_R2INT_ENA;
+	if (psli->num_rings > 3)
+		status |= HC_R3INT_ENA;
+
+	if ((phba->cfg_poll & ENABLE_FCP_RING_POLLING) &&
+	    (phba->cfg_poll & DISABLE_FCP_RING_INT))
+		status &= ~(HC_R0INT_ENA << LPFC_FCP_RING);
+
+	writel(status, phba->HCregaddr);
+	readl(phba->HCregaddr); /* flush */
+	spin_unlock_irq(phba->host->host_lock);
+
+	/*
+	 * Setup the ring 0 (els)  timeout handler
+	 */
+	timeout = phba->fc_ratov << 1;
+	phba->els_tmofunc.expires = jiffies + HZ * timeout;
+	add_timer(&phba->els_tmofunc);
+
+	lpfc_init_link(phba, pmb, phba->cfg_topology, phba->cfg_link_speed);
+	pmb->mbox_cmpl = lpfc_sli_def_mbox_cmpl;
+	if (lpfc_sli_issue_mbox(phba, pmb, MBX_NOWAIT) != MBX_SUCCESS) {
+		lpfc_printf_log(phba,
+				KERN_ERR,
+				LOG_INIT,
+				"%d:0454 Adapter failed to init, mbxCmd x%x "
+				"INIT_LINK, mbxStatus x%x\n",
+				phba->brd_no,
+				mb->mbxCommand, mb->mbxStatus);
+
+		/* Clear all interrupt enable conditions */
+		writel(0, phba->HCregaddr);
+		readl(phba->HCregaddr); /* flush */
+		/* Clear all pending interrupts */
+		writel(0xffffffff, phba->HAregaddr);
+		readl(phba->HAregaddr); /* flush */
+
+		phba->hba_state = LPFC_HBA_ERROR;
+		mempool_free(pmb, phba->mbox_mem_pool);
+		return -EIO;
+	}
+	/* MBOX buffer will be freed in mbox compl */
+
+	i = 0;
+	while ((phba->hba_state != LPFC_HBA_READY) ||
+	       (phba->num_disc_nodes) || (phba->fc_prli_sent) ||
+	       ((phba->fc_map_cnt == 0) && (i<2)) ||
+	       (psli->sli_flag & LPFC_SLI_MBOX_ACTIVE)) {
+		/* Check every second for 30 retries. */
+		i++;
+		if (i > 30) {
+			break;
+		}
+		if ((i >= 15) && (phba->hba_state <= LPFC_LINK_DOWN)) {
+			/* The link is down.  Set linkdown timeout */
+			break;
+		}
+
+		/* Delay for 1 second to give discovery time to complete. */
+		msleep(1000);
+
+	}
+
+	/* Since num_disc_nodes keys off of PLOGI, delay a bit to let
+	 * any potential PRLIs to flush thru the SLI sub-system.
+	 */
+	msleep(50);
+
+	return (0);
+}
+
+/************************************************************************/
+/*                                                                      */
+/*    lpfc_hba_down_prep                                                */
+/*    This routine will do LPFC uninitialization before the             */
+/*    HBA is reset when bringing down the SLI Layer. This will be       */
+/*    initialized as a SLI layer callback routine.                      */
+/*    This routine returns 0 on success. Any other return value         */
+/*    indicates an error.                                               */
+/*                                                                      */
+/************************************************************************/
+int
+lpfc_hba_down_prep(struct lpfc_hba * phba)
+{
+	/* Disable interrupts */
+	writel(0, phba->HCregaddr);
+	readl(phba->HCregaddr); /* flush */
+
+	/* Cleanup potential discovery resources */
+	lpfc_els_flush_rscn(phba);
+	lpfc_els_flush_cmd(phba);
+	lpfc_disc_flush_list(phba);
+
+	/* Disable SLI2 since we disabled interrupts */
+	phba->sli.sli_flag &= ~LPFC_SLI2_ACTIVE;
+	return (0);
+}
+
+/************************************************************************/
+/*                                                                      */
+/*    lpfc_hba_down_post                                                */
+/*    This routine will do uninitialization after the HBA is reset      */
+/*    when bringing down the SLI Layer.                                 */
+/*    This routine returns 0 on success. Any other return value         */
+/*    indicates an error.                                               */
+/*                                                                      */
+/************************************************************************/
+int
+lpfc_hba_down_post(struct lpfc_hba * phba)
+{
+	struct lpfc_sli *psli = &phba->sli;
+	struct lpfc_sli_ring *pring;
+	struct lpfc_dmabuf *mp, *next_mp;
+	int i;
+
+	/* Cleanup preposted buffers on the ELS ring */
+	pring = &psli->ring[LPFC_ELS_RING];
+	list_for_each_entry_safe(mp, next_mp, &pring->postbufq, list) {
+		list_del(&mp->list);
+		pring->postbufq_cnt--;
+		lpfc_mbuf_free(phba, mp->virt, mp->phys);
+		kfree(mp);
+	}
+
+	for (i = 0; i < psli->num_rings; i++) {
+		pring = &psli->ring[i];
+		lpfc_sli_abort_iocb_ring(phba, pring);
+	}
+
+	return 0;
+}
+
+/************************************************************************/
+/*                                                                      */
+/*    lpfc_handle_eratt                                                 */
+/*    This routine will handle processing a Host Attention              */
+/*    Error Status event. This will be initialized                      */
+/*    as a SLI layer callback routine.                                  */
+/*                                                                      */
+/************************************************************************/
+void
+lpfc_handle_eratt(struct lpfc_hba * phba)
+{
+	struct lpfc_sli *psli = &phba->sli;
+	struct lpfc_sli_ring  *pring;
+
+	if (phba->work_hs & HS_FFER6) {
+		/* Re-establishing Link */
+		lpfc_printf_log(phba, KERN_INFO, LOG_LINK_EVENT,
+				"%d:1301 Re-establishing Link "
+				"Data: x%x x%x x%x\n",
+				phba->brd_no, phba->work_hs,
+				phba->work_status[0], phba->work_status[1]);
+		spin_lock_irq(phba->host->host_lock);
+		phba->fc_flag |= FC_ESTABLISH_LINK;
+		spin_unlock_irq(phba->host->host_lock);
+
+		/*
+		* Firmware stops when it triggled erratt with HS_FFER6.
+		* That could cause the I/Os dropped by the firmware.
+		* Error iocb (I/O) on txcmplq and let the SCSI layer
+		* retry it after re-establishing link.
+		*/
+		pring = &psli->ring[psli->fcp_ring];
+		lpfc_sli_abort_iocb_ring(phba, pring);
+
+
+		/*
+		 * There was a firmware error.  Take the hba offline and then
+		 * attempt to restart it.
+		 */
+		lpfc_offline(phba);
+		lpfc_sli_brdrestart(phba);
+		if (lpfc_online(phba) == 0) {	/* Initialize the HBA */
+			mod_timer(&phba->fc_estabtmo, jiffies + HZ * 60);
+			return;
+		}
+	} else {
+		/* The if clause above forces this code path when the status
+		 * failure is a value other than FFER6.  Do not call the offline
+		 *  twice. This is the adapter hardware error path.
+		 */
+		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+				"%d:0457 Adapter Hardware Error "
+				"Data: x%x x%x x%x\n",
+				phba->brd_no, phba->work_hs,
+				phba->work_status[0], phba->work_status[1]);
+
+		lpfc_offline(phba);
+		phba->hba_state = LPFC_HBA_ERROR;
+		lpfc_hba_down_post(phba);
+	}
+}
+
+/************************************************************************/
+/*                                                                      */
+/*    lpfc_handle_latt                                                  */
+/*    This routine will handle processing a Host Attention              */
+/*    Link Status event. This will be initialized                       */
+/*    as a SLI layer callback routine.                                  */
+/*                                                                      */
+/************************************************************************/
+void
+lpfc_handle_latt(struct lpfc_hba * phba)
+{
+	struct lpfc_sli *psli = &phba->sli;
+	LPFC_MBOXQ_t *pmb;
+	volatile uint32_t control;
+	struct lpfc_dmabuf *mp;
+	int rc = -ENOMEM;
+
+	pmb = (LPFC_MBOXQ_t *)mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
+	if (!pmb)
+		goto lpfc_handle_latt_err_exit;
+
+	mp = kmalloc(sizeof(struct lpfc_dmabuf), GFP_KERNEL);
+	if (!mp)
+		goto lpfc_handle_latt_free_pmb;
+
+	mp->virt = lpfc_mbuf_alloc(phba, 0, &mp->phys);
+	if (!mp->virt)
+		goto lpfc_handle_latt_free_mp;
+
+	rc = -EIO;
+
+	/* Cleanup any outstanding ELS commands */
+	lpfc_els_flush_cmd(phba);
+
+	psli->slistat.link_event++;
+	lpfc_read_la(phba, pmb, mp);
+	pmb->mbox_cmpl = lpfc_mbx_cmpl_read_la;
+	rc = lpfc_sli_issue_mbox (phba, pmb, (MBX_NOWAIT | MBX_STOP_IOCB));
+	if (rc == MBX_NOT_FINISHED)
+		goto lpfc_handle_latt_free_mp;
+
+	/* Clear Link Attention in HA REG */
+	spin_lock_irq(phba->host->host_lock);
+	writel(HA_LATT, phba->HAregaddr);
+	readl(phba->HAregaddr); /* flush */
+	spin_unlock_irq(phba->host->host_lock);
+
+	return;
+
+lpfc_handle_latt_free_mp:
+	kfree(mp);
+lpfc_handle_latt_free_pmb:
+	kfree(pmb);
+lpfc_handle_latt_err_exit:
+	/* Enable Link attention interrupts */
+	spin_lock_irq(phba->host->host_lock);
+	psli->sli_flag |= LPFC_PROCESS_LA;
+	control = readl(phba->HCregaddr);
+	control |= HC_LAINT_ENA;
+	writel(control, phba->HCregaddr);
+	readl(phba->HCregaddr); /* flush */
+
+	/* Clear Link Attention in HA REG */
+	writel(HA_LATT, phba->HAregaddr);
+	readl(phba->HAregaddr); /* flush */
+	spin_unlock_irq(phba->host->host_lock);
+	lpfc_linkdown(phba);
+	phba->hba_state = LPFC_HBA_ERROR;
+
+	/* The other case is an error from issue_mbox */
+	if (rc == -ENOMEM)
+		lpfc_printf_log(phba,
+				KERN_WARNING,
+				LOG_MBOX,
+			        "%d:0300 READ_LA: no buffers\n",
+				phba->brd_no);
+
+	return;
+}
+
+/************************************************************************/
+/*                                                                      */
+/*   lpfc_parse_vpd                                                     */
+/*   This routine will parse the VPD data                               */
+/*                                                                      */
+/************************************************************************/
+static int
+lpfc_parse_vpd(struct lpfc_hba * phba, uint8_t * vpd, int len)
+{
+	uint8_t lenlo, lenhi;
+	uint32_t Length;
+	int i, j;
+	int finished = 0;
+	int index = 0;
+
+	if (!vpd)
+		return 0;
+
+	/* Vital Product */
+	lpfc_printf_log(phba,
+			KERN_INFO,
+			LOG_INIT,
+			"%d:0455 Vital Product Data: x%x x%x x%x x%x\n",
+			phba->brd_no,
+			(uint32_t) vpd[0], (uint32_t) vpd[1], (uint32_t) vpd[2],
+			(uint32_t) vpd[3]);
+	while (!finished && (index < (len - 4))) {
+		switch (vpd[index]) {
+		case 0x82:
+		case 0x91:
+			index += 1;
+			lenlo = vpd[index];
+			index += 1;
+			lenhi = vpd[index];
+			index += 1;
+			i = ((((unsigned short)lenhi) << 8) + lenlo);
+			index += i;
+			break;
+		case 0x90:
+			index += 1;
+			lenlo = vpd[index];
+			index += 1;
+			lenhi = vpd[index];
+			index += 1;
+			Length = ((((unsigned short)lenhi) << 8) + lenlo);
+			if (Length > len - index)
+				Length = len - index;
+			while (Length > 0) {
+			/* Look for Serial Number */
+			if ((vpd[index] == 'S') && (vpd[index+1] == 'N')) {
+				index += 2;
+				i = vpd[index];
+				index += 1;
+				j = 0;
+				Length -= (3+i);
+				while(i--) {
+					phba->SerialNumber[j++] = vpd[index++];
+					if (j == 31)
+						break;
+				}
+				phba->SerialNumber[j] = 0;
+				continue;
+			}
+			else if ((vpd[index] == 'V') && (vpd[index+1] == '1')) {
+				phba->vpd_flag |= VPD_MODEL_DESC;
+				index += 2;
+				i = vpd[index];
+				index += 1;
+				j = 0;
+				Length -= (3+i);
+				while(i--) {
+					phba->ModelDesc[j++] = vpd[index++];
+					if (j == 255)
+						break;
+				}
+				phba->ModelDesc[j] = 0;
+				continue;
+			}
+			else if ((vpd[index] == 'V') && (vpd[index+1] == '2')) {
+				phba->vpd_flag |= VPD_MODEL_NAME;
+				index += 2;
+				i = vpd[index];
+				index += 1;
+				j = 0;
+				Length -= (3+i);
+				while(i--) {
+					phba->ModelName[j++] = vpd[index++];
+					if (j == 79)
+						break;
+				}
+				phba->ModelName[j] = 0;
+				continue;
+			}
+			else if ((vpd[index] == 'V') && (vpd[index+1] == '3')) {
+				phba->vpd_flag |= VPD_PROGRAM_TYPE;
+				index += 2;
+				i = vpd[index];
+				index += 1;
+				j = 0;
+				Length -= (3+i);
+				while(i--) {
+					phba->ProgramType[j++] = vpd[index++];
+					if (j == 255)
+						break;
+				}
+				phba->ProgramType[j] = 0;
+				continue;
+			}
+			else if ((vpd[index] == 'V') && (vpd[index+1] == '4')) {
+				phba->vpd_flag |= VPD_PORT;
+				index += 2;
+				i = vpd[index];
+				index += 1;
+				j = 0;
+				Length -= (3+i);
+				while(i--) {
+				phba->Port[j++] = vpd[index++];
+				if (j == 19)
+					break;
+				}
+				phba->Port[j] = 0;
+				continue;
+			}
+			else {
+				index += 2;
+				i = vpd[index];
+				index += 1;
+				index += i;
+				Length -= (3 + i);
+			}
+		}
+		finished = 0;
+		break;
+		case 0x78:
+			finished = 1;
+			break;
+		default:
+			index ++;
+			break;
+		}
+	}
+
+	return(1);
+}
+
+static void
+lpfc_get_hba_model_desc(struct lpfc_hba * phba, uint8_t * mdp, uint8_t * descp)
+{
+	lpfc_vpd_t *vp;
+	uint16_t dev_id = phba->pcidev->device;
+	uint16_t dev_subid = phba->pcidev->subsystem_device;
+	uint8_t hdrtype;
+	int max_speed;
+	char * ports;
+	struct {
+		char * name;
+		int    max_speed;
+		char * ports;
+		char * bus;
+	} m;
+
+	pci_read_config_byte(phba->pcidev, PCI_HEADER_TYPE, &hdrtype);
+	ports = (hdrtype == 0x80) ? "2-port " : "";
+	if (mdp && mdp[0] != '\0'
+		&& descp && descp[0] != '\0')
+		return;
+
+	if (phba->lmt & LMT_10Gb)
+		max_speed = 10;
+	else if (phba->lmt & LMT_8Gb)
+		max_speed = 8;
+	else if (phba->lmt & LMT_4Gb)
+		max_speed = 4;
+	else if (phba->lmt & LMT_2Gb)
+		max_speed = 2;
+	else
+		max_speed = 1;
+
+	vp = &phba->vpd;
+
+	switch (dev_id) {
+	case PCI_DEVICE_ID_FIREFLY:
+		m = (typeof(m)){"LP6000", max_speed, "", "PCI"};
+		break;
+	case PCI_DEVICE_ID_SUPERFLY:
+		if (vp->rev.biuRev >= 1 && vp->rev.biuRev <= 3)
+			m = (typeof(m)){"LP7000", max_speed, "", "PCI"};
+		else
+			m = (typeof(m)){"LP7000E", max_speed, "", "PCI"};
+		break;
+	case PCI_DEVICE_ID_DRAGONFLY:
+		m = (typeof(m)){"LP8000", max_speed, "", "PCI"};
+		break;
+	case PCI_DEVICE_ID_CENTAUR:
+		if (FC_JEDEC_ID(vp->rev.biuRev) == CENTAUR_2G_JEDEC_ID)
+			m = (typeof(m)){"LP9002", max_speed, "", "PCI"};
+		else
+			m = (typeof(m)){"LP9000", max_speed, "", "PCI"};
+		break;
+	case PCI_DEVICE_ID_RFLY:
+		m = (typeof(m)){"LP952", max_speed, "", "PCI"};
+		break;
+	case PCI_DEVICE_ID_PEGASUS:
+		m = (typeof(m)){"LP9802", max_speed, "", "PCI-X"};
+		break;
+	case PCI_DEVICE_ID_THOR:
+		if (hdrtype == 0x80)
+			m = (typeof(m)){"LP10000DC",
+					max_speed, ports, "PCI-X"};
+		else
+			m = (typeof(m)){"LP10000",
+					max_speed, ports, "PCI-X"};
+		break;
+	case PCI_DEVICE_ID_VIPER:
+		m = (typeof(m)){"LPX1000", max_speed, "", "PCI-X"};
+		break;
+	case PCI_DEVICE_ID_PFLY:
+		m = (typeof(m)){"LP982", max_speed, "", "PCI-X"};
+		break;
+	case PCI_DEVICE_ID_TFLY:
+		if (hdrtype == 0x80)
+			m = (typeof(m)){"LP1050DC", max_speed, ports, "PCI-X"};
+		else
+			m = (typeof(m)){"LP1050", max_speed, ports, "PCI-X"};
+		break;
+	case PCI_DEVICE_ID_HELIOS:
+		if (hdrtype == 0x80)
+			m = (typeof(m)){"LP11002", max_speed, ports, "PCI-X2"};
+		else
+			m = (typeof(m)){"LP11000", max_speed, ports, "PCI-X2"};
+		break;
+	case PCI_DEVICE_ID_HELIOS_SCSP:
+		m = (typeof(m)){"LP11000-SP", max_speed, ports, "PCI-X2"};
+		break;
+	case PCI_DEVICE_ID_HELIOS_DCSP:
+		m = (typeof(m)){"LP11002-SP", max_speed, ports, "PCI-X2"};
+		break;
+	case PCI_DEVICE_ID_NEPTUNE:
+		if (hdrtype == 0x80)
+			m = (typeof(m)){"LPe1002", max_speed, ports, "PCIe"};
+		else
+			m = (typeof(m)){"LPe1000", max_speed, ports, "PCIe"};
+		break;
+	case PCI_DEVICE_ID_NEPTUNE_SCSP:
+		m = (typeof(m)){"LPe1000-SP", max_speed, ports, "PCIe"};
+		break;
+	case PCI_DEVICE_ID_NEPTUNE_DCSP:
+		m = (typeof(m)){"LPe1002-SP", max_speed, ports, "PCIe"};
+		break;
+	case PCI_DEVICE_ID_BMID:
+		m = (typeof(m)){"LP1150", max_speed, ports, "PCI-X2"};
+		break;
+	case PCI_DEVICE_ID_BSMB:
+		m = (typeof(m)){"LP111", max_speed, ports, "PCI-X2"};
+		break;
+	case PCI_DEVICE_ID_ZEPHYR:
+		if (hdrtype == 0x80)
+			m = (typeof(m)){"LPe11002", max_speed, ports, "PCIe"};
+		else
+			m = (typeof(m)){"LPe11000", max_speed, ports, "PCIe"};
+		break;
+	case PCI_DEVICE_ID_ZEPHYR_SCSP:
+		m = (typeof(m)){"LPe11000", max_speed, ports, "PCIe"};
+		break;
+	case PCI_DEVICE_ID_ZEPHYR_DCSP:
+		m = (typeof(m)){"LPe11002-SP", max_speed, ports, "PCIe"};
+		break;
+	case PCI_DEVICE_ID_ZMID:
+		m = (typeof(m)){"LPe1150", max_speed, ports, "PCIe"};
+		break;
+	case PCI_DEVICE_ID_ZSMB:
+		m = (typeof(m)){"LPe111", max_speed, ports, "PCIe"};
+		break;
+	case PCI_DEVICE_ID_LP101:
+		m = (typeof(m)){"LP101", max_speed, ports, "PCI-X"};
+		break;
+	case PCI_DEVICE_ID_LP10000S:
+		m = (typeof(m)){"LP10000-S", max_speed, ports, "PCI"};
+		break;
+	case PCI_DEVICE_ID_LP11000S:
+	case PCI_DEVICE_ID_LPE11000S:
+		switch (dev_subid) {
+		case PCI_SUBSYSTEM_ID_LP11000S:
+			m = (typeof(m)){"LP11000-S", max_speed,
+					ports, "PCI-X2"};
+			break;
+		case PCI_SUBSYSTEM_ID_LP11002S:
+			m = (typeof(m)){"LP11002-S", max_speed,
+					ports, "PCI-X2"};
+			break;
+		case PCI_SUBSYSTEM_ID_LPE11000S:
+			m = (typeof(m)){"LPe11000-S", max_speed,
+					ports, "PCIe"};
+			break;
+		case PCI_SUBSYSTEM_ID_LPE11002S:
+			m = (typeof(m)){"LPe11002-S", max_speed,
+					ports, "PCIe"};
+			break;
+		case PCI_SUBSYSTEM_ID_LPE11010S:
+			m = (typeof(m)){"LPe11010-S", max_speed,
+					"10-port ", "PCIe"};
+			break;
+		default:
+			m = (typeof(m)){ 0 };
+			break;
+		}
+		break;
+	default:
+		m = (typeof(m)){ 0 };
+		break;
+	}
+
+	if (mdp && mdp[0] == '\0')
+		snprintf(mdp, 79,"%s", m.name);
+	if (descp && descp[0] == '\0')
+		snprintf(descp, 255,
+			 "Emulex %s %dGb %s%s Fibre Channel Adapter",
+			 m.name, m.max_speed, m.ports, m.bus);
+}
+
+/**************************************************/
+/*   lpfc_post_buffer                             */
+/*                                                */
+/*   This routine will post count buffers to the  */
+/*   ring with the QUE_RING_BUF_CN command. This  */
+/*   allows 3 buffers / command to be posted.     */
+/*   Returns the number of buffers NOT posted.    */
+/**************************************************/
+int
+lpfc_post_buffer(struct lpfc_hba * phba, struct lpfc_sli_ring * pring, int cnt,
+		 int type)
+{
+	IOCB_t *icmd;
+	struct lpfc_iocbq *iocb;
+	struct lpfc_dmabuf *mp1, *mp2;
+
+	cnt += pring->missbufcnt;
+
+	/* While there are buffers to post */
+	while (cnt > 0) {
+		/* Allocate buffer for  command iocb */
+		spin_lock_irq(phba->host->host_lock);
+		iocb = lpfc_sli_get_iocbq(phba);
+		spin_unlock_irq(phba->host->host_lock);
+		if (iocb == NULL) {
+			pring->missbufcnt = cnt;
+			return cnt;
+		}
+		icmd = &iocb->iocb;
+
+		/* 2 buffers can be posted per command */
+		/* Allocate buffer to post */
+		mp1 = kmalloc(sizeof (struct lpfc_dmabuf), GFP_KERNEL);
+		if (mp1)
+		    mp1->virt = lpfc_mbuf_alloc(phba, MEM_PRI,
+						&mp1->phys);
+		if (mp1 == 0 || mp1->virt == 0) {
+			kfree(mp1);
+			spin_lock_irq(phba->host->host_lock);
+			lpfc_sli_release_iocbq(phba, iocb);
+			spin_unlock_irq(phba->host->host_lock);
+			pring->missbufcnt = cnt;
+			return cnt;
+		}
+
+		INIT_LIST_HEAD(&mp1->list);
+		/* Allocate buffer to post */
+		if (cnt > 1) {
+			mp2 = kmalloc(sizeof (struct lpfc_dmabuf), GFP_KERNEL);
+			if (mp2)
+				mp2->virt = lpfc_mbuf_alloc(phba, MEM_PRI,
+							    &mp2->phys);
+			if (mp2 == 0 || mp2->virt == 0) {
+				kfree(mp2);
+				lpfc_mbuf_free(phba, mp1->virt, mp1->phys);
+				kfree(mp1);
+				spin_lock_irq(phba->host->host_lock);
+				lpfc_sli_release_iocbq(phba, iocb);
+				spin_unlock_irq(phba->host->host_lock);
+				pring->missbufcnt = cnt;
+				return cnt;
+			}
+
+			INIT_LIST_HEAD(&mp2->list);
+		} else {
+			mp2 = NULL;
+		}
+
+		icmd->un.cont64[0].addrHigh = putPaddrHigh(mp1->phys);
+		icmd->un.cont64[0].addrLow = putPaddrLow(mp1->phys);
+		icmd->un.cont64[0].tus.f.bdeSize = FCELSSIZE;
+		icmd->ulpBdeCount = 1;
+		cnt--;
+		if (mp2) {
+			icmd->un.cont64[1].addrHigh = putPaddrHigh(mp2->phys);
+			icmd->un.cont64[1].addrLow = putPaddrLow(mp2->phys);
+			icmd->un.cont64[1].tus.f.bdeSize = FCELSSIZE;
+			cnt--;
+			icmd->ulpBdeCount = 2;
+		}
+
+		icmd->ulpCommand = CMD_QUE_RING_BUF64_CN;
+		icmd->ulpLe = 1;
+
+		spin_lock_irq(phba->host->host_lock);
+		if (lpfc_sli_issue_iocb(phba, pring, iocb, 0) == IOCB_ERROR) {
+			lpfc_mbuf_free(phba, mp1->virt, mp1->phys);
+			kfree(mp1);
+			cnt++;
+			if (mp2) {
+				lpfc_mbuf_free(phba, mp2->virt, mp2->phys);
+				kfree(mp2);
+				cnt++;
+			}
+			lpfc_sli_release_iocbq(phba, iocb);
+			pring->missbufcnt = cnt;
+			spin_unlock_irq(phba->host->host_lock);
+			return cnt;
+		}
+		spin_unlock_irq(phba->host->host_lock);
+		lpfc_sli_ringpostbuf_put(phba, pring, mp1);
+		if (mp2) {
+			lpfc_sli_ringpostbuf_put(phba, pring, mp2);
+		}
+	}
+	pring->missbufcnt = 0;
+	return 0;
+}
+
+/************************************************************************/
+/*                                                                      */
+/*   lpfc_post_rcv_buf                                                  */
+/*   This routine post initial rcv buffers to the configured rings      */
+/*                                                                      */
+/************************************************************************/
+static int
+lpfc_post_rcv_buf(struct lpfc_hba * phba)
+{
+	struct lpfc_sli *psli = &phba->sli;
+
+	/* Ring 0, ELS / CT buffers */
+	lpfc_post_buffer(phba, &psli->ring[LPFC_ELS_RING], LPFC_BUF_RING0, 1);
+	/* Ring 2 - FCP no buffers needed */
+
+	return 0;
+}
+
+#define S(N,V) (((V)<<(N))|((V)>>(32-(N))))
+
+/************************************************************************/
+/*                                                                      */
+/*   lpfc_sha_init                                                      */
+/*                                                                      */
+/************************************************************************/
+static void
+lpfc_sha_init(uint32_t * HashResultPointer)
+{
+	HashResultPointer[0] = 0x67452301;
+	HashResultPointer[1] = 0xEFCDAB89;
+	HashResultPointer[2] = 0x98BADCFE;
+	HashResultPointer[3] = 0x10325476;
+	HashResultPointer[4] = 0xC3D2E1F0;
+}
+
+/************************************************************************/
+/*                                                                      */
+/*   lpfc_sha_iterate                                                   */
+/*                                                                      */
+/************************************************************************/
+static void
+lpfc_sha_iterate(uint32_t * HashResultPointer, uint32_t * HashWorkingPointer)
+{
+	int t;
+	uint32_t TEMP;
+	uint32_t A, B, C, D, E;
+	t = 16;
+	do {
+		HashWorkingPointer[t] =
+		    S(1,
+		      HashWorkingPointer[t - 3] ^ HashWorkingPointer[t -
+								     8] ^
+		      HashWorkingPointer[t - 14] ^ HashWorkingPointer[t - 16]);
+	} while (++t <= 79);
+	t = 0;
+	A = HashResultPointer[0];
+	B = HashResultPointer[1];
+	C = HashResultPointer[2];
+	D = HashResultPointer[3];
+	E = HashResultPointer[4];
+
+	do {
+		if (t < 20) {
+			TEMP = ((B & C) | ((~B) & D)) + 0x5A827999;
+		} else if (t < 40) {
+			TEMP = (B ^ C ^ D) + 0x6ED9EBA1;
+		} else if (t < 60) {
+			TEMP = ((B & C) | (B & D) | (C & D)) + 0x8F1BBCDC;
+		} else {
+			TEMP = (B ^ C ^ D) + 0xCA62C1D6;
+		}
+		TEMP += S(5, A) + E + HashWorkingPointer[t];
+		E = D;
+		D = C;
+		C = S(30, B);
+		B = A;
+		A = TEMP;
+	} while (++t <= 79);
+
+	HashResultPointer[0] += A;
+	HashResultPointer[1] += B;
+	HashResultPointer[2] += C;
+	HashResultPointer[3] += D;
+	HashResultPointer[4] += E;
+
+}
+
+/************************************************************************/
+/*                                                                      */
+/*   lpfc_challenge_key                                                 */
+/*                                                                      */
+/************************************************************************/
+static void
+lpfc_challenge_key(uint32_t * RandomChallenge, uint32_t * HashWorking)
+{
+	*HashWorking = (*RandomChallenge ^ *HashWorking);
+}
+
+/************************************************************************/
+/*                                                                      */
+/*   lpfc_hba_init                                                      */
+/*                                                                      */
+/************************************************************************/
+void
+lpfc_hba_init(struct lpfc_hba *phba, uint32_t *hbainit)
+{
+	int t;
+	uint32_t *HashWorking;
+	uint32_t *pwwnn = phba->wwnn;
+
+	HashWorking = kmalloc(80 * sizeof(uint32_t), GFP_KERNEL);
+	if (!HashWorking)
+		return;
+
+	memset(HashWorking, 0, (80 * sizeof(uint32_t)));
+	HashWorking[0] = HashWorking[78] = *pwwnn++;
+	HashWorking[1] = HashWorking[79] = *pwwnn;
+
+	for (t = 0; t < 7; t++)
+		lpfc_challenge_key(phba->RandomData + t, HashWorking + t);
+
+	lpfc_sha_init(hbainit);
+	lpfc_sha_iterate(hbainit, HashWorking);
+	kfree(HashWorking);
+}
+
+static void
+lpfc_cleanup(struct lpfc_hba * phba, uint32_t save_bind)
+{
+	struct lpfc_nodelist *ndlp, *next_ndlp;
+
+	/* clean up phba - lpfc specific */
+	lpfc_can_disctmo(phba);
+	list_for_each_entry_safe(ndlp, next_ndlp, &phba->fc_nlpunmap_list,
+				nlp_listp) {
+		lpfc_nlp_remove(phba, ndlp);
+	}
+
+	list_for_each_entry_safe(ndlp, next_ndlp, &phba->fc_nlpmap_list,
+				 nlp_listp) {
+		lpfc_nlp_remove(phba, ndlp);
+	}
+
+	list_for_each_entry_safe(ndlp, next_ndlp, &phba->fc_unused_list,
+				nlp_listp) {
+		lpfc_nlp_list(phba, ndlp, NLP_NO_LIST);
+	}
+
+	list_for_each_entry_safe(ndlp, next_ndlp, &phba->fc_plogi_list,
+				nlp_listp) {
+		lpfc_nlp_remove(phba, ndlp);
+	}
+
+	list_for_each_entry_safe(ndlp, next_ndlp, &phba->fc_adisc_list,
+				nlp_listp) {
+		lpfc_nlp_remove(phba, ndlp);
+	}
+
+	list_for_each_entry_safe(ndlp, next_ndlp, &phba->fc_reglogin_list,
+				nlp_listp) {
+		lpfc_nlp_remove(phba, ndlp);
+	}
+
+	list_for_each_entry_safe(ndlp, next_ndlp, &phba->fc_prli_list,
+				nlp_listp) {
+		lpfc_nlp_remove(phba, ndlp);
+	}
+
+	list_for_each_entry_safe(ndlp, next_ndlp, &phba->fc_npr_list,
+				nlp_listp) {
+		lpfc_nlp_remove(phba, ndlp);
+	}
+
+	INIT_LIST_HEAD(&phba->fc_nlpmap_list);
+	INIT_LIST_HEAD(&phba->fc_nlpunmap_list);
+	INIT_LIST_HEAD(&phba->fc_unused_list);
+	INIT_LIST_HEAD(&phba->fc_plogi_list);
+	INIT_LIST_HEAD(&phba->fc_adisc_list);
+	INIT_LIST_HEAD(&phba->fc_reglogin_list);
+	INIT_LIST_HEAD(&phba->fc_prli_list);
+	INIT_LIST_HEAD(&phba->fc_npr_list);
+
+	phba->fc_map_cnt   = 0;
+	phba->fc_unmap_cnt = 0;
+	phba->fc_plogi_cnt = 0;
+	phba->fc_adisc_cnt = 0;
+	phba->fc_reglogin_cnt = 0;
+	phba->fc_prli_cnt  = 0;
+	phba->fc_npr_cnt   = 0;
+	phba->fc_unused_cnt= 0;
+	return;
+}
+
+static void
+lpfc_establish_link_tmo(unsigned long ptr)
+{
+	struct lpfc_hba *phba = (struct lpfc_hba *)ptr;
+	unsigned long iflag;
+
+
+	/* Re-establishing Link, timer expired */
+	lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
+			"%d:1300 Re-establishing Link, timer expired "
+			"Data: x%x x%x\n",
+			phba->brd_no, phba->fc_flag, phba->hba_state);
+	spin_lock_irqsave(phba->host->host_lock, iflag);
+	phba->fc_flag &= ~FC_ESTABLISH_LINK;
+	spin_unlock_irqrestore(phba->host->host_lock, iflag);
+}
+
+static int
+lpfc_stop_timer(struct lpfc_hba * phba)
+{
+	struct lpfc_sli *psli = &phba->sli;
+
+	/* Instead of a timer, this has been converted to a
+	 * deferred procedding list.
+	 */
+	while (!list_empty(&phba->freebufList)) {
+
+		struct lpfc_dmabuf *mp = NULL;
+
+		list_remove_head((&phba->freebufList), mp,
+				 struct lpfc_dmabuf, list);
+		if (mp) {
+			lpfc_mbuf_free(phba, mp->virt, mp->phys);
+			kfree(mp);
+		}
+	}
+
+	del_timer_sync(&phba->fcp_poll_timer);
+	del_timer_sync(&phba->fc_estabtmo);
+	del_timer_sync(&phba->fc_disctmo);
+	del_timer_sync(&phba->fc_fdmitmo);
+	del_timer_sync(&phba->els_tmofunc);
+	psli = &phba->sli;
+	del_timer_sync(&psli->mbox_tmo);
+	return(1);
+}
+
+int
+lpfc_online(struct lpfc_hba * phba)
+{
+	if (!phba)
+		return 0;
+
+	if (!(phba->fc_flag & FC_OFFLINE_MODE))
+		return 0;
+
+	lpfc_printf_log(phba,
+		       KERN_WARNING,
+		       LOG_INIT,
+		       "%d:0458 Bring Adapter online\n",
+		       phba->brd_no);
+
+	if (!lpfc_sli_queue_setup(phba))
+		return 1;
+
+	if (lpfc_sli_hba_setup(phba))	/* Initialize the HBA */
+		return 1;
+
+	spin_lock_irq(phba->host->host_lock);
+	phba->fc_flag &= ~FC_OFFLINE_MODE;
+	spin_unlock_irq(phba->host->host_lock);
+
+	return 0;
+}
+
+int
+lpfc_offline(struct lpfc_hba * phba)
+{
+	struct lpfc_sli_ring *pring;
+	struct lpfc_sli *psli;
+	unsigned long iflag;
+	int i = 0;
+
+	if (!phba)
+		return 0;
+
+	if (phba->fc_flag & FC_OFFLINE_MODE)
+		return 0;
+
+	psli = &phba->sli;
+	pring = &psli->ring[psli->fcp_ring];
+
+	lpfc_linkdown(phba);
+
+	/* The linkdown event takes 30 seconds to timeout. */
+	while (pring->txcmplq_cnt) {
+		mdelay(10);
+		if (i++ > 3000)
+			break;
+	}
+
+	/* stop all timers associated with this hba */
+	lpfc_stop_timer(phba);
+	phba->work_hba_events = 0;
+
+	lpfc_printf_log(phba,
+		       KERN_WARNING,
+		       LOG_INIT,
+		       "%d:0460 Bring Adapter offline\n",
+		       phba->brd_no);
+
+	/* Bring down the SLI Layer and cleanup.  The HBA is offline
+	   now.  */
+	lpfc_sli_hba_down(phba);
+	lpfc_cleanup(phba, 1);
+	spin_lock_irqsave(phba->host->host_lock, iflag);
+	phba->fc_flag |= FC_OFFLINE_MODE;
+	spin_unlock_irqrestore(phba->host->host_lock, iflag);
+	return 0;
+}
+
+/******************************************************************************
+* Function name: lpfc_scsi_free
+*
+* Description: Called from lpfc_pci_remove_one free internal driver resources
+*
+******************************************************************************/
+static int
+lpfc_scsi_free(struct lpfc_hba * phba)
+{
+	struct lpfc_scsi_buf *sb, *sb_next;
+	struct lpfc_iocbq *io, *io_next;
+
+	spin_lock_irq(phba->host->host_lock);
+	/* Release all the lpfc_scsi_bufs maintained by this host. */
+	list_for_each_entry_safe(sb, sb_next, &phba->lpfc_scsi_buf_list, list) {
+		list_del(&sb->list);
+		pci_pool_free(phba->lpfc_scsi_dma_buf_pool, sb->data,
+								sb->dma_handle);
+		kfree(sb);
+		phba->total_scsi_bufs--;
+	}
+
+	/* Release all the lpfc_iocbq entries maintained by this host. */
+	list_for_each_entry_safe(io, io_next, &phba->lpfc_iocb_list, list) {
+		list_del(&io->list);
+		kfree(io);
+		phba->total_iocbq_bufs--;
+	}
+
+	spin_unlock_irq(phba->host->host_lock);
+
+	return 0;
+}
+
+
+static int __devinit
+lpfc_pci_probe_one(struct pci_dev *pdev, const struct pci_device_id *pid)
+{
+	struct Scsi_Host *host;
+	struct lpfc_hba  *phba;
+	struct lpfc_sli  *psli;
+	struct lpfc_iocbq *iocbq_entry = NULL, *iocbq_next = NULL;
+	unsigned long bar0map_len, bar2map_len;
+	int error = -ENODEV, retval;
+	int i;
+	uint16_t iotag;
+
+	if (pci_enable_device(pdev))
+		goto out;
+	if (pci_request_regions(pdev, LPFC_DRIVER_NAME))
+		goto out_disable_device;
+
+	host = scsi_host_alloc(&lpfc_template, sizeof (struct lpfc_hba));
+	if (!host)
+		goto out_release_regions;
+
+	phba = (struct lpfc_hba*)host->hostdata;
+	memset(phba, 0, sizeof (struct lpfc_hba));
+	phba->host = host;
+
+	phba->fc_flag |= FC_LOADING;
+	phba->pcidev = pdev;
+
+	/* Assign an unused board number */
+	if (!idr_pre_get(&lpfc_hba_index, GFP_KERNEL))
+		goto out_put_host;
+
+	error = idr_get_new(&lpfc_hba_index, NULL, &phba->brd_no);
+	if (error)
+		goto out_put_host;
+
+	host->unique_id = phba->brd_no;
+	init_MUTEX(&phba->hba_can_block);
+	INIT_LIST_HEAD(&phba->ctrspbuflist);
+	INIT_LIST_HEAD(&phba->rnidrspbuflist);
+	INIT_LIST_HEAD(&phba->freebufList);
+
+	/* Initialize timers used by driver */
+	init_timer(&phba->fc_estabtmo);
+	phba->fc_estabtmo.function = lpfc_establish_link_tmo;
+	phba->fc_estabtmo.data = (unsigned long)phba;
+	init_timer(&phba->fc_disctmo);
+	phba->fc_disctmo.function = lpfc_disc_timeout;
+	phba->fc_disctmo.data = (unsigned long)phba;
+
+	init_timer(&phba->fc_fdmitmo);
+	phba->fc_fdmitmo.function = lpfc_fdmi_tmo;
+	phba->fc_fdmitmo.data = (unsigned long)phba;
+	init_timer(&phba->els_tmofunc);
+	phba->els_tmofunc.function = lpfc_els_timeout;
+	phba->els_tmofunc.data = (unsigned long)phba;
+	psli = &phba->sli;
+	init_timer(&psli->mbox_tmo);
+	psli->mbox_tmo.function = lpfc_mbox_timeout;
+	psli->mbox_tmo.data = (unsigned long)phba;
+
+	init_timer(&phba->fcp_poll_timer);
+	phba->fcp_poll_timer.function = lpfc_poll_timeout;
+	phba->fcp_poll_timer.data = (unsigned long)phba;
+
+	/*
+	 * Get all the module params for configuring this host and then
+	 * establish the host parameters.
+	 */
+	lpfc_get_cfgparam(phba);
+
+	host->max_id = LPFC_MAX_TARGET;
+	host->max_lun = phba->cfg_max_luns;
+	host->this_id = -1;
+
+	/* Initialize all internally managed lists. */
+	INIT_LIST_HEAD(&phba->fc_nlpmap_list);
+	INIT_LIST_HEAD(&phba->fc_nlpunmap_list);
+	INIT_LIST_HEAD(&phba->fc_unused_list);
+	INIT_LIST_HEAD(&phba->fc_plogi_list);
+	INIT_LIST_HEAD(&phba->fc_adisc_list);
+	INIT_LIST_HEAD(&phba->fc_reglogin_list);
+	INIT_LIST_HEAD(&phba->fc_prli_list);
+	INIT_LIST_HEAD(&phba->fc_npr_list);
+
+
+	pci_set_master(pdev);
+	retval = pci_set_mwi(pdev);
+	if (retval)
+		dev_printk(KERN_WARNING, &pdev->dev,
+			   "Warning: pci_set_mwi returned %d\n", retval);
+
+	if (pci_set_dma_mask(phba->pcidev, DMA_64BIT_MASK) != 0)
+		if (pci_set_dma_mask(phba->pcidev, DMA_32BIT_MASK) != 0)
+			goto out_idr_remove;
+
+	/*
+	 * Get the bus address of Bar0 and Bar2 and the number of bytes
+	 * required by each mapping.
+	 */
+	phba->pci_bar0_map = pci_resource_start(phba->pcidev, 0);
+	bar0map_len        = pci_resource_len(phba->pcidev, 0);
+
+	phba->pci_bar2_map = pci_resource_start(phba->pcidev, 2);
+	bar2map_len        = pci_resource_len(phba->pcidev, 2);
+
+	/* Map HBA SLIM to a kernel virtual address. */
+	phba->slim_memmap_p      = ioremap(phba->pci_bar0_map, bar0map_len);
+	if (!phba->slim_memmap_p) {
+		error = -ENODEV;
+		dev_printk(KERN_ERR, &pdev->dev,
+			   "ioremap failed for SLIM memory.\n");
+		goto out_idr_remove;
+	}
+
+	/* Map HBA Control Registers to a kernel virtual address. */
+	phba->ctrl_regs_memmap_p = ioremap(phba->pci_bar2_map, bar2map_len);
+	if (!phba->ctrl_regs_memmap_p) {
+		error = -ENODEV;
+		dev_printk(KERN_ERR, &pdev->dev,
+			   "ioremap failed for HBA control registers.\n");
+		goto out_iounmap_slim;
+	}
+
+	/* Allocate memory for SLI-2 structures */
+	phba->slim2p = dma_alloc_coherent(&phba->pcidev->dev, SLI2_SLIM_SIZE,
+					  &phba->slim2p_mapping, GFP_KERNEL);
+	if (!phba->slim2p)
+		goto out_iounmap;
+
+	memset(phba->slim2p, 0, SLI2_SLIM_SIZE);
+
+	/* Initialize the SLI Layer to run with lpfc HBAs. */
+	lpfc_sli_setup(phba);
+	lpfc_sli_queue_setup(phba);
+
+	error = lpfc_mem_alloc(phba);
+	if (error)
+		goto out_free_slim;
+
+	/* Initialize and populate the iocb list per host.  */
+	INIT_LIST_HEAD(&phba->lpfc_iocb_list);
+	for (i = 0; i < LPFC_IOCB_LIST_CNT; i++) {
+		iocbq_entry = kmalloc(sizeof(struct lpfc_iocbq), GFP_KERNEL);
+		if (iocbq_entry == NULL) {
+			printk(KERN_ERR "%s: only allocated %d iocbs of "
+				"expected %d count. Unloading driver.\n",
+				__FUNCTION__, i, LPFC_IOCB_LIST_CNT);
+			error = -ENOMEM;
+			goto out_free_iocbq;
+		}
+
+		memset(iocbq_entry, 0, sizeof(struct lpfc_iocbq));
+		iotag = lpfc_sli_next_iotag(phba, iocbq_entry);
+		if (iotag == 0) {
+			kfree (iocbq_entry);
+			printk(KERN_ERR "%s: failed to allocate IOTAG. "
+			       "Unloading driver.\n",
+				__FUNCTION__);
+			error = -ENOMEM;
+			goto out_free_iocbq;
+		}
+		spin_lock_irq(phba->host->host_lock);
+		list_add(&iocbq_entry->list, &phba->lpfc_iocb_list);
+		phba->total_iocbq_bufs++;
+		spin_unlock_irq(phba->host->host_lock);
+	}
+
+	/* Initialize HBA structure */
+	phba->fc_edtov = FF_DEF_EDTOV;
+	phba->fc_ratov = FF_DEF_RATOV;
+	phba->fc_altov = FF_DEF_ALTOV;
+	phba->fc_arbtov = FF_DEF_ARBTOV;
+
+	INIT_LIST_HEAD(&phba->work_list);
+	phba->work_ha_mask = (HA_ERATT|HA_MBATT|HA_LATT);
+	phba->work_ha_mask |= (HA_RXMASK << (LPFC_ELS_RING * 4));
+
+	/* Startup the kernel thread for this host adapter. */
+	phba->worker_thread = kthread_run(lpfc_do_work, phba,
+				       "lpfc_worker_%d", phba->brd_no);
+	if (IS_ERR(phba->worker_thread)) {
+		error = PTR_ERR(phba->worker_thread);
+		goto out_free_iocbq;
+	}
+
+	/* We can rely on a queue depth attribute only after SLI HBA setup */
+	host->can_queue = phba->cfg_hba_queue_depth - 10;
+
+	/* Tell the midlayer we support 16 byte commands */
+	host->max_cmd_len = 16;
+
+	/* Initialize the list of scsi buffers used by driver for scsi IO. */
+	spin_lock_init(&phba->scsi_buf_list_lock);
+	INIT_LIST_HEAD(&phba->lpfc_scsi_buf_list);
+
+	host->transportt = lpfc_transport_template;
+	pci_set_drvdata(pdev, host);
+	error = scsi_add_host(host, &pdev->dev);
+	if (error)
+		goto out_kthread_stop;
+
+	error = lpfc_alloc_sysfs_attr(phba);
+	if (error)
+		goto out_kthread_stop;
+
+	error =	request_irq(phba->pcidev->irq, lpfc_intr_handler, SA_SHIRQ,
+							LPFC_DRIVER_NAME, phba);
+	if (error) {
+		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+			"%d:0451 Enable interrupt handler failed\n",
+			phba->brd_no);
+		goto out_free_sysfs_attr;
+	}
+	phba->MBslimaddr = phba->slim_memmap_p;
+	phba->HAregaddr = phba->ctrl_regs_memmap_p + HA_REG_OFFSET;
+	phba->CAregaddr = phba->ctrl_regs_memmap_p + CA_REG_OFFSET;
+	phba->HSregaddr = phba->ctrl_regs_memmap_p + HS_REG_OFFSET;
+	phba->HCregaddr = phba->ctrl_regs_memmap_p + HC_REG_OFFSET;
+
+	error = lpfc_sli_hba_setup(phba);
+	if (error)
+		goto out_free_irq;
+
+	if (phba->cfg_poll & DISABLE_FCP_RING_INT) {
+		spin_lock_irq(phba->host->host_lock);
+		lpfc_poll_start_timer(phba);
+		spin_unlock_irq(phba->host->host_lock);
+	}
+
+	/*
+	 * set fixed host attributes
+	 * Must done after lpfc_sli_hba_setup()
+	 */
+
+	fc_host_node_name(host) = wwn_to_u64(phba->fc_nodename.u.wwn);
+	fc_host_port_name(host) = wwn_to_u64(phba->fc_portname.u.wwn);
+	fc_host_supported_classes(host) = FC_COS_CLASS3;
+
+	memset(fc_host_supported_fc4s(host), 0,
+		sizeof(fc_host_supported_fc4s(host)));
+	fc_host_supported_fc4s(host)[2] = 1;
+	fc_host_supported_fc4s(host)[7] = 1;
+
+	lpfc_get_hba_sym_node_name(phba, fc_host_symbolic_name(host));
+
+	fc_host_supported_speeds(host) = 0;
+	if (phba->lmt & LMT_10Gb)
+		fc_host_supported_speeds(host) |= FC_PORTSPEED_10GBIT;
+	if (phba->lmt & LMT_4Gb)
+		fc_host_supported_speeds(host) |= FC_PORTSPEED_4GBIT;
+	if (phba->lmt & LMT_2Gb)
+		fc_host_supported_speeds(host) |= FC_PORTSPEED_2GBIT;
+	if (phba->lmt & LMT_1Gb)
+		fc_host_supported_speeds(host) |= FC_PORTSPEED_1GBIT;
+
+	fc_host_maxframe_size(host) =
+		((((uint32_t) phba->fc_sparam.cmn.bbRcvSizeMsb & 0x0F) << 8) |
+		 (uint32_t) phba->fc_sparam.cmn.bbRcvSizeLsb);
+
+	/* This value is also unchanging */
+	memset(fc_host_active_fc4s(host), 0,
+		sizeof(fc_host_active_fc4s(host)));
+	fc_host_active_fc4s(host)[2] = 1;
+	fc_host_active_fc4s(host)[7] = 1;
+
+	spin_lock_irq(phba->host->host_lock);
+	phba->fc_flag &= ~FC_LOADING;
+	spin_unlock_irq(phba->host->host_lock);
+	return 0;
+
+out_free_irq:
+	lpfc_stop_timer(phba);
+	phba->work_hba_events = 0;
+	free_irq(phba->pcidev->irq, phba);
+out_free_sysfs_attr:
+	lpfc_free_sysfs_attr(phba);
+out_kthread_stop:
+	kthread_stop(phba->worker_thread);
+out_free_iocbq:
+	list_for_each_entry_safe(iocbq_entry, iocbq_next,
+						&phba->lpfc_iocb_list, list) {
+		spin_lock_irq(phba->host->host_lock);
+		kfree(iocbq_entry);
+		phba->total_iocbq_bufs--;
+		spin_unlock_irq(phba->host->host_lock);
+	}
+	lpfc_mem_free(phba);
+out_free_slim:
+	dma_free_coherent(&pdev->dev, SLI2_SLIM_SIZE, phba->slim2p,
+							phba->slim2p_mapping);
+out_iounmap:
+	iounmap(phba->ctrl_regs_memmap_p);
+out_iounmap_slim:
+	iounmap(phba->slim_memmap_p);
+out_idr_remove:
+	idr_remove(&lpfc_hba_index, phba->brd_no);
+out_put_host:
+	scsi_host_put(host);
+out_release_regions:
+	pci_release_regions(pdev);
+out_disable_device:
+	pci_disable_device(pdev);
+out:
+	return error;
+}
+
+static void __devexit
+lpfc_pci_remove_one(struct pci_dev *pdev)
+{
+	struct Scsi_Host   *host = pci_get_drvdata(pdev);
+	struct lpfc_hba    *phba = (struct lpfc_hba *)host->hostdata;
+	unsigned long iflag;
+
+	lpfc_free_sysfs_attr(phba);
+
+	spin_lock_irqsave(phba->host->host_lock, iflag);
+	phba->fc_flag |= FC_UNLOADING;
+
+	spin_unlock_irqrestore(phba->host->host_lock, iflag);
+
+	fc_remove_host(phba->host);
+	scsi_remove_host(phba->host);
+
+	kthread_stop(phba->worker_thread);
+
+	/*
+	 * Bring down the SLI Layer. This step disable all interrupts,
+	 * clears the rings, discards all mailbox commands, and resets
+	 * the HBA.
+	 */
+	lpfc_sli_hba_down(phba);
+	lpfc_sli_brdrestart(phba);
+
+	/* Release the irq reservation */
+	free_irq(phba->pcidev->irq, phba);
+
+	lpfc_cleanup(phba, 0);
+	lpfc_stop_timer(phba);
+	phba->work_hba_events = 0;
+
+	/*
+	 * Call scsi_free before mem_free since scsi bufs are released to their
+	 * corresponding pools here.
+	 */
+	lpfc_scsi_free(phba);
+	lpfc_mem_free(phba);
+
+	/* Free resources associated with SLI2 interface */
+	dma_free_coherent(&pdev->dev, SLI2_SLIM_SIZE,
+			  phba->slim2p, phba->slim2p_mapping);
+
+	/* unmap adapter SLIM and Control Registers */
+	iounmap(phba->ctrl_regs_memmap_p);
+	iounmap(phba->slim_memmap_p);
+
+	pci_release_regions(phba->pcidev);
+	pci_disable_device(phba->pcidev);
+
+	idr_remove(&lpfc_hba_index, phba->brd_no);
+	scsi_host_put(phba->host);
+
+	pci_set_drvdata(pdev, NULL);
+}
+
+static struct pci_device_id lpfc_id_table[] = {
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_VIPER,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_FIREFLY,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_THOR,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_PEGASUS,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_CENTAUR,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_DRAGONFLY,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_SUPERFLY,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_RFLY,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_PFLY,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_NEPTUNE,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_NEPTUNE_SCSP,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_NEPTUNE_DCSP,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_HELIOS,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_HELIOS_SCSP,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_HELIOS_DCSP,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_BMID,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_BSMB,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_ZEPHYR,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_ZEPHYR_SCSP,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_ZEPHYR_DCSP,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_ZMID,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_ZSMB,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_TFLY,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_LP101,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_LP10000S,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_LP11000S,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{PCI_VENDOR_ID_EMULEX, PCI_DEVICE_ID_LPE11000S,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, lpfc_id_table);
+
+static struct pci_driver lpfc_driver = {
+	.name		= LPFC_DRIVER_NAME,
+	.id_table	= lpfc_id_table,
+	.probe		= lpfc_pci_probe_one,
+	.remove		= __devexit_p(lpfc_pci_remove_one),
+};
+
+static int __init
+lpfc_init(void)
+{
+	int error = 0;
+
+	printk(LPFC_MODULE_DESC "\n");
+	printk(LPFC_COPYRIGHT "\n");
+
+	lpfc_transport_template =
+				fc_attach_transport(&lpfc_transport_functions);
+	if (!lpfc_transport_template)
+		return -ENOMEM;
+	error = pci_register_driver(&lpfc_driver);
+	if (error)
+		fc_release_transport(lpfc_transport_template);
+
+	return error;
+}
+
+static void __exit
+lpfc_exit(void)
+{
+	pci_unregister_driver(&lpfc_driver);
+	fc_release_transport(lpfc_transport_template);
+}
+
+module_init(lpfc_init);
+module_exit(lpfc_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(LPFC_MODULE_DESC);
+MODULE_AUTHOR("Emulex Corporation - tech.support@emulex.com");
+MODULE_VERSION("0:" LPFC_DRIVER_VERSION);
diff -urN oldtree/drivers/usb/net/pegasus.c newtree/drivers/usb/net/pegasus.c
--- oldtree/drivers/usb/net/pegasus.c	2006-03-08 18:48:01.415967500 +0000
+++ newtree/drivers/usb/net/pegasus.c	2006-03-08 15:22:33.157499500 +0000
@@ -1452,7 +1452,7 @@
 	pr_info("%s: %s, " DRIVER_DESC "\n", driver_name, DRIVER_VERSION);
 	if (devid)
 		parse_id(devid);
-	pegasus_workqueue = create_singlethread_workqueue("pegasus");
+	pegasus_workqueue = create_nofreeze_singlethread_workqueue("pegasus");
 	if (!pegasus_workqueue)
 		return -ENOMEM;
 	return usb_register(&pegasus_driver);
diff -urN oldtree/drivers/usb/net/pegasus.c.orig newtree/drivers/usb/net/pegasus.c.orig
--- oldtree/drivers/usb/net/pegasus.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/drivers/usb/net/pegasus.c.orig	2006-03-08 15:21:18.208815500 +0000
@@ -0,0 +1,1468 @@
+/*
+ *  Copyright (c) 1999-2005 Petko Manolov (petkan@users.sourceforge.net)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *	ChangeLog:
+ *		....	Most of the time spent on reading sources & docs.
+ *		v0.2.x	First official release for the Linux kernel.
+ *		v0.3.0	Beutified and structured, some bugs fixed.
+ *		v0.3.x	URBifying bulk requests and bugfixing. First relatively
+ *			stable release. Still can touch device's registers only
+ *			from top-halves.
+ *		v0.4.0	Control messages remained unurbified are now URBs.
+ *			Now we can touch the HW at any time.
+ *		v0.4.9	Control urbs again use process context to wait. Argh...
+ *			Some long standing bugs (enable_net_traffic) fixed.
+ *			Also nasty trick about resubmiting control urb from
+ *			interrupt context used. Please let me know how it
+ *			behaves. Pegasus II support added since this version.
+ *			TODO: suppressing HCD warnings spewage on disconnect.
+ *		v0.4.13	Ethernet address is now set at probe(), not at open()
+ *			time as this seems to break dhcpd. 
+ *		v0.5.0	branch to 2.5.x kernels
+ *		v0.5.1	ethtool support added
+ *		v0.5.5	rx socket buffers are in a pool and the their allocation
+ * 			is out of the interrupt routine.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/usb.h>
+#include <linux/module.h>
+#include <asm/byteorder.h>
+#include <asm/uaccess.h>
+#include "pegasus.h"
+
+/*
+ * Version Information
+ */
+#define DRIVER_VERSION "v0.6.13 (2005/11/13)"
+#define DRIVER_AUTHOR "Petko Manolov <petkan@users.sourceforge.net>"
+#define DRIVER_DESC "Pegasus/Pegasus II USB Ethernet driver"
+
+static const char driver_name[] = "pegasus";
+
+#undef	PEGASUS_WRITE_EEPROM
+#define	BMSR_MEDIA	(BMSR_10HALF | BMSR_10FULL | BMSR_100HALF | \
+			BMSR_100FULL | BMSR_ANEGCAPABLE)
+
+static int loopback = 0;
+static int mii_mode = 0;
+static char *devid=NULL;
+
+static struct usb_eth_dev usb_dev_id[] = {
+#define	PEGASUS_DEV(pn, vid, pid, flags)	\
+	{.name = pn, .vendor = vid, .device = pid, .private = flags},
+#include "pegasus.h"
+#undef	PEGASUS_DEV
+	{NULL, 0, 0, 0},
+	{NULL, 0, 0, 0}
+};
+
+static struct usb_device_id pegasus_ids[] = {
+#define	PEGASUS_DEV(pn, vid, pid, flags) \
+	{.match_flags = USB_DEVICE_ID_MATCH_DEVICE, .idVendor = vid, .idProduct = pid},
+#include "pegasus.h"
+#undef	PEGASUS_DEV
+	{},
+	{}
+};
+
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
+module_param(loopback, bool, 0);
+module_param(mii_mode, bool, 0);
+module_param(devid, charp, 0);
+MODULE_PARM_DESC(loopback, "Enable MAC loopback mode (bit 0)");
+MODULE_PARM_DESC(mii_mode, "Enable HomePNA mode (bit 0),default=MII mode = 0");
+MODULE_PARM_DESC(devid, "The format is: 'DEV_name:VendorID:DeviceID:Flags'");
+
+/* use ethtool to change the level for any given device */
+static int msg_level = -1;
+module_param (msg_level, int, 0);
+MODULE_PARM_DESC (msg_level, "Override default message level");
+
+MODULE_DEVICE_TABLE(usb, pegasus_ids);
+
+static int update_eth_regs_async(pegasus_t *);
+/* Aargh!!! I _really_ hate such tweaks */
+static void ctrl_callback(struct urb *urb, struct pt_regs *regs)
+{
+	pegasus_t *pegasus = urb->context;
+
+	if (!pegasus)
+		return;
+
+	switch (urb->status) {
+	case 0:
+		if (pegasus->flags & ETH_REGS_CHANGE) {
+			pegasus->flags &= ~ETH_REGS_CHANGE;
+			pegasus->flags |= ETH_REGS_CHANGED;
+			update_eth_regs_async(pegasus);
+			return;
+		}
+		break;
+	case -EINPROGRESS:
+		return;
+	case -ENOENT:
+		break;
+	default:
+		if (netif_msg_drv(pegasus))
+			dev_dbg(&pegasus->intf->dev, "%s, status %d\n",
+				__FUNCTION__, urb->status);
+	}
+	pegasus->flags &= ~ETH_REGS_CHANGED;
+	wake_up(&pegasus->ctrl_wait);
+}
+
+static int get_registers(pegasus_t * pegasus, __u16 indx, __u16 size,
+			 void *data)
+{
+	int ret;
+	char *buffer;
+	DECLARE_WAITQUEUE(wait, current);
+
+	buffer = kmalloc(size, GFP_KERNEL);
+	if (!buffer) {
+		if (netif_msg_drv(pegasus))
+			dev_warn(&pegasus->intf->dev, "out of memory in %s\n",
+					__FUNCTION__);
+		return -ENOMEM;
+	}
+	add_wait_queue(&pegasus->ctrl_wait, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (pegasus->flags & ETH_REGS_CHANGED)
+		schedule();
+	remove_wait_queue(&pegasus->ctrl_wait, &wait);
+	set_current_state(TASK_RUNNING);
+
+	pegasus->dr.bRequestType = PEGASUS_REQT_READ;
+	pegasus->dr.bRequest = PEGASUS_REQ_GET_REGS;
+	pegasus->dr.wValue = cpu_to_le16(0);
+	pegasus->dr.wIndex = cpu_to_le16p(&indx);
+	pegasus->dr.wLength = cpu_to_le16p(&size);
+	pegasus->ctrl_urb->transfer_buffer_length = size;
+
+	usb_fill_control_urb(pegasus->ctrl_urb, pegasus->usb,
+			     usb_rcvctrlpipe(pegasus->usb, 0),
+			     (char *) &pegasus->dr,
+			     buffer, size, ctrl_callback, pegasus);
+
+	add_wait_queue(&pegasus->ctrl_wait, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+
+	/* using ATOMIC, we'd never wake up if we slept */
+	if ((ret = usb_submit_urb(pegasus->ctrl_urb, GFP_ATOMIC))) {
+		if (netif_msg_drv(pegasus))
+			dev_err(&pegasus->intf->dev, "%s, status %d\n",
+					__FUNCTION__, ret);
+		goto out;
+	}
+
+	schedule();
+out:
+	remove_wait_queue(&pegasus->ctrl_wait, &wait);
+	memcpy(data, buffer, size);
+	kfree(buffer);
+
+	return ret;
+}
+
+static int set_registers(pegasus_t * pegasus, __u16 indx, __u16 size,
+			 void *data)
+{
+	int ret;
+	char *buffer;
+	DECLARE_WAITQUEUE(wait, current);
+
+	buffer = kmalloc(size, GFP_KERNEL);
+	if (!buffer) {
+		if (netif_msg_drv(pegasus))
+			dev_warn(&pegasus->intf->dev, "out of memory in %s\n",
+					__FUNCTION__);
+		return -ENOMEM;
+	}
+	memcpy(buffer, data, size);
+
+	add_wait_queue(&pegasus->ctrl_wait, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (pegasus->flags & ETH_REGS_CHANGED)
+		schedule();
+	remove_wait_queue(&pegasus->ctrl_wait, &wait);
+	set_current_state(TASK_RUNNING);
+
+	pegasus->dr.bRequestType = PEGASUS_REQT_WRITE;
+	pegasus->dr.bRequest = PEGASUS_REQ_SET_REGS;
+	pegasus->dr.wValue = cpu_to_le16(0);
+	pegasus->dr.wIndex = cpu_to_le16p(&indx);
+	pegasus->dr.wLength = cpu_to_le16p(&size);
+	pegasus->ctrl_urb->transfer_buffer_length = size;
+
+	usb_fill_control_urb(pegasus->ctrl_urb, pegasus->usb,
+			     usb_sndctrlpipe(pegasus->usb, 0),
+			     (char *) &pegasus->dr,
+			     buffer, size, ctrl_callback, pegasus);
+
+	add_wait_queue(&pegasus->ctrl_wait, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+
+	if ((ret = usb_submit_urb(pegasus->ctrl_urb, GFP_ATOMIC))) {
+		if (netif_msg_drv(pegasus))
+			dev_err(&pegasus->intf->dev, "%s, status %d\n",
+					__FUNCTION__, ret);
+		goto out;
+	}
+
+	schedule();
+out:
+	remove_wait_queue(&pegasus->ctrl_wait, &wait);
+	kfree(buffer);
+
+	return ret;
+}
+
+static int set_register(pegasus_t * pegasus, __u16 indx, __u8 data)
+{
+	int ret;
+	char *tmp;
+	DECLARE_WAITQUEUE(wait, current);
+
+	tmp = kmalloc(1, GFP_KERNEL);
+	if (!tmp) {
+		if (netif_msg_drv(pegasus))
+			dev_warn(&pegasus->intf->dev, "out of memory in %s\n",
+					__FUNCTION__);
+		return -ENOMEM;
+	}
+	memcpy(tmp, &data, 1);
+	add_wait_queue(&pegasus->ctrl_wait, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (pegasus->flags & ETH_REGS_CHANGED)
+		schedule();
+	remove_wait_queue(&pegasus->ctrl_wait, &wait);
+	set_current_state(TASK_RUNNING);
+
+	pegasus->dr.bRequestType = PEGASUS_REQT_WRITE;
+	pegasus->dr.bRequest = PEGASUS_REQ_SET_REG;
+	pegasus->dr.wValue = cpu_to_le16(data);
+	pegasus->dr.wIndex = cpu_to_le16p(&indx);
+	pegasus->dr.wLength = cpu_to_le16(1);
+	pegasus->ctrl_urb->transfer_buffer_length = 1;
+
+	usb_fill_control_urb(pegasus->ctrl_urb, pegasus->usb,
+			     usb_sndctrlpipe(pegasus->usb, 0),
+			     (char *) &pegasus->dr,
+			     &tmp, 1, ctrl_callback, pegasus);
+
+	add_wait_queue(&pegasus->ctrl_wait, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+
+	if ((ret = usb_submit_urb(pegasus->ctrl_urb, GFP_ATOMIC))) {
+		if (netif_msg_drv(pegasus))
+			dev_err(&pegasus->intf->dev, "%s, status %d\n",
+					__FUNCTION__, ret);
+		goto out;
+	}
+
+	schedule();
+out:
+	remove_wait_queue(&pegasus->ctrl_wait, &wait);
+	kfree(tmp);
+
+	return ret;
+}
+
+static int update_eth_regs_async(pegasus_t * pegasus)
+{
+	int ret;
+
+	pegasus->dr.bRequestType = PEGASUS_REQT_WRITE;
+	pegasus->dr.bRequest = PEGASUS_REQ_SET_REGS;
+	pegasus->dr.wValue = 0;
+	pegasus->dr.wIndex = cpu_to_le16(EthCtrl0);
+	pegasus->dr.wLength = cpu_to_le16(3);
+	pegasus->ctrl_urb->transfer_buffer_length = 3;
+
+	usb_fill_control_urb(pegasus->ctrl_urb, pegasus->usb,
+			     usb_sndctrlpipe(pegasus->usb, 0),
+			     (char *) &pegasus->dr,
+			     pegasus->eth_regs, 3, ctrl_callback, pegasus);
+
+	if ((ret = usb_submit_urb(pegasus->ctrl_urb, GFP_ATOMIC)))
+		if (netif_msg_drv(pegasus))
+			dev_err(&pegasus->intf->dev, "%s, status %d\n",
+					__FUNCTION__, ret);
+
+	return ret;
+}
+
+static int read_mii_word(pegasus_t * pegasus, __u8 phy, __u8 indx, __u16 * regd)
+{
+	int i;
+	__u8 data[4] = { phy, 0, 0, indx };
+	__le16 regdi;
+	int ret;
+
+	set_register(pegasus, PhyCtrl, 0);
+	set_registers(pegasus, PhyAddr, sizeof (data), data);
+	set_register(pegasus, PhyCtrl, (indx | PHY_READ));
+	for (i = 0; i < REG_TIMEOUT; i++) {
+		ret = get_registers(pegasus, PhyCtrl, 1, data);
+		if (data[0] & PHY_DONE)
+			break;
+	}
+	if (i < REG_TIMEOUT) {
+		ret = get_registers(pegasus, PhyData, 2, &regdi);
+		*regd = le16_to_cpu(regdi);
+		return ret;
+	}
+	if (netif_msg_drv(pegasus))
+		dev_warn(&pegasus->intf->dev, "fail %s\n", __FUNCTION__);
+
+	return ret;
+}
+
+static int mdio_read(struct net_device *dev, int phy_id, int loc)
+{
+	pegasus_t *pegasus = (pegasus_t *) netdev_priv(dev);
+	u16 res;
+
+	read_mii_word(pegasus, phy_id, loc, &res);
+	return (int)res;
+}
+
+static int write_mii_word(pegasus_t * pegasus, __u8 phy, __u8 indx, __u16 regd)
+{
+	int i;
+	__u8 data[4] = { phy, 0, 0, indx };
+	int ret;
+
+	data[1] = (u8) regd;
+	data[2] = (u8) (regd >> 8);
+	set_register(pegasus, PhyCtrl, 0);
+	set_registers(pegasus, PhyAddr, sizeof(data), data);
+	set_register(pegasus, PhyCtrl, (indx | PHY_WRITE));
+	for (i = 0; i < REG_TIMEOUT; i++) {
+		ret = get_registers(pegasus, PhyCtrl, 1, data);
+		if (data[0] & PHY_DONE)
+			break;
+	}
+	if (i < REG_TIMEOUT)
+		return ret;
+
+	if (netif_msg_drv(pegasus))
+		dev_warn(&pegasus->intf->dev, "fail %s\n", __FUNCTION__);
+	return -ETIMEDOUT;
+}
+
+static void mdio_write(struct net_device *dev, int phy_id, int loc, int val)
+{
+	pegasus_t *pegasus = (pegasus_t *) netdev_priv(dev);
+
+	write_mii_word(pegasus, phy_id, loc, val);
+}
+
+static int read_eprom_word(pegasus_t * pegasus, __u8 index, __u16 * retdata)
+{
+	int i;
+	__u8 tmp;
+	__le16 retdatai;
+	int ret;
+
+	set_register(pegasus, EpromCtrl, 0);
+	set_register(pegasus, EpromOffset, index);
+	set_register(pegasus, EpromCtrl, EPROM_READ);
+
+	for (i = 0; i < REG_TIMEOUT; i++) {
+		ret = get_registers(pegasus, EpromCtrl, 1, &tmp);
+		if (tmp & EPROM_DONE)
+			break;
+	}
+	if (i < REG_TIMEOUT) {
+		ret = get_registers(pegasus, EpromData, 2, &retdatai);
+		*retdata = le16_to_cpu(retdatai);
+		return ret;
+	}
+
+	if (netif_msg_drv(pegasus))
+		dev_warn(&pegasus->intf->dev, "fail %s\n", __FUNCTION__);
+	return -ETIMEDOUT;
+}
+
+#ifdef	PEGASUS_WRITE_EEPROM
+static inline void enable_eprom_write(pegasus_t * pegasus)
+{
+	__u8 tmp;
+	int ret;
+
+	get_registers(pegasus, EthCtrl2, 1, &tmp);
+	set_register(pegasus, EthCtrl2, tmp | EPROM_WR_ENABLE);
+}
+
+static inline void disable_eprom_write(pegasus_t * pegasus)
+{
+	__u8 tmp;
+	int ret;
+
+	get_registers(pegasus, EthCtrl2, 1, &tmp);
+	set_register(pegasus, EpromCtrl, 0);
+	set_register(pegasus, EthCtrl2, tmp & ~EPROM_WR_ENABLE);
+}
+
+static int write_eprom_word(pegasus_t * pegasus, __u8 index, __u16 data)
+{
+	int i;
+	__u8 tmp, d[4] = { 0x3f, 0, 0, EPROM_WRITE };
+	int ret;
+
+	set_registers(pegasus, EpromOffset, 4, d);
+	enable_eprom_write(pegasus);
+	set_register(pegasus, EpromOffset, index);
+	set_registers(pegasus, EpromData, 2, &data);
+	set_register(pegasus, EpromCtrl, EPROM_WRITE);
+
+	for (i = 0; i < REG_TIMEOUT; i++) {
+		ret = get_registers(pegasus, EpromCtrl, 1, &tmp);
+		if (tmp & EPROM_DONE)
+			break;
+	}
+	disable_eprom_write(pegasus);
+	if (i < REG_TIMEOUT)
+		return ret;
+	if (netif_msg_drv(pegasus))
+		dev_warn(&pegasus->intf->dev, "fail %s\n", __FUNCTION__);
+	return -ETIMEDOUT;
+}
+#endif				/* PEGASUS_WRITE_EEPROM */
+
+static inline void get_node_id(pegasus_t * pegasus, __u8 * id)
+{
+	int i;
+	__u16 w16;
+
+	for (i = 0; i < 3; i++) {
+		read_eprom_word(pegasus, i, &w16);
+		((__le16 *) id)[i] = cpu_to_le16p(&w16);
+	}
+}
+
+static void set_ethernet_addr(pegasus_t * pegasus)
+{
+	__u8 node_id[6];
+
+	get_node_id(pegasus, node_id);
+	set_registers(pegasus, EthID, sizeof (node_id), node_id);
+	memcpy(pegasus->net->dev_addr, node_id, sizeof (node_id));
+}
+
+static inline int reset_mac(pegasus_t * pegasus)
+{
+	__u8 data = 0x8;
+	int i;
+
+	set_register(pegasus, EthCtrl1, data);
+	for (i = 0; i < REG_TIMEOUT; i++) {
+		get_registers(pegasus, EthCtrl1, 1, &data);
+		if (~data & 0x08) {
+			if (loopback & 1)
+				break;
+			if (mii_mode && (pegasus->features & HAS_HOME_PNA))
+				set_register(pegasus, Gpio1, 0x34);
+			else
+				set_register(pegasus, Gpio1, 0x26);
+			set_register(pegasus, Gpio0, pegasus->features);
+			set_register(pegasus, Gpio0, DEFAULT_GPIO_SET);
+			break;
+		}
+	}
+	if (i == REG_TIMEOUT)
+		return -ETIMEDOUT;
+
+	if (usb_dev_id[pegasus->dev_index].vendor == VENDOR_LINKSYS ||
+	    usb_dev_id[pegasus->dev_index].vendor == VENDOR_DLINK) {
+		set_register(pegasus, Gpio0, 0x24);
+		set_register(pegasus, Gpio0, 0x26);
+	}
+	if (usb_dev_id[pegasus->dev_index].vendor == VENDOR_ELCON) {
+		__u16 auxmode;
+		read_mii_word(pegasus, 3, 0x1b, &auxmode);
+		write_mii_word(pegasus, 3, 0x1b, auxmode | 4);
+	}
+
+	return 0;
+}
+
+static int enable_net_traffic(struct net_device *dev, struct usb_device *usb)
+{
+	__u16 linkpart;
+	__u8 data[4];
+	pegasus_t *pegasus = netdev_priv(dev);
+	int ret;
+
+	read_mii_word(pegasus, pegasus->phy, MII_LPA, &linkpart);
+	data[0] = 0xc9;
+	data[1] = 0;
+	if (linkpart & (ADVERTISE_100FULL | ADVERTISE_10FULL))
+		data[1] |= 0x20;	/* set full duplex */
+	if (linkpart & (ADVERTISE_100FULL | ADVERTISE_100HALF))
+		data[1] |= 0x10;	/* set 100 Mbps */
+	if (mii_mode)
+		data[1] = 0;
+	data[2] = (loopback & 1) ? 0x09 : 0x01;
+
+	memcpy(pegasus->eth_regs, data, sizeof (data));
+	ret = set_registers(pegasus, EthCtrl0, 3, data);
+
+	if (usb_dev_id[pegasus->dev_index].vendor == VENDOR_LINKSYS ||
+	    usb_dev_id[pegasus->dev_index].vendor == VENDOR_LINKSYS2 ||
+	    usb_dev_id[pegasus->dev_index].vendor == VENDOR_DLINK) {
+		u16 auxmode;
+		read_mii_word(pegasus, 0, 0x1b, &auxmode);
+		write_mii_word(pegasus, 0, 0x1b, auxmode | 4);
+	}
+
+	return ret;
+}
+
+static void fill_skb_pool(pegasus_t * pegasus)
+{
+	int i;
+
+	for (i = 0; i < RX_SKBS; i++) {
+		if (pegasus->rx_pool[i])
+			continue;
+		pegasus->rx_pool[i] = dev_alloc_skb(PEGASUS_MTU + 2);
+		/*
+		 ** we give up if the allocation fail. the tasklet will be
+		 ** rescheduled again anyway...
+		 */
+		if (pegasus->rx_pool[i] == NULL)
+			return;
+		pegasus->rx_pool[i]->dev = pegasus->net;
+		skb_reserve(pegasus->rx_pool[i], 2);
+	}
+}
+
+static void free_skb_pool(pegasus_t * pegasus)
+{
+	int i;
+
+	for (i = 0; i < RX_SKBS; i++) {
+		if (pegasus->rx_pool[i]) {
+			dev_kfree_skb(pegasus->rx_pool[i]);
+			pegasus->rx_pool[i] = NULL;
+		}
+	}
+}
+
+static inline struct sk_buff *pull_skb(pegasus_t * pegasus)
+{
+	int i;
+	struct sk_buff *skb;
+
+	for (i = 0; i < RX_SKBS; i++) {
+		if (likely(pegasus->rx_pool[i] != NULL)) {
+			skb = pegasus->rx_pool[i];
+			pegasus->rx_pool[i] = NULL;
+			return skb;
+		}
+	}
+	return NULL;
+}
+
+static void read_bulk_callback(struct urb *urb, struct pt_regs *regs)
+{
+	pegasus_t *pegasus = urb->context;
+	struct net_device *net;
+	int rx_status, count = urb->actual_length;
+	u8 *buf = urb->transfer_buffer;
+	__u16 pkt_len;
+
+	if (!pegasus)
+		return;
+
+	net = pegasus->net;
+	if (!netif_device_present(net) || !netif_running(net))
+		return;
+
+	switch (urb->status) {
+	case 0:
+		break;
+	case -ETIMEDOUT:
+		if (netif_msg_rx_err(pegasus))
+			pr_debug("%s: reset MAC\n", net->name);
+		pegasus->flags &= ~PEGASUS_RX_BUSY;
+		break;
+	case -EPIPE:		/* stall, or disconnect from TT */
+		/* FIXME schedule work to clear the halt */
+		if (netif_msg_rx_err(pegasus))
+			printk(KERN_WARNING "%s: no rx stall recovery\n",
+					net->name);
+		return;
+	case -ENOENT:
+	case -ECONNRESET:
+	case -ESHUTDOWN:
+		if (netif_msg_ifdown(pegasus))
+			pr_debug("%s: rx unlink, %d\n", net->name, urb->status);
+		return;
+	default:
+		if (netif_msg_rx_err(pegasus))
+			pr_debug("%s: RX status %d\n", net->name, urb->status);
+		goto goon;
+	}
+
+	if (!count || count < 4)
+		goto goon;
+
+	rx_status = buf[count - 2];
+	if (rx_status & 0x1e) {
+		if (netif_msg_rx_err(pegasus))
+			pr_debug("%s: RX packet error %x\n",
+					net->name, rx_status);
+		pegasus->stats.rx_errors++;
+		if (rx_status & 0x06)	// long or runt
+			pegasus->stats.rx_length_errors++;
+		if (rx_status & 0x08)
+			pegasus->stats.rx_crc_errors++;
+		if (rx_status & 0x10)	// extra bits
+			pegasus->stats.rx_frame_errors++;
+		goto goon;
+	}
+	if (pegasus->chip == 0x8513) {
+		pkt_len = le32_to_cpu(*(__le32 *)urb->transfer_buffer);
+		pkt_len &= 0x0fff;
+		pegasus->rx_skb->data += 2;
+	} else {
+		pkt_len = buf[count - 3] << 8;
+		pkt_len += buf[count - 4];
+		pkt_len &= 0xfff;
+		pkt_len -= 8;
+	}
+
+	/*
+	 * If the packet is unreasonably long, quietly drop it rather than
+	 * kernel panicing by calling skb_put.
+	 */
+	if (pkt_len > PEGASUS_MTU)
+		goto goon;
+
+	/*
+	 * at this point we are sure pegasus->rx_skb != NULL
+	 * so we go ahead and pass up the packet.
+	 */
+	skb_put(pegasus->rx_skb, pkt_len);
+	pegasus->rx_skb->protocol = eth_type_trans(pegasus->rx_skb, net);
+	netif_rx(pegasus->rx_skb);
+	pegasus->stats.rx_packets++;
+	pegasus->stats.rx_bytes += pkt_len;
+
+	if (pegasus->flags & PEGASUS_UNPLUG)
+		return;
+
+	spin_lock(&pegasus->rx_pool_lock);
+	pegasus->rx_skb = pull_skb(pegasus);
+	spin_unlock(&pegasus->rx_pool_lock);
+
+	if (pegasus->rx_skb == NULL)
+		goto tl_sched;
+goon:
+	usb_fill_bulk_urb(pegasus->rx_urb, pegasus->usb,
+			  usb_rcvbulkpipe(pegasus->usb, 1),
+			  pegasus->rx_skb->data, PEGASUS_MTU + 8,
+			  read_bulk_callback, pegasus);
+	if (usb_submit_urb(pegasus->rx_urb, GFP_ATOMIC)) {
+		pegasus->flags |= PEGASUS_RX_URB_FAIL;
+		goto tl_sched;
+	} else {
+		pegasus->flags &= ~PEGASUS_RX_URB_FAIL;
+	}
+
+	return;
+
+tl_sched:
+	tasklet_schedule(&pegasus->rx_tl);
+}
+
+static void rx_fixup(unsigned long data)
+{
+	pegasus_t *pegasus;
+	unsigned long flags;
+
+	pegasus = (pegasus_t *) data;
+	if (pegasus->flags & PEGASUS_UNPLUG)
+		return;
+
+	spin_lock_irqsave(&pegasus->rx_pool_lock, flags);
+	fill_skb_pool(pegasus);
+	if (pegasus->flags & PEGASUS_RX_URB_FAIL)
+		if (pegasus->rx_skb)
+			goto try_again;
+	if (pegasus->rx_skb == NULL) {
+		pegasus->rx_skb = pull_skb(pegasus);
+	}
+	if (pegasus->rx_skb == NULL) {
+		if (netif_msg_rx_err(pegasus))
+			printk(KERN_WARNING "%s: low on memory\n",
+					pegasus->net->name);
+		tasklet_schedule(&pegasus->rx_tl);
+		goto done;
+	}
+	usb_fill_bulk_urb(pegasus->rx_urb, pegasus->usb,
+			  usb_rcvbulkpipe(pegasus->usb, 1),
+			  pegasus->rx_skb->data, PEGASUS_MTU + 8,
+			  read_bulk_callback, pegasus);
+try_again:
+	if (usb_submit_urb(pegasus->rx_urb, GFP_ATOMIC)) {
+		pegasus->flags |= PEGASUS_RX_URB_FAIL;
+		tasklet_schedule(&pegasus->rx_tl);
+	} else {
+		pegasus->flags &= ~PEGASUS_RX_URB_FAIL;
+	}
+done:
+	spin_unlock_irqrestore(&pegasus->rx_pool_lock, flags);
+}
+
+static void write_bulk_callback(struct urb *urb, struct pt_regs *regs)
+{
+	pegasus_t *pegasus = urb->context;
+	struct net_device *net = pegasus->net;
+
+	if (!pegasus)
+		return;
+
+	if (!netif_device_present(net) || !netif_running(net))
+		return;
+
+	switch (urb->status) {
+	case -EPIPE:
+		/* FIXME schedule_work() to clear the tx halt */
+		netif_stop_queue(net);
+		if (netif_msg_tx_err(pegasus))
+			printk(KERN_WARNING "%s: no tx stall recovery\n",
+					net->name);
+		return;
+	case -ENOENT:
+	case -ECONNRESET:
+	case -ESHUTDOWN:
+		if (netif_msg_ifdown(pegasus))
+			pr_debug("%s: tx unlink, %d\n", net->name, urb->status);
+		return;
+	default:
+		if (netif_msg_tx_err(pegasus))
+			pr_info("%s: TX status %d\n", net->name, urb->status);
+		/* FALL THROUGH */
+	case 0:
+		break;
+	}
+
+	net->trans_start = jiffies;
+	netif_wake_queue(net);
+}
+
+static void intr_callback(struct urb *urb, struct pt_regs *regs)
+{
+	pegasus_t *pegasus = urb->context;
+	struct net_device *net;
+	int status;
+
+	if (!pegasus)
+		return;
+	net = pegasus->net;
+
+	switch (urb->status) {
+	case 0:
+		break;
+	case -ECONNRESET:	/* unlink */
+	case -ENOENT:
+	case -ESHUTDOWN:
+		return;
+	default:
+		/* some Pegasus-I products report LOTS of data
+		 * toggle errors... avoid log spamming
+		 */
+		if (netif_msg_timer(pegasus))
+			pr_debug("%s: intr status %d\n", net->name,
+					urb->status);
+	}
+
+	if (urb->actual_length >= 6) {
+		u8	* d = urb->transfer_buffer;
+
+		/* byte 0 == tx_status1, reg 2B */
+		if (d[0] & (TX_UNDERRUN|EXCESSIVE_COL
+					|LATE_COL|JABBER_TIMEOUT)) {
+			pegasus->stats.tx_errors++;
+			if (d[0] & TX_UNDERRUN)
+				pegasus->stats.tx_fifo_errors++;
+			if (d[0] & (EXCESSIVE_COL | JABBER_TIMEOUT))
+				pegasus->stats.tx_aborted_errors++;
+			if (d[0] & LATE_COL)
+				pegasus->stats.tx_window_errors++;
+		}
+
+		/* d[5].LINK_STATUS lies on some adapters.
+		 * d[0].NO_CARRIER kicks in only with failed TX.
+		 * ... so monitoring with MII may be safest.
+		 */
+		if (d[0] & NO_CARRIER)
+			netif_carrier_off(net);	
+		else
+			netif_carrier_on(net);
+
+		/* bytes 3-4 == rx_lostpkt, reg 2E/2F */
+		pegasus->stats.rx_missed_errors += ((d[3] & 0x7f) << 8) | d[4];
+	}
+
+	status = usb_submit_urb(urb, SLAB_ATOMIC);
+	if (status && netif_msg_timer(pegasus))
+		printk(KERN_ERR "%s: can't resubmit interrupt urb, %d\n",
+				net->name, status);
+}
+
+static void pegasus_tx_timeout(struct net_device *net)
+{
+	pegasus_t *pegasus = netdev_priv(net);
+	if (netif_msg_timer(pegasus))
+		printk(KERN_WARNING "%s: tx timeout\n", net->name);
+	usb_unlink_urb(pegasus->tx_urb);
+	pegasus->stats.tx_errors++;
+}
+
+static int pegasus_start_xmit(struct sk_buff *skb, struct net_device *net)
+{
+	pegasus_t *pegasus = netdev_priv(net);
+	int count = ((skb->len + 2) & 0x3f) ? skb->len + 2 : skb->len + 3;
+	int res;
+	__u16 l16 = skb->len;
+
+	netif_stop_queue(net);
+
+	((__le16 *) pegasus->tx_buff)[0] = cpu_to_le16(l16);
+	memcpy(pegasus->tx_buff + 2, skb->data, skb->len);
+	usb_fill_bulk_urb(pegasus->tx_urb, pegasus->usb,
+			  usb_sndbulkpipe(pegasus->usb, 2),
+			  pegasus->tx_buff, count,
+			  write_bulk_callback, pegasus);
+	if ((res = usb_submit_urb(pegasus->tx_urb, GFP_ATOMIC))) {
+		if (netif_msg_tx_err(pegasus))
+			printk(KERN_WARNING "%s: fail tx, %d\n",
+					net->name, res);
+		switch (res) {
+		case -EPIPE:		/* stall, or disconnect from TT */
+			/* cleanup should already have been scheduled */
+			break;
+		case -ENODEV:		/* disconnect() upcoming */
+			break;
+		default:
+			pegasus->stats.tx_errors++;
+			netif_start_queue(net);
+		}
+	} else {
+		pegasus->stats.tx_packets++;
+		pegasus->stats.tx_bytes += skb->len;
+		net->trans_start = jiffies;
+	}
+	dev_kfree_skb(skb);
+
+	return 0;
+}
+
+static struct net_device_stats *pegasus_netdev_stats(struct net_device *dev)
+{
+	return &((pegasus_t *) netdev_priv(dev))->stats;
+}
+
+static inline void disable_net_traffic(pegasus_t * pegasus)
+{
+	int tmp = 0;
+
+	set_registers(pegasus, EthCtrl0, 2, &tmp);
+}
+
+static inline void get_interrupt_interval(pegasus_t * pegasus)
+{
+	__u8 data[2];
+
+	read_eprom_word(pegasus, 4, (__u16 *) data);
+	if (pegasus->usb->speed != USB_SPEED_HIGH) {
+		if (data[1] < 0x80) {
+			if (netif_msg_timer(pegasus))
+				dev_info(&pegasus->intf->dev, "intr interval "
+					"changed from %ums to %ums\n",
+					data[1], 0x80);
+			data[1] = 0x80;
+#ifdef PEGASUS_WRITE_EEPROM
+			write_eprom_word(pegasus, 4, *(__u16 *) data);
+#endif
+		}
+	}
+	pegasus->intr_interval = data[1];
+}
+
+static void set_carrier(struct net_device *net)
+{
+	pegasus_t *pegasus = netdev_priv(net);
+	u16 tmp;
+
+	if (!read_mii_word(pegasus, pegasus->phy, MII_BMSR, &tmp))
+		return;
+
+	if (tmp & BMSR_LSTATUS)
+		netif_carrier_on(net);
+	else
+		netif_carrier_off(net);
+}
+
+static void free_all_urbs(pegasus_t * pegasus)
+{
+	usb_free_urb(pegasus->intr_urb);
+	usb_free_urb(pegasus->tx_urb);
+	usb_free_urb(pegasus->rx_urb);
+	usb_free_urb(pegasus->ctrl_urb);
+}
+
+static void unlink_all_urbs(pegasus_t * pegasus)
+{
+	usb_kill_urb(pegasus->intr_urb);
+	usb_kill_urb(pegasus->tx_urb);
+	usb_kill_urb(pegasus->rx_urb);
+	usb_kill_urb(pegasus->ctrl_urb);
+}
+
+static int alloc_urbs(pegasus_t * pegasus)
+{
+	pegasus->ctrl_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!pegasus->ctrl_urb) {
+		return 0;
+	}
+	pegasus->rx_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!pegasus->rx_urb) {
+		usb_free_urb(pegasus->ctrl_urb);
+		return 0;
+	}
+	pegasus->tx_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!pegasus->tx_urb) {
+		usb_free_urb(pegasus->rx_urb);
+		usb_free_urb(pegasus->ctrl_urb);
+		return 0;
+	}
+	pegasus->intr_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!pegasus->intr_urb) {
+		usb_free_urb(pegasus->tx_urb);
+		usb_free_urb(pegasus->rx_urb);
+		usb_free_urb(pegasus->ctrl_urb);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int pegasus_open(struct net_device *net)
+{
+	pegasus_t *pegasus = netdev_priv(net);
+	int res;
+
+	if (pegasus->rx_skb == NULL)
+		pegasus->rx_skb = pull_skb(pegasus);
+	/*
+	 ** Note: no point to free the pool.  it is empty :-)
+	 */
+	if (!pegasus->rx_skb)
+		return -ENOMEM;
+
+	res = set_registers(pegasus, EthID, 6, net->dev_addr);
+	
+	usb_fill_bulk_urb(pegasus->rx_urb, pegasus->usb,
+			  usb_rcvbulkpipe(pegasus->usb, 1),
+			  pegasus->rx_skb->data, PEGASUS_MTU + 8,
+			  read_bulk_callback, pegasus);
+	if ((res = usb_submit_urb(pegasus->rx_urb, GFP_KERNEL))) {
+		if (netif_msg_ifup(pegasus))
+			pr_debug("%s: failed rx_urb, %d", net->name, res);
+		goto exit;
+	}
+
+	usb_fill_int_urb(pegasus->intr_urb, pegasus->usb,
+			 usb_rcvintpipe(pegasus->usb, 3),
+			 pegasus->intr_buff, sizeof (pegasus->intr_buff),
+			 intr_callback, pegasus, pegasus->intr_interval);
+	if ((res = usb_submit_urb(pegasus->intr_urb, GFP_KERNEL))) {
+		if (netif_msg_ifup(pegasus))
+			pr_debug("%s: failed intr_urb, %d\n", net->name, res);
+		usb_kill_urb(pegasus->rx_urb);
+		goto exit;
+	}
+	if ((res = enable_net_traffic(net, pegasus->usb))) {
+		if (netif_msg_ifup(pegasus))
+			pr_debug("%s: can't enable_net_traffic() - %d\n",
+					net->name, res);
+		res = -EIO;
+		usb_kill_urb(pegasus->rx_urb);
+		usb_kill_urb(pegasus->intr_urb);
+		free_skb_pool(pegasus);
+		goto exit;
+	}
+	set_carrier(net);
+	netif_start_queue(net);
+	if (netif_msg_ifup(pegasus))
+		pr_debug("%s: open\n", net->name);
+	res = 0;
+exit:
+	return res;
+}
+
+static int pegasus_close(struct net_device *net)
+{
+	pegasus_t *pegasus = netdev_priv(net);
+
+	netif_stop_queue(net);
+	if (!(pegasus->flags & PEGASUS_UNPLUG))
+		disable_net_traffic(pegasus);
+	tasklet_kill(&pegasus->rx_tl);
+	unlink_all_urbs(pegasus);
+
+	return 0;
+}
+
+static void pegasus_get_drvinfo(struct net_device *dev,
+				struct ethtool_drvinfo *info)
+{
+	pegasus_t *pegasus = netdev_priv(dev);
+	strncpy(info->driver, driver_name, sizeof (info->driver) - 1);
+	strncpy(info->version, DRIVER_VERSION, sizeof (info->version) - 1);
+	usb_make_path(pegasus->usb, info->bus_info, sizeof (info->bus_info));
+}
+
+/* also handles three patterns of some kind in hardware */
+#define	WOL_SUPPORTED	(WAKE_MAGIC|WAKE_PHY)
+
+static void
+pegasus_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	pegasus_t	*pegasus = netdev_priv(dev);
+
+	wol->supported = WAKE_MAGIC | WAKE_PHY;
+	wol->wolopts = pegasus->wolopts;
+}
+
+static int
+pegasus_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	pegasus_t	*pegasus = netdev_priv(dev);
+	u8		reg78 = 0x04;
+	
+	if (wol->wolopts & ~WOL_SUPPORTED)
+		return -EINVAL;
+
+	if (wol->wolopts & WAKE_MAGIC)
+		reg78 |= 0x80;
+	if (wol->wolopts & WAKE_PHY)
+		reg78 |= 0x40;
+	/* FIXME this 0x10 bit still needs to get set in the chip... */
+	if (wol->wolopts)
+		pegasus->eth_regs[0] |= 0x10;
+	else
+		pegasus->eth_regs[0] &= ~0x10;
+	pegasus->wolopts = wol->wolopts;
+	return set_register(pegasus, WakeupControl, reg78);
+}
+
+static inline void pegasus_reset_wol(struct net_device *dev)
+{
+	struct ethtool_wolinfo wol;
+	
+	memset(&wol, 0, sizeof wol);
+	(void) pegasus_set_wol(dev, &wol);
+}
+
+static int
+pegasus_get_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
+{
+	pegasus_t *pegasus;
+
+	if (in_atomic())
+		return 0;
+
+	pegasus = netdev_priv(dev);
+	mii_ethtool_gset(&pegasus->mii, ecmd);
+
+	return 0;
+}
+
+static int
+pegasus_set_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
+{
+	pegasus_t *pegasus = netdev_priv(dev);
+	return mii_ethtool_sset(&pegasus->mii, ecmd);
+}
+
+static int pegasus_nway_reset(struct net_device *dev)
+{
+	pegasus_t *pegasus = netdev_priv(dev);
+	return mii_nway_restart(&pegasus->mii);
+}
+
+static u32 pegasus_get_link(struct net_device *dev)
+{
+	pegasus_t *pegasus = netdev_priv(dev);
+	return mii_link_ok(&pegasus->mii);
+}
+
+static u32 pegasus_get_msglevel(struct net_device *dev)
+{
+	pegasus_t *pegasus = netdev_priv(dev);
+	return pegasus->msg_enable;
+}
+
+static void pegasus_set_msglevel(struct net_device *dev, u32 v)
+{
+	pegasus_t *pegasus = netdev_priv(dev);
+	pegasus->msg_enable = v;
+}
+
+static struct ethtool_ops ops = {
+	.get_drvinfo = pegasus_get_drvinfo,
+	.get_settings = pegasus_get_settings,
+	.set_settings = pegasus_set_settings,
+	.nway_reset = pegasus_nway_reset,
+	.get_link = pegasus_get_link,
+	.get_msglevel = pegasus_get_msglevel,
+	.set_msglevel = pegasus_set_msglevel,
+	.get_wol = pegasus_get_wol,
+	.set_wol = pegasus_set_wol,
+};
+
+static int pegasus_ioctl(struct net_device *net, struct ifreq *rq, int cmd)
+{
+	__u16 *data = (__u16 *) & rq->ifr_ifru;
+	pegasus_t *pegasus = netdev_priv(net);
+	int res;
+
+	switch (cmd) {
+	case SIOCDEVPRIVATE:
+		data[0] = pegasus->phy;
+	case SIOCDEVPRIVATE + 1:
+		read_mii_word(pegasus, data[0], data[1] & 0x1f, &data[3]);
+		res = 0;
+		break;
+	case SIOCDEVPRIVATE + 2:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		write_mii_word(pegasus, pegasus->phy, data[1] & 0x1f, data[2]);
+		res = 0;
+		break;
+	default:
+		res = -EOPNOTSUPP;
+	}
+	return res;
+}
+
+static void pegasus_set_multicast(struct net_device *net)
+{
+	pegasus_t *pegasus = netdev_priv(net);
+
+	if (net->flags & IFF_PROMISC) {
+		pegasus->eth_regs[EthCtrl2] |= RX_PROMISCUOUS;
+		if (netif_msg_link(pegasus))
+			pr_info("%s: Promiscuous mode enabled.\n", net->name);
+	} else if (net->mc_count ||
+		   (net->flags & IFF_ALLMULTI)) {
+		pegasus->eth_regs[EthCtrl0] |= RX_MULTICAST;
+		pegasus->eth_regs[EthCtrl2] &= ~RX_PROMISCUOUS;
+		if (netif_msg_link(pegasus))
+			pr_info("%s: set allmulti\n", net->name);
+	} else {
+		pegasus->eth_regs[EthCtrl0] &= ~RX_MULTICAST;
+		pegasus->eth_regs[EthCtrl2] &= ~RX_PROMISCUOUS;
+	}
+
+	pegasus->flags |= ETH_REGS_CHANGE;
+	ctrl_callback(pegasus->ctrl_urb, NULL);
+}
+
+static __u8 mii_phy_probe(pegasus_t * pegasus)
+{
+	int i;
+	__u16 tmp;
+
+	for (i = 0; i < 32; i++) {
+		read_mii_word(pegasus, i, MII_BMSR, &tmp);
+		if (tmp == 0 || tmp == 0xffff || (tmp & BMSR_MEDIA) == 0)
+			continue;
+		else
+			return i;
+	}
+
+	return 0xff;
+}
+
+static inline void setup_pegasus_II(pegasus_t * pegasus)
+{
+	__u8 data = 0xa5;
+	
+	set_register(pegasus, Reg1d, 0);
+	set_register(pegasus, Reg7b, 1);
+	mdelay(100);
+	if ((pegasus->features & HAS_HOME_PNA) && mii_mode)
+		set_register(pegasus, Reg7b, 0);
+	else
+		set_register(pegasus, Reg7b, 2);
+
+	set_register(pegasus, 0x83, data);
+	get_registers(pegasus, 0x83, 1, &data);
+
+	if (data == 0xa5) {
+		pegasus->chip = 0x8513;
+	} else {
+		pegasus->chip = 0;
+	}
+
+	set_register(pegasus, 0x80, 0xc0);
+	set_register(pegasus, 0x83, 0xff);
+	set_register(pegasus, 0x84, 0x01);
+	
+	if (pegasus->features & HAS_HOME_PNA && mii_mode)
+		set_register(pegasus, Reg81, 6);
+	else
+		set_register(pegasus, Reg81, 2);
+}
+
+
+static struct workqueue_struct *pegasus_workqueue = NULL;
+#define CARRIER_CHECK_DELAY (2 * HZ)
+
+static void check_carrier(void *data)
+{
+	pegasus_t *pegasus = data;
+	set_carrier(pegasus->net);
+	if (!(pegasus->flags & PEGASUS_UNPLUG)) {
+		queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check,
+			CARRIER_CHECK_DELAY);
+	}
+}
+
+static int pegasus_probe(struct usb_interface *intf,
+			 const struct usb_device_id *id)
+{
+	struct usb_device *dev = interface_to_usbdev(intf);
+	struct net_device *net;
+	pegasus_t *pegasus;
+	int dev_index = id - pegasus_ids;
+	int res = -ENOMEM;
+
+	usb_get_dev(dev);
+	net = alloc_etherdev(sizeof(struct pegasus));
+	if (!net) {
+		dev_err(&intf->dev, "can't allocate %s\n", "device");
+		goto out;
+	}
+
+	pegasus = netdev_priv(net);
+	memset(pegasus, 0, sizeof (struct pegasus));
+	pegasus->dev_index = dev_index;
+	init_waitqueue_head(&pegasus->ctrl_wait);
+
+	if (!alloc_urbs(pegasus)) {
+		dev_err(&intf->dev, "can't allocate %s\n", "urbs");
+		goto out1;
+	}
+
+	tasklet_init(&pegasus->rx_tl, rx_fixup, (unsigned long) pegasus);
+
+	INIT_WORK(&pegasus->carrier_check, check_carrier, pegasus);
+
+	pegasus->intf = intf;
+	pegasus->usb = dev;
+	pegasus->net = net;
+	SET_MODULE_OWNER(net);
+	net->open = pegasus_open;
+	net->stop = pegasus_close;
+	net->watchdog_timeo = PEGASUS_TX_TIMEOUT;
+	net->tx_timeout = pegasus_tx_timeout;
+	net->do_ioctl = pegasus_ioctl;
+	net->hard_start_xmit = pegasus_start_xmit;
+	net->set_multicast_list = pegasus_set_multicast;
+	net->get_stats = pegasus_netdev_stats;
+	SET_ETHTOOL_OPS(net, &ops);
+	pegasus->mii.dev = net;
+	pegasus->mii.mdio_read = mdio_read;
+	pegasus->mii.mdio_write = mdio_write;
+	pegasus->mii.phy_id_mask = 0x1f;
+	pegasus->mii.reg_num_mask = 0x1f;
+	spin_lock_init(&pegasus->rx_pool_lock);
+	pegasus->msg_enable = netif_msg_init (msg_level, NETIF_MSG_DRV
+				| NETIF_MSG_PROBE | NETIF_MSG_LINK);
+
+	pegasus->features = usb_dev_id[dev_index].private;
+	get_interrupt_interval(pegasus);
+	if (reset_mac(pegasus)) {
+		dev_err(&intf->dev, "can't reset MAC\n");
+		res = -EIO;
+		goto out2;
+	}
+	set_ethernet_addr(pegasus);
+	fill_skb_pool(pegasus);
+	if (pegasus->features & PEGASUS_II) {
+		dev_info(&intf->dev, "setup Pegasus II specific registers\n");
+		setup_pegasus_II(pegasus);
+	}
+	pegasus->phy = mii_phy_probe(pegasus);
+	if (pegasus->phy == 0xff) {
+		dev_warn(&intf->dev, "can't locate MII phy, using default\n");
+		pegasus->phy = 1;
+	}
+	pegasus->mii.phy_id = pegasus->phy;
+	usb_set_intfdata(intf, pegasus);
+	SET_NETDEV_DEV(net, &intf->dev);
+	pegasus_reset_wol(net);
+	res = register_netdev(net);
+	if (res)
+		goto out3;
+	queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check,
+				CARRIER_CHECK_DELAY);
+
+	dev_info(&intf->dev, "%s, %s, %02x:%02x:%02x:%02x:%02x:%02x\n",
+		net->name,
+		usb_dev_id[dev_index].name,
+		net->dev_addr [0], net->dev_addr [1],
+		net->dev_addr [2], net->dev_addr [3],
+		net->dev_addr [4], net->dev_addr [5]);
+	return 0;
+
+out3:
+	usb_set_intfdata(intf, NULL);
+	free_skb_pool(pegasus);
+out2:
+	free_all_urbs(pegasus);
+out1:
+	free_netdev(net);
+out:
+	usb_put_dev(dev);
+	return res;
+}
+
+static void pegasus_disconnect(struct usb_interface *intf)
+{
+	struct pegasus *pegasus = usb_get_intfdata(intf);
+
+	usb_set_intfdata(intf, NULL);
+	if (!pegasus) {
+		dev_dbg(&intf->dev, "unregistering non-bound device?\n");
+		return;
+	}
+
+	pegasus->flags |= PEGASUS_UNPLUG;
+	cancel_delayed_work(&pegasus->carrier_check);
+	unregister_netdev(pegasus->net);
+	usb_put_dev(interface_to_usbdev(intf));
+	unlink_all_urbs(pegasus);
+	free_all_urbs(pegasus);
+	free_skb_pool(pegasus);
+	if (pegasus->rx_skb)
+		dev_kfree_skb(pegasus->rx_skb);
+	free_netdev(pegasus->net);
+}
+
+static int pegasus_suspend (struct usb_interface *intf, pm_message_t message)
+{
+	struct pegasus *pegasus = usb_get_intfdata(intf);
+	
+	netif_device_detach (pegasus->net);
+	if (netif_running(pegasus->net)) {
+		cancel_delayed_work(&pegasus->carrier_check);
+
+		usb_kill_urb(pegasus->rx_urb);
+		usb_kill_urb(pegasus->intr_urb);
+	}
+	return 0;
+}
+
+static int pegasus_resume (struct usb_interface *intf)
+{
+	struct pegasus *pegasus = usb_get_intfdata(intf);
+
+	netif_device_attach (pegasus->net);
+	if (netif_running(pegasus->net)) {
+		pegasus->rx_urb->status = 0;
+		pegasus->rx_urb->actual_length = 0;
+		read_bulk_callback(pegasus->rx_urb, NULL);
+
+		pegasus->intr_urb->status = 0;
+		pegasus->intr_urb->actual_length = 0;
+		intr_callback(pegasus->intr_urb, NULL);
+
+		queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check,
+					CARRIER_CHECK_DELAY);
+	}
+	return 0;
+}
+
+static struct usb_driver pegasus_driver = {
+	.name = driver_name,
+	.probe = pegasus_probe,
+	.disconnect = pegasus_disconnect,
+	.id_table = pegasus_ids,
+	.suspend = pegasus_suspend,
+	.resume = pegasus_resume,
+};
+
+static void parse_id(char *id)
+{
+	unsigned int vendor_id=0, device_id=0, flags=0, i=0;
+	char *token, *name=NULL;
+
+	if ((token = strsep(&id, ":")) != NULL)
+		name = token;
+	/* name now points to a null terminated string*/
+	if ((token = strsep(&id, ":")) != NULL)
+		vendor_id = simple_strtoul(token, NULL, 16);
+	if ((token = strsep(&id, ":")) != NULL)
+		device_id = simple_strtoul(token, NULL, 16);
+	flags = simple_strtoul(id, NULL, 16);
+	pr_info("%s: new device %s, vendor ID 0x%04x, device ID 0x%04x, flags: 0x%x\n",
+	        driver_name, name, vendor_id, device_id, flags);
+
+	if (vendor_id > 0x10000 || vendor_id == 0)
+		return;
+	if (device_id > 0x10000 || device_id == 0)
+		return;
+
+	for (i=0; usb_dev_id[i].name; i++);
+	usb_dev_id[i].name = name;
+	usb_dev_id[i].vendor = vendor_id;
+	usb_dev_id[i].device = device_id;
+	usb_dev_id[i].private = flags;
+	pegasus_ids[i].match_flags = USB_DEVICE_ID_MATCH_DEVICE;
+	pegasus_ids[i].idVendor = vendor_id;
+	pegasus_ids[i].idProduct = device_id;
+}
+
+static int __init pegasus_init(void)
+{
+	pr_info("%s: %s, " DRIVER_DESC "\n", driver_name, DRIVER_VERSION);
+	if (devid)
+		parse_id(devid);
+	pegasus_workqueue = create_singlethread_workqueue("pegasus");
+	if (!pegasus_workqueue)
+		return -ENOMEM;
+	return usb_register(&pegasus_driver);
+}
+
+static void __exit pegasus_exit(void)
+{
+	destroy_workqueue(pegasus_workqueue);
+	usb_deregister(&pegasus_driver);
+}
+
+module_init(pegasus_init);
+module_exit(pegasus_exit);
diff -urN oldtree/include/asm-arm/hw_irq.h newtree/include/asm-arm/hw_irq.h
--- oldtree/include/asm-arm/hw_irq.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/asm-arm/hw_irq.h	2006-03-08 15:22:33.161499750 +0000
@@ -0,0 +1,4 @@
+#ifndef __ASM_HARDIRQ_H
+#define __ASM_HARDIRQ_H
+#include <asm/hardirq.h>
+#endif
diff -urN oldtree/include/asm-arm/suspend2.h newtree/include/asm-arm/suspend2.h
--- oldtree/include/asm-arm/suspend2.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/asm-arm/suspend2.h	2006-03-08 15:22:33.161499750 +0000
@@ -0,0 +1,136 @@
+#ifndef _ASMARM_SUSPEND_H
+#define _ASMARM_SUSPEND_H
+/*
+ * Based on code
+ * Copyright 2005  Sony Corporation
+ * Copyright 2003-2004 Nigel Cunningham <ncunningham@linuxmail.org>
+ * Copyright 2001-2002 Pavel Machek <pavel@suse.cz>
+ * Copyright 2001 Patrick Mochel <mochel@osdl.org>
+ */
+
+/* image of the saved processor state */
+struct suspend2_saved_context {
+	/* general registers */
+	__u32 r[15];
+
+	/* coprocessor 15 registers */
+/*	__u32 ID_code;    read only reg */
+/*	__u32 cache_type; read only reg */
+/*	__u32 TCM_stat;   read only reg */
+	__u32 CR;
+	__u32 TTBR;
+	__u32 DACR;
+	__u32 D_FSR;
+	__u32 I_FSR;
+	__u32 FAR;
+/*	__u32 COR;        write only reg */
+/*	__u32 TLBOR;      write only reg */
+	__u32 D_CLR;
+	__u32 I_CLR;
+	__u32 D_TCMRR;
+	__u32 I_TCMRR;
+	__u32 TLBLR;
+	__u32 FCSE;
+	__u32 CID;
+} __attribute__((packed));
+typedef struct suspend2_saved_context suspend2_saved_context_t;
+
+/* temporary storage */
+extern struct suspend2_saved_context suspend2_saved_context;
+
+static inline void suspend2_arch_save_processor_context(void)
+{
+	/* save general registers */
+	asm volatile ("stmia %0, {r4-r14}"
+		      :: "r" (suspend2_saved_context.r));
+	/* save coprocessor 15 registers */
+	asm volatile ("mrc p15, 0, %0, c1, c0, 0"
+		      : "=r" (suspend2_saved_context.CR));
+	asm volatile ("mrc p15, 0, %0, c3, c0, 0"
+		      : "=r" (suspend2_saved_context.DACR));
+	asm volatile ("mrc p15, 0, %0, c5, c0, 0"
+		      : "=r" (suspend2_saved_context.D_FSR));
+	asm volatile ("mrc p15, 0, %0, c5, c0, 1"
+		      : "=r" (suspend2_saved_context.I_FSR));
+	asm volatile ("mrc p15, 0, %0, c6, c0, 0"
+		      : "=r" (suspend2_saved_context.FAR));
+	asm volatile ("mrc p15, 0, %0, c9, c0, 0"
+		      : "=r" (suspend2_saved_context.D_CLR));
+	asm volatile ("mrc p15, 0, %0, c9, c0, 1"
+		      : "=r" (suspend2_saved_context.I_CLR));
+	asm volatile ("mrc p15, 0, %0, c9, c1, 0"
+		      : "=r" (suspend2_saved_context.D_TCMRR));
+	asm volatile ("mrc p15, 0, %0, c9, c1, 1"
+		      : "=r" (suspend2_saved_context.I_TCMRR));
+	asm volatile ("mrc p15, 0, %0, c10, c0, 0"
+		      : "=r" (suspend2_saved_context.TLBLR));
+	asm volatile ("mrc p15, 0, %0, c13, c0, 0"
+		      : "=r" (suspend2_saved_context.FCSE));
+	asm volatile ("mrc p15, 0, %0, c13, c0, 1"
+		      : "=r" (suspend2_saved_context.CID));
+	asm volatile ("mrc p15, 0, %0, c2, c0, 0"
+		      : "=r" (suspend2_saved_context.TTBR));
+}
+
+static inline void suspend2_arch_restore_processor_context(void)
+{
+	/* restore coprocessor 15 registers */
+	asm volatile ("mcr p15, 0, %0, c2, c0, 0"
+		      :: "r" (suspend2_saved_context.TTBR));
+	asm volatile ("mcr p15, 0, %0, c13, c0, 1"
+		      :: "r" (suspend2_saved_context.CID));
+	asm volatile ("mcr p15, 0, %0, c13, c0, 0"
+		      :: "r" (suspend2_saved_context.FCSE));
+	asm volatile ("mcr p15, 0, %0, c10, c0, 0"
+		      :: "r" (suspend2_saved_context.TLBLR));
+	asm volatile ("mcr p15, 0, %0, c9, c1, 1"
+		      :: "r" (suspend2_saved_context.I_TCMRR));
+	asm volatile ("mcr p15, 0, %0, c9, c1, 0"
+		      :: "r" (suspend2_saved_context.D_TCMRR));
+	asm volatile ("mcr p15, 0, %0, c9, c0, 1"
+		      :: "r" (suspend2_saved_context.I_CLR));
+	asm volatile ("mcr p15, 0, %0, c9, c0, 0"
+		      :: "r" (suspend2_saved_context.D_CLR));
+	asm volatile ("mcr p15, 0, %0, c6, c0, 0"
+		      :: "r" (suspend2_saved_context.FAR));
+	asm volatile ("mcr p15, 0, %0, c5, c0, 1"
+		      :: "r" (suspend2_saved_context.I_FSR));
+	asm volatile ("mcr p15, 0, %0, c5, c0, 0"
+		      :: "r" (suspend2_saved_context.D_FSR));
+	asm volatile ("mcr p15, 0, %0, c3, c0, 0"
+		      :: "r" (suspend2_saved_context.DACR));
+	asm volatile ("mcr p15, 0, %0, c1, c0, 0"
+		      :: "r" (suspend2_saved_context.CR));
+
+	/* restore general registers */
+	asm volatile ("ldmia r3, {r4-r14}" : "=m" (suspend2_saved_context.r));
+}
+
+static inline void save_context(void)
+{
+}
+
+static inline void restore_context(void)
+{
+}
+
+static inline void suspend2_arch_pre_copy(void)
+{
+}
+
+static inline void suspend2_arch_post_copy(void)
+{
+}
+
+static inline void suspend2_arch_pre_copyback(void)
+{
+}
+
+static inline void suspend2_arch_post_copyback(void)
+{
+}
+
+static inline void suspend2_arch_flush_caches(void)
+{
+}
+#endif
diff -urN oldtree/include/asm-i386/mach-default/mach_time.h newtree/include/asm-i386/mach-default/mach_time.h
--- oldtree/include/asm-i386/mach-default/mach_time.h	2006-01-03 03:21:10.000000000 +0000
+++ newtree/include/asm-i386/mach-default/mach_time.h	2006-03-08 15:22:33.165500000 +0000
@@ -79,24 +79,19 @@
 	return retval;
 }
 
-static inline unsigned long mach_get_cmos_time(void)
+/* __get_cmos_time
+ *
+ * Separated out from mach_get_cmos_time so that we can
+ * quickly get the cmos time when we don't care about
+ * whether the second has just started.
+ *
+ * Used from suspend and resume sysdev calls.
+ */
+static inline unsigned long __get_cmos_time(void)
 {
 	unsigned int year, mon, day, hour, min, sec;
-	int i;
 
-	/* The Linux interpretation of the CMOS clock register contents:
-	 * When the Update-In-Progress (UIP) flag goes from 1 to 0, the
-	 * RTC registers show the second which has precisely just started.
-	 * Let's hope other operating systems interpret the RTC the same way.
-	 */
-	/* read RTC exactly on falling edge of update flag */
-	for (i = 0 ; i < 1000000 ; i++)	/* may take up to 1 second... */
-		if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)
-			break;
-	for (i = 0 ; i < 1000000 ; i++)	/* must try at least 2.228 ms */
-		if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
-			break;
-	do { /* Isn't this overkill ? UIP above should guarantee consistency */
+	do {
 		sec = CMOS_READ(RTC_SECONDS);
 		min = CMOS_READ(RTC_MINUTES);
 		hour = CMOS_READ(RTC_HOURS);
@@ -104,6 +99,7 @@
 		mon = CMOS_READ(RTC_MONTH);
 		year = CMOS_READ(RTC_YEAR);
 	} while (sec != CMOS_READ(RTC_SECONDS));
+
 	if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
 	  {
 	    BCD_TO_BIN(sec);
@@ -119,4 +115,24 @@
 	return mktime(year, mon, day, hour, min, sec);
 }
 
+static inline unsigned long mach_get_cmos_time(void)
+{
+	int i;
+
+	/* The Linux interpretation of the CMOS clock register contents:
+	 * When the Update-In-Progress (UIP) flag goes from 1 to 0, the
+	 * RTC registers show the second which has precisely just started.
+	 * Let's hope other operating systems interpret the RTC the same way.
+	 */
+	/* read RTC exactly on falling edge of update flag */
+	for (i = 0 ; i < 1000000 ; i++)	/* may take up to 1 second... */
+		if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)
+			break;
+	for (i = 0 ; i < 1000000 ; i++)	/* must try at least 2.228 ms */
+		if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
+			break;
+
+	return __get_cmos_time();
+}
+
 #endif /* !_MACH_TIME_H */
diff -urN oldtree/include/asm-i386/suspend2.h newtree/include/asm-i386/suspend2.h
--- oldtree/include/asm-i386/suspend2.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/asm-i386/suspend2.h	2006-03-08 16:42:44.166168750 +0000
@@ -0,0 +1,287 @@
+ /*
+  * Copyright 2003-2005 Nigel Cunningham <nigel@suspend2.net>
+  * Based on code
+  * Copyright 2001-2002 Pavel Machek <pavel@suse.cz>
+  * Based on code
+  * Copyright 2001 Patrick Mochel <mochel@osdl.org>
+  */
+#include <linux/irq.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/processor.h>
+
+/* image of the saved processor states */
+struct suspend2_saved_context {
+	u32 eax, ebx, ecx, edx;
+	u32 esp, ebp, esi, edi;
+	u16 es, fs, gs, ss;
+	u32 cr0, cr2, cr3, cr4;
+	u16 gdt_pad;
+	u16 gdt_limit;
+	u32 gdt_base;
+	u16 idt_pad;
+	u16 idt_limit;
+	u32 idt_base;
+	u16 ldt;
+	u16 tss;
+	u32 tr;
+	u32 safety;
+	u32 return_address;
+	u32 eflags;
+} __attribute__((packed));
+typedef struct suspend2_saved_context suspend2_saved_context_t;
+
+/* temporary storage */
+extern struct suspend2_saved_context suspend2_saved_context;
+
+/*
+ * save_processor_context
+ * 
+ * Save the state of the processor before we go to sleep.
+ *
+ * return_stack is the value of the stack pointer (%esp) as the caller sees it.
+ * A good way could not be found to obtain it from here (don't want to make
+ * _too_ many assumptions about the layout of the stack this far down.) Also,
+ * the handy little __builtin_frame_pointer(level) where level > 0, is blatantly
+ * buggy - it returns the value of the stack at the proper location, not the 
+ * location, like it should (as of gcc 2.91.66)
+ * 
+ * Note that the context and timing of this function is pretty critical.
+ * With a minimal amount of things going on in the caller and in here, gcc
+ * does a good job of being just a dumb compiler.  Watch the assembly output
+ * if anything changes, though, and make sure everything is going in the right
+ * place. 
+ */
+static inline void suspend2_arch_save_processor_context(void)
+{
+	kernel_fpu_begin();
+
+	/*
+	 * descriptor tables
+	 */
+	asm volatile ("sgdt (%0)" : "=m" (suspend2_saved_context.gdt_limit));
+	asm volatile ("sidt (%0)" : "=m" (suspend2_saved_context.idt_limit));
+	asm volatile ("sldt (%0)" : "=m" (suspend2_saved_context.ldt));
+	asm volatile ("str (%0)"  : "=m" (suspend2_saved_context.tr));
+
+	/*
+	 * save the general registers.
+	 * note that gcc has constructs to specify output of certain registers,
+	 * but they're not used here, because it assumes that you want to modify
+	 * those registers, so it tries to be smart and save them beforehand.
+	 * It's really not necessary, and kinda fishy (check the assembly output),
+	 * so it's avoided. 
+	 */
+	asm volatile ("movl %%esp, (%0)" : "=m" (suspend2_saved_context.esp));
+	asm volatile ("movl %%eax, (%0)" : "=m" (suspend2_saved_context.eax));
+	asm volatile ("movl %%ebx, (%0)" : "=m" (suspend2_saved_context.ebx));
+	asm volatile ("movl %%ecx, (%0)" : "=m" (suspend2_saved_context.ecx));
+	asm volatile ("movl %%edx, (%0)" : "=m" (suspend2_saved_context.edx));
+	asm volatile ("movl %%ebp, (%0)" : "=m" (suspend2_saved_context.ebp));
+	asm volatile ("movl %%esi, (%0)" : "=m" (suspend2_saved_context.esi));
+	asm volatile ("movl %%edi, (%0)" : "=m" (suspend2_saved_context.edi));
+
+	/*
+	 * segment registers
+	 */
+	asm volatile ("movw %%es, %0" : "=r" (suspend2_saved_context.es));
+	asm volatile ("movw %%fs, %0" : "=r" (suspend2_saved_context.fs));
+	asm volatile ("movw %%gs, %0" : "=r" (suspend2_saved_context.gs));
+	asm volatile ("movw %%ss, %0" : "=r" (suspend2_saved_context.ss));
+
+	/*
+	 * control registers 
+	 */
+	asm volatile ("movl %%cr0, %0" : "=r" (suspend2_saved_context.cr0));
+	asm volatile ("movl %%cr2, %0" : "=r" (suspend2_saved_context.cr2));
+	asm volatile ("movl %%cr3, %0" : "=r" (suspend2_saved_context.cr3));
+	asm volatile ("movl %%cr4, %0" : "=r" (suspend2_saved_context.cr4));
+
+	/*
+	 * eflags
+	 */
+	asm volatile ("pushfl ; popl (%0)" : "=m" (suspend2_saved_context.eflags));
+}
+
+static void fix_processor_context(void)
+{
+	struct tss_struct *t = &per_cpu(init_tss,0);
+
+	/* This just modifies memory; should not be neccessary. But... This is
+	 * neccessary, because 386 hardware has concept of busy tsc or some
+	 * similar stupidity. */
+	set_tss_desc(0,t);
+
+	load_TR_desc();
+
+	load_LDT(&current->active_mm->context);	/* This does lldt */
+
+	/*
+	 * Now maybe reload the debug registers
+	 */
+	if (current->thread.debugreg[7]){
+		set_debugreg(&current->thread.debugreg[0], 0);
+		set_debugreg(&current->thread.debugreg[1], 1);
+		set_debugreg(&current->thread.debugreg[2], 2);
+		set_debugreg(&current->thread.debugreg[3], 3);
+		/* no 4 and 5 */
+		set_debugreg(&current->thread.debugreg[6], 6);
+		set_debugreg(&current->thread.debugreg[7], 7);
+	}
+
+}
+
+static void do_fpu_end(void)
+{
+	/* restore FPU regs if necessary */
+	/* Do it out of line so that gcc does not move cr0 load to some stupid
+	 * place */
+	kernel_fpu_end();
+}
+
+#if defined(CONFIG_SUSPEND2) || defined(CONFIG_SMP)
+static unsigned long c_loops_per_jiffy_ref __nosavedata;
+#endif
+
+#ifdef CONFIG_SUSPEND2
+#ifndef CONFIG_SMP
+extern unsigned long loops_per_jiffy;
+volatile static unsigned long cpu_khz_ref __nosavedata = 0;
+#endif
+
+static inline void suspend2_arch_pre_copy(void) { }
+static inline void suspend2_arch_post_copy(void) { }
+
+static inline void suspend2_arch_pre_copyback(void)
+{
+	/* We want to run from swsusp_pg_dir, since swsusp_pg_dir is stored in
+	 * constant place in memory.
+	 */
+
+	 __asm__( "movl %%ecx,%%cr3\n" ::"c"(__pa(swsusp_pg_dir)));
+	
+	c_loops_per_jiffy_ref =
+		current_cpu_data.loops_per_jiffy;
+#ifndef CONFIG_SMP
+	cpu_khz_ref = cpu_khz;
+	c_loops_per_jiffy_ref = loops_per_jiffy;
+#endif
+
+}
+
+/*
+ * restore_processor_context
+ * 
+ * Restore the processor context as it was before we went to sleep
+ * - descriptor tables
+ * - control registers
+ * - segment registers
+ * - flags
+ * 
+ * Note that it is critical that this function is declared inline.
+ * It was separated out from restore_state to make that function
+ * a little clearer, but it needs to be inlined because we won't have a
+ * stack when we get here (so we can't push a return address).
+ */
+static inline void suspend2_arch_restore_processor_context(void)
+{
+	/*
+	 * first restore %ds, so we can access our data properly
+	 */
+	asm volatile (".align 4");
+	asm volatile ("movw %0, %%ds" :: "r" ((u16)__KERNEL_DS));
+
+
+	/*
+	 * control registers
+	 */
+	asm volatile ("movl %0, %%cr4" :: "r" (suspend2_saved_context.cr4));
+	asm volatile ("movl %0, %%cr3" :: "r" (suspend2_saved_context.cr3));
+	asm volatile ("movl %0, %%cr2" :: "r" (suspend2_saved_context.cr2));
+	asm volatile ("movl %0, %%cr0" :: "r" (suspend2_saved_context.cr0));
+
+	/*
+	 * segment registers
+	 */
+	asm volatile ("movw %0, %%es" :: "r" (suspend2_saved_context.es));
+	asm volatile ("movw %0, %%fs" :: "r" (suspend2_saved_context.fs));
+	asm volatile ("movw %0, %%gs" :: "r" (suspend2_saved_context.gs));
+	asm volatile ("movw %0, %%ss" :: "r" (suspend2_saved_context.ss));
+
+	/*
+	 * the other general registers
+	 *
+	 * note that even though gcc has constructs to specify memory 
+	 * input into certain registers, it will try to be too smart
+	 * and save them at the beginning of the function.  This is esp.
+	 * bad since we don't have a stack set up when we enter, and we 
+	 * want to preserve the values on exit. So, we set them manually.
+	 */
+	asm volatile ("movl %0, %%esp" :: "m" (suspend2_saved_context.esp));
+	asm volatile ("movl %0, %%ebp" :: "m" (suspend2_saved_context.ebp));
+	asm volatile ("movl %0, %%eax" :: "m" (suspend2_saved_context.eax));
+	asm volatile ("movl %0, %%ebx" :: "m" (suspend2_saved_context.ebx));
+	asm volatile ("movl %0, %%ecx" :: "m" (suspend2_saved_context.ecx));
+	asm volatile ("movl %0, %%edx" :: "m" (suspend2_saved_context.edx));
+	asm volatile ("movl %0, %%esi" :: "m" (suspend2_saved_context.esi));
+	asm volatile ("movl %0, %%edi" :: "m" (suspend2_saved_context.edi));
+
+	/*
+	 * now restore the descriptor tables to their proper values
+	 * ltr is done in fix_processor_context().
+	 */
+
+	asm volatile ("lgdt (%0)" :: "m" (suspend2_saved_context.gdt_limit));
+	asm volatile ("lidt (%0)" :: "m" (suspend2_saved_context.idt_limit));
+	asm volatile ("lldt (%0)" :: "m" (suspend2_saved_context.ldt));
+
+	/* tell gcc that we clobbered all the registers... 
+	 * otherwise it might keep some addresses there.
+	 * Unfortunately gcc 4 thinks it's smart and will
+	 * error out if we tell it we're clobbering ebp as
+	 * well. So we have to lie. 
+	 */
+	asm volatile ("" : : : "esp", "eax", "ebx", "ecx", "edx", "esi", "edi");
+	
+	if (boot_cpu_has(X86_FEATURE_SEP))
+		enable_sep_cpu();
+
+	fix_processor_context();
+
+	/*
+	 * the flags
+	 */
+	asm volatile ("pushl %0 ; popfl" :: "m" (suspend2_saved_context.eflags));
+
+	do_fpu_end();
+
+	mtrr_ap_init();
+	mcheck_init(&boot_cpu_data);
+}
+	
+static inline void suspend2_arch_flush_caches(void)
+{
+#ifdef CONFIG_SMP
+	cpu_clear(0, per_cpu(cpu_tlbstate,
+				0).active_mm->cpu_vm_mask);
+#endif
+	wbinvd();
+	__flush_tlb_all();
+	
+}
+
+static inline void suspend2_arch_post_copyback(void)
+{
+	BUG_ON(!irqs_disabled());
+
+	current_cpu_data.loops_per_jiffy =
+		c_loops_per_jiffy_ref;
+#ifndef CONFIG_SMP
+	loops_per_jiffy = c_loops_per_jiffy_ref;
+	cpu_khz = cpu_khz_ref;
+#endif
+}
+
+#endif
diff -urN oldtree/include/asm-ppc/cpu_context.h newtree/include/asm-ppc/cpu_context.h
--- oldtree/include/asm-ppc/cpu_context.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/asm-ppc/cpu_context.h	2006-03-08 15:22:33.169500250 +0000
@@ -0,0 +1,110 @@
+/*
+ * Written by Hu Gang (hugang@soulinfo.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/adb.h>
+#include <linux/pmu.h>
+#include <asm/mmu_context.h>
+
+/* image of the saved processor states */
+struct saved_context {
+	u32 lr, cr, sp, r2;
+	u32 r[20]; /* r12 - r31 */                                                 
+	u32 sprg[4];
+	u32 msr, sdr1, tb1, tb2;
+} __attribute__((packed));
+
+inline static void __save_processor_state(struct saved_context *s)
+{
+	/*asm volatile ("mflr 0; stw 0,%0" : "=m" (s->lr));*/
+	asm volatile ("mfcr 0; stw 0,%0" : "=m" (s->cr));
+	asm volatile ("stw 1,%0" : "=m" (s->sp));
+	asm volatile ("stw 2,%0" : "=m" (s->r2));
+	asm volatile ("stmw 12,%0" : "=m" (s->r));
+
+	/* Save MSR & SDR1 */
+	asm volatile ("mfmsr 4; stw 4,%0" : "=m" (s->msr));
+	asm volatile ("mfsdr1 4; stw 4,%0": "=m" (s->sdr1));
+
+	/* Get a stable timebase and save it */
+	asm volatile ("1:\n"
+		      "mftbu 4;stw 4,%0\n"
+		      "mftb  5;stw 5,%1\n"
+		      "mftbu 3\n"
+		      "cmpw 3,4;\n"
+			  "bne 1b" : 
+			  "=m" (s->tb1),
+			  "=m" (s->tb2));
+		
+	/* Save SPRGs */
+	asm volatile ("mfsprg 4,0; stw 4,%0 " : "=m" (s->sprg[0]));
+	asm volatile ("mfsprg 4,1; stw 4,%0 " : "=m" (s->sprg[1]));
+	asm volatile ("mfsprg 4,2; stw 4,%0 " : "=m" (s->sprg[2]));
+	asm volatile ("mfsprg 4,3; stw 4,%0 " : "=m" (s->sprg[3]));
+}
+
+inline static void __restore_processor_state(struct saved_context *s)
+{
+	/* Restore the BATs, and SDR1 */
+	asm volatile ("lwz 4,%0; mtsdr1 4" : "=m" (s->sdr1));
+	/* asm volatile ("lwz 3,%0" : "=m" (saved_context.msr)); */
+
+	asm volatile ("lwz 4,%0; mtsprg 0,4": "=m" (s->sprg[0]));
+	asm volatile ("lwz 4,%0; mtsprg 1,4": "=m" (s->sprg[1]));
+	asm volatile ("lwz 4,%0; mtsprg 2,4": "=m" (s->sprg[2]));
+	asm volatile ("lwz 4,%0; mtsprg 3,4": "=m" (s->sprg[3]));
+
+	/* Restore TB */
+	asm volatile ("li 3,0; mttbl 3; \n"
+		      "lwz 3,%0\n; lwz 4,%1\n"
+		      "mttbu 3; mttbl 4" :
+		      "=m" (s->tb1),
+		      "=m" (s->tb2));
+
+	/* Restore the callee-saved registers and return */
+	asm volatile ("lmw 12,%0" : "=m" (s->r)); 
+	asm volatile ("lwz 2,%0" : "=m" (s->r2));
+	asm volatile ("lwz 1,%0" : "=m" (s->sp));
+	asm volatile ("lwz 0,%0; mtcr 0" : "=m" (s->cr));
+	
+	/* tell gcc that we clobbered all the registers... 
+	 * otherwise it might keep some addresses there. */
+	asm volatile ("" : : : "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31");
+	/*asm volatile ("lwz 0,%0; mtlr 0" : "=m" (s->lr));*/
+}
+
+static inline void save_context(void)
+{
+#ifdef CONFIG_ADB_PMU
+	printk("pmu suspend\n");
+	pmu_suspend();
+#endif
+}
+
+extern void enable_kernel_altivec(void);
+
+static inline void restore_context(void)
+{
+	printk("set context: <%p>\n", current);
+	set_context(current->active_mm->context,
+			current->active_mm->pgd);
+
+#ifdef CONFIG_ADB_PMU
+	printk("pmu_resume\n");
+	pmu_resume();
+#endif
+
+#ifdef CONFIG_ALTIVEC
+	if (cur_cpu_spec->cpu_features & CPU_FTR_ALTIVEC) {
+		printk("enable altivec\n");
+		enable_kernel_altivec();
+	}
+#endif
+	printk("enable fp\n");
+	enable_kernel_fp();
+}
diff -urN oldtree/include/asm-ppc/suspend2.h newtree/include/asm-ppc/suspend2.h
--- oldtree/include/asm-ppc/suspend2.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/asm-ppc/suspend2.h	2006-03-08 15:22:33.173500500 +0000
@@ -0,0 +1,47 @@
+/*
+ * Written by Hu Gang (hugang@soulinfo.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "asm/cpu_context.h"
+
+typedef struct saved_context suspend2_saved_context_t;
+
+extern struct saved_context suspend2_saved_context;
+
+static inline void suspend2_arch_save_processor_context(void)
+{
+	__save_processor_state(&suspend2_saved_context);
+}
+
+static inline void suspend2_arch_restore_processor_context(void)
+{
+	__restore_processor_state(&suspend2_saved_context);
+
+	restore_context();
+}
+
+static inline void suspend2_arch_pre_copy(void)
+{
+}
+
+static inline void suspend2_arch_post_copy(void)
+{
+}
+
+static inline void suspend2_arch_pre_copyback(void)
+{
+	save_context();
+}
+
+static inline void suspend2_arch_post_copyback(void)
+{
+}
+
+static inline void suspend2_arch_flush_caches(void)
+{
+}
diff -urN oldtree/include/asm-x86_64/page.h newtree/include/asm-x86_64/page.h
--- oldtree/include/asm-x86_64/page.h	2006-03-08 18:48:02.172014750 +0000
+++ newtree/include/asm-x86_64/page.h	2006-03-08 15:22:33.173500500 +0000
@@ -105,6 +105,8 @@
 
 #include <asm/bug.h>
 
+extern int page_is_ram(unsigned long pagenr);
+
 #endif /* __ASSEMBLY__ */
 
 #define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
diff -urN oldtree/include/asm-x86_64/suspend.h newtree/include/asm-x86_64/suspend.h
--- oldtree/include/asm-x86_64/suspend.h	2006-03-08 18:48:02.176015000 +0000
+++ newtree/include/asm-x86_64/suspend.h	2006-03-08 15:22:33.177500750 +0000
@@ -41,8 +41,6 @@
 #define loaddebug(thread,register) \
 	set_debugreg((thread)->debugreg##register, register)
 
-extern void fix_processor_context(void);
-
 #ifdef CONFIG_ACPI_SLEEP
 extern unsigned long saved_eip;
 extern unsigned long saved_esp;
diff -urN oldtree/include/asm-x86_64/suspend.h.orig newtree/include/asm-x86_64/suspend.h.orig
--- oldtree/include/asm-x86_64/suspend.h.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/asm-x86_64/suspend.h.orig	2006-03-08 15:21:18.940861250 +0000
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2001-2003 Pavel Machek <pavel@suse.cz>
+ * Based on code
+ * Copyright 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#include <asm/desc.h>
+#include <asm/i387.h>
+
+static inline int
+arch_prepare_suspend(void)
+{
+	return 0;
+}
+
+/* Image of the saved processor state. If you touch this, fix acpi_wakeup.S. */
+struct saved_context {
+  	u16 ds, es, fs, gs, ss;
+	unsigned long gs_base, gs_kernel_base, fs_base;
+	unsigned long cr0, cr2, cr3, cr4, cr8;
+	u16 gdt_pad;
+	u16 gdt_limit;
+	unsigned long gdt_base;
+	u16 idt_pad;
+	u16 idt_limit;
+	unsigned long idt_base;
+	u16 ldt;
+	u16 tss;
+	unsigned long tr;
+	unsigned long safety;
+	unsigned long return_address;
+	unsigned long eflags;
+} __attribute__((packed));
+
+/* We'll access these from assembly, so we'd better have them outside struct */
+extern unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
+extern unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
+extern unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
+extern unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
+extern unsigned long saved_context_eflags;
+
+#define loaddebug(thread,register) \
+	set_debugreg((thread)->debugreg##register, register)
+
+extern void fix_processor_context(void);
+
+#ifdef CONFIG_ACPI_SLEEP
+extern unsigned long saved_eip;
+extern unsigned long saved_esp;
+extern unsigned long saved_ebp;
+extern unsigned long saved_ebx;
+extern unsigned long saved_esi;
+extern unsigned long saved_edi;
+
+/* routines for saving/restoring kernel state */
+extern int acpi_save_state_mem(void);
+#endif
diff -urN oldtree/include/asm-x86_64/suspend2.h newtree/include/asm-x86_64/suspend2.h
--- oldtree/include/asm-x86_64/suspend2.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/asm-x86_64/suspend2.h	2006-03-08 15:22:33.181501000 +0000
@@ -0,0 +1,427 @@
+ /*
+  * Copyright 2005 Nigel Cunningham <nigel@suspend2.net>
+  * Based on code
+  * Copyright 2001-2002 Pavel Machek <pavel@suse.cz>
+  * Based on code
+  * Copyright 2001 Patrick Mochel <mochel@osdl.org>
+  */
+
+#include <linux/irq.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/page.h>
+
+static pgd_t *temp_level4_pgt;
+extern int suspend2_mapping_prepare(void);
+
+/* image of the saved processor states */
+struct suspend2_saved_context {
+	unsigned long eax, ebx, ecx, edx;
+	unsigned long esp, ebp, esi, edi;
+	unsigned long r8, r9, r10, r11;
+	unsigned long r12, r13, r14, r15;
+
+	u16 ds, es, fs, gs, ss;
+	unsigned long gs_base, gs_kernel_base, fs_base;
+	unsigned long cr0, cr2, cr3, cr4, cr8;
+	u16 gdt_pad;
+	u16 gdt_limit;
+	unsigned long gdt_base;
+	u16 idt_pad;
+	u16 idt_limit;
+	unsigned long idt_base;
+	u16 ldt;
+	u16 tss;
+	unsigned long tr;
+	unsigned long safety;
+	unsigned long return_address;
+	unsigned long eflags;
+} __attribute__((packed));
+
+typedef struct suspend2_saved_context suspend2_saved_context_t;
+
+/* temporary storage */
+extern struct suspend2_saved_context suspend2_saved_context;
+
+static inline void suspend2_arch_flush_caches(void)
+{
+#ifdef CONFIG_SMP
+	clear_bit(0, &read_pda(active_mm)->cpu_vm_mask);
+#endif
+	wbinvd();
+	__flush_tlb_all();
+	
+}
+
+/*
+ * save_processor_context
+ * 
+ * Save the state of the processor before we go to sleep.
+ *
+ * return_stack is the value of the stack pointer (%esp) as the caller sees it.
+ * A good way could not be found to obtain it from here (don't want to make _too_
+ * many assumptions about the layout of the stack this far down.) Also, the 
+ * handy little __builtin_frame_pointer(level) where level > 0, is blatantly 
+ * buggy - it returns the value of the stack at the proper location, not the 
+ * location, like it should (as of gcc 2.91.66)
+ * 
+ * Note that the context and timing of this function is pretty critical.
+ * With a minimal amount of things going on in the caller and in here, gcc
+ * does a good job of being just a dumb compiler.  Watch the assembly output
+ * if anything changes, though, and make sure everything is going in the right
+ * place. 
+ */
+static inline void suspend2_arch_save_processor_context(void)
+{
+	kernel_fpu_begin();
+
+	/*
+	 * descriptor tables
+	 */
+	asm volatile ("sgdt %0" : "=m" (suspend2_saved_context.gdt_limit));
+	asm volatile ("sidt %0" : "=m" (suspend2_saved_context.idt_limit));
+	asm volatile ("str %0"  : "=m" (suspend2_saved_context.tr));
+
+	/*
+	 * segment registers
+	 */
+	asm volatile ("movw %%ds, %0" : "=r" (suspend2_saved_context.ds));
+	asm volatile ("movw %%es, %0" : "=r" (suspend2_saved_context.es));
+	asm volatile ("movw %%fs, %0" : "=r" (suspend2_saved_context.fs));
+	asm volatile ("movw %%gs, %0" : "=r" (suspend2_saved_context.gs));
+	asm volatile ("movw %%ss, %0" : "=r" (suspend2_saved_context.ss));
+
+	rdmsrl(MSR_FS_BASE, suspend2_saved_context.fs_base);
+	rdmsrl(MSR_GS_BASE, suspend2_saved_context.gs_base);
+	rdmsrl(MSR_KERNEL_GS_BASE, suspend2_saved_context.gs_kernel_base);
+
+	/*
+	 * control registers 
+	 */
+	asm volatile ("movq %%cr0, %0" : "=r" (suspend2_saved_context.cr0));
+	asm volatile ("movq %%cr2, %0" : "=r" (suspend2_saved_context.cr2));
+	asm volatile ("movq %%cr3, %0" : "=r" (suspend2_saved_context.cr3));
+	asm volatile ("movq %%cr4, %0" : "=r" (suspend2_saved_context.cr4));
+	asm volatile ("movq %%cr8, %0" : "=r" (suspend2_saved_context.cr8));
+	
+	/*
+	 * save the general registers.
+	 * note that gcc has constructs to specify output of certain registers,
+	 * but they're not used here, because it assumes that you want to modify
+	 * those registers, so it tries to be smart and save them beforehand.
+	 * It's really not necessary, and kinda fishy (check the assembly output),
+	 * so it's avoided. 
+	 */
+
+	asm volatile ("movq %%rsp, %0" : "=m" (suspend2_saved_context.esp));
+
+	asm volatile ("movq %%rax, %0" : "=m" (suspend2_saved_context.eax));
+	asm volatile ("movq %%rbx, %0" : "=m" (suspend2_saved_context.ebx));
+	asm volatile ("movq %%rcx, %0" : "=m" (suspend2_saved_context.ecx));
+	asm volatile ("movq %%rdx, %0" : "=m" (suspend2_saved_context.edx));
+	asm volatile ("movq %%rbp, %0" : "=m" (suspend2_saved_context.ebp));
+	asm volatile ("movq %%rsi, %0" : "=m" (suspend2_saved_context.esi));
+	asm volatile ("movq %%rdi, %0" : "=m" (suspend2_saved_context.edi));
+	asm volatile ("movq %%r8, %0" : "=m" (suspend2_saved_context.r8));
+	asm volatile ("movq %%r9, %0" : "=m" (suspend2_saved_context.r9));
+	asm volatile ("movq %%r10, %0" : "=m" (suspend2_saved_context.r10));
+	asm volatile ("movq %%r11, %0" : "=m" (suspend2_saved_context.r11));
+	asm volatile ("movq %%r12, %0" : "=m" (suspend2_saved_context.r12));
+	asm volatile ("movq %%r13, %0" : "=m" (suspend2_saved_context.r13));
+	asm volatile ("movq %%r14, %0" : "=m" (suspend2_saved_context.r14));
+	asm volatile ("movq %%r15, %0" : "=m" (suspend2_saved_context.r15));
+
+	/*
+	 * eflags
+	 */
+	asm volatile ("pushfq ; popq %0" : "=m" (suspend2_saved_context.eflags));
+
+}
+
+static void fix_processor_context(void)
+{
+	struct tss_struct * t = &per_cpu(init_tss,0);
+
+	set_tss_desc(0,t);	/* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy tsc or some similar stupidity. */
+	cpu_gdt(0)[GDT_ENTRY_TSS].type = 9;
+
+	syscall_init();				/* This sets MSR_*STAR and related */
+	load_TR_desc();
+	load_LDT(&current->active_mm->context);	/* This does lldt */
+
+	/*
+	 * Now maybe reload the debug registers
+	 */
+	if (current->thread.debugreg7){
+		loaddebug(&current->thread, 0);
+		loaddebug(&current->thread, 1);
+		loaddebug(&current->thread, 2);
+		loaddebug(&current->thread, 3);
+		/* no 4 and 5 */
+		loaddebug(&current->thread, 6);
+		loaddebug(&current->thread, 7);
+	}
+}
+
+static void do_fpu_end(void)
+{
+	/* restore FPU regs if necessary */
+	/* Do it out of line so that gcc does not move cr0 load to some stupid place */
+	kernel_fpu_end();
+	mxcsr_feature_mask_init();
+}
+
+/*
+ * restore_processor_context
+ * 
+ * Restore the processor context as it was before we went to sleep
+ * - descriptor tables
+ * - control registers
+ * - segment registers
+ * - flags
+ * 
+ * Note that it is critical that this function is declared inline.
+ * It was separated out from restore_state to make that function
+ * a little clearer, but it needs to be inlined because we won't have a
+ * stack when we get here (so we can't push a return address).
+ */
+static inline void restore_processor_context(void)
+{
+	/* 
+	 * Credit for this goes to the swsusp code. Restoring the
+	 * CPU context is the one thing we still do in the same
+	 * way, and swsusp did it right first.
+	 *
+	 * 0xffffffff80000000UL is __START_KERNEL_map.
+	 */
+
+	__asm__ __volatile__(
+		"leaq	init_level4_pgt(%rip), %rax;		\n"
+		"subq	$0xffffffff80000000, %rax;		\n"
+		"movq	%rax, %cr3;				\n"
+		"movq	mmu_cr4_features(%rip), %rax;		\n"
+		"movq	%rax, %rdx;				\n"
+		"andq	$~(1<<7), %rdx;  # PGE			\n"
+		"movq	%rdx, %cr4;  # turn off PGE		\n"
+		"movq	%cr3, %rcx;  # flush TLB		\n"
+		"movq	%rcx, %cr3;				\n"
+		"movq	%rax, %cr4;  # turn PGE back on;	\n"
+
+		"movl	$24, %eax;				\n"
+		"movl	%eax, %ds				\n");
+	/*
+	 * the other general registers
+	 *
+	 * note that even though gcc has constructs to specify memory 
+	 * input into certain registers, it will try to be too smart
+	 * and save them at the beginning of the function.  This is esp.
+	 * bad since we don't have a stack set up when we enter, and we 
+	 * want to preserve the values on exit. So, we set them manually.
+	 */
+	asm volatile ("movq %0, %%rsp" :: "m" (suspend2_saved_context.esp));
+	asm volatile ("movq %0, %%rbp" :: "m" (suspend2_saved_context.ebp));
+	asm volatile ("movq %0, %%rbx" :: "m" (suspend2_saved_context.ebx));
+	asm volatile ("movq %0, %%rcx" :: "m" (suspend2_saved_context.ecx));
+	asm volatile ("movq %0, %%rdx" :: "m" (suspend2_saved_context.edx));
+	asm volatile ("movq %0, %%rsi" :: "m" (suspend2_saved_context.esi));
+	asm volatile ("movq %0, %%rdi" :: "m" (suspend2_saved_context.edi));
+	asm volatile ("movq %0, %%r8" :: "m" (suspend2_saved_context.r8));
+	asm volatile ("movq %0, %%r9" :: "m" (suspend2_saved_context.r9));
+	asm volatile ("movq %0, %%r10" :: "m" (suspend2_saved_context.r10));
+	asm volatile ("movq %0, %%r11" :: "m" (suspend2_saved_context.r11));
+	asm volatile ("movq %0, %%r12" :: "m" (suspend2_saved_context.r12));
+	asm volatile ("movq %0, %%r13" :: "m" (suspend2_saved_context.r13));
+	asm volatile ("movq %0, %%r14" :: "m" (suspend2_saved_context.r14));
+	asm volatile ("movq %0, %%r15" :: "m" (suspend2_saved_context.r15));
+
+	/*
+	 * the flags
+	 */
+	asm volatile ("pushq %0 ; popfq" :: "m" (suspend2_saved_context.eflags));
+	
+	asm volatile ("xorq	%rax, %rax");
+
+	/*
+	 * control registers
+	 */
+	asm volatile ("movq %0, %%cr8" :: "r" (suspend2_saved_context.cr8));
+	asm volatile ("movq %0, %%cr4" :: "r" (suspend2_saved_context.cr4));
+	asm volatile ("movq %0, %%cr3" :: "r" (suspend2_saved_context.cr3));
+	asm volatile ("movq %0, %%cr2" :: "r" (suspend2_saved_context.cr2));
+	asm volatile ("movq %0, %%cr0" :: "r" (suspend2_saved_context.cr0));
+
+	/*
+	 * now restore the descriptor tables to their proper values
+	 * ltr is done in fix_processor_context().
+	 */
+
+	asm volatile ("lgdt %0" :: "m" (suspend2_saved_context.gdt_limit));
+	asm volatile ("lidt %0" :: "m" (suspend2_saved_context.idt_limit));
+
+	/*
+	 * segment registers
+	 */
+	asm volatile ("movw %0, %%ds" :: "r" (suspend2_saved_context.ds));
+	asm volatile ("movw %0, %%es" :: "r" (suspend2_saved_context.es));
+	asm volatile ("movw %0, %%fs" :: "r" (suspend2_saved_context.fs));
+	load_gs_index(suspend2_saved_context.gs);
+	asm volatile ("movw %0, %%ss" :: "r" (suspend2_saved_context.ss));
+
+	wrmsrl(MSR_FS_BASE, suspend2_saved_context.fs_base);
+	wrmsrl(MSR_GS_BASE, suspend2_saved_context.gs_base);
+	wrmsrl(MSR_KERNEL_GS_BASE, suspend2_saved_context.gs_kernel_base);
+
+	/* tell gcc that we clobbered all the registers... 
+	 * otherwise it might keep some addresses there. */
+	asm volatile ("" : : : "rsp", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");
+
+	fix_processor_context();
+
+	do_fpu_end();
+	
+	suspend2_arch_flush_caches();
+
+	mtrr_ap_init();
+	mcheck_init(&boot_cpu_data);
+}
+
+#if defined(CONFIG_SUSPEND2) || defined(CONFIG_SMP)
+extern unsigned char * my_saved_context __nosavedata;
+static unsigned long c_loops_per_jiffy_ref[NR_CPUS] __nosavedata;
+#endif
+
+#ifdef CONFIG_SUSPEND2
+#ifndef CONFIG_SMP
+extern unsigned long loops_per_jiffy;
+volatile static unsigned long cpu_khz_ref __nosavedata = 0;
+#endif
+
+/* 
+ * APIC support: These routines save the APIC
+ * configuration for the CPU on which they are
+ * being executed
+ */
+extern void suspend_apic_save_state(void);
+extern void suspend_apic_reload_state(void);
+
+static inline void suspend2_arch_pre_copy(void)
+{
+}
+
+static inline void suspend2_arch_post_copy(void)
+{
+}
+
+/* Based on the version from swsusp */
+static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{
+	long i, j;
+
+	i = pud_index(address);
+	pud = pud + i;
+	for (; i < PTRS_PER_PUD; pud++, i++) {
+		unsigned long paddr;
+		pmd_t *pmd;
+
+		paddr = address + i*PUD_SIZE;
+		if (paddr >= end)
+			break;
+
+		pmd = (pmd_t *)suspend_get_nonconflicting_pages(0);
+		if (!pmd)
+			return -ENOMEM;
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+			unsigned long pe;
+
+			if (paddr >= end)
+				break;
+			pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
+			pe &= __supported_pte_mask;
+			set_pmd(pmd, __pmd(pe));
+		}
+	}
+	return 0;
+}
+
+static int set_up_temporary_mappings_suspend2(void)
+{
+	unsigned long start, end, next;
+	int error;
+
+	temp_level4_pgt = (pgd_t *)suspend_get_nonconflicting_pages(0);
+	if (!temp_level4_pgt)
+		return -ENOMEM;
+
+	/* It is safe to reuse the original kernel mapping */
+	set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
+		init_level4_pgt[pgd_index(__START_KERNEL_map)]);
+
+	/* Set up the direct mapping from scratch */
+	start = (unsigned long)pfn_to_kaddr(0);
+	end = (unsigned long)pfn_to_kaddr(end_pfn);
+
+	for (; start < end; start = next) {
+		pud_t *pud = (pud_t *)suspend_get_nonconflicting_pages(0);
+		if (!pud)
+			return -ENOMEM;
+		next = start + PGDIR_SIZE;
+		if (next > end)
+			next = end;
+		if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
+			return error;
+		set_pgd(temp_level4_pgt + pgd_index(start),
+			mk_kernel_pgd(__pa(pud)));
+	}
+	return 0;
+}
+
+static inline void suspend2_arch_pre_copyback(void)
+{
+	/* We want to run from swsusp_pg_dir, since swsusp_pg_dir is stored in
+	 * constant place in memory.
+	 */
+
+	set_up_temporary_mappings_suspend2();
+
+	asm volatile ("movq	$0xffff810000000000, %rdx");
+	asm volatile ("movq	temp_level4_pgt(%rip), %rax");
+	asm volatile ("subq	%rdx, %rax");
+	asm volatile ("movq	%rax, %cr3");
+
+	wbinvd();
+	__flush_tlb_all();
+	
+	c_loops_per_jiffy_ref[0] =
+		current_cpu_data.loops_per_jiffy;
+#ifndef CONFIG_SMP
+	cpu_khz_ref = cpu_khz;
+	c_loops_per_jiffy_ref[0] = loops_per_jiffy;
+#endif
+	
+}
+
+static inline void suspend2_arch_restore_processor_context(void)
+{
+	restore_processor_context();
+}
+	
+static inline void suspend2_arch_post_copyback(void)
+{
+	/* Get other CPUs to restore their contexts and flush their tlbs. */
+	clear_suspend_state(SUSPEND_FREEZE_SMP);
+	
+	BUG_ON(!irqs_disabled());
+
+	current_cpu_data.loops_per_jiffy =
+		c_loops_per_jiffy_ref[0];
+#ifndef CONFIG_SMP
+	loops_per_jiffy = c_loops_per_jiffy_ref[0];
+	cpu_khz = cpu_khz_ref;
+#endif
+}
+
+#endif
diff -urN oldtree/include/linux/bio.h newtree/include/linux/bio.h
--- oldtree/include/linux/bio.h	2006-03-08 18:47:15.729112250 +0000
+++ newtree/include/linux/bio.h	2006-03-08 15:22:33.185501250 +0000
@@ -124,6 +124,7 @@
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
 #define BIO_EOPNOTSUPP	7	/* not supported */
+#define BIO_SUSPEND2	8	/* Suspend2 bio - for corruption checking */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
diff -urN oldtree/include/linux/dyn_pageflags.h newtree/include/linux/dyn_pageflags.h
--- oldtree/include/linux/dyn_pageflags.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/linux/dyn_pageflags.h	2006-03-08 15:22:33.185501250 +0000
@@ -0,0 +1,66 @@
+/*
+ * include/linux/dyn_pageflags.h
+ *
+ * Copyright (C) 2004-2006 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * It implements support for dynamically allocated bitmaps that are
+ * used for temporary or infrequently used pageflags, in lieu of
+ * bits in the struct page flags entry.
+ */
+
+#ifndef DYN_PAGEFLAGS_H
+#define DYN_PAGEFLAGS_H
+
+#include <linux/mm.h>
+
+typedef unsigned long *** dyn_pageflags_t;
+
+#define BITNUMBER(page) (page_to_pfn(page))
+
+#if BITS_PER_LONG == 32
+#define UL_SHIFT 5
+#else 
+#if BITS_PER_LONG == 64
+#define UL_SHIFT 6
+#else
+#error Bits per long not 32 or 64?
+#endif
+#endif
+
+#define BIT_NUM_MASK (sizeof(unsigned long) * 8 - 1)
+#define PAGE_NUM_MASK (~((1 << (PAGE_SHIFT + 3)) - 1))
+#define UL_NUM_MASK (~(BIT_NUM_MASK | PAGE_NUM_MASK))
+
+#define BITS_PER_PAGE (PAGE_SIZE << 3)
+#define PAGENUMBER(zone_offset) (zone_offset >> (PAGE_SHIFT + 3))
+#define PAGEINDEX(zone_offset) ((zone_offset & UL_NUM_MASK) >> UL_SHIFT)
+#define PAGEBIT(zone_offset) (zone_offset & BIT_NUM_MASK)
+
+#define PAGE_UL_PTR(bitmap, zone_num, zone_pfn) \
+       ((bitmap[zone_num][PAGENUMBER(zone_pfn)])+PAGEINDEX(zone_pfn))
+
+/* With the above macros defined, you can do...
+
+#define PagePageset1(page) (test_dynpageflag(&pageset1_map, page))
+#define SetPagePageset1(page) (set_dynpageflag(&pageset1_map, page))
+#define ClearPagePageset1(page) (clear_dynpageflag(&pageset1_map, page))
+*/
+
+#define BITMAP_FOR_EACH_SET(bitmap, counter) \
+	for (counter = get_next_bit_on(bitmap, -1); counter < max_pfn; \
+		counter = get_next_bit_on(bitmap, counter))
+
+extern void clear_dyn_pageflags(dyn_pageflags_t pagemap);
+extern int allocate_dyn_pageflags(dyn_pageflags_t *pagemap);
+extern void free_dyn_pageflags(dyn_pageflags_t *pagemap);
+extern int dyn_pageflags_pages_per_bitmap(void);
+extern int get_next_bit_on(dyn_pageflags_t bitmap, int counter);
+extern unsigned long *dyn_pageflags_ul_ptr(dyn_pageflags_t *bitmap,
+		struct page *pg);
+
+extern int test_dynpageflag(dyn_pageflags_t *bitmap, struct page *page);
+extern void set_dynpageflag(dyn_pageflags_t *bitmap, struct page *page);
+extern void clear_dynpageflag(dyn_pageflags_t *bitmap, struct page *page);
+#endif
diff -urN oldtree/include/linux/freezer.h newtree/include/linux/freezer.h
--- oldtree/include/linux/freezer.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/linux/freezer.h	2006-03-08 15:22:33.189501500 +0000
@@ -0,0 +1,28 @@
+/* Freezer declarations */
+
+#define FREEZER_ON 0
+#define ABORT_FREEZING 1
+
+#define FREEZER_KERNEL_THREADS 0
+#define FREEZER_ALL_THREADS 1
+
+#ifdef CONFIG_PM
+extern unsigned long freezer_state;
+
+#define test_freezer_state(bit) test_bit(bit, &freezer_state)
+#define set_freezer_state(bit) set_bit(bit, &freezer_state)
+#define clear_freezer_state(bit) clear_bit(bit, &freezer_state)
+
+#define freezer_is_on() (test_freezer_state(FREEZER_ON))
+
+extern void do_freeze_process(struct notifier_block *nl);
+
+#else
+
+#define test_freezer_state(bit) (0)
+#define set_freezer_state(bit) do { } while(0)
+#define clear_freezer_state(bit) do { } while(0)
+
+#define freezer_is_on() (0)
+
+#endif
diff -urN oldtree/include/linux/kernel.h newtree/include/linux/kernel.h
--- oldtree/include/linux/kernel.h	2006-03-08 18:48:02.308023250 +0000
+++ newtree/include/linux/kernel.h	2006-03-08 15:22:33.193501750 +0000
@@ -108,6 +108,8 @@
 	__attribute__ ((format (printf, 2, 0)));
 extern int snprintf(char * buf, size_t size, const char * fmt, ...)
 	__attribute__ ((format (printf, 3, 4)));
+extern int snprintf_used(char *buffer, int buffer_size,
+		const char *fmt, ...);
 extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 	__attribute__ ((format (printf, 3, 0)));
 extern int scnprintf(char * buf, size_t size, const char * fmt, ...)
diff -urN oldtree/include/linux/kernel.h.orig newtree/include/linux/kernel.h.orig
--- oldtree/include/linux/kernel.h.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/linux/kernel.h.orig	2006-03-08 15:21:18.988864250 +0000
@@ -0,0 +1,335 @@
+#ifndef _LINUX_KERNEL_H
+#define _LINUX_KERNEL_H
+
+/*
+ * 'kernel.h' contains some often-used function prototypes etc
+ */
+
+#ifdef __KERNEL__
+
+#include <stdarg.h>
+#include <linux/linkage.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+#include <asm/bug.h>
+
+extern const char linux_banner[];
+
+#define INT_MAX		((int)(~0U>>1))
+#define INT_MIN		(-INT_MAX - 1)
+#define UINT_MAX	(~0U)
+#define LONG_MAX	((long)(~0UL>>1))
+#define LONG_MIN	(-LONG_MAX - 1)
+#define ULONG_MAX	(~0UL)
+
+#define STACK_MAGIC	0xdeadbeef
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
+
+#define	KERN_EMERG	"<0>"	/* system is unusable			*/
+#define	KERN_ALERT	"<1>"	/* action must be taken immediately	*/
+#define	KERN_CRIT	"<2>"	/* critical conditions			*/
+#define	KERN_ERR	"<3>"	/* error conditions			*/
+#define	KERN_WARNING	"<4>"	/* warning conditions			*/
+#define	KERN_NOTICE	"<5>"	/* normal but significant condition	*/
+#define	KERN_INFO	"<6>"	/* informational			*/
+#define	KERN_DEBUG	"<7>"	/* debug-level messages			*/
+
+extern int console_printk[];
+
+#define console_loglevel (console_printk[0])
+#define default_message_loglevel (console_printk[1])
+#define minimum_console_loglevel (console_printk[2])
+#define default_console_loglevel (console_printk[3])
+
+struct completion;
+struct pt_regs;
+struct user;
+
+/**
+ * might_sleep - annotation for functions that can sleep
+ *
+ * this macro will print a stack trace if it is executed in an atomic
+ * context (spinlock, irq-handler, ...).
+ *
+ * This is a useful debugging help to be able to catch problems early and not
+ * be biten later when the calling function happens to sleep when it is not
+ * supposed to.
+ */
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int cond_resched(void);
+# define might_resched() cond_resched()
+#else
+# define might_resched() do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+  void __might_sleep(char *file, int line);
+# define might_sleep() \
+	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
+#else
+# define might_sleep() do { might_resched(); } while (0)
+#endif
+
+#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
+
+#define abs(x) ({				\
+		int __x = (x);			\
+		(__x < 0) ? -__x : __x;		\
+	})
+
+#define labs(x) ({				\
+		long __x = (x);			\
+		(__x < 0) ? -__x : __x;		\
+	})
+
+extern struct atomic_notifier_head panic_notifier_list;
+extern long (*panic_blink)(long time);
+NORET_TYPE void panic(const char * fmt, ...)
+	__attribute__ ((NORET_AND format (printf, 1, 2)));
+extern void oops_enter(void);
+extern void oops_exit(void);
+extern int oops_may_print(void);
+fastcall NORET_TYPE void do_exit(long error_code)
+	ATTRIB_NORET;
+NORET_TYPE void complete_and_exit(struct completion *, long)
+	ATTRIB_NORET;
+extern unsigned long simple_strtoul(const char *,char **,unsigned int);
+extern long simple_strtol(const char *,char **,unsigned int);
+extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
+extern long long simple_strtoll(const char *,char **,unsigned int);
+extern int sprintf(char * buf, const char * fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int vsprintf(char *buf, const char *, va_list)
+	__attribute__ ((format (printf, 2, 0)));
+extern int snprintf(char * buf, size_t size, const char * fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+	__attribute__ ((format (printf, 3, 0)));
+extern int scnprintf(char * buf, size_t size, const char * fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
+	__attribute__ ((format (printf, 3, 0)));
+
+extern int sscanf(const char *, const char *, ...)
+	__attribute__ ((format (scanf, 2, 3)));
+extern int vsscanf(const char *, const char *, va_list)
+	__attribute__ ((format (scanf, 2, 0)));
+
+extern int get_option(char **str, int *pint);
+extern char *get_options(const char *str, int nints, int *ints);
+extern unsigned long long memparse(char *ptr, char **retptr);
+
+extern int __kernel_text_address(unsigned long addr);
+extern int kernel_text_address(unsigned long addr);
+extern int session_of_pgrp(int pgrp);
+
+extern void dump_thread(struct pt_regs *regs, struct user *dump);
+
+#ifdef CONFIG_PRINTK
+asmlinkage int vprintk(const char *fmt, va_list args)
+	__attribute__ ((format (printf, 1, 0)));
+asmlinkage int printk(const char * fmt, ...)
+	__attribute__ ((format (printf, 1, 2)));
+#else
+static inline int vprintk(const char *s, va_list args)
+	__attribute__ ((format (printf, 1, 0)));
+static inline int vprintk(const char *s, va_list args) { return 0; }
+static inline int printk(const char *s, ...)
+	__attribute__ ((format (printf, 1, 2)));
+static inline int printk(const char *s, ...) { return 0; }
+#endif
+
+unsigned long int_sqrt(unsigned long);
+
+static inline int __attribute_pure__ long_log2(unsigned long x)
+{
+	int r = 0;
+	for (x >>= 1; x > 0; x >>= 1)
+		r++;
+	return r;
+}
+
+static inline unsigned long __attribute_const__ roundup_pow_of_two(unsigned long x)
+{
+	return (1UL << fls(x - 1));
+}
+
+extern int printk_ratelimit(void);
+extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst);
+
+static inline void console_silent(void)
+{
+	console_loglevel = 0;
+}
+
+static inline void console_verbose(void)
+{
+	if (console_loglevel)
+		console_loglevel = 15;
+}
+
+extern void bust_spinlocks(int yes);
+extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
+extern __deprecated_for_modules int panic_timeout;
+extern int panic_on_oops;
+extern int tainted;
+extern const char *print_tainted(void);
+extern void add_taint(unsigned);
+
+/* Values used for system_state */
+extern enum system_states {
+	SYSTEM_BOOTING,
+	SYSTEM_RUNNING,
+	SYSTEM_HALT,
+	SYSTEM_POWER_OFF,
+	SYSTEM_RESTART,
+	SYSTEM_SUSPEND_DISK,
+} system_state;
+
+#define TAINT_PROPRIETARY_MODULE	(1<<0)
+#define TAINT_FORCED_MODULE		(1<<1)
+#define TAINT_UNSAFE_SMP		(1<<2)
+#define TAINT_FORCED_RMMOD		(1<<3)
+#define TAINT_MACHINE_CHECK		(1<<4)
+#define TAINT_BAD_PAGE			(1<<5)
+
+extern void dump_stack(void);
+
+#ifdef DEBUG
+#define pr_debug(fmt,arg...) \
+	printk(KERN_DEBUG fmt,##arg)
+#else
+#define pr_debug(fmt,arg...) \
+	do { } while (0)
+#endif
+
+#define pr_info(fmt,arg...) \
+	printk(KERN_INFO fmt,##arg)
+
+/*
+ *      Display an IP address in readable format.
+ */
+
+#define NIPQUAD(addr) \
+	((unsigned char *)&addr)[0], \
+	((unsigned char *)&addr)[1], \
+	((unsigned char *)&addr)[2], \
+	((unsigned char *)&addr)[3]
+#define NIPQUAD_FMT "%u.%u.%u.%u"
+
+#define NIP6(addr) \
+	ntohs((addr).s6_addr16[0]), \
+	ntohs((addr).s6_addr16[1]), \
+	ntohs((addr).s6_addr16[2]), \
+	ntohs((addr).s6_addr16[3]), \
+	ntohs((addr).s6_addr16[4]), \
+	ntohs((addr).s6_addr16[5]), \
+	ntohs((addr).s6_addr16[6]), \
+	ntohs((addr).s6_addr16[7])
+#define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
+#define NIP6_SEQFMT "%04x%04x%04x%04x%04x%04x%04x%04x"
+
+#if defined(__LITTLE_ENDIAN)
+#define HIPQUAD(addr) \
+	((unsigned char *)&addr)[3], \
+	((unsigned char *)&addr)[2], \
+	((unsigned char *)&addr)[1], \
+	((unsigned char *)&addr)[0]
+#elif defined(__BIG_ENDIAN)
+#define HIPQUAD	NIPQUAD
+#else
+#error "Please fix asm/byteorder.h"
+#endif /* __LITTLE_ENDIAN */
+
+/*
+ * min()/max() macros that also do
+ * strict type-checking.. See the
+ * "unnecessary" pointer comparison.
+ */
+#define min(x,y) ({ \
+	typeof(x) _x = (x);	\
+	typeof(y) _y = (y);	\
+	(void) (&_x == &_y);		\
+	_x < _y ? _x : _y; })
+
+#define max(x,y) ({ \
+	typeof(x) _x = (x);	\
+	typeof(y) _y = (y);	\
+	(void) (&_x == &_y);		\
+	_x > _y ? _x : _y; })
+
+/*
+ * ..and if you can't take the strict
+ * types, you can specify one yourself.
+ *
+ * Or not use min/max at all, of course.
+ */
+#define min_t(type,x,y) \
+	({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+#define max_t(type,x,y) \
+	({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
+
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Check at compile time that something is of a particular type.
+ * Always evaluates to 1 so you may use it easily in comparisons.
+ */
+#define typecheck(type,x) \
+({	type __dummy; \
+	typeof(x) __dummy2; \
+	(void)(&__dummy == &__dummy2); \
+	1; \
+})
+
+/*
+ * Check at compile time that 'function' is a certain type, or is a pointer
+ * to that type (needs to use typedef for the function type.)
+ */
+#define typecheck_fn(type,function) \
+({	typeof(type) __tmp = function; \
+	(void)__tmp; \
+})
+
+#endif /* __KERNEL__ */
+
+#define SI_LOAD_SHIFT	16
+struct sysinfo {
+	long uptime;			/* Seconds since boot */
+	unsigned long loads[3];		/* 1, 5, and 15 minute load averages */
+	unsigned long totalram;		/* Total usable main memory size */
+	unsigned long freeram;		/* Available memory size */
+	unsigned long sharedram;	/* Amount of shared memory */
+	unsigned long bufferram;	/* Memory used by buffers */
+	unsigned long totalswap;	/* Total swap space size */
+	unsigned long freeswap;		/* swap space still available */
+	unsigned short procs;		/* Number of current processes */
+	unsigned short pad;		/* explicit padding for m68k */
+	unsigned long totalhigh;	/* Total high memory size */
+	unsigned long freehigh;		/* Available high memory size */
+	unsigned int mem_unit;		/* Memory unit size in bytes */
+	char _f[20-2*sizeof(long)-sizeof(int)];	/* Padding: libc5 uses this.. */
+};
+
+/* Force a compilation error if condition is true */
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+/* Trap pasters of __FUNCTION__ at compile-time */
+#define __FUNCTION__ (__func__)
+
+#endif
diff -urN oldtree/include/linux/kthread.h newtree/include/linux/kthread.h
--- oldtree/include/linux/kthread.h	2006-01-03 03:21:10.000000000 +0000
+++ newtree/include/linux/kthread.h	2006-03-08 15:22:33.193501750 +0000
@@ -23,10 +23,20 @@
  *
  * Returns a task_struct or ERR_PTR(-ENOMEM).
  */
+struct task_struct *__kthread_create(int (*threadfn)(void *data),
+				   void *data,
+				   unsigned long freezer_flags,
+				   const char namefmt[],
+				   va_list * args);
+
 struct task_struct *kthread_create(int (*threadfn)(void *data),
 				   void *data,
 				   const char namefmt[], ...);
 
+struct task_struct *kthread_nofreeze_create(int (*threadfn)(void *data),
+				   void *data,
+				   const char namefmt[], ...);
+
 /**
  * kthread_run: create and wake a thread.
  * @threadfn: the function to run until signal_pending(current).
@@ -35,14 +45,15 @@
  *
  * Description: Convenient wrapper for kthread_create() followed by
  * wake_up_process().  Returns the kthread, or ERR_PTR(-ENOMEM). */
-#define kthread_run(threadfn, data, namefmt, ...)			   \
-({									   \
-	struct task_struct *__k						   \
-		= kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
-	if (!IS_ERR(__k))						   \
-		wake_up_process(__k);					   \
-	__k;								   \
-})
+
+extern struct task_struct * kthread_run(int (*threadfn)(void *data),
+					void *data,
+					const char namefmt[], ...);
+
+extern struct task_struct * kthread_nofreeze_run(int (*threadfn)(void *data),
+					void *data,
+					const char namefmt[], ...);
+
 
 /**
  * kthread_bind: bind a just-created kthread to a cpu.
diff -urN oldtree/include/linux/netlink.h newtree/include/linux/netlink.h
--- oldtree/include/linux/netlink.h	2006-03-08 18:48:02.328024500 +0000
+++ newtree/include/linux/netlink.h	2006-03-08 15:22:33.197502000 +0000
@@ -21,6 +21,8 @@
 #define NETLINK_DNRTMSG		14	/* DECnet routing messages */
 #define NETLINK_KOBJECT_UEVENT	15	/* Kernel messages to userspace */
 #define NETLINK_GENERIC		16
+#define NETLINK_SUSPEND2_USERUI	17	/* For suspend2's userui */
+#define NETLINK_SUSPEND2_USM	18	/* For suspend2's userui */
 
 #define MAX_LINKS 32		
 
diff -urN oldtree/include/linux/sched.h newtree/include/linux/sched.h
--- oldtree/include/linux/sched.h	2006-03-08 18:48:02.348025750 +0000
+++ newtree/include/linux/sched.h	2006-03-08 15:22:33.201502250 +0000
@@ -1445,7 +1445,7 @@
 
 extern void refrigerator(void);
 extern int freeze_processes(void);
-extern void thaw_processes(void);
+extern void thaw_processes(int which_threads);
 
 static inline int try_to_freeze(void)
 {
@@ -1464,7 +1464,7 @@
 
 static inline void refrigerator(void) {}
 static inline int freeze_processes(void) { BUG(); return 0; }
-static inline void thaw_processes(void) {}
+static inline void thaw_processes(int which_threads) {}
 
 static inline int try_to_freeze(void) { return 0; }
 
diff -urN oldtree/include/linux/sched.h.orig newtree/include/linux/sched.h.orig
--- oldtree/include/linux/sched.h.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/linux/sched.h.orig	2006-03-08 15:21:19.080870000 +0000
@@ -0,0 +1,1474 @@
+#ifndef _LINUX_SCHED_H
+#define _LINUX_SCHED_H
+
+#include <asm/param.h>	/* for HZ */
+
+#include <linux/config.h>
+#include <linux/capability.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/thread_info.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/nodemask.h>
+
+#include <asm/system.h>
+#include <asm/semaphore.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/mmu.h>
+#include <asm/cputime.h>
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/seccomp.h>
+#include <linux/rcupdate.h>
+#include <linux/futex.h>
+
+#include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
+
+struct exec_domain;
+struct bio;
+
+/*
+ * cloning flags:
+ */
+#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
+#define CLONE_VM	0x00000100	/* set if VM shared between processes */
+#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
+#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
+#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
+#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
+#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
+#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
+#define CLONE_THREAD	0x00010000	/* Same thread group? */
+#define CLONE_NEWNS	0x00020000	/* New namespace group? */
+#define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
+#define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
+#define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
+#define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
+#define CLONE_DETACHED		0x00400000	/* Unused, ignored */
+#define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
+#define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
+#define CLONE_STOPPED		0x02000000	/* Start in stopped state */
+
+/*
+ * List of flags we want to share for kernel threads,
+ * if only because they are not used by them anyway.
+ */
+#define CLONE_KERNEL	(CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
+
+/*
+ * These are the constant used to fake the fixed-point load-average
+ * counting. Some notes:
+ *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
+ *    a load-average precision of 10 bits integer + 11 bits fractional
+ *  - if you want to count load-averages more often, you need more
+ *    precision, or rounding will get you. With 2-second counting freq,
+ *    the EXP_n values would be 1981, 2034 and 2043 if still using only
+ *    11 bit fractions.
+ */
+extern unsigned long avenrun[];		/* Load averages */
+
+#define FSHIFT		11		/* nr of bits of precision */
+#define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
+#define LOAD_FREQ	(5*HZ)		/* 5 sec intervals */
+#define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
+#define EXP_5		2014		/* 1/exp(5sec/5min) */
+#define EXP_15		2037		/* 1/exp(5sec/15min) */
+
+#define CALC_LOAD(load,exp,n) \
+	load *= exp; \
+	load += n*(FIXED_1-exp); \
+	load >>= FSHIFT;
+
+extern unsigned long total_forks;
+extern int nr_threads;
+extern int last_pid;
+DECLARE_PER_CPU(unsigned long, process_counts);
+extern int nr_processes(void);
+extern unsigned long nr_running(void);
+extern unsigned long nr_uninterruptible(void);
+extern unsigned long nr_iowait(void);
+
+#include <linux/time.h>
+#include <linux/param.h>
+#include <linux/resource.h>
+#include <linux/timer.h>
+#include <linux/hrtimer.h>
+
+#include <asm/processor.h>
+
+/*
+ * Task state bitmask. NOTE! These bits are also
+ * encoded in fs/proc/array.c: get_task_state().
+ *
+ * We have two separate sets of flags: task->state
+ * is about runnability, while task->exit_state are
+ * about the task exiting. Confusing, but this way
+ * modifying one set can't modify the other one by
+ * mistake.
+ */
+#define TASK_RUNNING		0
+#define TASK_INTERRUPTIBLE	1
+#define TASK_UNINTERRUPTIBLE	2
+#define TASK_STOPPED		4
+#define TASK_TRACED		8
+/* in tsk->exit_state */
+#define EXIT_ZOMBIE		16
+#define EXIT_DEAD		32
+/* in tsk->state again */
+#define TASK_NONINTERACTIVE	64
+
+#define __set_task_state(tsk, state_value)		\
+	do { (tsk)->state = (state_value); } while (0)
+#define set_task_state(tsk, state_value)		\
+	set_mb((tsk)->state, (state_value))
+
+/*
+ * set_current_state() includes a barrier so that the write of current->state
+ * is correctly serialised wrt the caller's subsequent test of whether to
+ * actually sleep:
+ *
+ *	set_current_state(TASK_UNINTERRUPTIBLE);
+ *	if (do_i_need_to_sleep())
+ *		schedule();
+ *
+ * If the caller does not need such serialisation then use __set_current_state()
+ */
+#define __set_current_state(state_value)			\
+	do { current->state = (state_value); } while (0)
+#define set_current_state(state_value)		\
+	set_mb(current->state, (state_value))
+
+/* Task command name length */
+#define TASK_COMM_LEN 16
+
+/*
+ * Scheduling policies
+ */
+#define SCHED_NORMAL		0
+#define SCHED_FIFO		1
+#define SCHED_RR		2
+#define SCHED_BATCH		3
+
+struct sched_param {
+	int sched_priority;
+};
+
+#ifdef __KERNEL__
+
+#include <linux/spinlock.h>
+
+/*
+ * This serializes "schedule()" and also protects
+ * the run-queue from deletions/modifications (but
+ * _adding_ to the beginning of the run-queue has
+ * a separate lock).
+ */
+extern rwlock_t tasklist_lock;
+extern spinlock_t mmlist_lock;
+
+typedef struct task_struct task_t;
+
+extern void sched_init(void);
+extern void sched_init_smp(void);
+extern void init_idle(task_t *idle, int cpu);
+
+extern cpumask_t nohz_cpu_mask;
+
+extern void show_state(void);
+extern void show_regs(struct pt_regs *);
+extern void smp_show_regs(struct pt_regs *, void *);
+
+/*
+ * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
+ * task), SP is the stack pointer of the first frame that should be shown in the back
+ * trace (or NULL if the entire call-chain of the task should be shown).
+ */
+extern void show_stack(struct task_struct *task, unsigned long *sp);
+
+void io_schedule(void);
+long io_schedule_timeout(long timeout);
+
+extern void cpu_init (void);
+extern void trap_init(void);
+extern void update_process_times(int user);
+extern void scheduler_tick(void);
+
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+extern void softlockup_tick(void);
+extern void spawn_softlockup_task(void);
+extern void touch_softlockup_watchdog(void);
+#else
+static inline void softlockup_tick(void)
+{
+}
+static inline void spawn_softlockup_task(void)
+{
+}
+static inline void touch_softlockup_watchdog(void)
+{
+}
+#endif
+
+
+/* Attach to any functions which should be ignored in wchan output. */
+#define __sched		__attribute__((__section__(".sched.text")))
+/* Is this address in the __sched functions? */
+extern int in_sched_functions(unsigned long addr);
+
+#define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
+extern signed long FASTCALL(schedule_timeout(signed long timeout));
+extern signed long schedule_timeout_interruptible(signed long timeout);
+extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void schedule(void);
+
+struct namespace;
+
+/* Maximum number of active map areas.. This is a random (large) number */
+#define DEFAULT_MAX_MAP_COUNT	65536
+
+extern int sysctl_max_map_count;
+
+#include <linux/aio.h>
+
+extern unsigned long
+arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
+		       unsigned long, unsigned long);
+extern unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+			  unsigned long len, unsigned long pgoff,
+			  unsigned long flags);
+extern void arch_unmap_area(struct mm_struct *, unsigned long);
+extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
+
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * The mm counters are not protected by its page_table_lock,
+ * so must be incremented atomically.
+ */
+#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
+typedef atomic_long_t mm_counter_t;
+
+#else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+/*
+ * The mm counters are protected by its page_table_lock,
+ * so can be incremented directly.
+ */
+#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
+#define get_mm_counter(mm, member) ((mm)->_##member)
+#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
+#define inc_mm_counter(mm, member) (mm)->_##member++
+#define dec_mm_counter(mm, member) (mm)->_##member--
+typedef unsigned long mm_counter_t;
+
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
+#define get_mm_rss(mm)					\
+	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
+#define update_hiwater_rss(mm)	do {			\
+	unsigned long _rss = get_mm_rss(mm);		\
+	if ((mm)->hiwater_rss < _rss)			\
+		(mm)->hiwater_rss = _rss;		\
+} while (0)
+#define update_hiwater_vm(mm)	do {			\
+	if ((mm)->hiwater_vm < (mm)->total_vm)		\
+		(mm)->hiwater_vm = (mm)->total_vm;	\
+} while (0)
+
+struct mm_struct {
+	struct vm_area_struct * mmap;		/* list of VMAs */
+	struct rb_root mm_rb;
+	struct vm_area_struct * mmap_cache;	/* last find_vma result */
+	unsigned long (*get_unmapped_area) (struct file *filp,
+				unsigned long addr, unsigned long len,
+				unsigned long pgoff, unsigned long flags);
+	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
+	unsigned long mmap_base;		/* base of mmap area */
+	unsigned long task_size;		/* size of task vm space */
+	unsigned long cached_hole_size;         /* if non-zero, the largest hole below free_area_cache */
+	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
+	pgd_t * pgd;
+	atomic_t mm_users;			/* How many users with user space? */
+	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
+	int map_count;				/* number of VMAs */
+	struct rw_semaphore mmap_sem;
+	spinlock_t page_table_lock;		/* Protects page tables and some counters */
+
+	struct list_head mmlist;		/* List of maybe swapped mm's.  These are globally strung
+						 * together off init_mm.mmlist, and are protected
+						 * by mmlist_lock
+						 */
+
+	/* Special counters, in some configurations protected by the
+	 * page_table_lock, in other configurations by being atomic.
+	 */
+	mm_counter_t _file_rss;
+	mm_counter_t _anon_rss;
+
+	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
+	unsigned long hiwater_vm;	/* High-water virtual memory usage */
+
+	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
+	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
+	unsigned long start_code, end_code, start_data, end_data;
+	unsigned long start_brk, brk, start_stack;
+	unsigned long arg_start, arg_end, env_start, env_end;
+
+	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
+
+	unsigned dumpable:2;
+	cpumask_t cpu_vm_mask;
+
+	/* Architecture-specific MM context */
+	mm_context_t context;
+
+	/* Token based thrashing protection. */
+	unsigned long swap_token_time;
+	char recent_pagein;
+
+	/* coredumping support */
+	int core_waiters;
+	struct completion *core_startup_done, core_done;
+
+	/* aio bits */
+	rwlock_t		ioctx_list_lock;
+	struct kioctx		*ioctx_list;
+};
+
+struct sighand_struct {
+	atomic_t		count;
+	struct k_sigaction	action[_NSIG];
+	spinlock_t		siglock;
+};
+
+/*
+ * NOTE! "signal_struct" does not have it's own
+ * locking, because a shared signal_struct always
+ * implies a shared sighand_struct, so locking
+ * sighand_struct is always a proper superset of
+ * the locking of signal_struct.
+ */
+struct signal_struct {
+	atomic_t		count;
+	atomic_t		live;
+
+	wait_queue_head_t	wait_chldexit;	/* for wait4() */
+
+	/* current thread group signal load-balancing target: */
+	task_t			*curr_target;
+
+	/* shared signal handling: */
+	struct sigpending	shared_pending;
+
+	/* thread group exit support */
+	int			group_exit_code;
+	/* overloaded:
+	 * - notify group_exit_task when ->count is equal to notify_count
+	 * - everyone except group_exit_task is stopped during signal delivery
+	 *   of fatal signals, group_exit_task processes the signal.
+	 */
+	struct task_struct	*group_exit_task;
+	int			notify_count;
+
+	/* thread group stop support, overloads group_exit_code too */
+	int			group_stop_count;
+	unsigned int		flags; /* see SIGNAL_* flags below */
+
+	/* POSIX.1b Interval Timers */
+	struct list_head posix_timers;
+
+	/* ITIMER_REAL timer for the process */
+	struct hrtimer real_timer;
+	ktime_t it_real_incr;
+
+	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
+	cputime_t it_prof_expires, it_virt_expires;
+	cputime_t it_prof_incr, it_virt_incr;
+
+	/* job control IDs */
+	pid_t pgrp;
+	pid_t tty_old_pgrp;
+	pid_t session;
+	/* boolean value for session group leader */
+	int leader;
+
+	struct tty_struct *tty; /* NULL if no tty */
+
+	/*
+	 * Cumulative resource counters for dead threads in the group,
+	 * and for reaped dead child processes forked by this group.
+	 * Live threads maintain their own counters and add to these
+	 * in __exit_signal, except for the group leader.
+	 */
+	cputime_t utime, stime, cutime, cstime;
+	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
+	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+
+	/*
+	 * Cumulative ns of scheduled CPU time for dead threads in the
+	 * group, not including a zombie group leader.  (This only differs
+	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
+	 * other than jiffies.)
+	 */
+	unsigned long long sched_time;
+
+	/*
+	 * We don't bother to synchronize most readers of this at all,
+	 * because there is no reader checking a limit that actually needs
+	 * to get both rlim_cur and rlim_max atomically, and either one
+	 * alone is a single word that can safely be read normally.
+	 * getrlimit/setrlimit use task_lock(current->group_leader) to
+	 * protect this instead of the siglock, because they really
+	 * have no need to disable irqs.
+	 */
+	struct rlimit rlim[RLIM_NLIMITS];
+
+	struct list_head cpu_timers[3];
+
+	/* keep the process-shared keyrings here so that they do the right
+	 * thing in threads created with CLONE_THREAD */
+#ifdef CONFIG_KEYS
+	struct key *session_keyring;	/* keyring inherited over fork */
+	struct key *process_keyring;	/* keyring private to this process */
+#endif
+};
+
+/* Context switch must be unlocked if interrupts are to be enabled */
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+# define __ARCH_WANT_UNLOCKED_CTXSW
+#endif
+
+/*
+ * Bits in flags field of signal_struct.
+ */
+#define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
+#define SIGNAL_STOP_DEQUEUED	0x00000002 /* stop signal dequeued */
+#define SIGNAL_STOP_CONTINUED	0x00000004 /* SIGCONT since WCONTINUED reap */
+#define SIGNAL_GROUP_EXIT	0x00000008 /* group exit in progress */
+
+
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO	100
+#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+
+#define MAX_PRIO		(MAX_RT_PRIO + 40)
+
+#define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
+
+/*
+ * Some day this will be a full-fledged user tracking system..
+ */
+struct user_struct {
+	atomic_t __count;	/* reference count */
+	atomic_t processes;	/* How many processes does this user have? */
+	atomic_t files;		/* How many open files does this user have? */
+	atomic_t sigpending;	/* How many pending signals does this user have? */
+#ifdef CONFIG_INOTIFY
+	atomic_t inotify_watches; /* How many inotify watches does this user have? */
+	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
+#endif
+	/* protected by mq_lock	*/
+	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
+	unsigned long locked_shm; /* How many pages of mlocked shm ? */
+
+#ifdef CONFIG_KEYS
+	struct key *uid_keyring;	/* UID specific keyring */
+	struct key *session_keyring;	/* UID's default session keyring */
+#endif
+
+	/* Hash table maintenance information */
+	struct list_head uidhash_list;
+	uid_t uid;
+};
+
+extern struct user_struct *find_user(uid_t);
+
+extern struct user_struct root_user;
+#define INIT_USER (&root_user)
+
+typedef struct prio_array prio_array_t;
+struct backing_dev_info;
+struct reclaim_state;
+
+#ifdef CONFIG_SCHEDSTATS
+struct sched_info {
+	/* cumulative counters */
+	unsigned long	cpu_time,	/* time spent on the cpu */
+			run_delay,	/* time spent waiting on a runqueue */
+			pcnt;		/* # of timeslices run on this cpu */
+
+	/* timestamps */
+	unsigned long	last_arrival,	/* when we last ran on a cpu */
+			last_queued;	/* when we were last queued to run */
+};
+
+extern struct file_operations proc_schedstat_operations;
+#endif
+
+enum idle_type
+{
+	SCHED_IDLE,
+	NOT_IDLE,
+	NEWLY_IDLE,
+	MAX_IDLE_TYPES
+};
+
+/*
+ * sched-domains (multiprocessor balancing) declarations:
+ */
+#ifdef CONFIG_SMP
+#define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
+
+#define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
+#define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
+#define SD_BALANCE_EXEC		4	/* Balance on exec */
+#define SD_BALANCE_FORK		8	/* Balance on fork, clone */
+#define SD_WAKE_IDLE		16	/* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE		32	/* Wake task to waking CPU */
+#define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
+
+struct sched_group {
+	struct sched_group *next;	/* Must be a circular list */
+	cpumask_t cpumask;
+
+	/*
+	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+	 * single CPU. This is read only (except for setup, hotplug CPU).
+	 */
+	unsigned long cpu_power;
+};
+
+struct sched_domain {
+	/* These fields must be setup */
+	struct sched_domain *parent;	/* top domain must be null terminated */
+	struct sched_group *groups;	/* the balancing groups of the domain */
+	cpumask_t span;			/* span of all CPUs in this domain */
+	unsigned long min_interval;	/* Minimum balance interval ms */
+	unsigned long max_interval;	/* Maximum balance interval ms */
+	unsigned int busy_factor;	/* less balancing by factor if busy */
+	unsigned int imbalance_pct;	/* No balance until over watermark */
+	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
+	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
+	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
+	unsigned int busy_idx;
+	unsigned int idle_idx;
+	unsigned int newidle_idx;
+	unsigned int wake_idx;
+	unsigned int forkexec_idx;
+	int flags;			/* See SD_* */
+
+	/* Runtime fields. */
+	unsigned long last_balance;	/* init to jiffies. units in jiffies */
+	unsigned int balance_interval;	/* initialise to 1. units in ms. */
+	unsigned int nr_balance_failed; /* initialise to 0 */
+
+#ifdef CONFIG_SCHEDSTATS
+	/* load_balance() stats */
+	unsigned long lb_cnt[MAX_IDLE_TYPES];
+	unsigned long lb_failed[MAX_IDLE_TYPES];
+	unsigned long lb_balanced[MAX_IDLE_TYPES];
+	unsigned long lb_imbalance[MAX_IDLE_TYPES];
+	unsigned long lb_gained[MAX_IDLE_TYPES];
+	unsigned long lb_hot_gained[MAX_IDLE_TYPES];
+	unsigned long lb_nobusyg[MAX_IDLE_TYPES];
+	unsigned long lb_nobusyq[MAX_IDLE_TYPES];
+
+	/* Active load balancing */
+	unsigned long alb_cnt;
+	unsigned long alb_failed;
+	unsigned long alb_pushed;
+
+	/* SD_BALANCE_EXEC stats */
+	unsigned long sbe_cnt;
+	unsigned long sbe_balanced;
+	unsigned long sbe_pushed;
+
+	/* SD_BALANCE_FORK stats */
+	unsigned long sbf_cnt;
+	unsigned long sbf_balanced;
+	unsigned long sbf_pushed;
+
+	/* try_to_wake_up() stats */
+	unsigned long ttwu_wake_remote;
+	unsigned long ttwu_move_affine;
+	unsigned long ttwu_move_balance;
+#endif
+};
+
+extern void partition_sched_domains(cpumask_t *partition1,
+				    cpumask_t *partition2);
+
+/*
+ * Maximum cache size the migration-costs auto-tuning code will
+ * search from:
+ */
+extern unsigned int max_cache_size;
+
+#endif	/* CONFIG_SMP */
+
+
+struct io_context;			/* See blkdev.h */
+void exit_io_context(void);
+struct cpuset;
+
+#define NGROUPS_SMALL		32
+#define NGROUPS_PER_BLOCK	((int)(PAGE_SIZE / sizeof(gid_t)))
+struct group_info {
+	int ngroups;
+	atomic_t usage;
+	gid_t small_block[NGROUPS_SMALL];
+	int nblocks;
+	gid_t *blocks[0];
+};
+
+/*
+ * get_group_info() must be called with the owning task locked (via task_lock())
+ * when task != current.  The reason being that the vast majority of callers are
+ * looking at current->group_info, which can not be changed except by the
+ * current task.  Changing current->group_info requires the task lock, too.
+ */
+#define get_group_info(group_info) do { \
+	atomic_inc(&(group_info)->usage); \
+} while (0)
+
+#define put_group_info(group_info) do { \
+	if (atomic_dec_and_test(&(group_info)->usage)) \
+		groups_free(group_info); \
+} while (0)
+
+extern struct group_info *groups_alloc(int gidsetsize);
+extern void groups_free(struct group_info *group_info);
+extern int set_current_groups(struct group_info *group_info);
+extern int groups_search(struct group_info *group_info, gid_t grp);
+/* access the groups "array" with this macro */
+#define GROUP_AT(gi, i) \
+    ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
+
+#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
+extern void prefetch_stack(struct task_struct*);
+#else
+static inline void prefetch_stack(struct task_struct *t) { }
+#endif
+
+struct audit_context;		/* See audit.c */
+struct mempolicy;
+
+enum sleep_type {
+	SLEEP_NORMAL,
+	SLEEP_NONINTERACTIVE,
+	SLEEP_INTERACTIVE,
+	SLEEP_INTERRUPTED,
+};
+
+struct task_struct {
+	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
+	struct thread_info *thread_info;
+	atomic_t usage;
+	unsigned long flags;	/* per process flags, defined below */
+	unsigned long ptrace;
+
+	int lock_depth;		/* BKL lock depth */
+
+#ifdef CONFIG_SMP
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+	int oncpu;
+#endif
+	int load_weight;	/* for load balancing purposes */
+#endif
+	int prio, static_prio;
+	struct list_head run_list;
+	prio_array_t *array;
+
+	unsigned short ioprio;
+	unsigned int btrace_seq;
+
+	unsigned long sleep_avg;
+	unsigned long long timestamp, last_ran;
+	unsigned long long sched_time; /* sched_clock time spent running */
+	enum sleep_type sleep_type;
+
+	unsigned long policy;
+	cpumask_t cpus_allowed;
+	unsigned int time_slice, first_time_slice;
+
+#ifdef CONFIG_SCHEDSTATS
+	struct sched_info sched_info;
+#endif
+
+	seccomp_t seccomp;
+
+	struct list_head tasks;
+	/*
+	 * ptrace_list/ptrace_children forms the list of my children
+	 * that were stolen by a ptracer.
+	 */
+	struct list_head ptrace_children;
+	struct list_head ptrace_list;
+
+	struct mm_struct *mm, *active_mm;
+
+/* task state */
+	struct linux_binfmt *binfmt;
+	long exit_state;
+	int exit_code, exit_signal;
+	int pdeath_signal;  /*  The signal sent when the parent dies  */
+	/* ??? */
+	unsigned long personality;
+	unsigned did_exec:1;
+	pid_t pid;
+	pid_t tgid;
+	/* 
+	 * pointers to (original) parent process, youngest child, younger sibling,
+	 * older sibling, respectively.  (p->father can be replaced with 
+	 * p->parent->pid)
+	 */
+	struct task_struct *real_parent; /* real parent process (when being debugged) */
+	struct task_struct *parent;	/* parent process */
+	/*
+	 * children/sibling forms the list of my children plus the
+	 * tasks I'm ptracing.
+	 */
+	struct list_head children;	/* list of my children */
+	struct list_head sibling;	/* linkage in my parent's children list */
+	struct task_struct *group_leader;	/* threadgroup leader */
+
+	/* PID/PID hash table linkage. */
+	struct pid pids[PIDTYPE_MAX];
+
+	struct completion *vfork_done;		/* for vfork() */
+	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
+	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
+
+	unsigned long rt_priority;
+	cputime_t utime, stime;
+	unsigned long nvcsw, nivcsw; /* context switch counts */
+	struct timespec start_time;
+/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+	unsigned long min_flt, maj_flt;
+
+  	cputime_t it_prof_expires, it_virt_expires;
+	unsigned long long it_sched_expires;
+	struct list_head cpu_timers[3];
+
+/* process credentials */
+	uid_t uid,euid,suid,fsuid;
+	gid_t gid,egid,sgid,fsgid;
+	struct group_info *group_info;
+	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
+	unsigned keep_capabilities:1;
+	struct user_struct *user;
+#ifdef CONFIG_KEYS
+	struct key *request_key_auth;	/* assumed request_key authority */
+	struct key *thread_keyring;	/* keyring private to this thread */
+	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
+#endif
+	int oomkilladj; /* OOM kill score adjustment (bit shift). */
+	char comm[TASK_COMM_LEN]; /* executable name excluding path
+				     - access with [gs]et_task_comm (which lock
+				       it with task_lock())
+				     - initialized normally by flush_old_exec */
+/* file system info */
+	int link_count, total_link_count;
+/* ipc stuff */
+	struct sysv_sem sysvsem;
+/* CPU-specific state of this task */
+	struct thread_struct thread;
+/* filesystem information */
+	struct fs_struct *fs;
+/* open file information */
+	struct files_struct *files;
+/* namespace */
+	struct namespace *namespace;
+/* signal handlers */
+	struct signal_struct *signal;
+	struct sighand_struct *sighand;
+
+	sigset_t blocked, real_blocked;
+	sigset_t saved_sigmask;		/* To be restored with TIF_RESTORE_SIGMASK */
+	struct sigpending pending;
+
+	unsigned long sas_ss_sp;
+	size_t sas_ss_size;
+	int (*notifier)(void *priv);
+	void *notifier_data;
+	sigset_t *notifier_mask;
+	
+	void *security;
+	struct audit_context *audit_context;
+
+/* Thread group tracking */
+   	u32 parent_exec_id;
+   	u32 self_exec_id;
+/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
+	spinlock_t alloc_lock;
+
+#ifdef CONFIG_DEBUG_MUTEXES
+	/* mutex deadlock detection */
+	struct mutex_waiter *blocked_on;
+#endif
+
+/* journalling filesystem info */
+	void *journal_info;
+
+/* stacked block device info */
+	struct bio *bio_list, **bio_tail;
+
+/* VM state */
+	struct reclaim_state *reclaim_state;
+
+	struct backing_dev_info *backing_dev_info;
+
+	struct io_context *io_context;
+
+	unsigned long ptrace_message;
+	siginfo_t *last_siginfo; /* For ptrace use.  */
+/*
+ * current io wait handle: wait queue entry to use for io waits
+ * If this thread is processing aio, this points at the waitqueue
+ * inside the currently handled kiocb. It may be NULL (i.e. default
+ * to a stack based synchronous wait) if its doing sync IO.
+ */
+	wait_queue_t *io_wait;
+/* i/o counters(bytes read/written, #syscalls */
+	u64 rchar, wchar, syscr, syscw;
+#if defined(CONFIG_BSD_PROCESS_ACCT)
+	u64 acct_rss_mem1;	/* accumulated rss usage */
+	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
+	clock_t acct_stimexpd;	/* clock_t-converted stime since last update */
+#endif
+#ifdef CONFIG_NUMA
+  	struct mempolicy *mempolicy;
+	short il_next;
+#endif
+#ifdef CONFIG_CPUSETS
+	struct cpuset *cpuset;
+	nodemask_t mems_allowed;
+	int cpuset_mems_generation;
+	int cpuset_mem_spread_rotor;
+#endif
+	struct robust_list_head __user *robust_list;
+#ifdef CONFIG_COMPAT
+	struct compat_robust_list_head __user *compat_robust_list;
+#endif
+
+	atomic_t fs_excl;	/* holding fs exclusive resources */
+	struct rcu_head rcu;
+};
+
+static inline pid_t process_group(struct task_struct *tsk)
+{
+	return tsk->signal->pgrp;
+}
+
+/**
+ * pid_alive - check that a task structure is not stale
+ * @p: Task structure to be checked.
+ *
+ * Test if a process is not yet dead (at most zombie state)
+ * If pid_alive fails, then pointers within the task structure
+ * can be stale and must not be dereferenced.
+ */
+static inline int pid_alive(struct task_struct *p)
+{
+	return p->pids[PIDTYPE_PID].nr != 0;
+}
+
+extern void free_task(struct task_struct *tsk);
+extern void __put_task_struct(struct task_struct *tsk);
+#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
+
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->rcu, __put_task_struct_cb);
+}
+
+/*
+ * Per process flags
+ */
+#define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
+					/* Not implemented yet, only for 486*/
+#define PF_STARTING	0x00000002	/* being created */
+#define PF_EXITING	0x00000004	/* getting shut down */
+#define PF_DEAD		0x00000008	/* Dead */
+#define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+#define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
+#define PF_DUMPCORE	0x00000200	/* dumped core */
+#define PF_SIGNALED	0x00000400	/* killed by a signal */
+#define PF_MEMALLOC	0x00000800	/* Allocating memory */
+#define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
+#define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
+#define PF_FREEZE	0x00004000	/* this task is being frozen for suspend now */
+#define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
+#define PF_FROZEN	0x00010000	/* frozen for system suspend */
+#define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
+#define PF_KSWAPD	0x00040000	/* I am kswapd */
+#define PF_SWAPOFF	0x00080000	/* I am in swapoff */
+#define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
+#define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
+#define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
+#define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
+#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
+
+/*
+ * Only the _current_ task can read/write to tsk->flags, but other
+ * tasks can access tsk->flags in readonly mode for example
+ * with tsk_used_math (like during threaded core dumping).
+ * There is however an exception to this rule during ptrace
+ * or during fork: the ptracer task is allowed to write to the
+ * child->flags of its traced child (same goes for fork, the parent
+ * can write to the child->flags), because we're guaranteed the
+ * child is not running and in turn not changing child->flags
+ * at the same time the parent does it.
+ */
+#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
+#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
+#define clear_used_math() clear_stopped_child_used_math(current)
+#define set_used_math() set_stopped_child_used_math(current)
+#define conditional_stopped_child_used_math(condition, child) \
+	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
+#define conditional_used_math(condition) \
+	conditional_stopped_child_used_math(condition, current)
+#define copy_to_stopped_child_used_math(child) \
+	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
+/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
+#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
+#define used_math() tsk_used_math(current)
+
+#ifdef CONFIG_SMP
+extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
+#else
+static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
+{
+	if (!cpu_isset(0, new_mask))
+		return -EINVAL;
+	return 0;
+}
+#endif
+
+extern unsigned long long sched_clock(void);
+extern unsigned long long current_sched_time(const task_t *current_task);
+
+/* sched_exec is called by processes performing an exec */
+#ifdef CONFIG_SMP
+extern void sched_exec(void);
+#else
+#define sched_exec()   {}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern void idle_task_exit(void);
+#else
+static inline void idle_task_exit(void) {}
+#endif
+
+extern void sched_idle_next(void);
+extern void set_user_nice(task_t *p, long nice);
+extern int task_prio(const task_t *p);
+extern int task_nice(const task_t *p);
+extern int can_nice(const task_t *p, const int nice);
+extern int task_curr(const task_t *p);
+extern int idle_cpu(int cpu);
+extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern task_t *idle_task(int cpu);
+extern task_t *curr_task(int cpu);
+extern void set_curr_task(int cpu, task_t *p);
+
+void yield(void);
+
+/*
+ * The default (Linux) execution domain.
+ */
+extern struct exec_domain	default_exec_domain;
+
+union thread_union {
+	struct thread_info thread_info;
+	unsigned long stack[THREAD_SIZE/sizeof(long)];
+};
+
+#ifndef __HAVE_ARCH_KSTACK_END
+static inline int kstack_end(void *addr)
+{
+	/* Reliable end of stack detection:
+	 * Some APM bios versions misalign the stack
+	 */
+	return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
+}
+#endif
+
+extern union thread_union init_thread_union;
+extern struct task_struct init_task;
+
+extern struct   mm_struct init_mm;
+
+#define find_task_by_pid(nr)	find_task_by_pid_type(PIDTYPE_PID, nr)
+extern struct task_struct *find_task_by_pid_type(int type, int pid);
+extern void set_special_pids(pid_t session, pid_t pgrp);
+extern void __set_special_pids(pid_t session, pid_t pgrp);
+
+/* per-UID process charging. */
+extern struct user_struct * alloc_uid(uid_t);
+static inline struct user_struct *get_uid(struct user_struct *u)
+{
+	atomic_inc(&u->__count);
+	return u;
+}
+extern void free_uid(struct user_struct *);
+extern void switch_uid(struct user_struct *);
+
+#include <asm/current.h>
+
+extern void do_timer(struct pt_regs *);
+
+extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
+extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
+						unsigned long clone_flags));
+#ifdef CONFIG_SMP
+ extern void kick_process(struct task_struct *tsk);
+#else
+ static inline void kick_process(struct task_struct *tsk) { }
+#endif
+extern void FASTCALL(sched_fork(task_t * p, int clone_flags));
+extern void FASTCALL(sched_exit(task_t * p));
+
+extern int in_group_p(gid_t);
+extern int in_egroup_p(gid_t);
+
+extern void proc_caches_init(void);
+extern void flush_signals(struct task_struct *);
+extern void flush_signal_handlers(struct task_struct *, int force_default);
+extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
+
+static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
+	ret = dequeue_signal(tsk, mask, info);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+
+	return ret;
+}	
+
+extern void block_all_signals(int (*notifier)(void *priv), void *priv,
+			      sigset_t *mask);
+extern void unblock_all_signals(void);
+extern void release_task(struct task_struct * p);
+extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+extern int send_group_sig_info(int, struct siginfo *, struct task_struct *);
+extern int force_sigsegv(int, struct task_struct *);
+extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp);
+extern int kill_pg_info(int, struct siginfo *, pid_t);
+extern int kill_proc_info(int, struct siginfo *, pid_t);
+extern int kill_proc_info_as_uid(int, struct siginfo *, pid_t, uid_t, uid_t);
+extern void do_notify_parent(struct task_struct *, int);
+extern void force_sig(int, struct task_struct *);
+extern void force_sig_specific(int, struct task_struct *);
+extern int send_sig(int, struct task_struct *, int);
+extern void zap_other_threads(struct task_struct *p);
+extern int kill_pg(pid_t, int, int);
+extern int kill_proc(pid_t, int, int);
+extern struct sigqueue *sigqueue_alloc(void);
+extern void sigqueue_free(struct sigqueue *);
+extern int send_sigqueue(int, struct sigqueue *,  struct task_struct *);
+extern int send_group_sigqueue(int, struct sigqueue *,  struct task_struct *);
+extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
+extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
+
+/* These can be the second arg to send_sig_info/send_group_sig_info.  */
+#define SEND_SIG_NOINFO ((struct siginfo *) 0)
+#define SEND_SIG_PRIV	((struct siginfo *) 1)
+#define SEND_SIG_FORCED	((struct siginfo *) 2)
+
+static inline int is_si_special(const struct siginfo *info)
+{
+	return info <= SEND_SIG_FORCED;
+}
+
+/* True if we are on the alternate signal stack.  */
+
+static inline int on_sig_stack(unsigned long sp)
+{
+	return (sp - current->sas_ss_sp < current->sas_ss_size);
+}
+
+static inline int sas_ss_flags(unsigned long sp)
+{
+	return (current->sas_ss_size == 0 ? SS_DISABLE
+		: on_sig_stack(sp) ? SS_ONSTACK : 0);
+}
+
+/*
+ * Routines for handling mm_structs
+ */
+extern struct mm_struct * mm_alloc(void);
+
+/* mmdrop drops the mm and the page tables */
+extern void FASTCALL(__mmdrop(struct mm_struct *));
+static inline void mmdrop(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop(mm);
+}
+
+/* mmput gets rid of the mappings and all user-space */
+extern void mmput(struct mm_struct *);
+/* Grab a reference to a task's mm, if it is not already going away */
+extern struct mm_struct *get_task_mm(struct task_struct *task);
+/* Remove the current tasks stale references to the old mm_struct */
+extern void mm_release(struct task_struct *, struct mm_struct *);
+
+extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+extern void flush_thread(void);
+extern void exit_thread(void);
+
+extern void exit_files(struct task_struct *);
+extern void __cleanup_signal(struct signal_struct *);
+extern void cleanup_sighand(struct task_struct *);
+extern void exit_itimers(struct signal_struct *);
+
+extern NORET_TYPE void do_group_exit(int);
+
+extern void daemonize(const char *, ...);
+extern int allow_signal(int);
+extern int disallow_signal(int);
+extern task_t *child_reaper;
+
+extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
+extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+task_t *fork_idle(int);
+
+extern void set_task_comm(struct task_struct *tsk, char *from);
+extern void get_task_comm(char *to, struct task_struct *tsk);
+
+#ifdef CONFIG_SMP
+extern void wait_task_inactive(task_t * p);
+#else
+#define wait_task_inactive(p)	do { } while (0)
+#endif
+
+#define remove_parent(p)	list_del_init(&(p)->sibling)
+#define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
+
+#define next_task(p)	list_entry((p)->tasks.next, struct task_struct, tasks)
+#define prev_task(p)	list_entry((p)->tasks.prev, struct task_struct, tasks)
+
+#define for_each_process(p) \
+	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
+
+/*
+ * Careful: do_each_thread/while_each_thread is a double loop so
+ *          'break' will not work as expected - use goto instead.
+ */
+#define do_each_thread(g, t) \
+	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
+
+#define while_each_thread(g, t) \
+	while ((t = next_thread(t)) != g)
+
+extern task_t * FASTCALL(next_thread(const task_t *p));
+
+#define thread_group_leader(p)	(p->pid == p->tgid)
+
+static inline int thread_group_empty(task_t *p)
+{
+	return list_empty(&p->pids[PIDTYPE_TGID].pid_list);
+}
+
+#define delay_group_leader(p) \
+		(thread_group_leader(p) && !thread_group_empty(p))
+
+/*
+ * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
+ * subscriptions and synchronises with wait4().  Also used in procfs.  Also
+ * pins the final release of task.io_context.  Also protects ->cpuset.
+ *
+ * Nests both inside and outside of read_lock(&tasklist_lock).
+ * It must not be nested with write_lock_irq(&tasklist_lock),
+ * neither inside nor outside.
+ */
+static inline void task_lock(struct task_struct *p)
+{
+	spin_lock(&p->alloc_lock);
+}
+
+static inline void task_unlock(struct task_struct *p)
+{
+	spin_unlock(&p->alloc_lock);
+}
+
+extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+							unsigned long *flags);
+
+static inline void unlock_task_sighand(struct task_struct *tsk,
+						unsigned long *flags)
+{
+	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
+}
+
+#ifndef __HAVE_THREAD_FUNCTIONS
+
+#define task_thread_info(task) (task)->thread_info
+#define task_stack_page(task) ((void*)((task)->thread_info))
+
+static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
+{
+	*task_thread_info(p) = *task_thread_info(org);
+	task_thread_info(p)->task = p;
+}
+
+static inline unsigned long *end_of_stack(struct task_struct *p)
+{
+	return (unsigned long *)(p->thread_info + 1);
+}
+
+#endif
+
+/* set thread flags in other task's structures
+ * - see asm/thread_info.h for TIF_xxxx flags available
+ */
+static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	set_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	clear_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	return test_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline void set_tsk_need_resched(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+}
+
+static inline void clear_tsk_need_resched(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+}
+
+static inline int signal_pending(struct task_struct *p)
+{
+	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
+}
+  
+static inline int need_resched(void)
+{
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+}
+
+/*
+ * cond_resched() and cond_resched_lock(): latency reduction via
+ * explicit rescheduling in places that are safe. The return
+ * value indicates whether a reschedule was done in fact.
+ * cond_resched_lock() will drop the spinlock before scheduling,
+ * cond_resched_softirq() will enable bhs before scheduling.
+ */
+extern int cond_resched(void);
+extern int cond_resched_lock(spinlock_t * lock);
+extern int cond_resched_softirq(void);
+
+/*
+ * Does a critical section need to be broken due to another
+ * task waiting?:
+ */
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+# define need_lockbreak(lock) ((lock)->break_lock)
+#else
+# define need_lockbreak(lock) 0
+#endif
+
+/*
+ * Does a critical section need to be broken due to another
+ * task waiting or preemption being signalled:
+ */
+static inline int lock_need_resched(spinlock_t *lock)
+{
+	if (need_lockbreak(lock) || need_resched())
+		return 1;
+	return 0;
+}
+
+/* Reevaluate whether the task has signals pending delivery.
+   This is required every time the blocked sigset_t changes.
+   callers must hold sighand->siglock.  */
+
+extern FASTCALL(void recalc_sigpending_tsk(struct task_struct *t));
+extern void recalc_sigpending(void);
+
+extern void signal_wake_up(struct task_struct *t, int resume_stopped);
+
+/*
+ * Wrappers for p->thread_info->cpu access. No-op on UP.
+ */
+#ifdef CONFIG_SMP
+
+static inline unsigned int task_cpu(const struct task_struct *p)
+{
+	return task_thread_info(p)->cpu;
+}
+
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+	task_thread_info(p)->cpu = cpu;
+}
+
+#else
+
+static inline unsigned int task_cpu(const struct task_struct *p)
+{
+	return 0;
+}
+
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+}
+
+#endif /* CONFIG_SMP */
+
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+extern void arch_pick_mmap_layout(struct mm_struct *mm);
+#else
+static inline void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	mm->mmap_base = TASK_UNMAPPED_BASE;
+	mm->get_unmapped_area = arch_get_unmapped_area;
+	mm->unmap_area = arch_unmap_area;
+}
+#endif
+
+extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
+extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
+
+extern void normalize_rt_tasks(void);
+
+#ifdef CONFIG_PM
+/*
+ * Check if a process has been frozen
+ */
+static inline int frozen(struct task_struct *p)
+{
+	return p->flags & PF_FROZEN;
+}
+
+/*
+ * Check if there is a request to freeze a process
+ */
+static inline int freezing(struct task_struct *p)
+{
+	return p->flags & PF_FREEZE;
+}
+
+/*
+ * Request that a process be frozen
+ * FIXME: SMP problem. We may not modify other process' flags!
+ */
+static inline void freeze(struct task_struct *p)
+{
+	p->flags |= PF_FREEZE;
+}
+
+/*
+ * Wake up a frozen process
+ */
+static inline int thaw_process(struct task_struct *p)
+{
+	if (frozen(p)) {
+		p->flags &= ~PF_FROZEN;
+		wake_up_process(p);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * freezing is complete, mark process as frozen
+ */
+static inline void frozen_process(struct task_struct *p)
+{
+	p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN;
+}
+
+extern void refrigerator(void);
+extern int freeze_processes(void);
+extern void thaw_processes(void);
+
+static inline int try_to_freeze(void)
+{
+	if (freezing(current)) {
+		refrigerator();
+		return 1;
+	} else
+		return 0;
+}
+#else
+static inline int frozen(struct task_struct *p) { return 0; }
+static inline int freezing(struct task_struct *p) { return 0; }
+static inline void freeze(struct task_struct *p) { BUG(); }
+static inline int thaw_process(struct task_struct *p) { return 1; }
+static inline void frozen_process(struct task_struct *p) { BUG(); }
+
+static inline void refrigerator(void) {}
+static inline int freeze_processes(void) { BUG(); return 0; }
+static inline void thaw_processes(void) {}
+
+static inline int try_to_freeze(void) { return 0; }
+
+#endif /* CONFIG_PM */
+#endif /* __KERNEL__ */
+
+#endif
diff -urN oldtree/include/linux/suspend.h newtree/include/linux/suspend.h
--- oldtree/include/linux/suspend.h	2006-03-08 18:47:15.977127750 +0000
+++ newtree/include/linux/suspend.h	2006-03-08 15:22:33.205502500 +0000
@@ -9,6 +9,7 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/pm.h>
+#include <linux/suspend2.h>
 
 /* page backup entry */
 typedef struct pbe {
@@ -46,6 +47,8 @@
 #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
+extern int freeze_processes(void);
+extern void thaw_processes(int which_threads);
 #else
 static inline int pm_prepare_console(void) { return 0; }
 static inline void pm_restore_console(void) {}
@@ -56,8 +59,12 @@
 	printk("Warning: fake suspend called\n");
 	return -EPERM;
 }
+static inline int freeze_processes(void) { return 0; }
+static inline void thaw_processes(int which_threads) { }
 #endif /* CONFIG_PM */
 
+extern char resume2_file[256];
+
 #ifdef CONFIG_SUSPEND_SMP
 extern void disable_nonboot_cpus(void);
 extern void enable_nonboot_cpus(void);
@@ -69,8 +76,6 @@
 void save_processor_state(void);
 void restore_processor_state(void);
 struct saved_context;
-void __save_processor_state(struct saved_context *ctxt);
-void __restore_processor_state(struct saved_context *ctxt);
 unsigned long get_safe_page(gfp_t gfp_mask);
 
 /*
diff -urN oldtree/include/linux/suspend2.h newtree/include/linux/suspend2.h
--- oldtree/include/linux/suspend2.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/linux/suspend2.h	2006-03-08 15:22:33.209502750 +0000
@@ -0,0 +1,231 @@
+#ifndef _LINUX_SUSPEND2_H
+#define _LINUX_SUSPEND2_H
+
+#include <linux/dyn_pageflags.h>
+#include <linux/init.h>
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#include <acpi/acpi_drivers.h>
+#endif
+
+/* arch/i386/mm/init.c */
+extern char __nosave_begin, __nosave_end;
+
+extern char __nosavedata swsusp_pg_dir[PAGE_SIZE]
+                  __attribute__ ((aligned (PAGE_SIZE)));
+
+#define SECTOR_SIZE 512
+
+/* kernel/power/process.c */
+
+/* kernel/power/main.c */
+extern unsigned long suspend_result;
+
+/* kernel/power/process.c */
+extern unsigned long suspend_debug_state;
+
+/* arch/i386/power/suspend2.c */
+extern unsigned long suspend_action;
+extern int suspend_io_time[2][2];
+
+extern dyn_pageflags_t pageset1_map;
+extern dyn_pageflags_t	pageset1_copy_map;
+
+#ifdef CONFIG_PM_DEBUG
+#define test_debug_state(bit) (test_bit(bit, &suspend_debug_state))
+#else
+#define test_debug_state(bit) (0)
+#endif
+
+#define test_result_state(bit) (test_bit(bit, &suspend_result))
+
+/* 
+ * First status register - this is suspend's return code.
+ *
+ * All the rest are in kernel/power/suspend2_common.h
+ */
+#define SUSPEND_ABORTED			0
+
+/* Second status register - ditto */
+#define SUSPEND_RETRY_RESUME		0
+
+/* Debug sections  - if debugging compiled in */
+enum {
+	SUSPEND_ANY_SECTION,
+	SUSPEND_FREEZER,
+	SUSPEND_EAT_MEMORY,
+	SUSPEND_PAGESETS,
+	SUSPEND_IO,
+	SUSPEND_BMAP,
+	SUSPEND_HEADER,
+	SUSPEND_WRITER,
+	SUSPEND_MEMORY,
+	SUSPEND_EXTENTS,
+	SUSPEND_SPINLOCKS,
+	SUSPEND_MEM_POOL,
+	SUSPEND_RANGE_PARANOIA,
+	SUSPEND_NOSAVE,
+	SUSPEND_INTEGRITY
+};
+
+/* debugging levels. */
+#define SUSPEND_STATUS		0
+#define SUSPEND_ERROR		2
+#define SUSPEND_LOW	 	3
+#define SUSPEND_MEDIUM	 	4
+#define SUSPEND_HIGH	  	5
+#define SUSPEND_VERBOSE		6
+
+/* second status register */
+enum {
+	SUSPEND_REBOOT,
+	SUSPEND_PAUSE,
+	SUSPEND_SLOW,
+	SUSPEND_NOPAGESET2,
+	SUSPEND_LOGALL,
+	SUSPEND_CAN_CANCEL,
+	SUSPEND_KEEP_IMAGE,
+	SUSPEND_FREEZER_TEST,
+	SUSPEND_FREEZER_TEST_SHOWALL,
+	SUSPEND_SINGLESTEP,
+	SUSPEND_PAUSE_NEAR_PAGESET_END,
+	SUSPEND_USE_ACPI_S4,
+	SUSPEND_TEST_FILTER_SPEED,
+	SUSPEND_FREEZE_TIMERS,
+	SUSPEND_DISABLE_SYSDEV_SUPPORT,
+	SUSPEND_VGA_POST,
+	SUSPEND_TEST_BIO,
+	SUSPEND_NO_PAGESET2,
+};
+
+#ifdef CONFIG_SUSPEND2
+#define test_action_state(bit) (test_bit(bit, &suspend_action))
+#define set_action_state(bit) (test_and_set_bit(bit, &suspend_action))
+#define clear_action_state(bit) (test_and_clear_bit(bit, &suspend_action))
+#else
+#define test_action_state(bit) (0)
+#endif
+
+extern void __suspend_message(unsigned long section, unsigned long level, int log_normally,
+		const char *fmt, ...);
+
+#ifdef CONFIG_PM_DEBUG
+#define suspend_message(sn, lev, log, fmt, a...) \
+do { \
+	if (test_debug_state(sn)) \
+		__suspend_message(sn, lev, log, fmt, ##a); \
+} while(0)
+#else /* CONFIG_PM_DEBUG */
+#define suspend_message(sn, lev, log, fmt, a...) \
+do { \
+	if (lev == 0) \
+		__suspend_message(sn, lev, log, fmt, ##a); \
+} while(0)
+#endif /* CONFIG_PM_DEBUG */
+  
+/* Suspend 2 */
+
+enum {
+	SUSPEND_DISABLED,
+	SUSPEND_RUNNING,
+	SUSPEND_RESUME_DEVICE_OK,
+	SUSPEND_NORESUME_SPECIFIED,
+	SUSPEND_COMMANDLINE_ERROR,
+	SUSPEND_IGNORE_IMAGE,
+	SUSPEND_SANITY_CHECK_PROMPT,
+	SUSPEND_FREEZER_ON,
+	SUSPEND_BLOCK_PAGE_ALLOCATIONS,
+	SUSPEND_USE_MEMORY_POOL,
+	SUSPEND_STAGE2_CONTINUE,
+	SUSPEND_FREEZE_SMP,
+	SUSPEND_PAGESET2_NOT_LOADED,
+	SUSPEND_CONTINUE_REQ,
+	SUSPEND_RESUMED_BEFORE,
+	SUSPEND_RUNNING_INITRD,
+	SUSPEND_RESUME_NOT_DONE,
+	SUSPEND_BOOT_TIME,
+	SUSPEND_NOW_RESUMING,
+	SUSPEND_SLAB_ALLOC_FALLBACK,
+	SUSPEND_IGNORE_LOGLEVEL,
+	SUSPEND_TIMER_FREEZER_ON,
+	SUSPEND_ACT_USED,
+	SUSPEND_DBG_USED,
+	SUSPEND_LVL_USED,
+	SUSPEND_TRYING_TO_RESUME,
+	SUSPEND_FORK_COPYBACK_THREAD,
+	SUSPEND_TRY_RESUME_RD,
+	SUSPEND_IGNORE_ROOTFS,
+};
+
+#define test_and_set_suspend_state(bit) \
+	(test_and_set_bit(bit, &software_suspend_state))
+
+#define get_suspend_state() 		(software_suspend_state)
+#define restore_suspend_state(saved_state) \
+	do { software_suspend_state = saved_state; } while(0)
+	
+/* --------------------------------------------------------------------- */
+#ifdef CONFIG_SUSPEND2
+
+/* Used in init dir files */
+extern unsigned long software_suspend_state;
+
+extern void suspend2_try_resume(void);
+extern int suspend_early_boot_message 
+	(int can_erase_image, int default_answer, char *warning_reason, ...);
+extern void suspend_handle_keypress(unsigned int keycode, int source);
+extern unsigned long suspend_update_status (unsigned long value, unsigned long maximum,
+		const char *fmt, ...);
+extern void suspend_prepare_status (int clearbar, const char *fmt, ...);
+
+#define test_suspend_state(bit) \
+	(test_bit(bit, &software_suspend_state))
+
+#define clear_suspend_state(bit) \
+	(clear_bit(bit, &software_suspend_state))
+
+#define set_suspend_state(bit) \
+	(set_bit(bit, &software_suspend_state))
+
+extern inline void suspend_copyback_low(void);
+extern inline void suspend_copyback_high(void);
+
+extern void suspend2_try_suspend(void);
+
+/* --------------------------------------------------------------------- */
+#else
+/* --------------------------------------------------------------------- */
+
+#define software_suspend_state		(0)
+#define clear_suspend_state(bit)	do { } while (0)
+#define test_suspend_state(bit) 	(0)
+#define set_suspend_state(bit)		do { } while(0)
+
+#define suspend2_try_resume()			do { } while(0)
+static inline int suspend_early_boot_message(int a, int b, char *c, ...)	{ return 0; }
+#define suspend_handle_keypress(a, b) 		do { } while(0)
+static inline unsigned long suspend_update_status(unsigned long value, unsigned long maximum,
+		const char *fmt, ...)
+{
+	return maximum;
+}
+#define suspend_prepare_status(a, ...)  do { } while(0)
+
+#endif /* CONFIG_SUSPEND2 */
+
+#if defined(CONFIG_SUSPEND2) && defined(CONFIG_ACPI)
+static inline int may_try_suspend2(u32 state)
+{
+	if (state == ACPI_STATE_S4) {
+		suspend2_try_suspend();
+		return 1;
+	}
+	return 0;
+}
+#else
+static inline int may_try_suspend2(u32 state)
+{
+	return 0;
+}
+#endif
+#endif /* _LINUX_SUSPEND2_H */
diff -urN oldtree/include/linux/workqueue.h newtree/include/linux/workqueue.h
--- oldtree/include/linux/workqueue.h	2006-03-08 18:48:02.368027000 +0000
+++ newtree/include/linux/workqueue.h	2006-03-08 15:22:33.213503000 +0000
@@ -55,9 +55,12 @@
 	} while (0)
 
 extern struct workqueue_struct *__create_workqueue(const char *name,
-						    int singlethread);
-#define create_workqueue(name) __create_workqueue((name), 0)
-#define create_singlethread_workqueue(name) __create_workqueue((name), 1)
+						    int singlethread,
+						    unsigned long freezer_flag);
+#define create_workqueue(name) __create_workqueue((name), 0, 0)
+#define create_nofreeze_workqueue(name) __create_workqueue((name), 0, PF_NOFREEZE)
+#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0)
+#define create_nofreeze_singlethread_workqueue(name) __create_workqueue((name), 1, PF_NOFREEZE)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff -urN oldtree/include/linux/workqueue.h.orig newtree/include/linux/workqueue.h.orig
--- oldtree/include/linux/workqueue.h.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/include/linux/workqueue.h.orig	2006-03-08 15:21:19.104871500 +0000
@@ -0,0 +1,99 @@
+/*
+ * workqueue.h --- work queue handling for Linux.
+ */
+
+#ifndef _LINUX_WORKQUEUE_H
+#define _LINUX_WORKQUEUE_H
+
+#include <linux/timer.h>
+#include <linux/linkage.h>
+#include <linux/bitops.h>
+
+struct workqueue_struct;
+
+struct work_struct {
+	unsigned long pending;
+	struct list_head entry;
+	void (*func)(void *);
+	void *data;
+	void *wq_data;
+	struct timer_list timer;
+};
+
+struct execute_work {
+	struct work_struct work;
+};
+
+#define __WORK_INITIALIZER(n, f, d) {				\
+        .entry	= { &(n).entry, &(n).entry },			\
+	.func = (f),						\
+	.data = (d),						\
+	.timer = TIMER_INITIALIZER(NULL, 0, 0),			\
+	}
+
+#define DECLARE_WORK(n, f, d)					\
+	struct work_struct n = __WORK_INITIALIZER(n, f, d)
+
+/*
+ * initialize a work-struct's func and data pointers:
+ */
+#define PREPARE_WORK(_work, _func, _data)			\
+	do {							\
+		(_work)->func = _func;				\
+		(_work)->data = _data;				\
+	} while (0)
+
+/*
+ * initialize all of a work-struct:
+ */
+#define INIT_WORK(_work, _func, _data)				\
+	do {							\
+		INIT_LIST_HEAD(&(_work)->entry);		\
+		(_work)->pending = 0;				\
+		PREPARE_WORK((_work), (_func), (_data));	\
+		init_timer(&(_work)->timer);			\
+	} while (0)
+
+extern struct workqueue_struct *__create_workqueue(const char *name,
+						    int singlethread);
+#define create_workqueue(name) __create_workqueue((name), 0)
+#define create_singlethread_workqueue(name) __create_workqueue((name), 1)
+
+extern void destroy_workqueue(struct workqueue_struct *wq);
+
+extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
+extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay));
+extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
+
+extern int FASTCALL(schedule_work(struct work_struct *work));
+extern int FASTCALL(schedule_delayed_work(struct work_struct *work, unsigned long delay));
+
+extern int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay);
+extern int schedule_on_each_cpu(void (*func)(void *info), void *info);
+extern void flush_scheduled_work(void);
+extern int current_is_keventd(void);
+extern int keventd_up(void);
+
+extern void init_workqueues(void);
+void cancel_rearming_delayed_work(struct work_struct *work);
+void cancel_rearming_delayed_workqueue(struct workqueue_struct *,
+				       struct work_struct *);
+int execute_in_process_context(void (*fn)(void *), void *,
+			       struct execute_work *);
+
+/*
+ * Kill off a pending schedule_delayed_work().  Note that the work callback
+ * function may still be running on return from cancel_delayed_work().  Run
+ * flush_scheduled_work() to wait on it.
+ */
+static inline int cancel_delayed_work(struct work_struct *work)
+{
+	int ret;
+
+	ret = del_timer_sync(&work->timer);
+	if (ret)
+		clear_bit(0, &work->pending);
+	return ret;
+}
+
+#endif
diff -urN oldtree/init/do_mounts.c newtree/init/do_mounts.c
--- oldtree/init/do_mounts.c	2006-01-03 03:21:10.000000000 +0000
+++ newtree/init/do_mounts.c	2006-03-08 15:22:33.213503000 +0000
@@ -139,11 +139,16 @@
 	char s[32];
 	char *p;
 	dev_t res = 0;
-	int part;
+	int part, mount_result;
 
 #ifdef CONFIG_SYSFS
 	int mkdir_err = sys_mkdir("/sys", 0700);
-	if (sys_mount("sysfs", "/sys", "sysfs", 0, NULL) < 0)
+	/* 
+	 * When changing resume2 parameter for Software Suspend, sysfs may
+	 * already be mounted. 
+	 */
+	mount_result = sys_mount("sysfs", "/sys", "sysfs", 0, NULL);
+	if (mount_result < 0 && mount_result != -EBUSY)
 		goto out;
 #endif
 
@@ -195,7 +200,8 @@
 	res = try_name(s, part);
 done:
 #ifdef CONFIG_SYSFS
-	sys_umount("/sys", 0);
+	if (mount_result >= 0)
+		sys_umount("/sys", 0);
 out:
 	if (!mkdir_err)
 		sys_rmdir("/sys");
@@ -412,9 +418,25 @@
 
 	is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR;
 
+	/* Suspend2:
+	 * By this point, suspend_early_init has been called to initialise our
+	 * proc interface. If modules are built in, they have registered (all
+	 * of the above via late_initcalls).
+	 * 
+	 * We have not yet looked to see if an image exists, however. If we
+	 * have an initrd, it is expected that the user will have set it up
+	 * to echo > /proc/suspend2/do_resume and thus initiate any
+	 * resume. If they don't do that, we do it immediately after the initrd
+	 * is finished (major issues if they mount filesystems rw from the
+	 * initrd! - they are warned. If there's no usable initrd, we do our
+	 * check next.
+	 */
 	if (initrd_load())
 		goto out;
 
+	if (test_suspend_state(SUSPEND_RESUME_NOT_DONE))
+		suspend2_try_resume();
+	
 	if (is_floppy && rd_doload && rd_load_disk(0))
 		ROOT_DEV = Root_RAM0;
 
diff -urN oldtree/init/do_mounts_initrd.c newtree/init/do_mounts_initrd.c
--- oldtree/init/do_mounts_initrd.c	2006-03-08 18:48:02.928062000 +0000
+++ newtree/init/do_mounts_initrd.c	2006-03-08 15:22:33.217503250 +0000
@@ -7,6 +7,7 @@
 #include <linux/romfs_fs.h>
 #include <linux/initrd.h>
 #include <linux/sched.h>
+#include <linux/suspend.h>
 
 #include "do_mounts.h"
 
@@ -59,10 +60,16 @@
 	current->flags |= PF_NOFREEZE;
 	pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
 	if (pid > 0) {
-		while (pid != sys_wait4(-1, NULL, 0, NULL))
+		while (pid != sys_wait4(-1, NULL, 0, NULL)) {
 			yield();
+			try_to_freeze();
+		}
 	}
 
+	if (test_suspend_state(SUSPEND_RESUME_NOT_DONE))
+		printk(KERN_ERR "Suspend2: Initrd lacks echo > /proc/suspend2/do_resume.\n");
+	clear_suspend_state(SUSPEND_BOOT_TIME);
+
 	/* move initrd to rootfs' /old */
 	sys_fchdir(old_fd);
 	sys_mount("/", ".", NULL, MS_MOVE, NULL);
diff -urN oldtree/init/do_mounts_initrd.c.orig newtree/init/do_mounts_initrd.c.orig
--- oldtree/init/do_mounts_initrd.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/init/do_mounts_initrd.c.orig	2006-03-08 15:21:19.148874250 +0000
@@ -0,0 +1,125 @@
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/minix_fs.h>
+#include <linux/ext2_fs.h>
+#include <linux/romfs_fs.h>
+#include <linux/initrd.h>
+#include <linux/sched.h>
+
+#include "do_mounts.h"
+
+unsigned long initrd_start, initrd_end;
+int initrd_below_start_ok;
+unsigned int real_root_dev;	/* do_proc_dointvec cannot handle kdev_t */
+static int __initdata old_fd, root_fd;
+static int __initdata mount_initrd = 1;
+
+static int __init no_initrd(char *str)
+{
+	mount_initrd = 0;
+	return 1;
+}
+
+__setup("noinitrd", no_initrd);
+
+static int __init do_linuxrc(void * shell)
+{
+	static char *argv[] = { "linuxrc", NULL, };
+	extern char * envp_init[];
+
+	sys_close(old_fd);sys_close(root_fd);
+	sys_close(0);sys_close(1);sys_close(2);
+	sys_setsid();
+	(void) sys_open("/dev/console",O_RDWR,0);
+	(void) sys_dup(0);
+	(void) sys_dup(0);
+	return execve(shell, argv, envp_init);
+}
+
+static void __init handle_initrd(void)
+{
+	int error;
+	int pid;
+
+	real_root_dev = new_encode_dev(ROOT_DEV);
+	create_dev("/dev/root.old", Root_RAM0, NULL);
+	/* mount initrd on rootfs' /root */
+	mount_block_root("/dev/root.old", root_mountflags & ~MS_RDONLY);
+	sys_mkdir("/old", 0700);
+	root_fd = sys_open("/", 0, 0);
+	old_fd = sys_open("/old", 0, 0);
+	/* move initrd over / and chdir/chroot in initrd root */
+	sys_chdir("/root");
+	sys_mount(".", "/", NULL, MS_MOVE, NULL);
+	sys_chroot(".");
+	mount_devfs_fs ();
+
+	current->flags |= PF_NOFREEZE;
+	pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
+	if (pid > 0) {
+		while (pid != sys_wait4(-1, NULL, 0, NULL))
+			yield();
+	}
+
+	/* move initrd to rootfs' /old */
+	sys_fchdir(old_fd);
+	sys_mount("/", ".", NULL, MS_MOVE, NULL);
+	/* switch root and cwd back to / of rootfs */
+	sys_fchdir(root_fd);
+	sys_chroot(".");
+	sys_close(old_fd);
+	sys_close(root_fd);
+	umount_devfs("/old/dev");
+
+	if (new_decode_dev(real_root_dev) == Root_RAM0) {
+		sys_chdir("/old");
+		return;
+	}
+
+	ROOT_DEV = new_decode_dev(real_root_dev);
+	mount_root();
+
+	printk(KERN_NOTICE "Trying to move old root to /initrd ... ");
+	error = sys_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL);
+	if (!error)
+		printk("okay\n");
+	else {
+		int fd = sys_open("/dev/root.old", O_RDWR, 0);
+		if (error == -ENOENT)
+			printk("/initrd does not exist. Ignored.\n");
+		else
+			printk("failed\n");
+		printk(KERN_NOTICE "Unmounting old root\n");
+		sys_umount("/old", MNT_DETACH);
+		printk(KERN_NOTICE "Trying to free ramdisk memory ... ");
+		if (fd < 0) {
+			error = fd;
+		} else {
+			error = sys_ioctl(fd, BLKFLSBUF, 0);
+			sys_close(fd);
+		}
+		printk(!error ? "okay\n" : "failed\n");
+	}
+}
+
+int __init initrd_load(void)
+{
+	if (mount_initrd) {
+		create_dev("/dev/ram", Root_RAM0, NULL);
+		/*
+		 * Load the initrd data into /dev/ram0. Execute it as initrd
+		 * unless /dev/ram0 is supposed to be our actual root device,
+		 * in that case the ram disk is just set up here, and gets
+		 * mounted in the normal path.
+		 */
+		if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) {
+			sys_unlink("/initrd.image");
+			handle_initrd();
+			return 1;
+		}
+	}
+	sys_unlink("/initrd.image");
+	return 0;
+}
diff -urN oldtree/init/main.c newtree/init/main.c
--- oldtree/init/main.c	2006-03-08 18:48:02.928062000 +0000
+++ newtree/init/main.c	2006-03-08 15:22:33.221503500 +0000
@@ -726,7 +726,9 @@
 
 	/*
 	 * check if there is an early userspace init.  If yes, let it do all
-	 * the work
+	 * the work. For suspend2, we assume that it will do the right thing
+	 * with regard to trying to resume at the right place. When that
+	 * happens, the BOOT_TIME flag will be cleared.
 	 */
 
 	if (!ramdisk_execute_command)
diff -urN oldtree/init/main.c.orig newtree/init/main.c.orig
--- oldtree/init/main.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/init/main.c.orig	2006-03-08 15:21:19.148874250 +0000
@@ -0,0 +1,780 @@
+/*
+ *  linux/init/main.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  GK 2/5/95  -  Changed to support mounting root fs via NFS
+ *  Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96
+ *  Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96
+ *  Simplified starting of init:  Michael A. Griffith <grif@acm.org> 
+ */
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/utsname.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/smp_lock.h>
+#include <linux/initrd.h>
+#include <linux/hdreg.h>
+#include <linux/bootmem.h>
+#include <linux/tty.h>
+#include <linux/gfp.h>
+#include <linux/percpu.h>
+#include <linux/kmod.h>
+#include <linux/kernel_stat.h>
+#include <linux/security.h>
+#include <linux/workqueue.h>
+#include <linux/profile.h>
+#include <linux/rcupdate.h>
+#include <linux/moduleparam.h>
+#include <linux/kallsyms.h>
+#include <linux/writeback.h>
+#include <linux/timeofday.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/efi.h>
+#include <linux/unistd.h>
+#include <linux/rmap.h>
+#include <linux/mempolicy.h>
+#include <linux/key.h>
+
+#include <asm/io.h>
+#include <asm/bugs.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/cacheflush.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/smp.h>
+#endif
+
+/*
+ * This is one of the first .c files built. Error out early if we have compiler
+ * trouble.
+ *
+ * Versions of gcc older than that listed below may actually compile and link
+ * okay, but the end product can have subtle run time bugs.  To avoid associated
+ * bogus bug reports, we flatly refuse to compile with a gcc that is known to be
+ * too old from the very beginning.
+ */
+#if (__GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 2)
+#error Sorry, your GCC is too old. It builds incorrect kernels.
+#endif
+
+static int init(void *);
+
+extern void init_IRQ(void);
+extern void fork_init(unsigned long);
+extern void mca_init(void);
+extern void sbus_init(void);
+extern void sysctl_init(void);
+extern void signals_init(void);
+extern void buffer_init(void);
+extern void pidhash_init(void);
+extern void pidmap_init(void);
+extern void prio_tree_init(void);
+extern void radix_tree_init(void);
+extern void free_initmem(void);
+extern void populate_rootfs(void);
+extern void driver_init(void);
+extern void prepare_namespace(void);
+#ifdef	CONFIG_ACPI
+extern void acpi_early_init(void);
+#else
+static inline void acpi_early_init(void) { }
+#endif
+#ifndef CONFIG_DEBUG_RODATA
+static inline void mark_rodata_ro(void) { }
+#endif
+
+#ifdef CONFIG_TC
+extern void tc_init(void);
+#endif
+
+enum system_states system_state;
+EXPORT_SYMBOL(system_state);
+
+/*
+ * Boot command-line arguments
+ */
+#define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT
+#define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT
+
+extern void time_init(void);
+/* Default late time init is NULL. archs can override this later. */
+void (*late_time_init)(void);
+extern void softirq_init(void);
+
+/* Untouched command line (eg. for /proc) saved by arch-specific code. */
+char saved_command_line[COMMAND_LINE_SIZE];
+
+static char *execute_command;
+static char *ramdisk_execute_command;
+
+/* Setup configured maximum number of CPUs to activate */
+static unsigned int max_cpus = NR_CPUS;
+
+/*
+ * Setup routine for controlling SMP activation
+ *
+ * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ * activation entirely (the MPS table probe still happens, though).
+ *
+ * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ * greater than 0, limits the maximum number of CPUs activated in
+ * SMP mode to <NUM>.
+ */
+static int __init nosmp(char *str)
+{
+	max_cpus = 0;
+	return 1;
+}
+
+__setup("nosmp", nosmp);
+
+static int __init maxcpus(char *str)
+{
+	get_option(&str, &max_cpus);
+	return 1;
+}
+
+__setup("maxcpus=", maxcpus);
+
+#ifdef CONFIG_LIMIT_CPUS
+unsigned int limit_cpus=NR_CPUS;
+
+static int __init
+set_limit_cpus(char *str)
+{
+   int ncpus;
+   get_option (&str, &ncpus);
+   limit_cpus = ncpus;
+   printk (KERN_INFO "Limiting cpus present count to %d\n", ncpus);
+   return 1;
+}
+
+__setup("limit_cpus=", set_limit_cpus);
+#endif
+
+
+static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
+char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
+static const char *panic_later, *panic_param;
+
+extern struct obs_kernel_param __setup_start[], __setup_end[];
+
+static int __init obsolete_checksetup(char *line)
+{
+	struct obs_kernel_param *p;
+
+	p = __setup_start;
+	do {
+		int n = strlen(p->str);
+		if (!strncmp(line, p->str, n)) {
+			if (p->early) {
+				/* Already done in parse_early_param?  (Needs
+				 * exact match on param part) */
+				if (line[n] == '\0' || line[n] == '=')
+					return 1;
+			} else if (!p->setup_func) {
+				printk(KERN_WARNING "Parameter %s is obsolete,"
+				       " ignored\n", p->str);
+				return 1;
+			} else if (p->setup_func(line + n))
+				return 1;
+		}
+		p++;
+	} while (p < __setup_end);
+	return 0;
+}
+
+/*
+ * This should be approx 2 Bo*oMips to start (note initial shift), and will
+ * still work even if initially too large, it will just take slightly longer
+ */
+unsigned long loops_per_jiffy = (1<<12);
+
+EXPORT_SYMBOL(loops_per_jiffy);
+
+static int __init debug_kernel(char *str)
+{
+	if (*str)
+		return 0;
+	console_loglevel = 10;
+	return 1;
+}
+
+static int __init quiet_kernel(char *str)
+{
+	if (*str)
+		return 0;
+	console_loglevel = 4;
+	return 1;
+}
+
+__setup("debug", debug_kernel);
+__setup("quiet", quiet_kernel);
+
+static int __init loglevel(char *str)
+{
+	get_option(&str, &console_loglevel);
+	return 1;
+}
+
+__setup("loglevel=", loglevel);
+
+/*
+ * Unknown boot options get handed to init, unless they look like
+ * failed parameters
+ */
+static int __init unknown_bootoption(char *param, char *val)
+{
+	/* Change NUL term back to "=", to make "param" the whole string. */
+	if (val) {
+		/* param=val or param="val"? */
+		if (val == param+strlen(param)+1)
+			val[-1] = '=';
+		else if (val == param+strlen(param)+2) {
+			val[-2] = '=';
+			memmove(val-1, val, strlen(val)+1);
+			val--;
+		} else
+			BUG();
+	}
+
+	/* Handle obsolete-style parameters */
+	if (obsolete_checksetup(param))
+		return 0;
+
+	/*
+	 * Preemptive maintenance for "why didn't my mispelled command
+	 * line work?"
+	 */
+	if (strchr(param, '.') && (!val || strchr(param, '.') < val)) {
+		printk(KERN_ERR "Unknown boot option `%s': ignoring\n", param);
+		return 0;
+	}
+
+	if (panic_later)
+		return 0;
+
+	if (val) {
+		/* Environment option */
+		unsigned int i;
+		for (i = 0; envp_init[i]; i++) {
+			if (i == MAX_INIT_ENVS) {
+				panic_later = "Too many boot env vars at `%s'";
+				panic_param = param;
+			}
+			if (!strncmp(param, envp_init[i], val - param))
+				break;
+		}
+		envp_init[i] = param;
+	} else {
+		/* Command line option */
+		unsigned int i;
+		for (i = 0; argv_init[i]; i++) {
+			if (i == MAX_INIT_ARGS) {
+				panic_later = "Too many boot init vars at `%s'";
+				panic_param = param;
+			}
+		}
+		argv_init[i] = param;
+	}
+	return 0;
+}
+
+static int __init init_setup(char *str)
+{
+	unsigned int i;
+
+	execute_command = str;
+	/*
+	 * In case LILO is going to boot us with default command line,
+	 * it prepends "auto" before the whole cmdline which makes
+	 * the shell think it should execute a script with such name.
+	 * So we ignore all arguments entered _before_ init=... [MJ]
+	 */
+	for (i = 1; i < MAX_INIT_ARGS; i++)
+		argv_init[i] = NULL;
+	return 1;
+}
+__setup("init=", init_setup);
+
+static int __init rdinit_setup(char *str)
+{
+	unsigned int i;
+
+	ramdisk_execute_command = str;
+	/* See "auto" comment in init_setup */
+	for (i = 1; i < MAX_INIT_ARGS; i++)
+		argv_init[i] = NULL;
+	return 1;
+}
+__setup("rdinit=", rdinit_setup);
+
+extern void setup_arch(char **);
+
+#ifndef CONFIG_SMP
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void __init smp_init(void)
+{
+	APIC_init_uniprocessor();
+}
+#else
+#define smp_init()	do { } while (0)
+#endif
+
+static inline void setup_per_cpu_areas(void) { }
+static inline void smp_prepare_cpus(unsigned int maxcpus) { }
+
+#else
+
+#ifdef __GENERIC_PER_CPU
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+
+EXPORT_SYMBOL(__per_cpu_offset);
+
+static void __init setup_per_cpu_areas(void)
+{
+	unsigned long size, i;
+	char *ptr;
+	unsigned long nr_possible_cpus = num_possible_cpus();
+
+	/* Copy section for each CPU (we discard the original) */
+	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
+#ifdef CONFIG_MODULES
+	if (size < PERCPU_ENOUGH_ROOM)
+		size = PERCPU_ENOUGH_ROOM;
+#endif
+	ptr = alloc_bootmem(size * nr_possible_cpus);
+
+	for_each_cpu(i) {
+		__per_cpu_offset[i] = ptr - __per_cpu_start;
+		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+		ptr += size;
+	}
+}
+#endif /* !__GENERIC_PER_CPU */
+
+/* Called by boot processor to activate the rest. */
+static void __init smp_init(void)
+{
+	unsigned int i;
+
+#ifdef CONFIG_LIMIT_CPUS
+   extern __init void limit_cpu_present_map(void);
+   limit_cpu_present_map();
+#endif
+
+	/* FIXME: This should be done in userspace --RR */
+	for_each_present_cpu(i) {
+		if (num_online_cpus() >= max_cpus)
+			break;
+		if (!cpu_online(i))
+			cpu_up(i);
+	}
+
+	/* Any cleanup work */
+	printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+	smp_cpus_done(max_cpus);
+#if 0
+	/* Get other processors into their bootup holding patterns. */
+
+	smp_commence();
+#endif
+}
+
+#endif
+
+/*
+ * We need to finalize in a non-__init function or else race conditions
+ * between the root thread and the init thread may cause start_kernel to
+ * be reaped by free_initmem before the root thread has proceeded to
+ * cpu_idle.
+ *
+ * gcc-3.4 accidentally inlines this function, so use noinline.
+ */
+
+static void noinline rest_init(void)
+	__releases(kernel_lock)
+{
+	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
+	numa_default_policy();
+	unlock_kernel();
+
+	/*
+	 * The boot idle thread must execute schedule()
+	 * at least one to get things moving:
+	 */
+	preempt_enable_no_resched();
+	schedule();
+	preempt_disable();
+
+	/* Call into cpu_idle with preempt disabled */
+	cpu_idle();
+} 
+
+/* Check for early params. */
+static int __init do_early_param(char *param, char *val)
+{
+	struct obs_kernel_param *p;
+
+	for (p = __setup_start; p < __setup_end; p++) {
+		if (p->early && strcmp(param, p->str) == 0) {
+			if (p->setup_func(val) != 0)
+				printk(KERN_WARNING
+				       "Malformed early option '%s'\n", param);
+		}
+	}
+	/* We accept everything at this stage. */
+	return 0;
+}
+
+/* Arch code calls this early on, or if not, just before other parsing. */
+void __init parse_early_param(void)
+{
+	static __initdata int done = 0;
+	static __initdata char tmp_cmdline[COMMAND_LINE_SIZE];
+
+	if (done)
+		return;
+
+	/* All fall through to do_early_param. */
+	strlcpy(tmp_cmdline, saved_command_line, COMMAND_LINE_SIZE);
+	parse_args("early options", tmp_cmdline, NULL, 0, do_early_param);
+	done = 1;
+}
+
+/*
+ *	Activate the first processor.
+ */
+
+static void __init boot_cpu_init(void)
+{
+	int cpu = smp_processor_id();
+	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
+	cpu_set(cpu, cpu_online_map);
+	cpu_set(cpu, cpu_present_map);
+	cpu_set(cpu, cpu_possible_map);
+}
+
+asmlinkage void __init start_kernel(void)
+{
+	char * command_line;
+	extern struct kernel_param __start___param[], __stop___param[];
+/*
+ * Interrupts are still disabled. Do necessary setups, then
+ * enable them
+ */
+	lock_kernel();
+	boot_cpu_init();
+	page_address_init();
+	printk(KERN_NOTICE);
+	printk(linux_banner);
+	setup_arch(&command_line);
+	setup_per_cpu_areas();
+	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
+
+	/*
+	 * Set up the scheduler prior starting any interrupts (such as the
+	 * timer interrupt). Full topology setup happens at smp_init()
+	 * time - but meanwhile we still have a functioning scheduler.
+	 */
+	sched_init();
+	/*
+	 * Disable preemption - early bootup scheduling is extremely
+	 * fragile until we cpu_idle() for the first time.
+	 */
+	preempt_disable();
+	build_all_zonelists();
+	page_alloc_init();
+	printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line);
+	parse_early_param();
+	parse_args("Booting kernel", command_line, __start___param,
+		   __stop___param - __start___param,
+		   &unknown_bootoption);
+	sort_main_extable();
+	trap_init();
+	rcu_init();
+	init_IRQ();
+	pidhash_init();
+	init_timers();
+	hrtimers_init();
+	softirq_init();
+	timeofday_init();
+	time_init();
+
+	/*
+	 * HACK ALERT! This is early. We're enabling the console before
+	 * we've done PCI setups etc, and console_init() must be aware of
+	 * this. But we do want output early, in case something goes wrong.
+	 */
+	console_init();
+	if (panic_later)
+		panic(panic_later, panic_param);
+	profile_init();
+	local_irq_enable();
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (initrd_start && !initrd_below_start_ok &&
+			initrd_start < min_low_pfn << PAGE_SHIFT) {
+		printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
+		    "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT);
+		initrd_start = 0;
+	}
+#endif
+	vfs_caches_init_early();
+	cpuset_init_early();
+	mem_init();
+	kmem_cache_init();
+	setup_per_cpu_pageset();
+	numa_policy_init();
+	if (late_time_init)
+		late_time_init();
+	calibrate_delay();
+	pidmap_init();
+	pgtable_cache_init();
+	prio_tree_init();
+	anon_vma_init();
+#ifdef CONFIG_X86
+	if (efi_enabled)
+		efi_enter_virtual_mode();
+#endif
+	fork_init(num_physpages);
+	proc_caches_init();
+	buffer_init();
+	unnamed_dev_init();
+	key_init();
+	security_init();
+	vfs_caches_init(num_physpages);
+	radix_tree_init();
+	signals_init();
+	/* rootfs populating might need page-writeback */
+	page_writeback_init();
+#ifdef CONFIG_PROC_FS
+	proc_root_init();
+#endif
+	cpuset_init();
+
+	check_bugs();
+
+	acpi_early_init(); /* before LAPIC and SMP init */
+
+	/* Do the rest non-__init'ed, we're now alive */
+	rest_init();
+}
+
+static int __initdata initcall_debug;
+
+static int __init initcall_debug_setup(char *str)
+{
+	initcall_debug = 1;
+	return 1;
+}
+__setup("initcall_debug", initcall_debug_setup);
+
+struct task_struct *child_reaper = &init_task;
+
+extern initcall_t __initcall_start[], __initcall_end[];
+
+static void __init do_initcalls(void)
+{
+	initcall_t *call;
+	int count = preempt_count();
+
+	for (call = __initcall_start; call < __initcall_end; call++) {
+		char *msg = NULL;
+		char msgbuf[40];
+		int result;
+
+		if (initcall_debug) {
+			printk(KERN_DEBUG "Calling initcall 0x%p", *call);
+			print_fn_descriptor_symbol(": %s()",
+					(unsigned long) *call);
+			printk("\n");
+		}
+
+		result = (*call)();
+
+		if (result && (result != -ENODEV || initcall_debug)) {
+			sprintf(msgbuf, "error code %d", result);
+			msg = msgbuf;
+		}
+		if (preempt_count() != count) {
+			msg = "preemption imbalance";
+			preempt_count() = count;
+		}
+		if (irqs_disabled()) {
+			msg = "disabled interrupts";
+			local_irq_enable();
+		}
+		if (msg) {
+			printk(KERN_WARNING "initcall at 0x%p", *call);
+			print_fn_descriptor_symbol(": %s()",
+					(unsigned long) *call);
+			printk(": returned with %s\n", msg);
+		}
+	}
+
+	/* Make sure there is no pending stuff from the initcall sequence */
+	flush_scheduled_work();
+}
+
+/*
+ * Ok, the machine is now initialized. None of the devices
+ * have been touched yet, but the CPU subsystem is up and
+ * running, and memory and process management works.
+ *
+ * Now we can finally start doing some real work..
+ */
+static void __init do_basic_setup(void)
+{
+	/* drivers will send hotplug events */
+	init_workqueues();
+	usermodehelper_init();
+	driver_init();
+
+#ifdef CONFIG_SYSCTL
+	sysctl_init();
+#endif
+
+	do_initcalls();
+}
+
+static void do_pre_smp_initcalls(void)
+{
+	extern int spawn_ksoftirqd(void);
+#ifdef CONFIG_SMP
+	extern int migration_init(void);
+
+	migration_init();
+#endif
+	spawn_ksoftirqd();
+	spawn_softlockup_task();
+}
+
+static void run_init_process(char *init_filename)
+{
+	argv_init[0] = init_filename;
+	execve(init_filename, argv_init, envp_init);
+}
+
+static inline void fixup_cpu_present_map(void)
+{
+#ifdef CONFIG_SMP
+	int i;
+
+	/*
+	 * If arch is not hotplug ready and did not populate
+	 * cpu_present_map, just make cpu_present_map same as cpu_possible_map
+	 * for other cpu bringup code to function as normal. e.g smp_init() etc.
+	 */
+	if (cpus_empty(cpu_present_map)) {
+		for_each_cpu(i) {
+			cpu_set(i, cpu_present_map);
+		}
+	}
+#endif
+}
+
+static int init(void * unused)
+{
+	lock_kernel();
+	/*
+	 * init can run on any cpu.
+	 */
+	set_cpus_allowed(current, CPU_MASK_ALL);
+	/*
+	 * Tell the world that we're going to be the grim
+	 * reaper of innocent orphaned children.
+	 *
+	 * We don't want people to have to make incorrect
+	 * assumptions about where in the task array this
+	 * can be found.
+	 */
+	child_reaper = current;
+
+	smp_prepare_cpus(max_cpus);
+
+	do_pre_smp_initcalls();
+
+	fixup_cpu_present_map();
+	smp_init();
+	sched_init_smp();
+
+	cpuset_init_smp();
+
+	/*
+	 * Do this before initcalls, because some drivers want to access
+	 * firmware files.
+	 */
+	populate_rootfs();
+
+	do_basic_setup();
+
+	/*
+	 * check if there is an early userspace init.  If yes, let it do all
+	 * the work
+	 */
+
+	if (!ramdisk_execute_command)
+		ramdisk_execute_command = "/init";
+
+	if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {
+		ramdisk_execute_command = NULL;
+		prepare_namespace();
+	}
+
+	/*
+	 * Ok, we have completed the initial bootup, and
+	 * we're essentially up and running. Get rid of the
+	 * initmem segments and start the user-mode stuff..
+	 */
+	free_initmem();
+	unlock_kernel();
+	mark_rodata_ro();
+	system_state = SYSTEM_RUNNING;
+	numa_default_policy();
+
+	if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
+		printk(KERN_WARNING "Warning: unable to open an initial console.\n");
+
+	(void) sys_dup(0);
+	(void) sys_dup(0);
+
+	if (ramdisk_execute_command) {
+		run_init_process(ramdisk_execute_command);
+		printk(KERN_WARNING "Failed to execute %s\n",
+				ramdisk_execute_command);
+	}
+
+	/*
+	 * We try each of these until one succeeds.
+	 *
+	 * The Bourne shell can be used instead of init if we are 
+	 * trying to recover a really broken machine.
+	 */
+	if (execute_command) {
+		run_init_process(execute_command);
+		printk(KERN_WARNING "Failed to execute %s.  Attempting "
+					"defaults...\n", execute_command);
+	}
+	run_init_process("/sbin/init");
+	run_init_process("/etc/init");
+	run_init_process("/bin/init");
+	run_init_process("/bin/sh");
+
+	panic("No init found.  Try passing init= option to kernel.");
+}
diff -urN oldtree/kernel/audit.c newtree/kernel/audit.c
--- oldtree/kernel/audit.c	2006-03-08 18:48:02.936062500 +0000
+++ newtree/kernel/audit.c	2006-03-08 15:22:33.225503750 +0000
@@ -294,6 +294,9 @@
 			}
 		} else {
 			DECLARE_WAITQUEUE(wait, current);
+			
+			try_to_freeze();
+
 			set_current_state(TASK_INTERRUPTIBLE);
 			add_wait_queue(&kauditd_wait, &wait);
 
diff -urN oldtree/kernel/audit.c.orig newtree/kernel/audit.c.orig
--- oldtree/kernel/audit.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/audit.c.orig	2006-03-08 15:21:19.156874750 +0000
@@ -0,0 +1,996 @@
+/* audit.c -- Auditing support
+ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
+ * System-call specific features have moved to auditsc.c
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Goals: 1) Integrate fully with SELinux.
+ *	  2) Minimal run-time overhead:
+ *	     a) Minimal when syscall auditing is disabled (audit_enable=0).
+ *	     b) Small when syscall auditing is enabled and no audit record
+ *		is generated (defer as much work as possible to record
+ *		generation time):
+ *		i) context is allocated,
+ *		ii) names from getname are stored without a copy, and
+ *		iii) inode information stored from path_lookup.
+ *	  3) Ability to disable syscall auditing at boot time (audit=0).
+ *	  4) Usable by other parts of the kernel (if audit_log* is called,
+ *	     then a syscall record will be generated automatically for the
+ *	     current syscall).
+ *	  5) Netlink interface to user-space.
+ *	  6) Support low-overhead kernel-based filtering to minimize the
+ *	     information that must be passed to user-space.
+ *
+ * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
+ */
+
+#include <linux/init.h>
+#include <asm/types.h>
+#include <asm/atomic.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+
+#include <linux/audit.h>
+
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+
+/* No auditing will take place until audit_initialized != 0.
+ * (Initialization happens after skb_init is called.) */
+static int	audit_initialized;
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+int		audit_enabled;
+
+/* Default state when kernel boots without any parameters. */
+static int	audit_default;
+
+/* If auditing cannot proceed, audit_failure selects what happens. */
+static int	audit_failure = AUDIT_FAIL_PRINTK;
+
+/* If audit records are to be written to the netlink socket, audit_pid
+ * contains the (non-zero) pid. */
+int		audit_pid;
+
+/* If audit_rate_limit is non-zero, limit the rate of sending audit records
+ * to that number per second.  This prevents DoS attacks, but results in
+ * audit records being dropped. */
+static int	audit_rate_limit;
+
+/* Number of outstanding audit_buffers allowed. */
+static int	audit_backlog_limit = 64;
+static int	audit_backlog_wait_time = 60 * HZ;
+static int	audit_backlog_wait_overflow = 0;
+
+/* The identity of the user shutting down the audit system. */
+uid_t		audit_sig_uid = -1;
+pid_t		audit_sig_pid = -1;
+
+/* Records can be lost in several ways:
+   0) [suppressed in audit_alloc]
+   1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
+   2) out of memory in audit_log_move [alloc_skb]
+   3) suppressed due to audit_rate_limit
+   4) suppressed due to audit_backlog_limit
+*/
+static atomic_t    audit_lost = ATOMIC_INIT(0);
+
+/* The netlink socket. */
+static struct sock *audit_sock;
+
+/* The audit_freelist is a list of pre-allocated audit buffers (if more
+ * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
+ * being placed on the freelist). */
+static DEFINE_SPINLOCK(audit_freelist_lock);
+static int	   audit_freelist_count;
+static LIST_HEAD(audit_freelist);
+
+static struct sk_buff_head audit_skb_queue;
+static struct task_struct *kauditd_task;
+static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
+
+/* The netlink socket is only to be read by 1 CPU, which lets us assume
+ * that list additions and deletions never happen simultaneously in
+ * auditsc.c */
+DEFINE_MUTEX(audit_netlink_mutex);
+
+/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
+ * audit records.  Since printk uses a 1024 byte buffer, this buffer
+ * should be at least that large. */
+#define AUDIT_BUFSIZ 1024
+
+/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
+ * audit_freelist.  Doing so eliminates many kmalloc/kfree calls. */
+#define AUDIT_MAXFREE  (2*NR_CPUS)
+
+/* The audit_buffer is used when formatting an audit record.  The caller
+ * locks briefly to get the record off the freelist or to allocate the
+ * buffer, and locks briefly to send the buffer to the netlink layer or
+ * to place it on a transmit queue.  Multiple audit_buffers can be in
+ * use simultaneously. */
+struct audit_buffer {
+	struct list_head     list;
+	struct sk_buff       *skb;	/* formatted skb ready to send */
+	struct audit_context *ctx;	/* NULL or associated context */
+	gfp_t		     gfp_mask;
+};
+
+static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
+{
+	struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+	nlh->nlmsg_pid = pid;
+}
+
+void audit_panic(const char *message)
+{
+	switch (audit_failure)
+	{
+	case AUDIT_FAIL_SILENT:
+		break;
+	case AUDIT_FAIL_PRINTK:
+		printk(KERN_ERR "audit: %s\n", message);
+		break;
+	case AUDIT_FAIL_PANIC:
+		panic("audit: %s\n", message);
+		break;
+	}
+}
+
+static inline int audit_rate_check(void)
+{
+	static unsigned long	last_check = 0;
+	static int		messages   = 0;
+	static DEFINE_SPINLOCK(lock);
+	unsigned long		flags;
+	unsigned long		now;
+	unsigned long		elapsed;
+	int			retval	   = 0;
+
+	if (!audit_rate_limit) return 1;
+
+	spin_lock_irqsave(&lock, flags);
+	if (++messages < audit_rate_limit) {
+		retval = 1;
+	} else {
+		now     = jiffies;
+		elapsed = now - last_check;
+		if (elapsed > HZ) {
+			last_check = now;
+			messages   = 0;
+			retval     = 1;
+		}
+	}
+	spin_unlock_irqrestore(&lock, flags);
+
+	return retval;
+}
+
+/**
+ * audit_log_lost - conditionally log lost audit message event
+ * @message: the message stating reason for lost audit message
+ *
+ * Emit at least 1 message per second, even if audit_rate_check is
+ * throttling.
+ * Always increment the lost messages counter.
+*/
+void audit_log_lost(const char *message)
+{
+	static unsigned long	last_msg = 0;
+	static DEFINE_SPINLOCK(lock);
+	unsigned long		flags;
+	unsigned long		now;
+	int			print;
+
+	atomic_inc(&audit_lost);
+
+	print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
+
+	if (!print) {
+		spin_lock_irqsave(&lock, flags);
+		now = jiffies;
+		if (now - last_msg > HZ) {
+			print = 1;
+			last_msg = now;
+		}
+		spin_unlock_irqrestore(&lock, flags);
+	}
+
+	if (print) {
+		printk(KERN_WARNING
+		       "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n",
+		       atomic_read(&audit_lost),
+		       audit_rate_limit,
+		       audit_backlog_limit);
+		audit_panic(message);
+	}
+}
+
+static int audit_set_rate_limit(int limit, uid_t loginuid)
+{
+	int old		 = audit_rate_limit;
+	audit_rate_limit = limit;
+	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 
+			"audit_rate_limit=%d old=%d by auid=%u",
+			audit_rate_limit, old, loginuid);
+	return old;
+}
+
+static int audit_set_backlog_limit(int limit, uid_t loginuid)
+{
+	int old		 = audit_backlog_limit;
+	audit_backlog_limit = limit;
+	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+			"audit_backlog_limit=%d old=%d by auid=%u",
+			audit_backlog_limit, old, loginuid);
+	return old;
+}
+
+static int audit_set_enabled(int state, uid_t loginuid)
+{
+	int old		 = audit_enabled;
+	if (state != 0 && state != 1)
+		return -EINVAL;
+	audit_enabled = state;
+	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+			"audit_enabled=%d old=%d by auid=%u",
+			audit_enabled, old, loginuid);
+	return old;
+}
+
+static int audit_set_failure(int state, uid_t loginuid)
+{
+	int old		 = audit_failure;
+	if (state != AUDIT_FAIL_SILENT
+	    && state != AUDIT_FAIL_PRINTK
+	    && state != AUDIT_FAIL_PANIC)
+		return -EINVAL;
+	audit_failure = state;
+	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+			"audit_failure=%d old=%d by auid=%u",
+			audit_failure, old, loginuid);
+	return old;
+}
+
+static int kauditd_thread(void *dummy)
+{
+	struct sk_buff *skb;
+
+	while (1) {
+		skb = skb_dequeue(&audit_skb_queue);
+		wake_up(&audit_backlog_wait);
+		if (skb) {
+			if (audit_pid) {
+				int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
+				if (err < 0) {
+					BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+					printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+					audit_pid = 0;
+				}
+			} else {
+				printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
+				kfree_skb(skb);
+			}
+		} else {
+			DECLARE_WAITQUEUE(wait, current);
+			set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue(&kauditd_wait, &wait);
+
+			if (!skb_queue_len(&audit_skb_queue)) {
+				try_to_freeze();
+				schedule();
+			}
+
+			__set_current_state(TASK_RUNNING);
+			remove_wait_queue(&kauditd_wait, &wait);
+		}
+	}
+	return 0;
+}
+
+/**
+ * audit_send_reply - send an audit reply message via netlink
+ * @pid: process id to send reply to
+ * @seq: sequence number
+ * @type: audit message type
+ * @done: done (last) flag
+ * @multi: multi-part message flag
+ * @payload: payload data
+ * @size: payload size
+ *
+ * Allocates an skb, builds the netlink message, and sends it to the pid.
+ * No failure notifications.
+ */
+void audit_send_reply(int pid, int seq, int type, int done, int multi,
+		      void *payload, int size)
+{
+	struct sk_buff	*skb;
+	struct nlmsghdr	*nlh;
+	int		len = NLMSG_SPACE(size);
+	void		*data;
+	int		flags = multi ? NLM_F_MULTI : 0;
+	int		t     = done  ? NLMSG_DONE  : type;
+
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb)
+		return;
+
+	nlh		 = NLMSG_PUT(skb, pid, seq, t, size);
+	nlh->nlmsg_flags = flags;
+	data		 = NLMSG_DATA(nlh);
+	memcpy(data, payload, size);
+
+	/* Ignore failure. It'll only happen if the sender goes away,
+	   because our timeout is set to infinite. */
+	netlink_unicast(audit_sock, skb, pid, 0);
+	return;
+
+nlmsg_failure:			/* Used by NLMSG_PUT */
+	if (skb)
+		kfree_skb(skb);
+}
+
+/*
+ * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
+ * control messages.
+ */
+static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
+{
+	int err = 0;
+
+	switch (msg_type) {
+	case AUDIT_GET:
+	case AUDIT_LIST:
+	case AUDIT_LIST_RULES:
+	case AUDIT_SET:
+	case AUDIT_ADD:
+	case AUDIT_ADD_RULE:
+	case AUDIT_DEL:
+	case AUDIT_DEL_RULE:
+	case AUDIT_SIGNAL_INFO:
+		if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
+			err = -EPERM;
+		break;
+	case AUDIT_USER:
+	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+	case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+		if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
+			err = -EPERM;
+		break;
+	default:  /* bad msg */
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	u32			uid, pid, seq;
+	void			*data;
+	struct audit_status	*status_get, status_set;
+	int			err;
+	struct audit_buffer	*ab;
+	u16			msg_type = nlh->nlmsg_type;
+	uid_t			loginuid; /* loginuid of sender */
+	struct audit_sig_info   sig_data;
+
+	err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
+	if (err)
+		return err;
+
+	/* As soon as there's any sign of userspace auditd,
+	 * start kauditd to talk to it */
+	if (!kauditd_task)
+		kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+	if (IS_ERR(kauditd_task)) {
+		err = PTR_ERR(kauditd_task);
+		kauditd_task = NULL;
+		return err;
+	}
+
+	pid  = NETLINK_CREDS(skb)->pid;
+	uid  = NETLINK_CREDS(skb)->uid;
+	loginuid = NETLINK_CB(skb).loginuid;
+	seq  = nlh->nlmsg_seq;
+	data = NLMSG_DATA(nlh);
+
+	switch (msg_type) {
+	case AUDIT_GET:
+		status_set.enabled	 = audit_enabled;
+		status_set.failure	 = audit_failure;
+		status_set.pid		 = audit_pid;
+		status_set.rate_limit	 = audit_rate_limit;
+		status_set.backlog_limit = audit_backlog_limit;
+		status_set.lost		 = atomic_read(&audit_lost);
+		status_set.backlog	 = skb_queue_len(&audit_skb_queue);
+		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
+				 &status_set, sizeof(status_set));
+		break;
+	case AUDIT_SET:
+		if (nlh->nlmsg_len < sizeof(struct audit_status))
+			return -EINVAL;
+		status_get   = (struct audit_status *)data;
+		if (status_get->mask & AUDIT_STATUS_ENABLED) {
+			err = audit_set_enabled(status_get->enabled, loginuid);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_FAILURE) {
+			err = audit_set_failure(status_get->failure, loginuid);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_PID) {
+			int old   = audit_pid;
+			audit_pid = status_get->pid;
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"audit_pid=%d old=%d by auid=%u",
+				  audit_pid, old, loginuid);
+		}
+		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+			audit_set_rate_limit(status_get->rate_limit, loginuid);
+		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
+			audit_set_backlog_limit(status_get->backlog_limit,
+							loginuid);
+		break;
+	case AUDIT_USER:
+	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+	case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+		if (!audit_enabled && msg_type != AUDIT_USER_AVC)
+			return 0;
+
+		err = audit_filter_user(&NETLINK_CB(skb), msg_type);
+		if (err == 1) {
+			err = 0;
+			ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+			if (ab) {
+				audit_log_format(ab,
+						 "user pid=%d uid=%u auid=%u msg='%.1024s'",
+						 pid, uid, loginuid, (char *)data);
+				audit_set_pid(ab, pid);
+				audit_log_end(ab);
+			}
+		}
+		break;
+	case AUDIT_ADD:
+	case AUDIT_DEL:
+		if (nlmsg_len(nlh) < sizeof(struct audit_rule))
+			return -EINVAL;
+		/* fallthrough */
+	case AUDIT_LIST:
+		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+					   uid, seq, data, nlmsg_len(nlh),
+					   loginuid);
+		break;
+	case AUDIT_ADD_RULE:
+	case AUDIT_DEL_RULE:
+		if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+			return -EINVAL;
+		/* fallthrough */
+	case AUDIT_LIST_RULES:
+		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+					   uid, seq, data, nlmsg_len(nlh),
+					   loginuid);
+		break;
+	case AUDIT_SIGNAL_INFO:
+		sig_data.uid = audit_sig_uid;
+		sig_data.pid = audit_sig_pid;
+		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 
+				0, 0, &sig_data, sizeof(sig_data));
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err < 0 ? err : 0;
+}
+
+/*
+ * Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+ * processed by audit_receive_msg.  Malformed skbs with wrong length are
+ * discarded silently.
+ */
+static void audit_receive_skb(struct sk_buff *skb)
+{
+	int		err;
+	struct nlmsghdr	*nlh;
+	u32		rlen;
+
+	while (skb->len >= NLMSG_SPACE(0)) {
+		nlh = (struct nlmsghdr *)skb->data;
+		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+			return;
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		if ((err = audit_receive_msg(skb, nlh))) {
+			netlink_ack(skb, nlh, err);
+		} else if (nlh->nlmsg_flags & NLM_F_ACK)
+			netlink_ack(skb, nlh, 0);
+		skb_pull(skb, rlen);
+	}
+}
+
+/* Receive messages from netlink socket. */
+static void audit_receive(struct sock *sk, int length)
+{
+	struct sk_buff  *skb;
+	unsigned int qlen;
+
+	mutex_lock(&audit_netlink_mutex);
+
+	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		audit_receive_skb(skb);
+		kfree_skb(skb);
+	}
+	mutex_unlock(&audit_netlink_mutex);
+}
+
+
+/* Initialize audit support at boot time. */
+static int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
+	       audit_default ? "enabled" : "disabled");
+	audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
+					   THIS_MODULE);
+	if (!audit_sock)
+		audit_panic("cannot initialize netlink socket");
+
+	audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+	skb_queue_head_init(&audit_skb_queue);
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+	return 0;
+}
+__initcall(audit_init);
+
+/* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
+static int __init audit_enable(char *str)
+{
+	audit_default = !!simple_strtol(str, NULL, 0);
+	printk(KERN_INFO "audit: %s%s\n",
+	       audit_default ? "enabled" : "disabled",
+	       audit_initialized ? "" : " (after initialization)");
+	if (audit_initialized)
+		audit_enabled = audit_default;
+	return 0;
+}
+
+__setup("audit=", audit_enable);
+
+static void audit_buffer_free(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	if (!ab)
+		return;
+
+	if (ab->skb)
+		kfree_skb(ab->skb);
+
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (++audit_freelist_count > AUDIT_MAXFREE)
+		kfree(ab);
+	else
+		list_add(&ab->list, &audit_freelist);
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
+						gfp_t gfp_mask, int type)
+{
+	unsigned long flags;
+	struct audit_buffer *ab = NULL;
+	struct nlmsghdr *nlh;
+
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (!list_empty(&audit_freelist)) {
+		ab = list_entry(audit_freelist.next,
+				struct audit_buffer, list);
+		list_del(&ab->list);
+		--audit_freelist_count;
+	}
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+	if (!ab) {
+		ab = kmalloc(sizeof(*ab), gfp_mask);
+		if (!ab)
+			goto err;
+	}
+
+	ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
+	if (!ab->skb)
+		goto err;
+
+	ab->ctx = ctx;
+	ab->gfp_mask = gfp_mask;
+	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
+	nlh->nlmsg_type = type;
+	nlh->nlmsg_flags = 0;
+	nlh->nlmsg_pid = 0;
+	nlh->nlmsg_seq = 0;
+	return ab;
+err:
+	audit_buffer_free(ab);
+	return NULL;
+}
+
+/**
+ * audit_serial - compute a serial number for the audit record
+ *
+ * Compute a serial number for the audit record.  Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces.  The timestamp of the
+ * record and this serial number are used by the user-space tools to
+ * determine which pieces belong to the same audit record.  The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit.  However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts).
+ */
+unsigned int audit_serial(void)
+{
+	static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+	static unsigned int serial = 0;
+
+	unsigned long flags;
+	unsigned int ret;
+
+	spin_lock_irqsave(&serial_lock, flags);
+	do {
+		ret = ++serial;
+	} while (unlikely(!ret));
+	spin_unlock_irqrestore(&serial_lock, flags);
+
+	return ret;
+}
+
+static inline void audit_get_stamp(struct audit_context *ctx, 
+				   struct timespec *t, unsigned int *serial)
+{
+	if (ctx)
+		auditsc_get_stamp(ctx, t, serial);
+	else {
+		*t = CURRENT_TIME;
+		*serial = audit_serial();
+	}
+}
+
+/* Obtain an audit buffer.  This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format.  If the tsk is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit.  If there is no associated task, tsk
+ * should be NULL. */
+
+/**
+ * audit_log_start - obtain an audit buffer
+ * @ctx: audit_context (may be NULL)
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ *
+ * Returns audit_buffer pointer on success or NULL on error.
+ *
+ * Obtain an audit buffer.  This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format.  If the task (ctx) is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit.  If there is no associated task, then
+ * task context (ctx) should be NULL.
+ */
+struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
+				     int type)
+{
+	struct audit_buffer	*ab	= NULL;
+	struct timespec		t;
+	unsigned int		serial;
+	int reserve;
+	unsigned long timeout_start = jiffies;
+
+	if (!audit_initialized)
+		return NULL;
+
+	if (unlikely(audit_filter_type(type)))
+		return NULL;
+
+	if (gfp_mask & __GFP_WAIT)
+		reserve = 0;
+	else
+		reserve = 5; /* Allow atomic callers to go up to five 
+				entries over the normal backlog limit */
+
+	while (audit_backlog_limit
+	       && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
+		if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
+		    && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
+
+			/* Wait for auditd to drain the queue a little */
+			DECLARE_WAITQUEUE(wait, current);
+			set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue(&audit_backlog_wait, &wait);
+
+			if (audit_backlog_limit &&
+			    skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+				schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
+
+			__set_current_state(TASK_RUNNING);
+			remove_wait_queue(&audit_backlog_wait, &wait);
+			continue;
+		}
+		if (audit_rate_check())
+			printk(KERN_WARNING
+			       "audit: audit_backlog=%d > "
+			       "audit_backlog_limit=%d\n",
+			       skb_queue_len(&audit_skb_queue),
+			       audit_backlog_limit);
+		audit_log_lost("backlog limit exceeded");
+		audit_backlog_wait_time = audit_backlog_wait_overflow;
+		wake_up(&audit_backlog_wait);
+		return NULL;
+	}
+
+	ab = audit_buffer_alloc(ctx, gfp_mask, type);
+	if (!ab) {
+		audit_log_lost("out of memory in audit_log_start");
+		return NULL;
+	}
+
+	audit_get_stamp(ab->ctx, &t, &serial);
+
+	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+			 t.tv_sec, t.tv_nsec/1000000, serial);
+	return ab;
+}
+
+/**
+ * audit_expand - expand skb in the audit buffer
+ * @ab: audit_buffer
+ * @extra: space to add at tail of the skb
+ *
+ * Returns 0 (no space) on failed expansion, or available space if
+ * successful.
+ */
+static inline int audit_expand(struct audit_buffer *ab, int extra)
+{
+	struct sk_buff *skb = ab->skb;
+	int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
+				   ab->gfp_mask);
+	if (ret < 0) {
+		audit_log_lost("out of memory in audit_expand");
+		return 0;
+	}
+	return skb_tailroom(skb);
+}
+
+/*
+ * Format an audit message into the audit buffer.  If there isn't enough
+ * room in the audit buffer, more room will be allocated and vsnprint
+ * will be called a second time.  Currently, we assume that a printk
+ * can't format message larger than 1024 bytes, so we don't either.
+ */
+static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
+			      va_list args)
+{
+	int len, avail;
+	struct sk_buff *skb;
+	va_list args2;
+
+	if (!ab)
+		return;
+
+	BUG_ON(!ab->skb);
+	skb = ab->skb;
+	avail = skb_tailroom(skb);
+	if (avail == 0) {
+		avail = audit_expand(ab, AUDIT_BUFSIZ);
+		if (!avail)
+			goto out;
+	}
+	va_copy(args2, args);
+	len = vsnprintf(skb->tail, avail, fmt, args);
+	if (len >= avail) {
+		/* The printk buffer is 1024 bytes long, so if we get
+		 * here and AUDIT_BUFSIZ is at least 1024, then we can
+		 * log everything that printk could have logged. */
+		avail = audit_expand(ab,
+			max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
+		if (!avail)
+			goto out;
+		len = vsnprintf(skb->tail, avail, fmt, args2);
+	}
+	if (len > 0)
+		skb_put(skb, len);
+out:
+	return;
+}
+
+/**
+ * audit_log_format - format a message into the audit buffer.
+ * @ab: audit_buffer
+ * @fmt: format string
+ * @...: optional parameters matching @fmt string
+ *
+ * All the work is done in audit_log_vformat.
+ */
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{
+	va_list args;
+
+	if (!ab)
+		return;
+	va_start(args, fmt);
+	audit_log_vformat(ab, fmt, args);
+	va_end(args);
+}
+
+/**
+ * audit_log_hex - convert a buffer to hex and append it to the audit skb
+ * @ab: the audit_buffer
+ * @buf: buffer to convert to hex
+ * @len: length of @buf to be converted
+ *
+ * No return value; failure to expand is silently ignored.
+ *
+ * This function will take the passed buf and convert it into a string of
+ * ascii hex digits. The new string is placed onto the skb.
+ */
+void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
+		size_t len)
+{
+	int i, avail, new_len;
+	unsigned char *ptr;
+	struct sk_buff *skb;
+	static const unsigned char *hex = "0123456789ABCDEF";
+
+	BUG_ON(!ab->skb);
+	skb = ab->skb;
+	avail = skb_tailroom(skb);
+	new_len = len<<1;
+	if (new_len >= avail) {
+		/* Round the buffer request up to the next multiple */
+		new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
+		avail = audit_expand(ab, new_len);
+		if (!avail)
+			return;
+	}
+
+	ptr = skb->tail;
+	for (i=0; i<len; i++) {
+		*ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
+		*ptr++ = hex[buf[i] & 0x0F];	  /* Lower nibble */
+	}
+	*ptr = 0;
+	skb_put(skb, len << 1); /* new string is twice the old string */
+}
+
+/**
+ * audit_log_unstrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @string: string to be logged
+ *
+ * This code will escape a string that is passed to it if the string
+ * contains a control character, unprintable character, double quote mark,
+ * or a space. Unescaped strings will start and end with a double quote mark.
+ * Strings that are escaped are printed in hex (2 digits per char).
+ */
+void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+{
+	const unsigned char *p = string;
+
+	while (*p) {
+		if (*p == '"' || *p < 0x21 || *p > 0x7f) {
+			audit_log_hex(ab, string, strlen(string));
+			return;
+		}
+		p++;
+	}
+	audit_log_format(ab, "\"%s\"", string);
+}
+
+/* This is a helper-function to print the escaped d_path */
+void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
+		      struct dentry *dentry, struct vfsmount *vfsmnt)
+{
+	char *p, *path;
+
+	if (prefix)
+		audit_log_format(ab, " %s", prefix);
+
+	/* We will allow 11 spaces for ' (deleted)' to be appended */
+	path = kmalloc(PATH_MAX+11, ab->gfp_mask);
+	if (!path) {
+		audit_log_format(ab, "<no memory>");
+		return;
+	}
+	p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
+	if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
+		/* FIXME: can we save some information here? */
+		audit_log_format(ab, "<too long>");
+	} else 
+		audit_log_untrustedstring(ab, p);
+	kfree(path);
+}
+
+/**
+ * audit_log_end - end one audit record
+ * @ab: the audit_buffer
+ *
+ * The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is placed on a queue and a tasklet is scheduled to
+ * remove them from the queue outside the irq context.  May be called in
+ * any context.
+ */
+void audit_log_end(struct audit_buffer *ab)
+{
+	if (!ab)
+		return;
+	if (!audit_rate_check()) {
+		audit_log_lost("rate limit exceeded");
+	} else {
+		if (audit_pid) {
+			struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+			nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+			skb_queue_tail(&audit_skb_queue, ab->skb);
+			ab->skb = NULL;
+			wake_up_interruptible(&kauditd_wait);
+		} else {
+			printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0));
+		}
+	}
+	audit_buffer_free(ab);
+}
+
+/**
+ * audit_log - Log an audit record
+ * @ctx: audit context
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ * @fmt: format string to use
+ * @...: variable parameters matching the format string
+ *
+ * This is a convenience function that calls audit_log_start,
+ * audit_log_vformat, and audit_log_end.  It may be called
+ * in any context.
+ */
+void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 
+	       const char *fmt, ...)
+{
+	struct audit_buffer *ab;
+	va_list args;
+
+	ab = audit_log_start(ctx, gfp_mask, type);
+	if (ab) {
+		va_start(args, fmt);
+		audit_log_vformat(ab, fmt, args);
+		va_end(args);
+		audit_log_end(ab);
+	}
+}
diff -urN oldtree/kernel/fork.c newtree/kernel/fork.c
--- oldtree/kernel/fork.c	2006-03-08 18:48:02.944063000 +0000
+++ newtree/kernel/fork.c	2006-03-08 17:20:46.200787000 +0000
@@ -35,6 +35,7 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
+#include <linux/suspend.h>
 #include <linux/futex.h>
 #include <linux/rcupdate.h>
 #include <linux/ptrace.h>
@@ -167,7 +168,8 @@
 	if (!tsk)
 		return NULL;
 
-	ti = alloc_thread_info(tsk);
+		ti = alloc_thread_info(tsk);
+
 	if (!ti) {
 		free_task_struct(tsk);
 		return NULL;
diff -urN oldtree/kernel/fork.c.orig newtree/kernel/fork.c.orig
--- oldtree/kernel/fork.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/fork.c.orig	2006-03-08 15:21:19.212878250 +0000
@@ -0,0 +1,1652 @@
+/*
+ *  linux/kernel/fork.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ *  'fork.c' contains the help-routines for the 'fork' system call
+ * (see also entry.S and others).
+ * Fork is rather simple, once you get the hang of it, but the memory
+ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/unistd.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/completion.h>
+#include <linux/namespace.h>
+#include <linux/personality.h>
+#include <linux/mempolicy.h>
+#include <linux/sem.h>
+#include <linux/file.h>
+#include <linux/key.h>
+#include <linux/binfmts.h>
+#include <linux/mman.h>
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/security.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/jiffies.h>
+#include <linux/futex.h>
+#include <linux/rcupdate.h>
+#include <linux/ptrace.h>
+#include <linux/mount.h>
+#include <linux/audit.h>
+#include <linux/profile.h>
+#include <linux/rmap.h>
+#include <linux/acct.h>
+#include <linux/cn_proc.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Protected counters by write_lock_irq(&tasklist_lock)
+ */
+unsigned long total_forks;	/* Handle normal Linux uptimes. */
+int nr_threads; 		/* The idle threads do not count.. */
+
+int max_threads;		/* tunable limit on nr_threads */
+
+DEFINE_PER_CPU(unsigned long, process_counts) = 0;
+
+ __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+
+EXPORT_SYMBOL(tasklist_lock);
+
+int nr_processes(void)
+{
+	int cpu;
+	int total = 0;
+
+	for_each_online_cpu(cpu)
+		total += per_cpu(process_counts, cpu);
+
+	return total;
+}
+
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+# define alloc_task_struct()	kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
+# define free_task_struct(tsk)	kmem_cache_free(task_struct_cachep, (tsk))
+static kmem_cache_t *task_struct_cachep;
+#endif
+
+/* SLAB cache for signal_struct structures (tsk->signal) */
+static kmem_cache_t *signal_cachep;
+
+/* SLAB cache for sighand_struct structures (tsk->sighand) */
+kmem_cache_t *sighand_cachep;
+
+/* SLAB cache for files_struct structures (tsk->files) */
+kmem_cache_t *files_cachep;
+
+/* SLAB cache for fs_struct structures (tsk->fs) */
+kmem_cache_t *fs_cachep;
+
+/* SLAB cache for vm_area_struct structures */
+kmem_cache_t *vm_area_cachep;
+
+/* SLAB cache for mm_struct structures (tsk->mm) */
+static kmem_cache_t *mm_cachep;
+
+void free_task(struct task_struct *tsk)
+{
+	free_thread_info(tsk->thread_info);
+	free_task_struct(tsk);
+}
+EXPORT_SYMBOL(free_task);
+
+void __put_task_struct(struct task_struct *tsk)
+{
+	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
+	WARN_ON(atomic_read(&tsk->usage));
+	WARN_ON(tsk == current);
+
+	if (unlikely(tsk->audit_context))
+		audit_free(tsk);
+	security_task_free(tsk);
+	free_uid(tsk->user);
+	put_group_info(tsk->group_info);
+
+	if (!profile_handoff_task(tsk))
+		free_task(tsk);
+}
+
+void __init fork_init(unsigned long mempages)
+{
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
+#endif
+	/* create a slab on which task_structs can be allocated */
+	task_struct_cachep =
+		kmem_cache_create("task_struct", sizeof(struct task_struct),
+			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
+#endif
+
+	/*
+	 * The default maximum number of threads is set to a safe
+	 * value: the thread structures can take up at most half
+	 * of memory.
+	 */
+	max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
+
+	/*
+	 * we need to allow at least 20 threads to boot a system
+	 */
+	if(max_threads < 20)
+		max_threads = 20;
+
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+	init_task.signal->rlim[RLIMIT_SIGPENDING] =
+		init_task.signal->rlim[RLIMIT_NPROC];
+}
+
+static struct task_struct *dup_task_struct(struct task_struct *orig)
+{
+	int type;
+	struct task_struct *tsk;
+	struct thread_info *ti;
+
+	prepare_to_copy(orig);
+
+	tsk = alloc_task_struct();
+	if (!tsk)
+		return NULL;
+
+	ti = alloc_thread_info(tsk);
+	if (!ti) {
+		free_task_struct(tsk);
+		return NULL;
+	}
+
+	*tsk = *orig;
+	tsk->thread_info = ti;
+	setup_thread_stack(tsk, orig);
+
+	/* One for us, one for whoever does the "release_task()" (usually parent) */
+	atomic_set(&tsk->usage,2);
+	atomic_set(&tsk->fs_excl, 0);
+	tsk->btrace_seq = 0;
+	/* Initially there are no weak references to this task */
+	for (type = 0; type < PIDTYPE_MAX; type++) {
+		tsk->pids[type].nr = 0;
+		tsk->pids[type].tref = NULL;
+	}
+	return tsk;
+}
+
+#ifdef CONFIG_MMU
+static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	struct vm_area_struct *mpnt, *tmp, **pprev;
+	struct rb_node **rb_link, *rb_parent;
+	int retval;
+	unsigned long charge;
+	struct mempolicy *pol;
+
+	down_write(&oldmm->mmap_sem);
+	flush_cache_mm(oldmm);
+	down_write(&mm->mmap_sem);
+
+	mm->locked_vm = 0;
+	mm->mmap = NULL;
+	mm->mmap_cache = NULL;
+	mm->free_area_cache = oldmm->mmap_base;
+	mm->cached_hole_size = ~0UL;
+	mm->map_count = 0;
+	cpus_clear(mm->cpu_vm_mask);
+	mm->mm_rb = RB_ROOT;
+	rb_link = &mm->mm_rb.rb_node;
+	rb_parent = NULL;
+	pprev = &mm->mmap;
+
+	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+		struct file *file;
+
+		if (mpnt->vm_flags & VM_DONTCOPY) {
+			long pages = vma_pages(mpnt);
+			mm->total_vm -= pages;
+			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+								-pages);
+			continue;
+		}
+		charge = 0;
+		if (mpnt->vm_flags & VM_ACCOUNT) {
+			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+			if (security_vm_enough_memory(len))
+				goto fail_nomem;
+			charge = len;
+		}
+		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		if (!tmp)
+			goto fail_nomem;
+		*tmp = *mpnt;
+		pol = mpol_copy(vma_policy(mpnt));
+		retval = PTR_ERR(pol);
+		if (IS_ERR(pol))
+			goto fail_nomem_policy;
+		vma_set_policy(tmp, pol);
+		tmp->vm_flags &= ~VM_LOCKED;
+		tmp->vm_mm = mm;
+		tmp->vm_next = NULL;
+		anon_vma_link(tmp);
+		file = tmp->vm_file;
+		if (file) {
+			struct inode *inode = file->f_dentry->d_inode;
+			get_file(file);
+			if (tmp->vm_flags & VM_DENYWRITE)
+				atomic_dec(&inode->i_writecount);
+      
+			/* insert tmp into the share list, just after mpnt */
+			spin_lock(&file->f_mapping->i_mmap_lock);
+			tmp->vm_truncate_count = mpnt->vm_truncate_count;
+			flush_dcache_mmap_lock(file->f_mapping);
+			vma_prio_tree_add(tmp, mpnt);
+			flush_dcache_mmap_unlock(file->f_mapping);
+			spin_unlock(&file->f_mapping->i_mmap_lock);
+		}
+
+		/*
+		 * Link in the new vma and copy the page table entries.
+		 */
+		*pprev = tmp;
+		pprev = &tmp->vm_next;
+
+		__vma_link_rb(mm, tmp, rb_link, rb_parent);
+		rb_link = &tmp->vm_rb.rb_right;
+		rb_parent = &tmp->vm_rb;
+
+		mm->map_count++;
+		retval = copy_page_range(mm, oldmm, mpnt);
+
+		if (tmp->vm_ops && tmp->vm_ops->open)
+			tmp->vm_ops->open(tmp);
+
+		if (retval)
+			goto out;
+	}
+	retval = 0;
+out:
+	up_write(&mm->mmap_sem);
+	flush_tlb_mm(oldmm);
+	up_write(&oldmm->mmap_sem);
+	return retval;
+fail_nomem_policy:
+	kmem_cache_free(vm_area_cachep, tmp);
+fail_nomem:
+	retval = -ENOMEM;
+	vm_unacct_memory(charge);
+	goto out;
+}
+
+static inline int mm_alloc_pgd(struct mm_struct * mm)
+{
+	mm->pgd = pgd_alloc(mm);
+	if (unlikely(!mm->pgd))
+		return -ENOMEM;
+	return 0;
+}
+
+static inline void mm_free_pgd(struct mm_struct * mm)
+{
+	pgd_free(mm->pgd);
+}
+#else
+#define dup_mmap(mm, oldmm)	(0)
+#define mm_alloc_pgd(mm)	(0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
+
+#define allocate_mm()	(kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
+#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
+
+#include <linux/init_task.h>
+
+static struct mm_struct * mm_init(struct mm_struct * mm)
+{
+	atomic_set(&mm->mm_users, 1);
+	atomic_set(&mm->mm_count, 1);
+	init_rwsem(&mm->mmap_sem);
+	INIT_LIST_HEAD(&mm->mmlist);
+	mm->core_waiters = 0;
+	mm->nr_ptes = 0;
+	set_mm_counter(mm, file_rss, 0);
+	set_mm_counter(mm, anon_rss, 0);
+	spin_lock_init(&mm->page_table_lock);
+	rwlock_init(&mm->ioctx_list_lock);
+	mm->ioctx_list = NULL;
+	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
+
+	if (likely(!mm_alloc_pgd(mm))) {
+		mm->def_flags = 0;
+		return mm;
+	}
+	free_mm(mm);
+	return NULL;
+}
+
+/*
+ * Allocate and initialize an mm_struct.
+ */
+struct mm_struct * mm_alloc(void)
+{
+	struct mm_struct * mm;
+
+	mm = allocate_mm();
+	if (mm) {
+		memset(mm, 0, sizeof(*mm));
+		mm = mm_init(mm);
+	}
+	return mm;
+}
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+void fastcall __mmdrop(struct mm_struct *mm)
+{
+	BUG_ON(mm == &init_mm);
+	mm_free_pgd(mm);
+	destroy_context(mm);
+	free_mm(mm);
+}
+
+/*
+ * Decrement the use count and release all resources for an mm.
+ */
+void mmput(struct mm_struct *mm)
+{
+	if (atomic_dec_and_test(&mm->mm_users)) {
+		exit_aio(mm);
+		exit_mmap(mm);
+		if (!list_empty(&mm->mmlist)) {
+			spin_lock(&mmlist_lock);
+			list_del(&mm->mmlist);
+			spin_unlock(&mmlist_lock);
+		}
+		put_swap_token(mm);
+		mmdrop(mm);
+	}
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+/**
+ * get_task_mm - acquire a reference to the task's mm
+ *
+ * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * this kernel workthread has transiently adopted a user mm with use_mm,
+ * to do its AIO) is not set and if so returns a reference to it, after
+ * bumping up the use count.  User must release the mm via mmput()
+ * after use.  Typically used by /proc and ptrace.
+ */
+struct mm_struct *get_task_mm(struct task_struct *task)
+{
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm) {
+		if (task->flags & PF_BORROWED_MM)
+			mm = NULL;
+		else
+			atomic_inc(&mm->mm_users);
+	}
+	task_unlock(task);
+	return mm;
+}
+EXPORT_SYMBOL_GPL(get_task_mm);
+
+/* Please note the differences between mmput and mm_release.
+ * mmput is called whenever we stop holding onto a mm_struct,
+ * error success whatever.
+ *
+ * mm_release is called after a mm_struct has been removed
+ * from the current process.
+ *
+ * This difference is important for error handling, when we
+ * only half set up a mm_struct for a new process and need to restore
+ * the old one.  Because we mmput the new mm_struct before
+ * restoring the old one. . .
+ * Eric Biederman 10 January 1998
+ */
+void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct completion *vfork_done = tsk->vfork_done;
+
+	/* Get rid of any cached register state */
+	deactivate_mm(tsk, mm);
+
+	/* notify parent sleeping on vfork() */
+	if (vfork_done) {
+		tsk->vfork_done = NULL;
+		complete(vfork_done);
+	}
+	if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
+		u32 __user * tidptr = tsk->clear_child_tid;
+		tsk->clear_child_tid = NULL;
+
+		/*
+		 * We don't check the error code - if userspace has
+		 * not set up a proper pointer then tough luck.
+		 */
+		put_user(0, tidptr);
+		sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
+	}
+}
+
+/*
+ * Allocate a new mm structure and copy contents from the
+ * mm structure of the passed in task structure.
+ */
+static struct mm_struct *dup_mm(struct task_struct *tsk)
+{
+	struct mm_struct *mm, *oldmm = current->mm;
+	int err;
+
+	if (!oldmm)
+		return NULL;
+
+	mm = allocate_mm();
+	if (!mm)
+		goto fail_nomem;
+
+	memcpy(mm, oldmm, sizeof(*mm));
+
+	if (!mm_init(mm))
+		goto fail_nomem;
+
+	if (init_new_context(tsk, mm))
+		goto fail_nocontext;
+
+	err = dup_mmap(mm, oldmm);
+	if (err)
+		goto free_pt;
+
+	mm->hiwater_rss = get_mm_rss(mm);
+	mm->hiwater_vm = mm->total_vm;
+
+	return mm;
+
+free_pt:
+	mmput(mm);
+
+fail_nomem:
+	return NULL;
+
+fail_nocontext:
+	/*
+	 * If init_new_context() failed, we cannot use mmput() to free the mm
+	 * because it calls destroy_context()
+	 */
+	mm_free_pgd(mm);
+	free_mm(mm);
+	return NULL;
+}
+
+static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct mm_struct * mm, *oldmm;
+	int retval;
+
+	tsk->min_flt = tsk->maj_flt = 0;
+	tsk->nvcsw = tsk->nivcsw = 0;
+
+	tsk->mm = NULL;
+	tsk->active_mm = NULL;
+
+	/*
+	 * Are we cloning a kernel thread?
+	 *
+	 * We need to steal a active VM for that..
+	 */
+	oldmm = current->mm;
+	if (!oldmm)
+		return 0;
+
+	if (clone_flags & CLONE_VM) {
+		atomic_inc(&oldmm->mm_users);
+		mm = oldmm;
+		goto good_mm;
+	}
+
+	retval = -ENOMEM;
+	mm = dup_mm(tsk);
+	if (!mm)
+		goto fail_nomem;
+
+good_mm:
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	return 0;
+
+fail_nomem:
+	return retval;
+}
+
+static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
+{
+	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+	/* We don't need to lock fs - think why ;-) */
+	if (fs) {
+		atomic_set(&fs->count, 1);
+		rwlock_init(&fs->lock);
+		fs->umask = old->umask;
+		read_lock(&old->lock);
+		fs->rootmnt = mntget(old->rootmnt);
+		fs->root = dget(old->root);
+		fs->pwdmnt = mntget(old->pwdmnt);
+		fs->pwd = dget(old->pwd);
+		if (old->altroot) {
+			fs->altrootmnt = mntget(old->altrootmnt);
+			fs->altroot = dget(old->altroot);
+		} else {
+			fs->altrootmnt = NULL;
+			fs->altroot = NULL;
+		}
+		read_unlock(&old->lock);
+	}
+	return fs;
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+	return __copy_fs_struct(old);
+}
+
+EXPORT_SYMBOL_GPL(copy_fs_struct);
+
+static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
+{
+	if (clone_flags & CLONE_FS) {
+		atomic_inc(&current->fs->count);
+		return 0;
+	}
+	tsk->fs = __copy_fs_struct(current->fs);
+	if (!tsk->fs)
+		return -ENOMEM;
+	return 0;
+}
+
+static int count_open_files(struct fdtable *fdt)
+{
+	int size = fdt->max_fdset;
+	int i;
+
+	/* Find the last open fd */
+	for (i = size/(8*sizeof(long)); i > 0; ) {
+		if (fdt->open_fds->fds_bits[--i])
+			break;
+	}
+	i = (i+1) * 8 * sizeof(long);
+	return i;
+}
+
+static struct files_struct *alloc_files(void)
+{
+	struct files_struct *newf;
+	struct fdtable *fdt;
+
+	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+	if (!newf)
+		goto out;
+
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
+	fdt = &newf->fdtab;
+	fdt->max_fds = NR_OPEN_DEFAULT;
+	fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
+	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+	fdt->open_fds = (fd_set *)&newf->open_fds_init;
+	fdt->fd = &newf->fd_array[0];
+	INIT_RCU_HEAD(&fdt->rcu);
+	fdt->free_files = NULL;
+	fdt->next = NULL;
+	rcu_assign_pointer(newf->fdt, fdt);
+out:
+	return newf;
+}
+
+/*
+ * Allocate a new files structure and copy contents from the
+ * passed in files structure.
+ */
+static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
+{
+	struct files_struct *newf;
+	struct file **old_fds, **new_fds;
+	int open_files, size, i, expand;
+	struct fdtable *old_fdt, *new_fdt;
+
+	newf = alloc_files();
+	if (!newf)
+		goto out;
+
+	spin_lock(&oldf->file_lock);
+	old_fdt = files_fdtable(oldf);
+	new_fdt = files_fdtable(newf);
+	size = old_fdt->max_fdset;
+	open_files = count_open_files(old_fdt);
+	expand = 0;
+
+	/*
+	 * Check whether we need to allocate a larger fd array or fd set.
+	 * Note: we're not a clone task, so the open count won't  change.
+	 */
+	if (open_files > new_fdt->max_fdset) {
+		new_fdt->max_fdset = 0;
+		expand = 1;
+	}
+	if (open_files > new_fdt->max_fds) {
+		new_fdt->max_fds = 0;
+		expand = 1;
+	}
+
+	/* if the old fdset gets grown now, we'll only copy up to "size" fds */
+	if (expand) {
+		spin_unlock(&oldf->file_lock);
+		spin_lock(&newf->file_lock);
+		*errorp = expand_files(newf, open_files-1);
+		spin_unlock(&newf->file_lock);
+		if (*errorp < 0)
+			goto out_release;
+		new_fdt = files_fdtable(newf);
+		/*
+		 * Reacquire the oldf lock and a pointer to its fd table
+		 * who knows it may have a new bigger fd table. We need
+		 * the latest pointer.
+		 */
+		spin_lock(&oldf->file_lock);
+		old_fdt = files_fdtable(oldf);
+	}
+
+	old_fds = old_fdt->fd;
+	new_fds = new_fdt->fd;
+
+	memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
+	memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
+
+	for (i = open_files; i != 0; i--) {
+		struct file *f = *old_fds++;
+		if (f) {
+			get_file(f);
+		} else {
+			/*
+			 * The fd may be claimed in the fd bitmap but not yet
+			 * instantiated in the files array if a sibling thread
+			 * is partway through open().  So make sure that this
+			 * fd is available to the new process.
+			 */
+			FD_CLR(open_files - i, new_fdt->open_fds);
+		}
+		rcu_assign_pointer(*new_fds++, f);
+	}
+	spin_unlock(&oldf->file_lock);
+
+	/* compute the remainder to be cleared */
+	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
+
+	/* This is long word aligned thus could use a optimized version */ 
+	memset(new_fds, 0, size); 
+
+	if (new_fdt->max_fdset > open_files) {
+		int left = (new_fdt->max_fdset-open_files)/8;
+		int start = open_files / (8 * sizeof(unsigned long));
+
+		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
+		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
+	}
+
+out:
+	return newf;
+
+out_release:
+	free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
+	free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
+	free_fd_array(new_fdt->fd, new_fdt->max_fds);
+	kmem_cache_free(files_cachep, newf);
+	goto out;
+}
+
+static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct files_struct *oldf, *newf;
+	int error = 0;
+
+	/*
+	 * A background process may not have any files ...
+	 */
+	oldf = current->files;
+	if (!oldf)
+		goto out;
+
+	if (clone_flags & CLONE_FILES) {
+		atomic_inc(&oldf->count);
+		goto out;
+	}
+
+	/*
+	 * Note: we may be using current for both targets (See exec.c)
+	 * This works because we cache current->files (old) as oldf. Don't
+	 * break this.
+	 */
+	tsk->files = NULL;
+	error = -ENOMEM;
+	newf = dup_fd(oldf, &error);
+	if (!newf)
+		goto out;
+
+	tsk->files = newf;
+	error = 0;
+out:
+	return error;
+}
+
+/*
+ *	Helper to unshare the files of the current task.
+ *	We don't want to expose copy_files internals to
+ *	the exec layer of the kernel.
+ */
+
+int unshare_files(void)
+{
+	struct files_struct *files  = current->files;
+	int rc;
+
+	if(!files)
+		BUG();
+
+	/* This can race but the race causes us to copy when we don't
+	   need to and drop the copy */
+	if(atomic_read(&files->count) == 1)
+	{
+		atomic_inc(&files->count);
+		return 0;
+	}
+	rc = copy_files(0, current);
+	if(rc)
+		current->files = files;
+	return rc;
+}
+
+EXPORT_SYMBOL(unshare_files);
+
+static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct sighand_struct *sig;
+
+	if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
+		atomic_inc(&current->sighand->count);
+		return 0;
+	}
+	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+	rcu_assign_pointer(tsk->sighand, sig);
+	if (!sig)
+		return -ENOMEM;
+	atomic_set(&sig->count, 1);
+	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+	return 0;
+}
+
+void cleanup_sighand(struct task_struct *tsk)
+{
+	struct sighand_struct * sighand = tsk->sighand;
+
+	/* Ok, we're done with the signal handlers */
+	tsk->sighand = NULL;
+	if (atomic_dec_and_test(&sighand->count))
+		kmem_cache_free(sighand_cachep, sighand);
+}
+
+static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct signal_struct *sig;
+	int ret;
+
+	if (clone_flags & CLONE_THREAD) {
+		atomic_inc(&current->signal->count);
+		atomic_inc(&current->signal->live);
+		return 0;
+	}
+	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+	tsk->signal = sig;
+	if (!sig)
+		return -ENOMEM;
+
+	ret = copy_thread_group_keys(tsk);
+	if (ret < 0) {
+		kmem_cache_free(signal_cachep, sig);
+		return ret;
+	}
+
+	atomic_set(&sig->count, 1);
+	atomic_set(&sig->live, 1);
+	init_waitqueue_head(&sig->wait_chldexit);
+	sig->flags = 0;
+	sig->group_exit_code = 0;
+	sig->group_exit_task = NULL;
+	sig->group_stop_count = 0;
+	sig->curr_target = NULL;
+	init_sigpending(&sig->shared_pending);
+	INIT_LIST_HEAD(&sig->posix_timers);
+
+	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
+	sig->it_real_incr.tv64 = 0;
+	sig->real_timer.function = it_real_fn;
+	sig->real_timer.data = tsk;
+
+	sig->it_virt_expires = cputime_zero;
+	sig->it_virt_incr = cputime_zero;
+	sig->it_prof_expires = cputime_zero;
+	sig->it_prof_incr = cputime_zero;
+
+	sig->leader = 0;	/* session leadership doesn't inherit */
+	sig->tty_old_pgrp = 0;
+
+	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
+	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+	sig->sched_time = 0;
+	INIT_LIST_HEAD(&sig->cpu_timers[0]);
+	INIT_LIST_HEAD(&sig->cpu_timers[1]);
+	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+
+	task_lock(current->group_leader);
+	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
+	task_unlock(current->group_leader);
+
+	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+		/*
+		 * New sole thread in the process gets an expiry time
+		 * of the whole CPU time limit.
+		 */
+		tsk->it_prof_expires =
+			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+	}
+
+	return 0;
+}
+
+void __cleanup_signal(struct signal_struct *sig)
+{
+	exit_thread_group_keys(sig);
+	kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void cleanup_signal(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+
+	atomic_dec(&sig->live);
+
+	if (atomic_dec_and_test(&sig->count))
+		__cleanup_signal(sig);
+}
+
+static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
+{
+	unsigned long new_flags = p->flags;
+
+	new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
+	new_flags |= PF_FORKNOEXEC;
+	if (!(clone_flags & CLONE_PTRACE))
+		p->ptrace = 0;
+	p->flags = new_flags;
+}
+
+asmlinkage long sys_set_tid_address(int __user *tidptr)
+{
+	current->clear_child_tid = tidptr;
+
+	return current->pid;
+}
+
+/*
+ * This creates a new process as a copy of the old one,
+ * but does not actually start it yet.
+ *
+ * It copies the registers, and all the appropriate
+ * parts of the process environment (as per the clone
+ * flags). The actual kick-off is left to the caller.
+ */
+static task_t *copy_process(unsigned long clone_flags,
+				 unsigned long stack_start,
+				 struct pt_regs *regs,
+				 unsigned long stack_size,
+				 int __user *parent_tidptr,
+				 int __user *child_tidptr,
+				 int pid)
+{
+	int retval;
+	struct task_struct *p = NULL;
+
+	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Thread groups must share signals as well, and detached threads
+	 * can only be started up within the thread group.
+	 */
+	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Shared signal handlers imply shared VM. By way of the above,
+	 * thread groups also imply shared VM. Blocking this case allows
+	 * for various simplifications in other code.
+	 */
+	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
+		return ERR_PTR(-EINVAL);
+
+	retval = security_task_create(clone_flags);
+	if (retval)
+		goto fork_out;
+
+	retval = -ENOMEM;
+	p = dup_task_struct(current);
+	if (!p)
+		goto fork_out;
+
+	retval = -EAGAIN;
+	if (atomic_read(&p->user->processes) >=
+			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
+		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
+				p->user != &root_user)
+			goto bad_fork_free;
+	}
+
+	atomic_inc(&p->user->__count);
+	atomic_inc(&p->user->processes);
+	get_group_info(p->group_info);
+
+	/*
+	 * If multiple threads are within copy_process(), then this check
+	 * triggers too late. This doesn't hurt, the check is only there
+	 * to stop root fork bombs.
+	 */
+	if (nr_threads >= max_threads)
+		goto bad_fork_cleanup_count;
+
+	if (!try_module_get(task_thread_info(p)->exec_domain->module))
+		goto bad_fork_cleanup_count;
+
+	if (p->binfmt && !try_module_get(p->binfmt->module))
+		goto bad_fork_cleanup_put_domain;
+
+	p->did_exec = 0;
+	copy_flags(clone_flags, p);
+	p->pid = pid;
+	retval = -EFAULT;
+	if (clone_flags & CLONE_PARENT_SETTID)
+		if (put_user(p->pid, parent_tidptr))
+			goto bad_fork_cleanup;
+
+	INIT_LIST_HEAD(&p->children);
+	INIT_LIST_HEAD(&p->sibling);
+	p->vfork_done = NULL;
+	spin_lock_init(&p->alloc_lock);
+
+	clear_tsk_thread_flag(p, TIF_SIGPENDING);
+	init_sigpending(&p->pending);
+
+	p->utime = cputime_zero;
+	p->stime = cputime_zero;
+ 	p->sched_time = 0;
+	p->rchar = 0;		/* I/O counter: bytes read */
+	p->wchar = 0;		/* I/O counter: bytes written */
+	p->syscr = 0;		/* I/O counter: read syscalls */
+	p->syscw = 0;		/* I/O counter: write syscalls */
+	acct_clear_integrals(p);
+
+ 	p->it_virt_expires = cputime_zero;
+	p->it_prof_expires = cputime_zero;
+ 	p->it_sched_expires = 0;
+ 	INIT_LIST_HEAD(&p->cpu_timers[0]);
+ 	INIT_LIST_HEAD(&p->cpu_timers[1]);
+ 	INIT_LIST_HEAD(&p->cpu_timers[2]);
+
+	p->lock_depth = -1;		/* -1 = no lock */
+	do_posix_clock_monotonic_gettime(&p->start_time);
+	p->security = NULL;
+	p->io_context = NULL;
+	p->io_wait = NULL;
+	p->audit_context = NULL;
+	cpuset_fork(p);
+#ifdef CONFIG_NUMA
+ 	p->mempolicy = mpol_copy(p->mempolicy);
+ 	if (IS_ERR(p->mempolicy)) {
+ 		retval = PTR_ERR(p->mempolicy);
+ 		p->mempolicy = NULL;
+ 		goto bad_fork_cleanup_cpuset;
+ 	}
+	mpol_fix_fork_child_flag(p);
+#endif
+
+#ifdef CONFIG_DEBUG_MUTEXES
+	p->blocked_on = NULL; /* not blocked yet */
+#endif
+
+	p->tgid = p->pid;
+	if (clone_flags & CLONE_THREAD)
+		p->tgid = current->tgid;
+
+	if ((retval = security_task_alloc(p)))
+		goto bad_fork_cleanup_policy;
+	if ((retval = audit_alloc(p)))
+		goto bad_fork_cleanup_security;
+	/* copy all the process information */
+	if ((retval = copy_semundo(clone_flags, p)))
+		goto bad_fork_cleanup_audit;
+	if ((retval = copy_files(clone_flags, p)))
+		goto bad_fork_cleanup_semundo;
+	if ((retval = copy_fs(clone_flags, p)))
+		goto bad_fork_cleanup_files;
+	if ((retval = copy_sighand(clone_flags, p)))
+		goto bad_fork_cleanup_fs;
+	if ((retval = copy_signal(clone_flags, p)))
+		goto bad_fork_cleanup_sighand;
+	if ((retval = copy_mm(clone_flags, p)))
+		goto bad_fork_cleanup_signal;
+	if ((retval = copy_keys(clone_flags, p)))
+		goto bad_fork_cleanup_mm;
+	if ((retval = copy_namespace(clone_flags, p)))
+		goto bad_fork_cleanup_keys;
+	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+	if (retval)
+		goto bad_fork_cleanup_namespace;
+
+	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+	/*
+	 * Clear TID on mm_release()?
+	 */
+	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+
+	/*
+	 * Syscall tracing should be turned off in the child regardless
+	 * of CLONE_PTRACE.
+	 */
+	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#endif
+
+	/* Our parent execution domain becomes current domain
+	   These must match for thread signalling to apply */
+	   
+	p->parent_exec_id = p->self_exec_id;
+
+	/* ok, now we should be set up.. */
+	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
+	p->pdeath_signal = 0;
+	p->exit_state = 0;
+
+	/*
+	 * Ok, make it visible to the rest of the system.
+	 * We dont wake it up yet.
+	 */
+	p->group_leader = p;
+	INIT_LIST_HEAD(&p->ptrace_children);
+	INIT_LIST_HEAD(&p->ptrace_list);
+
+	/* Perform scheduler related setup. Assign this task to a CPU. */
+	sched_fork(p, clone_flags);
+
+	/* Need tasklist lock for parent etc handling! */
+	write_lock_irq(&tasklist_lock);
+
+	/*
+	 * The task hasn't been attached yet, so its cpus_allowed mask will
+	 * not be changed, nor will its assigned CPU.
+	 *
+	 * The cpus_allowed mask of the parent may have changed after it was
+	 * copied first time - so re-copy it here, then check the child's CPU
+	 * to ensure it is on a valid CPU (and if not, just force it back to
+	 * parent's CPU). This avoids alot of nasty races.
+	 */
+	p->cpus_allowed = current->cpus_allowed;
+	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
+			!cpu_online(task_cpu(p))))
+		set_task_cpu(p, smp_processor_id());
+
+	/*
+	 * Check for pending SIGKILL! The new thread should not be allowed
+	 * to slip out of an OOM kill. (or normal SIGKILL.)
+	 */
+	if (sigismember(&current->pending.signal, SIGKILL)) {
+		write_unlock_irq(&tasklist_lock);
+		retval = -EINTR;
+		goto bad_fork_cleanup_namespace;
+	}
+
+	/* CLONE_PARENT re-uses the old parent */
+	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+		p->real_parent = current->real_parent;
+	else
+		p->real_parent = current;
+	p->parent = p->real_parent;
+
+	spin_lock(&current->sighand->siglock);
+	if (clone_flags & CLONE_THREAD) {
+		/*
+		 * Important: if an exit-all has been started then
+		 * do not create this new thread - the whole thread
+		 * group is supposed to exit anyway.
+		 */
+		if (current->signal->flags & SIGNAL_GROUP_EXIT) {
+			spin_unlock(&current->sighand->siglock);
+			write_unlock_irq(&tasklist_lock);
+			retval = -EAGAIN;
+			goto bad_fork_cleanup_namespace;
+		}
+		p->group_leader = current->group_leader;
+
+		if (current->signal->group_stop_count > 0) {
+			/*
+			 * There is an all-stop in progress for the group.
+			 * We ourselves will stop as soon as we check signals.
+			 * Make the new thread part of that group stop too.
+			 */
+			current->signal->group_stop_count++;
+			set_tsk_thread_flag(p, TIF_SIGPENDING);
+		}
+
+		if (!cputime_eq(current->signal->it_virt_expires,
+				cputime_zero) ||
+		    !cputime_eq(current->signal->it_prof_expires,
+				cputime_zero) ||
+		    current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
+		    !list_empty(&current->signal->cpu_timers[0]) ||
+		    !list_empty(&current->signal->cpu_timers[1]) ||
+		    !list_empty(&current->signal->cpu_timers[2])) {
+			/*
+			 * Have child wake up on its first tick to check
+			 * for process CPU timers.
+			 */
+			p->it_prof_expires = jiffies_to_cputime(1);
+		}
+	}
+
+	/*
+	 * inherit ioprio
+	 */
+	p->ioprio = current->ioprio;
+
+	if (likely(p->pid)) {
+		add_parent(p);
+		if (unlikely(p->ptrace & PT_PTRACED))
+			__ptrace_link(p, current->parent);
+
+		if (thread_group_leader(p)) {
+			if (unlikely(p->pid == 1)) {
+				p->signal->tty = NULL;
+				p->signal->leader = 1;
+				p->signal->pgrp = 1;
+				p->signal->session = 1;
+			} else {
+				p->signal->tty = current->signal->tty;
+				p->signal->pgrp = process_group(current);
+				p->signal->session = current->signal->session;
+			}
+			attach_pid(p, PIDTYPE_PGID, process_group(p));
+			attach_pid(p, PIDTYPE_SID, p->signal->session);
+
+			list_add_tail(&p->tasks, &init_task.tasks);
+			__get_cpu_var(process_counts)++;
+		}
+		attach_pid(p, PIDTYPE_TGID, p->tgid);
+		attach_pid(p, PIDTYPE_PID, p->pid);
+		nr_threads++;
+	}
+
+	total_forks++;
+	spin_unlock(&current->sighand->siglock);
+	write_unlock_irq(&tasklist_lock);
+	proc_fork_connector(p);
+	return p;
+
+bad_fork_cleanup_namespace:
+	exit_namespace(p);
+bad_fork_cleanup_keys:
+	exit_keys(p);
+bad_fork_cleanup_mm:
+	if (p->mm)
+		mmput(p->mm);
+bad_fork_cleanup_signal:
+	cleanup_signal(p);
+bad_fork_cleanup_sighand:
+	cleanup_sighand(p);
+bad_fork_cleanup_fs:
+	exit_fs(p); /* blocking */
+bad_fork_cleanup_files:
+	exit_files(p); /* blocking */
+bad_fork_cleanup_semundo:
+	exit_sem(p);
+bad_fork_cleanup_audit:
+	audit_free(p);
+bad_fork_cleanup_security:
+	security_task_free(p);
+bad_fork_cleanup_policy:
+#ifdef CONFIG_NUMA
+	mpol_free(p->mempolicy);
+bad_fork_cleanup_cpuset:
+#endif
+	cpuset_exit(p);
+bad_fork_cleanup:
+	if (p->binfmt)
+		module_put(p->binfmt->module);
+bad_fork_cleanup_put_domain:
+	module_put(task_thread_info(p)->exec_domain->module);
+bad_fork_cleanup_count:
+	put_group_info(p->group_info);
+	atomic_dec(&p->user->processes);
+	free_uid(p->user);
+bad_fork_free:
+	free_task(p);
+fork_out:
+	return ERR_PTR(retval);
+}
+
+struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+	return regs;
+}
+
+task_t * __devinit fork_idle(int cpu)
+{
+	task_t *task;
+	struct pt_regs regs;
+
+	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
+	if (!task)
+		return ERR_PTR(-ENOMEM);
+	init_idle(task, cpu);
+
+	return task;
+}
+
+static inline int fork_traceflag (unsigned clone_flags)
+{
+	if (clone_flags & CLONE_UNTRACED)
+		return 0;
+	else if (clone_flags & CLONE_VFORK) {
+		if (current->ptrace & PT_TRACE_VFORK)
+			return PTRACE_EVENT_VFORK;
+	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
+		if (current->ptrace & PT_TRACE_CLONE)
+			return PTRACE_EVENT_CLONE;
+	} else if (current->ptrace & PT_TRACE_FORK)
+		return PTRACE_EVENT_FORK;
+
+	return 0;
+}
+
+/*
+ *  Ok, this is the main fork-routine.
+ *
+ * It copies the process, and if successful kick-starts
+ * it and waits for it to finish using the VM if required.
+ */
+long do_fork(unsigned long clone_flags,
+	      unsigned long stack_start,
+	      struct pt_regs *regs,
+	      unsigned long stack_size,
+	      int __user *parent_tidptr,
+	      int __user *child_tidptr)
+{
+	struct task_struct *p;
+	int trace = 0;
+	long pid = alloc_pidmap();
+
+	if (pid < 0)
+		return -EAGAIN;
+	if (unlikely(current->ptrace)) {
+		trace = fork_traceflag (clone_flags);
+		if (trace)
+			clone_flags |= CLONE_PTRACE;
+	}
+
+	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
+	/*
+	 * Do this prior waking up the new thread - the thread pointer
+	 * might get invalid after that point, if the thread exits quickly.
+	 */
+	if (!IS_ERR(p)) {
+		struct completion vfork;
+
+		if (clone_flags & CLONE_VFORK) {
+			p->vfork_done = &vfork;
+			init_completion(&vfork);
+		}
+
+		if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+			/*
+			 * We'll start up with an immediate SIGSTOP.
+			 */
+			sigaddset(&p->pending.signal, SIGSTOP);
+			set_tsk_thread_flag(p, TIF_SIGPENDING);
+		}
+
+		if (!(clone_flags & CLONE_STOPPED))
+			wake_up_new_task(p, clone_flags);
+		else
+			p->state = TASK_STOPPED;
+
+		if (unlikely (trace)) {
+			current->ptrace_message = pid;
+			ptrace_notify ((trace << 8) | SIGTRAP);
+		}
+
+		if (clone_flags & CLONE_VFORK) {
+			wait_for_completion(&vfork);
+			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
+				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
+		}
+	} else {
+		free_pidmap(pid);
+		pid = PTR_ERR(p);
+	}
+	return pid;
+}
+
+#ifndef ARCH_MIN_MMSTRUCT_ALIGN
+#define ARCH_MIN_MMSTRUCT_ALIGN 0
+#endif
+
+static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct sighand_struct *sighand = data;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+					SLAB_CTOR_CONSTRUCTOR)
+		spin_lock_init(&sighand->siglock);
+}
+
+void __init proc_caches_init(void)
+{
+	sighand_cachep = kmem_cache_create("sighand_cache",
+			sizeof(struct sighand_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+			sighand_ctor, NULL);
+	signal_cachep = kmem_cache_create("signal_cache",
+			sizeof(struct signal_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	files_cachep = kmem_cache_create("files_cache", 
+			sizeof(struct files_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	fs_cachep = kmem_cache_create("fs_cache", 
+			sizeof(struct fs_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	vm_area_cachep = kmem_cache_create("vm_area_struct",
+			sizeof(struct vm_area_struct), 0,
+			SLAB_PANIC, NULL, NULL);
+	mm_cachep = kmem_cache_create("mm_struct",
+			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+}
+
+
+/*
+ * Check constraints on flags passed to the unshare system call and
+ * force unsharing of additional process context as appropriate.
+ */
+static inline void check_unshare_flags(unsigned long *flags_ptr)
+{
+	/*
+	 * If unsharing a thread from a thread group, must also
+	 * unshare vm.
+	 */
+	if (*flags_ptr & CLONE_THREAD)
+		*flags_ptr |= CLONE_VM;
+
+	/*
+	 * If unsharing vm, must also unshare signal handlers.
+	 */
+	if (*flags_ptr & CLONE_VM)
+		*flags_ptr |= CLONE_SIGHAND;
+
+	/*
+	 * If unsharing signal handlers and the task was created
+	 * using CLONE_THREAD, then must unshare the thread
+	 */
+	if ((*flags_ptr & CLONE_SIGHAND) &&
+	    (atomic_read(&current->signal->count) > 1))
+		*flags_ptr |= CLONE_THREAD;
+
+	/*
+	 * If unsharing namespace, must also unshare filesystem information.
+	 */
+	if (*flags_ptr & CLONE_NEWNS)
+		*flags_ptr |= CLONE_FS;
+}
+
+/*
+ * Unsharing of tasks created with CLONE_THREAD is not supported yet
+ */
+static int unshare_thread(unsigned long unshare_flags)
+{
+	if (unshare_flags & CLONE_THREAD)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unshare the filesystem structure if it is being shared
+ */
+static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
+{
+	struct fs_struct *fs = current->fs;
+
+	if ((unshare_flags & CLONE_FS) &&
+	    (fs && atomic_read(&fs->count) > 1)) {
+		*new_fsp = __copy_fs_struct(current->fs);
+		if (!*new_fsp)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Unshare the namespace structure if it is being shared
+ */
+static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
+{
+	struct namespace *ns = current->namespace;
+
+	if ((unshare_flags & CLONE_NEWNS) &&
+	    (ns && atomic_read(&ns->count) > 1)) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		*new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
+		if (!*new_nsp)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
+ * supported yet
+ */
+static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
+{
+	struct sighand_struct *sigh = current->sighand;
+
+	if ((unshare_flags & CLONE_SIGHAND) &&
+	    (sigh && atomic_read(&sigh->count) > 1))
+		return -EINVAL;
+	else
+		return 0;
+}
+
+/*
+ * Unshare vm if it is being shared
+ */
+static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
+{
+	struct mm_struct *mm = current->mm;
+
+	if ((unshare_flags & CLONE_VM) &&
+	    (mm && atomic_read(&mm->mm_users) > 1)) {
+		*new_mmp = dup_mm(current);
+		if (!*new_mmp)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Unshare file descriptor table if it is being shared
+ */
+static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+{
+	struct files_struct *fd = current->files;
+	int error = 0;
+
+	if ((unshare_flags & CLONE_FILES) &&
+	    (fd && atomic_read(&fd->count) > 1)) {
+		*new_fdp = dup_fd(fd, &error);
+		if (!*new_fdp)
+			return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
+ * supported yet
+ */
+static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
+{
+	if (unshare_flags & CLONE_SYSVSEM)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * unshare allows a process to 'unshare' part of the process
+ * context which was originally shared using clone.  copy_*
+ * functions used by do_fork() cannot be used here directly
+ * because they modify an inactive task_struct that is being
+ * constructed. Here we are modifying the current, active,
+ * task_struct.
+ */
+asmlinkage long sys_unshare(unsigned long unshare_flags)
+{
+	int err = 0;
+	struct fs_struct *fs, *new_fs = NULL;
+	struct namespace *ns, *new_ns = NULL;
+	struct sighand_struct *sigh, *new_sigh = NULL;
+	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
+	struct files_struct *fd, *new_fd = NULL;
+	struct sem_undo_list *new_ulist = NULL;
+
+	check_unshare_flags(&unshare_flags);
+
+	if ((err = unshare_thread(unshare_flags)))
+		goto bad_unshare_out;
+	if ((err = unshare_fs(unshare_flags, &new_fs)))
+		goto bad_unshare_cleanup_thread;
+	if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
+		goto bad_unshare_cleanup_fs;
+	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
+		goto bad_unshare_cleanup_ns;
+	if ((err = unshare_vm(unshare_flags, &new_mm)))
+		goto bad_unshare_cleanup_sigh;
+	if ((err = unshare_fd(unshare_flags, &new_fd)))
+		goto bad_unshare_cleanup_vm;
+	if ((err = unshare_semundo(unshare_flags, &new_ulist)))
+		goto bad_unshare_cleanup_fd;
+
+	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
+
+		task_lock(current);
+
+		if (new_fs) {
+			fs = current->fs;
+			current->fs = new_fs;
+			new_fs = fs;
+		}
+
+		if (new_ns) {
+			ns = current->namespace;
+			current->namespace = new_ns;
+			new_ns = ns;
+		}
+
+		if (new_sigh) {
+			sigh = current->sighand;
+			current->sighand = new_sigh;
+			new_sigh = sigh;
+		}
+
+		if (new_mm) {
+			mm = current->mm;
+			active_mm = current->active_mm;
+			current->mm = new_mm;
+			current->active_mm = new_mm;
+			activate_mm(active_mm, new_mm);
+			new_mm = mm;
+		}
+
+		if (new_fd) {
+			fd = current->files;
+			current->files = new_fd;
+			new_fd = fd;
+		}
+
+		task_unlock(current);
+	}
+
+bad_unshare_cleanup_fd:
+	if (new_fd)
+		put_files_struct(new_fd);
+
+bad_unshare_cleanup_vm:
+	if (new_mm)
+		mmput(new_mm);
+
+bad_unshare_cleanup_sigh:
+	if (new_sigh)
+		if (atomic_dec_and_test(&new_sigh->count))
+			kmem_cache_free(sighand_cachep, new_sigh);
+
+bad_unshare_cleanup_ns:
+	if (new_ns)
+		put_namespace(new_ns);
+
+bad_unshare_cleanup_fs:
+	if (new_fs)
+		put_fs_struct(new_fs);
+
+bad_unshare_cleanup_thread:
+bad_unshare_out:
+	return err;
+}
diff -urN oldtree/kernel/kmod.c newtree/kernel/kmod.c
--- oldtree/kernel/kmod.c	2006-03-08 18:48:02.948063250 +0000
+++ newtree/kernel/kmod.c	2006-03-08 15:22:33.233504250 +0000
@@ -36,6 +36,7 @@
 #include <linux/mount.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
 
 extern int max_threads;
@@ -249,6 +250,9 @@
 	if (!khelper_wq)
 		return -EBUSY;
 
+	if (freezer_is_on())
+		return 0;
+
 	if (path[0] == '\0')
 		return 0;
 
diff -urN oldtree/kernel/kthread.c newtree/kernel/kthread.c
--- oldtree/kernel/kthread.c	2006-03-08 18:48:02.948063250 +0000
+++ newtree/kernel/kthread.c	2006-03-08 15:22:33.241504750 +0000
@@ -26,6 +26,7 @@
 	/* Information passed to kthread() from keventd. */
 	int (*threadfn)(void *data);
 	void *data;
+	unsigned long freezer_flags;
 	struct completion started;
 
 	/* Result passed back to kthread_create() from keventd. */
@@ -87,6 +88,10 @@
 	/* By default we can run anywhere, unlike keventd. */
 	set_cpus_allowed(current, CPU_MASK_ALL);
 
+	/* Set our freezer flags */
+	current->flags &= ~PF_NOFREEZE;
+	current->flags |= (create->freezer_flags & PF_NOFREEZE);
+
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_INTERRUPTIBLE);
 	complete(&create->started);
@@ -120,16 +125,18 @@
 	complete(&create->done);
 }
 
-struct task_struct *kthread_create(int (*threadfn)(void *data),
+struct task_struct *__kthread_create(int (*threadfn)(void *data),
 				   void *data,
+				   unsigned long freezer_flags,
 				   const char namefmt[],
-				   ...)
+				   va_list * args)
 {
 	struct kthread_create_info create;
 	DECLARE_WORK(work, keventd_create_kthread, &create);
 
 	create.threadfn = threadfn;
 	create.data = data;
+	create.freezer_flags = freezer_flags;
 	init_completion(&create.started);
 	init_completion(&create.done);
 
@@ -142,18 +149,89 @@
 		queue_work(helper_wq, &work);
 		wait_for_completion(&create.done);
 	}
-	if (!IS_ERR(create.result)) {
-		va_list args;
-		va_start(args, namefmt);
+	if (!IS_ERR(create.result))
 		vsnprintf(create.result->comm, sizeof(create.result->comm),
-			  namefmt, args);
-		va_end(args);
-	}
+			  namefmt, *args);
 
 	return create.result;
 }
+
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+				   void *data,
+				   const char namefmt[], ...)
+{
+	struct task_struct * result;
+
+	va_list args;
+	va_start(args, namefmt);
+	result = __kthread_create(threadfn, data, 0, namefmt, &args);
+	va_end(args);
+	return result;
+}
+
 EXPORT_SYMBOL(kthread_create);
 
+struct task_struct *kthread_nofreeze_create(int (*threadfn)(void *data),
+				   void *data,
+				   const char namefmt[], ...)
+{
+	struct task_struct * result;
+
+	va_list args;
+	va_start(args, namefmt);
+	result = __kthread_create(threadfn, data, PF_NOFREEZE, namefmt, &args);
+	va_end(args);
+	return result;
+}
+
+EXPORT_SYMBOL(kthread_nofreeze_create);
+
+/**
+ * kthread_run: create and wake a thread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: Convenient wrapper for kthread_create() followed by
+ * wake_up_process().  Returns the kthread, or ERR_PTR(-ENOMEM).
+ **/
+struct task_struct * kthread_run(int (*threadfn)(void *data),
+		void *data,
+		const char namefmt[], ...)
+{
+	struct task_struct *__k;
+	va_list args;
+
+	va_start(args, namefmt);
+	__k = __kthread_create(threadfn, data, 0, namefmt, &args);
+	va_end(args);
+
+	if(!IS_ERR(__k))
+		wake_up_process(__k);
+
+	return __k;
+}
+
+EXPORT_SYMBOL(kthread_run);
+
+struct task_struct * kthread_nofreeze_run(int (*threadfn)(void *data),
+		void *data,
+		const char namefmt[], ...)
+{
+	struct task_struct *__k;
+	va_list args;
+
+	va_start(args, namefmt);
+	__k = __kthread_create(threadfn, data, PF_NOFREEZE, namefmt, &args);
+	va_end(args);
+
+	if(!IS_ERR(__k))
+		wake_up_process(__k);
+
+	return __k;
+}
+EXPORT_SYMBOL(kthread_nofreeze_run);
+
 void kthread_bind(struct task_struct *k, unsigned int cpu)
 {
 	BUG_ON(k->state != TASK_INTERRUPTIBLE);
diff -urN oldtree/kernel/kthread.c.orig newtree/kernel/kthread.c.orig
--- oldtree/kernel/kthread.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/kthread.c.orig	2006-03-08 15:21:19.220878750 +0000
@@ -0,0 +1,212 @@
+/* Kernel thread helper functions.
+ *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
+ *
+ * Creation is done via keventd, so that we get a clean environment
+ * even if we're invoked from userspace (think modprobe, hotplug cpu,
+ * etc.).
+ */
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/unistd.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <asm/semaphore.h>
+
+/*
+ * We dont want to execute off keventd since it might
+ * hold a semaphore our callers hold too:
+ */
+static struct workqueue_struct *helper_wq;
+
+struct kthread_create_info
+{
+	/* Information passed to kthread() from keventd. */
+	int (*threadfn)(void *data);
+	void *data;
+	struct completion started;
+
+	/* Result passed back to kthread_create() from keventd. */
+	struct task_struct *result;
+	struct completion done;
+};
+
+struct kthread_stop_info
+{
+	struct task_struct *k;
+	int err;
+	struct completion done;
+};
+
+/* Thread stopping is done by setthing this var: lock serializes
+ * multiple kthread_stop calls. */
+static DEFINE_MUTEX(kthread_stop_lock);
+static struct kthread_stop_info kthread_stop_info;
+
+int kthread_should_stop(void)
+{
+	return (kthread_stop_info.k == current);
+}
+EXPORT_SYMBOL(kthread_should_stop);
+
+static void kthread_exit_files(void)
+{
+	struct fs_struct *fs;
+	struct task_struct *tsk = current;
+
+	exit_fs(tsk);		/* current->fs->count--; */
+	fs = init_task.fs;
+	tsk->fs = fs;
+	atomic_inc(&fs->count);
+ 	exit_files(tsk);
+	current->files = init_task.files;
+	atomic_inc(&tsk->files->count);
+}
+
+static int kthread(void *_create)
+{
+	struct kthread_create_info *create = _create;
+	int (*threadfn)(void *data);
+	void *data;
+	sigset_t blocked;
+	int ret = -EINTR;
+
+	kthread_exit_files();
+
+	/* Copy data: it's on keventd's stack */
+	threadfn = create->threadfn;
+	data = create->data;
+
+	/* Block and flush all signals (in case we're not from keventd). */
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(current);
+
+	/* By default we can run anywhere, unlike keventd. */
+	set_cpus_allowed(current, CPU_MASK_ALL);
+
+	/* OK, tell user we're spawned, wait for stop or wakeup */
+	__set_current_state(TASK_INTERRUPTIBLE);
+	complete(&create->started);
+	schedule();
+
+	if (!kthread_should_stop())
+		ret = threadfn(data);
+
+	/* It might have exited on its own, w/o kthread_stop.  Check. */
+	if (kthread_should_stop()) {
+		kthread_stop_info.err = ret;
+		complete(&kthread_stop_info.done);
+	}
+	return 0;
+}
+
+/* We are keventd: create a thread. */
+static void keventd_create_kthread(void *_create)
+{
+	struct kthread_create_info *create = _create;
+	int pid;
+
+	/* We want our own signal handler (we take no signals by default). */
+	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+	if (pid < 0) {
+		create->result = ERR_PTR(pid);
+	} else {
+		wait_for_completion(&create->started);
+		create->result = find_task_by_pid(pid);
+	}
+	complete(&create->done);
+}
+
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+				   void *data,
+				   const char namefmt[],
+				   ...)
+{
+	struct kthread_create_info create;
+	DECLARE_WORK(work, keventd_create_kthread, &create);
+
+	create.threadfn = threadfn;
+	create.data = data;
+	init_completion(&create.started);
+	init_completion(&create.done);
+
+	/*
+	 * The workqueue needs to start up first:
+	 */
+	if (!helper_wq)
+		work.func(work.data);
+	else {
+		queue_work(helper_wq, &work);
+		wait_for_completion(&create.done);
+	}
+	if (!IS_ERR(create.result)) {
+		va_list args;
+		va_start(args, namefmt);
+		vsnprintf(create.result->comm, sizeof(create.result->comm),
+			  namefmt, args);
+		va_end(args);
+	}
+
+	return create.result;
+}
+EXPORT_SYMBOL(kthread_create);
+
+void kthread_bind(struct task_struct *k, unsigned int cpu)
+{
+	BUG_ON(k->state != TASK_INTERRUPTIBLE);
+	/* Must have done schedule() in kthread() before we set_task_cpu */
+	wait_task_inactive(k);
+	set_task_cpu(k, cpu);
+	k->cpus_allowed = cpumask_of_cpu(cpu);
+}
+EXPORT_SYMBOL(kthread_bind);
+
+int kthread_stop(struct task_struct *k)
+{
+	return kthread_stop_sem(k, NULL);
+}
+EXPORT_SYMBOL(kthread_stop);
+
+int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
+{
+	int ret;
+
+	mutex_lock(&kthread_stop_lock);
+
+	/* It could exit after stop_info.k set, but before wake_up_process. */
+	get_task_struct(k);
+
+	/* Must init completion *before* thread sees kthread_stop_info.k */
+	init_completion(&kthread_stop_info.done);
+	smp_wmb();
+
+	/* Now set kthread_should_stop() to true, and wake it up. */
+	kthread_stop_info.k = k;
+	if (s)
+		up(s);
+	else
+		wake_up_process(k);
+	put_task_struct(k);
+
+	/* Once it dies, reset stop ptr, gather result and we're done. */
+	wait_for_completion(&kthread_stop_info.done);
+	kthread_stop_info.k = NULL;
+	ret = kthread_stop_info.err;
+	mutex_unlock(&kthread_stop_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(kthread_stop_sem);
+
+static __init int helper_init(void)
+{
+	helper_wq = create_singlethread_workqueue("kthread");
+	BUG_ON(!helper_wq);
+
+	return 0;
+}
+core_initcall(helper_init);
+
diff -urN oldtree/kernel/power/Kconfig newtree/kernel/power/Kconfig
--- oldtree/kernel/power/Kconfig	2006-03-08 18:48:02.956063750 +0000
+++ newtree/kernel/power/Kconfig	2006-03-08 15:22:33.245505000 +0000
@@ -98,3 +98,76 @@
 	bool
 	depends on HOTPLUG_CPU && X86 && PM
 	default y
+
+config SUSPEND_DEBUG_PAGEALLOC
+	bool
+	depends on DEBUG_PAGEALLOC && (SOFTWARE_SUSPEND || SUSPEND2)
+	default y
+
+config SUSPEND2_CRYPTO
+	bool
+	depends on SUSPEND2 && CRYPTO
+	default y
+
+menuconfig SUSPEND2
+	bool "Suspend2"
+	select DYN_PAGEFLAGS
+	depends on PM
+	select HOTPLUG_CPU if SMP
+	---help---
+	  Suspend2 is the 'new and improved' suspend support.
+	  
+	  See the Suspend2 home page (suspend2.net)
+	  for FAQs, HOWTOs and other documentation.
+
+	comment 'Image Storage (you need at least one writer)'
+		depends on SUSPEND2
+	
+	config SUSPEND2_FILEWRITER
+		bool '  File Writer'
+		depends on SUSPEND2
+		---help---
+		  This option enables support for storing an image in a
+		  simple file. This should be possible, but we're still
+		  testing it.
+
+	config SUSPEND2_SWAPWRITER
+		bool '  Swap Writer'
+		depends on SUSPEND2
+		select SWAP
+		---help---
+		  This option enables support for storing an image in your
+		  swap space.
+
+	comment 'General Options'
+		depends on SUSPEND2
+
+	config SUSPEND2_DEFAULT_RESUME2
+		string '  Default resume device name'
+		depends on SUSPEND2
+		---help---
+		  You normally need to add a resume2= parameter to your lilo.conf or
+		  equivalent. With this option properly set, the kernel has a value
+		  to default. No damage will be done if the value is invalid.
+
+	config SUSPEND2_CHECKSUMMING
+		bool '  Checksum images - developer option (SLOW!)'
+		depends on PM_DEBUG && SUSPEND2
+		---help---
+		  This option implements checksumming of images. It is not designed
+		  for everyone to use, but as a development tool.
+
+	config SUSPEND2_KEEP_IMAGE
+		bool '  Allow Keep Image Mode'
+		depends on SUSPEND2
+		---help---
+		  This option allows you to keep and image and reuse it. It is intended
+		  __ONLY__ for use with systems where all filesystems are mounted read-
+		  only (kiosks, for example). To use it, compile this option in and boot
+		  normally. Set the KEEP_IMAGE flag in /proc/suspend2 and suspend.
+		  When you resume, the image will not be removed. You will be unable to turn
+		  off swap partitions (assuming you are using the swap writer), but future
+		  suspends simply do a power-down. The image can be updated using the
+		  kernel command line parameter suspend_act= to turn off the keep image
+		  bit. Keep image mode is a little less user friendly on purpose - it
+		  should not be used without thought!
diff -urN oldtree/kernel/power/Makefile newtree/kernel/power/Makefile
--- oldtree/kernel/power/Makefile	2006-03-08 18:48:02.956063750 +0000
+++ newtree/kernel/power/Makefile	2006-03-08 15:31:56.812725750 +0000
@@ -5,8 +5,35 @@
 
 obj-y				:= main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)		+= pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
+
+CFLAGS_atomic_copy.o := -O0
+
+# Order is important for compression and encryption - we
+# compress before encrypting.
+
+suspend_core-objs := io.o pagedir.o prepare_image.o \
+              extent.o suspend.o modules.o \
+              pageflags.o ui.o proc.o \
+              power_off.o atomic_copy.o debug_pagealloc.o \
+              netlink.o
+
+#ifdef CONFIG_NET
+suspend_core-objs += storage.o
+#endif
+obj-$(CONFIG_SUSPEND2)                        += suspend_core.o
+obj-$(CONFIG_SUSPEND2_CRYPTO)         += compression.o encryption.o
+
+obj-$(CONFIG_SUSPEND2_SWAPWRITER)     += suspend_block_io.o suspend_swap.o
+obj-$(CONFIG_SUSPEND2_FILEWRITER)     += suspend_block_io.o suspend_file.o
+
+obj-$(CONFIG_SUSPEND2_CHECKSUMMING)   += suspend_checksums.o
+
+obj-$(CONFIG_SOFTWARE_SUSPEND)        += swsusp.o disk.o snapshot.o
+
+obj-$(CONFIG_MAGIC_SYSRQ)     += poweroff.o
+
+
diff -urN oldtree/kernel/power/Makefile.orig newtree/kernel/power/Makefile.orig
--- oldtree/kernel/power/Makefile.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/Makefile.orig	2006-03-08 15:21:19.228879250 +0000
@@ -0,0 +1,12 @@
+
+ifeq ($(CONFIG_PM_DEBUG),y)
+EXTRA_CFLAGS	+=	-DDEBUG
+endif
+
+obj-y				:= main.o process.o console.o
+obj-$(CONFIG_PM_LEGACY)		+= pm.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o user.o
+
+obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
+
+obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff -urN oldtree/kernel/power/atomic_copy.c newtree/kernel/power/atomic_copy.c
--- oldtree/kernel/power/atomic_copy.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/atomic_copy.c	2006-03-08 15:22:33.253505500 +0000
@@ -0,0 +1,473 @@
+/*
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <asm/setup.h>
+#include <asm/param.h>
+#include <asm/thread_info.h>
+#include "suspend2_common.h"
+#include "io.h"
+#include "power_off.h"
+#include "version.h"
+#include "ui.h"
+#include "modules.h"
+#include "atomic_copy.h"
+#include "suspend2.h"
+#include "checksum.h"
+#include "pageflags.h"
+#include "debug_pagealloc.h"
+#include "storage.h"
+
+#include <asm/suspend2.h>
+
+volatile static int state1 __nosavedata = 0;
+volatile static int state2 __nosavedata = 0;
+volatile static int state3 __nosavedata = 0;
+volatile static int io_speed_save[2][2] __nosavedata;
+
+static dyn_pageflags_t __nosavedata origmap;
+static dyn_pageflags_t __nosavedata copymap;
+static unsigned long __nosavedata origoffset;
+static unsigned long __nosavedata copyoffset;
+static int __nosavedata loop;
+static __nosavedata int o_zone_num, c_zone_num;
+static __nosavedata int is_resuming;
+
+__nosavedata char resume_commandline[COMMAND_LINE_SIZE];
+
+static atomic_t atomic_copy_hold;
+static atomic_t restore_thread_ready;
+
+suspend2_saved_context_t suspend2_saved_context;	/* temporary storage */
+
+struct zone_data {
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+	int is_highmem;
+};
+
+static __nosavedata struct zone_data *zone_nosave;
+static __nosavedata int num_zones;
+
+/*
+ * Zone information might be overwritten during the copy back,
+ * so we copy the fields we need to a non-conflicting page and
+ * use it.
+ */
+static void init_nosave_zone_table(void)
+{
+	struct zone *zone;
+	
+	zone_nosave = (struct zone_data *) suspend_get_nonconflicting_pages(0);
+
+	BUG_ON(!zone_nosave);
+
+	for_each_zone(zone) {
+		if (zone->spanned_pages) {
+			zone_nosave[num_zones].start_pfn = zone->zone_start_pfn;
+			zone_nosave[num_zones].end_pfn = zone->zone_start_pfn +
+				zone->spanned_pages - 1;
+			zone_nosave[num_zones].is_highmem = is_highmem(zone);
+		}
+		num_zones++;
+	}
+}
+
+/* For Suspend2, where this all has to be inlined */
+static unsigned long inline __get_next_bit_on(dyn_pageflags_t bitmap, int *zone_num, long counter)
+{
+	unsigned long *ul_ptr = NULL;
+	int reset_ul_ptr = 1;
+	BUG_ON(counter == max_pfn);
+
+	if (counter == -1) {
+		*zone_num = 0;
+
+		/* 
+		 * Test the end because the start can validly
+		 * be zero.
+		 */
+		while (!zone_nosave[*zone_num].end_pfn)
+			(*zone_num)++;
+		counter = zone_nosave[*zone_num].start_pfn - 1;
+	}
+
+	do {
+		counter++;
+		if (counter > zone_nosave[*zone_num].end_pfn) {
+			(*zone_num)++;
+			while (!zone_nosave[*zone_num].end_pfn && *zone_num < num_zones)
+				(*zone_num)++;
+			
+			if (*zone_num == num_zones)
+				return max_pfn;
+			counter = zone_nosave[*zone_num].start_pfn;
+			reset_ul_ptr = 1;
+		} else
+			if (!(counter & BIT_NUM_MASK))
+				reset_ul_ptr = 1;
+		if (reset_ul_ptr) {
+			reset_ul_ptr = 0;
+			ul_ptr = PAGE_UL_PTR(bitmap, *zone_num,
+					(counter - zone_nosave[*zone_num].start_pfn));
+			if (!*ul_ptr) {
+				counter += BIT_NUM_MASK - 1;
+				continue;
+			}
+		}
+	} while((counter < max_pfn) && !test_bit(PAGEBIT(counter), ul_ptr));
+	return counter;
+}
+
+/**
+ * copyback_prepare
+ * Functionality   : Preparatory steps for copying the original kernel back.
+ * Called From     : do_suspend2_lowlevel
+ **/
+
+static void copyback_prepare(void)
+{
+	int loop;
+
+	state1 = suspend_action;
+	state2 = suspend_debug_state;
+	state3 = console_loglevel;
+	for (loop = 0; loop < 4; loop++)
+		io_speed_save[loop/2][loop%2] = 
+			suspend_io_time[loop/2][loop%2];
+
+	init_nosave_zone_table();
+	
+	memcpy(resume_commandline, saved_command_line, COMMAND_LINE_SIZE);
+
+	suspend_map_atomic_copy_pages();
+
+	suspend_deactivate_storage(1);
+
+	/* Arch specific preparation */
+	suspend2_arch_pre_copyback();
+
+	device_suspend(PMSG_FREEZE);
+	local_irq_disable(); /* irqs might have been re-enabled on us by buggy drivers */
+
+	device_power_down(PMSG_FREEZE);
+	
+	barrier();
+	mb();
+}
+
+/*
+ * copyback_post
+ * Functionality   : Steps taken after copying back the original kernel at
+ *                   resume.
+ * Key Assumptions : Will be able to read back secondary pagedir (if 
+ *                   applicable).
+ * Called From     : do_suspend2_lowlevel
+ */
+
+static void copyback_post(void)
+{
+	int loop;
+
+	/* Arch specific code */
+	suspend2_arch_post_copyback();
+
+	suspend_action = state1;
+	suspend_debug_state = state2;
+	console_loglevel = state3;
+
+	for (loop = 0; loop < 4; loop++)
+		suspend_io_time[loop/2][loop%2] =
+			io_speed_save[loop/2][loop%2];
+
+	set_suspend_state(SUSPEND_NOW_RESUMING);
+	set_suspend_state(SUSPEND_PAGESET2_NOT_LOADED);
+
+	suspend_unmap_atomic_copy_pages();
+
+	local_irq_disable();
+	device_power_up();
+	local_irq_enable();
+
+	device_resume();
+
+	if (pm_ops && pm_ops->finish && suspend_powerdown_method > 3)
+		pm_ops->finish(suspend_powerdown_method);
+
+	if (suspend_activate_storage(1))
+		panic("Failed to reactivate our storage.");
+
+	userui_redraw();
+
+	check_shift_keys(1, "About to reload secondary pagedir.");
+
+	read_pageset2(0);
+	clear_suspend_state(SUSPEND_PAGESET2_NOT_LOADED);
+	
+	suspend_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
+}
+
+
+/*
+ * suspend_pre_copy
+ * Functionality   : Steps taken prior to saving CPU state and the image
+ *                   itself.
+ * Called From     : do_suspend2_lowlevel
+ */
+
+static void suspend_pre_copy(void)
+{
+	suspend2_arch_pre_copy();
+
+	device_suspend(PMSG_FREEZE);
+
+	mb();
+	barrier();
+
+	local_irq_disable();
+
+	device_power_down(PMSG_FREEZE);
+}
+
+/*
+ * suspend_post_copy
+ * Functionality   : Steps taken after saving CPU state to save the
+ *                   image and powerdown/reboot or recover on failure.
+ * Key Assumptions : save_image returns zero on success; otherwise we need to
+ *                   clean up and exit. The state on exiting this routine 
+ *                   should be essentially the same as if we have suspended,
+ *                   resumed and reached the end of copyback_post.
+ * Called From     : do_suspend2_lowlevel
+ */
+extern void suspend_power_down(void);
+
+static void suspend_post_copy(void)
+{
+	suspend2_arch_post_copy();
+
+	if (!save_image_part1()) {
+		int temp_result;
+
+		suspend_power_down();
+
+		temp_result = read_pageset2(1);
+
+		/* If that failed, we're sunk. Panic! */
+		if (temp_result)
+			panic("Attempt to reload pagedir 2 failed. Try rebooting.");
+	}
+
+	if (!test_result_state(SUSPEND_ABORT_REQUESTED) &&
+	    !test_action_state(SUSPEND_TEST_FILTER_SPEED) &&
+	    !test_action_state(SUSPEND_TEST_BIO) &&
+	    suspend_powerdown_method != PM_SUSPEND_MEM)
+		printk(KERN_EMERG name_suspend
+			"Suspend failed, trying to recover...\n");
+	barrier();
+	mb();
+}
+
+/*
+ * copyback_low
+ */
+
+static inline void copyback_low(void)
+{
+	unsigned long *origpage;
+	unsigned long *copypage;
+
+	o_zone_num = 0;
+	c_zone_num = 0;
+
+	origmap = pageset1_map;
+	copymap = pageset1_copy_map;
+
+	origoffset = __get_next_bit_on(origmap, &o_zone_num, -1);
+	copyoffset = __get_next_bit_on(copymap, &c_zone_num, -1);
+	
+	while (origoffset < max_pfn) {
+		if (!zone_nosave[o_zone_num].is_highmem) {
+			origpage = (unsigned long *) __va(origoffset << PAGE_SHIFT);
+			copypage = (unsigned long *) __va(copyoffset << PAGE_SHIFT);
+		
+			loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
+		
+			while (loop >= 0) {
+				*(origpage + loop) = *(copypage + loop);
+				loop--;
+			}
+		}
+		
+		origoffset = __get_next_bit_on(origmap, &o_zone_num, origoffset);
+		copyoffset = __get_next_bit_on(copymap, &c_zone_num, copyoffset);
+	}
+}
+
+/*
+ * copyback_high
+ */
+static void copyback_high(void)
+{
+	unsigned long *origpage;
+	unsigned long *copypage;
+
+	origoffset = get_next_bit_on(origmap, -1);
+	copyoffset = get_next_bit_on(copymap, -1);
+
+	while (origoffset < max_pfn) {
+		if (PageHighMem(pfn_to_page(origoffset))) {
+			origpage = (unsigned long *) kmap_atomic(pfn_to_page(origoffset), KM_USER1);
+			copypage = (unsigned long *) __va(copyoffset << PAGE_SHIFT);
+
+			memcpy(origpage, copypage, PAGE_SIZE);
+
+			kunmap_atomic(origpage, KM_USER1);
+		}
+		
+		origoffset = get_next_bit_on(origmap, origoffset);
+		copyoffset = get_next_bit_on(copymap, copyoffset);
+	}
+}
+
+void do_suspend2_lowlevel(int resume)
+{
+	is_resuming = resume;
+
+	if (resume) {
+		copyback_prepare();
+
+		suspend2_arch_save_processor_context();
+
+		copyback_low(); /* 0 = use logical addresses */
+
+		suspend2_arch_restore_processor_context();
+	} else {
+		suspend_pre_copy();
+
+		suspend2_arch_save_processor_context();
+	}
+	
+	if (is_resuming) {
+		suspend2_arch_flush_caches();
+
+		/* Now we are running with our old stack, and with registers copied
+		 * from suspend time. Let's copy back those remaining highmem pages. */
+		copyback_high();
+		suspend2_arch_flush_caches();
+
+		touch_softlockup_watchdog();
+		
+		suspend_checksum_print_differences();
+
+		copyback_post();
+		
+	} else {
+		suspend_post_copy();		/* If everything goes okay, this function does not return */
+	}
+}
+
+/* suspend_copy_pageset1
+ *
+ * Description:	Make the atomic copy of pageset1. We can't use copy_page (as we
+ * 		once did) because we can't be sure what side effects it has. On
+ * 		my old Duron, with 3DNOW, kernel_fpu_begin increments preempt
+ * 		count, making our preempt count at resume time 4 instead of 3.
+ * 		
+ * 		We don't want to call kmap_atomic unconditionally because it has
+ * 		the side effect of incrementing the preempt count, which will
+ * 		leave it one too high post resume (the page containing the
+ * 		preempt count will be copied after its incremented. This is
+ * 		essentially the same problem.
+ */
+
+void suspend_copy_pageset1(void)
+{
+	unsigned long i, source_index, dest_index;
+
+	source_index = get_next_bit_on(pageset1_map, -1);
+	dest_index = get_next_bit_on(pageset1_copy_map, -1);
+
+	for (i = 0; i < pagedir1.pageset_size; i++) {
+		unsigned long *origvirt, *copyvirt;
+		struct page *origpage;
+		int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
+
+		origpage = pfn_to_page(source_index);
+		
+		copyvirt = (unsigned long *) page_address(pfn_to_page(dest_index));
+
+	       	if (PageHighMem(origpage))
+			origvirt = kmap_atomic(origpage, KM_USER1);
+		else
+			origvirt = page_address(origpage);
+
+		while (loop >= 0) {
+			*(copyvirt + loop) = *(origvirt + loop);
+			loop--;
+		}
+		
+		if (PageHighMem(origpage))
+			kunmap_atomic(origvirt, KM_USER1);
+		
+		source_index = get_next_bit_on(pageset1_map, source_index);
+		dest_index = get_next_bit_on(pageset1_copy_map, dest_index);
+	}
+}
+
+int __suspend_atomic_restore(void *data)
+{
+	struct page *my_thread_info = virt_to_page(current->thread_info);
+
+	BUG_ON(PagePageset1(my_thread_info));
+	BUG_ON(THREAD_SIZE > PAGE_SIZE && PagePageset1(++my_thread_info));
+
+	atomic_set(&restore_thread_ready, 1);
+
+	while atomic_read(&atomic_copy_hold)
+		yield();
+
+	suspend_prepare_status(DONT_CLEAR_BAR,	"Copying original kernel back");
+	
+	/* 
+	 * If you're hitting this BUG_ON, you have a process that's
+	 * not freezing which is started prior to this.
+	 */
+	BUG_ON(freeze_processes());
+
+	do_suspend2_lowlevel(1);
+
+	printk("Returned from do_suspend2_lowlevel when resuming?!");
+	BUG();
+	
+	return 0;
+}
+
+void suspend_atomic_restore(void)
+{
+	struct task_struct *work_thread;
+
+	disable_nonboot_cpus();
+
+	yield();
+
+	set_suspend_state(SUSPEND_FORK_COPYBACK_THREAD);
+	BUG_ON(atomic_read(&restore_thread_ready));
+
+	atomic_set(&atomic_copy_hold, 1);
+
+	/* Now start the new thread */
+	work_thread = kthread_run(__suspend_atomic_restore, 0, "kcopyback");
+	BUG_ON(IS_ERR(work_thread));
+
+	while (!atomic_read(&restore_thread_ready))
+		yield();
+
+	atomic_set(&atomic_copy_hold, 0);
+
+	while(1) {
+		try_to_freeze();
+		yield();
+	}
+}
diff -urN oldtree/kernel/power/atomic_copy.h newtree/kernel/power/atomic_copy.h
--- oldtree/kernel/power/atomic_copy.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/atomic_copy.h	2006-03-08 15:22:33.253505500 +0000
@@ -0,0 +1,4 @@
+extern inline void move_stack_to_nonconflicing_area(void);
+extern int save_image_part1(void);
+extern void suspend_atomic_restore(void);
+
diff -urN oldtree/kernel/power/block_io.h newtree/kernel/power/block_io.h
--- oldtree/kernel/power/block_io.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/block_io.h	2006-03-08 15:22:33.253505500 +0000
@@ -0,0 +1,76 @@
+/*
+ * block_io.h
+ *
+ * Copyright 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * Distributed under GPLv2.
+ *
+ * This file contains declarations for functions exported from
+ * block_io.c, which contains low level io functions.
+ */
+
+#include <linux/buffer_head.h>
+#include "extent.h"
+
+/*
+ * submit_params
+ *
+ * The structure we use for tracking submitted I/O.
+ */
+struct submit_params {
+	swp_entry_t swap_address;
+	struct page *page;
+	struct block_device *dev;
+	sector_t block[MAX_BUF_PER_PAGE];
+	int readahead_index;
+	struct submit_params *next;
+	int printme;
+};
+
+struct suspend_bdev_info {
+	struct block_device *bdev;
+	dev_t dev_t;
+	int bmap_shift;
+	int blocks_per_page;
+};
+
+/* 
+ * Our exported interface so the swapwriter and filewriter don't
+ * need these functions duplicated.
+ */
+struct suspend_bio_ops {
+	int (*submit_io) (int rw, 
+		struct submit_params *submit_info, int syncio);
+	int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
+			struct page *page);
+	int (*rw_page) (int rw, struct page *page, int readahead_index,
+			int sync);
+	void (*wait_on_readahead) (int readahead_index);
+	void (*check_io_stats) (void);
+	void (*reset_io_stats) (void);
+	void (*finish_all_io) (void);
+	int (*prepare_readahead) (int index);
+	void (*cleanup_readahead) (int index);
+	struct page ** readahead_pages;
+	int (*readahead_ready) (int readahead_index);
+	int *need_extra_next;
+	int (*forward_one_page) (void);
+	void (*set_devinfo) (struct suspend_bdev_info *info);
+	int (*read_init) (int stream_number);
+	int (*read_chunk) (struct page *buffer_page, int sync);
+	int (*read_cleanup) (void);
+	int (*write_init) (int stream_number);
+	int (*write_chunk) (struct page *buffer_page);
+	int (*write_cleanup) (void);
+	int (*read_header_chunk) (char *buffer, int buffer_size);
+	int (*write_header_chunk) (char *buffer, int buffer_size);
+	int (*write_header_chunk_finish) (void);
+};
+
+extern struct suspend_bio_ops suspend_bio_ops;
+
+extern char *suspend_writer_buffer;
+extern int suspend_writer_buffer_posn;
+extern int suspend_read_fd;
+extern struct extent_iterate_saved_state suspend_writer_posn_save[3];
+extern struct extent_iterate_state suspend_writer_posn;
diff -urN oldtree/kernel/power/checksum.h newtree/kernel/power/checksum.h
--- oldtree/kernel/power/checksum.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/checksum.h	2006-03-08 15:22:33.257505750 +0000
@@ -0,0 +1,11 @@
+#ifdef CONFIG_SUSPEND2_CHECKSUMS
+extern void suspend_verify_checksums(void);
+extern void suspend_checksum_calculate_checksums(void);
+extern void suspend_checksum_print_differences(void);
+extern int suspend_allocate_checksum_pages(void);
+#else
+static inline void suspend_verify_checksums(void) { };
+static inline void suspend_checksum_calculate_checksums(void) { };
+static inline void suspend_checksum_print_differences(void) { };
+static inline int suspend_allocate_checksum_pages(void) { return 0; };
+#endif
diff -urN oldtree/kernel/power/compression.c newtree/kernel/power/compression.c
--- oldtree/kernel/power/compression.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/compression.c	2006-03-08 15:22:33.257505750 +0000
@@ -0,0 +1,638 @@
+/*
+ * kernel/power/suspend_core/compression.c
+ *
+ * Copyright (C) 2003-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains data compression routines for suspend,
+ * using LZH compression.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/crypto.h>
+
+#include "suspend2.h"
+#include "modules.h"
+#include "proc.h"
+#include "suspend2_common.h"
+#include "io.h"
+
+#define S2C_WRITE 0
+#define S2C_READ 1
+
+static int suspend_expected_compression = 0;
+
+static struct suspend_module_ops suspend_compression_ops;
+static struct suspend_module_ops *next_driver;
+
+static char suspend_compressor_name[32];
+static struct crypto_tfm *suspend_compressor_transform;
+
+static u8 *local_buffer = NULL;
+static u8 *page_buffer = NULL;
+static unsigned int bufofs;
+
+static int position = 0;
+
+/* ---- Local buffer management ---- */
+
+/* allocate_local_buffer
+ *
+ * Description:	Allocates a page of memory for buffering output.
+ * Returns:	Int: Zero if successful, -ENONEM otherwise.
+ */
+
+static int allocate_local_buffer(void)
+{
+	if (!local_buffer) {
+		local_buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	
+		if (!local_buffer) {
+			printk(KERN_ERR
+				"Failed to allocate the local buffer for "
+				"suspend2 compression driver.\n");
+			return -ENOMEM;
+		}
+	}
+
+	if (!page_buffer) {
+		page_buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	
+		if (!page_buffer) {
+			printk(KERN_ERR
+				"Failed to allocate the page buffer for "
+				"suspend2 compression driver.\n");
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/* free_local_buffer
+ *
+ * Description:	Frees memory allocated for buffering output.
+ */
+
+static inline void free_local_buffer(void)
+{
+	if (local_buffer)
+		free_page((unsigned long) local_buffer);
+
+	local_buffer = NULL;
+
+	if (page_buffer)
+		free_page((unsigned long) page_buffer);
+
+	page_buffer = NULL;
+}
+
+/* suspend_crypto_cleanup
+ *
+ * Description:	Frees memory allocated for our labours.
+ */
+
+static void suspend_crypto_cleanup(void)
+{
+	if (suspend_compressor_transform) {
+		crypto_free_tfm(suspend_compressor_transform);
+		suspend_compressor_transform = NULL;
+	}
+}
+
+/* suspend_crypto_prepare
+ *
+ * Description:	Prepare to do some work by allocating buffers and transforms.
+ * Returns:	Int: Zero if successful, -ENONEM otherwise.
+ */
+
+static int suspend_compress_crypto_prepare(int mode)
+{
+	if (!*suspend_compressor_name) {
+		printk("Suspend2: Compression enabled but no compressor name set.\n");
+		return 1;
+	}
+
+	if (!(suspend_compressor_transform = crypto_alloc_tfm(suspend_compressor_name, 0))) {
+		printk("Suspend2: Failed to initialise the %s compression transform.\n",
+				suspend_compressor_name);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* ---- Exported functions ---- */
+
+/* write_init()
+ *
+ * Description:	Allocate buffers and prepare to compress data.
+ * Arguments:	Stream_number:	Ignored.
+ * Returns:	Zero on success, -ENOMEM if unable to vmalloc.
+ */
+
+static int suspend_compress_write_init(int stream_number)
+{
+	int result;
+	
+	next_driver = suspend_get_next_filter(&suspend_compression_ops);
+
+	if (!next_driver) {
+		printk("Compression Driver: Argh! No one wants my output!");
+		return -ECHILD;
+	}
+
+	if ((result = suspend_compress_crypto_prepare(S2C_WRITE))) {
+		return result;
+	}
+	
+	if ((result = allocate_local_buffer()))
+		return result;
+
+	/* Only reset the stats if starting to write an image */
+	if (stream_number == 2)
+		bytes_in = bytes_out = 0;
+	
+	bufofs = 0;
+
+	position = 0;
+
+	return 0;
+}
+
+/* suspend_compress_write()
+ *
+ * Description:	Helper function for write_chunk. Write the compressed data.
+ * Arguments:	u8*:		Output buffer to be written.
+ * 		unsigned int:	Length of buffer.
+ * Return:	int:		Result to be passed back to caller.
+ */
+
+static int suspend_compress_write (u8 *buffer, unsigned int len)
+{
+	int ret;
+
+	bytes_out += len;
+
+	while (len + bufofs > PAGE_SIZE) {
+		unsigned int chunk = PAGE_SIZE - bufofs;
+		memcpy (local_buffer + bufofs, buffer, chunk);
+		buffer += chunk;
+		len -= chunk;
+		bufofs = 0;
+		if ((ret = next_driver->ops.filter.write_chunk(virt_to_page(local_buffer))) < 0)
+			return ret;
+	}
+	memcpy (local_buffer + bufofs, buffer, len);
+	bufofs += len;
+	return 0;
+}
+
+/* suspend_compress_write_chunk()
+ *
+ * Description:	Compress a page of data, buffering output and passing on
+ * 		filled pages to the next module in the pipeline.
+ * Arguments:	Buffer_page:	Pointer to a buffer of size PAGE_SIZE, 
+ * 				containing data to be compressed.
+ * Returns:	0 on success. Otherwise the error is that returned by later
+ * 		modules, -ECHILD if we have a broken pipeline or -EIO if
+ * 		zlib errs.
+ */
+
+static int suspend_compress_write_chunk(struct page *buffer_page)
+{
+	int ret; 
+	unsigned int len;
+	u16 len_written;
+	char *buffer_start;
+	
+	if (!suspend_compressor_transform)
+		return next_driver->ops.filter.write_chunk(buffer_page);
+
+	buffer_start = kmap(buffer_page);
+
+	bytes_in += PAGE_SIZE;
+
+	len = PAGE_SIZE;
+
+	ret = crypto_comp_compress(suspend_compressor_transform,
+			buffer_start, PAGE_SIZE,
+			page_buffer, &len);
+	
+	if (ret) {
+		printk("Compression failed.\n");
+		goto failure;
+	}
+	
+	len_written = (u16) len;
+		
+	if ((ret = suspend_compress_write((u8 *)&len_written, 2)) >= 0) {
+		if ((ret = suspend_compress_write((u8 *) &position, sizeof(position))))
+			return -EIO;
+		if (len < PAGE_SIZE) { // some compression
+			position += len;
+			ret = suspend_compress_write(page_buffer, len);
+		} else {
+			ret = suspend_compress_write(buffer_start, PAGE_SIZE);
+			position += PAGE_SIZE;
+		}
+	}
+	position += 2 + sizeof(int);
+
+
+failure:
+	kunmap(buffer_page);
+	return ret;
+}
+
+/* write_cleanup()
+ *
+ * Description: Write unflushed data and free workspace.
+ * Returns:	Result of writing last page.
+ */
+
+static int suspend_compress_write_cleanup(void)
+{
+	int ret = 0;
+	
+	if (suspend_compressor_transform)
+		ret = next_driver->ops.filter.write_chunk(virt_to_page(local_buffer));
+
+	suspend_crypto_cleanup();
+	free_local_buffer();
+
+	return ret;
+}
+
+/* read_init()
+ *
+ * Description:	Prepare to read a new stream of data.
+ * Arguments:	int: Section of image about to be read.
+ * Returns:	int: Zero on success, error number otherwise.
+ */
+
+static int suspend_compress_read_init(int stream_number)
+{
+	int result;
+
+	next_driver = suspend_get_next_filter(&suspend_compression_ops);
+
+	if (!next_driver) {
+		printk("Compression Driver: Argh! No one wants "
+				"to feed me data!");
+		return -ECHILD;
+	}
+	
+	if ((result = suspend_compress_crypto_prepare(S2C_READ)))
+		return result;
+	
+	if ((result = allocate_local_buffer()))
+		return result;
+
+	bufofs = PAGE_SIZE;
+
+	position = 0;
+
+	return 0;
+}
+
+/* suspend_compress_read()
+ *
+ * Description:	Read data into compression buffer.
+ * Arguments:	u8 *:		Address of the buffer.
+ * 		unsigned int:	Length
+ * Returns:	int:		Result of reading the image chunk.
+ */
+
+static int suspend_compress_read (u8 *buffer, unsigned int len)
+{
+	int ret;
+
+	while (len + bufofs > PAGE_SIZE) {
+		unsigned int chunk = PAGE_SIZE - bufofs;
+		memcpy(buffer, local_buffer + bufofs, chunk);
+		buffer += chunk;
+		len -= chunk;
+		bufofs = 0;
+		if ((ret = next_driver->ops.filter.read_chunk(
+				virt_to_page(local_buffer), SUSPEND_SYNC)) < 0) {
+			return ret;
+		}
+	}
+	memcpy (buffer, local_buffer + bufofs, len);
+	bufofs += len;
+	return 0;
+}
+
+/* suspend_compress_read_chunk()
+ *
+ * Description:	Retrieve data from later modules and decompress it until the
+ * 		input buffer is filled.
+ * Arguments:	Buffer_start: 	Pointer to a buffer of size PAGE_SIZE.
+ * 		Sync:		Whether the previous module (or core) wants its
+ * 				data synchronously.
+ * Returns:	Zero if successful. Error condition from me or from downstream
+ * 		on failure.
+ */
+
+static int suspend_compress_read_chunk(struct page *buffer_page, int sync)
+{
+	int ret, position_saved; 
+	unsigned int len;
+	u16 len_written;
+	char *buffer_start;
+
+	if (!suspend_compressor_transform)
+		return next_driver->ops.filter.read_chunk(buffer_page, SUSPEND_ASYNC);
+
+	/* 
+	 * All our reads must be synchronous - we can't decompress
+	 * data that hasn't been read yet.
+	 */
+
+	buffer_start = kmap(buffer_page);
+
+	if ((ret = suspend_compress_read ((u8 *)&len_written, 2)) >= 0) {
+		len = (unsigned int) len_written;
+		ret = suspend_compress_read((u8 *) &position_saved, sizeof(position_saved));
+		if (ret)
+			return ret;
+
+		if (position != position_saved) {
+			printk("Position saved (%d) != position I'm at now (%d).\n",
+					position_saved, position);
+			BUG_ON(1);
+		}
+		if (len >= PAGE_SIZE) { // uncompressed
+			ret = suspend_compress_read(buffer_start, PAGE_SIZE);
+			if (ret)
+				return ret;
+
+			position += PAGE_SIZE;
+		} else { // compressed
+			if ((ret = suspend_compress_read(page_buffer, len)) >= 0) {
+				int outlen = PAGE_SIZE;
+				/* Important note.
+				 *
+				 * For Deflate, decompression return values may represent
+				 * errors. Deflate complains when everything is alright, so
+				 * we ignore the errors unless the number of output bytes is
+				 * not PAGE_SIZE.
+				 */
+				crypto_comp_decompress(suspend_compressor_transform, 
+						page_buffer, len,
+						buffer_start, &outlen);
+				if (outlen != PAGE_SIZE) {
+					printk("Decompression yielded %ld bytes instead of %d.\n", PAGE_SIZE, outlen);
+					ret = -EIO;
+				} else
+					ret = 0;
+			}
+			position += len;
+		}
+		position += 2 + sizeof(int);
+	} else
+		printk("Compress_read returned %d.", ret);
+	kunmap(buffer_page);
+	return ret;
+}
+
+/* read_cleanup()
+ *
+ * Description:	Clean up after reading part or all of a stream of data.
+ * Returns:	int: Always zero. Never fails.
+ */
+
+static int suspend_compress_read_cleanup(void)
+{ 
+	suspend_crypto_cleanup();
+	free_local_buffer();
+	return 0;
+}
+
+/* suspend_compress_print_debug_stats
+ *
+ * Description:	Print information to be recorded for debugging purposes into a
+ * 		buffer.
+ * Arguments:	buffer: Pointer to a buffer into which the debug info will be
+ * 			printed.
+ * 		size:	Size of the buffer.
+ * Returns:	Number of characters written to the buffer.
+ */
+
+static int suspend_compress_print_debug_stats(char *buffer, int size)
+{
+	int pages_in = bytes_in >> PAGE_SHIFT, 
+		pages_out = bytes_out >> PAGE_SHIFT;
+	int len;
+	
+	/* Output the compression ratio achieved. */
+	len = snprintf_used(buffer, size, "- Compressor %s enabled.\n",
+			suspend_compressor_name);
+	if (pages_in)
+		len+= snprintf_used(buffer+len, size - len,
+		  "  Compressed %ld bytes into %ld (%d percent compression).\n",
+		  bytes_in, bytes_out, (pages_in - pages_out) * 100 / pages_in);
+	return len;
+}
+
+/* compression_memory_needed
+ *
+ * Description:	Tell the caller how much memory we need to operate during
+ * 		suspend/resume.
+ * Returns:	Unsigned long. Maximum number of bytes of memory required for
+ * 		operation.
+ */
+
+static unsigned long suspend_compress_memory_needed(void)
+{
+	return PAGE_SIZE;
+}
+
+static unsigned long suspend_compress_storage_needed(void)
+{
+	return 2 * sizeof(unsigned long) + sizeof(int);
+}
+
+/* suspend_compress_save_config_info
+ *
+ * Description:	Save informaton needed when reloading the image at resume time.
+ * Arguments:	Buffer:		Pointer to a buffer of size PAGE_SIZE.
+ * Returns:	Number of bytes used for saving our data.
+ */
+
+static int suspend_compress_save_config_info(char *buffer)
+{
+	int namelen = strlen(suspend_compressor_name) + 1;
+	int total_len;
+	
+	*((unsigned long *) buffer) = bytes_in;
+	*((unsigned long *) (buffer + 1 * sizeof(unsigned long))) = bytes_out;
+	*((unsigned long *) (buffer + 2 * sizeof(unsigned long))) = suspend_expected_compression;
+	*((unsigned long *) (buffer + 3 * sizeof(unsigned long))) = namelen;
+	strncpy(buffer + 4 * sizeof(unsigned long), suspend_compressor_name, namelen);
+	total_len = 4 * sizeof(unsigned long) + namelen;
+	return total_len;
+}
+
+/* suspend_compress_load_config_info
+ *
+ * Description:	Reload information needed for decompressing the image at 
+ * 		resume time.
+ * Arguments:	Buffer:		Pointer to the start of the data.
+ *		Size:		Number of bytes that were saved.
+ */
+
+static void suspend_compress_load_config_info(char *buffer, int size)
+{
+	int namelen;
+	
+	bytes_in = *((unsigned long *) buffer);
+	bytes_out = *((unsigned long *) (buffer + 1 * sizeof(unsigned long)));
+	suspend_expected_compression = *((unsigned long *) (buffer + 2 * sizeof(unsigned long)));
+	namelen = *((unsigned long *) (buffer + 3 * sizeof(unsigned long)));
+	strncpy(suspend_compressor_name, buffer + 4 * sizeof(unsigned long), namelen);
+	return;
+}
+
+/* suspend_expected_compression_ratio
+ * 
+ * Description:	Returns the expected ratio between data passed into this module
+ * 		and the amount of data output when writing.
+ * Returns:	100 if the module is disabled. Otherwise the value set by the
+ * 		user via our proc entry.
+ */
+
+int suspend_expected_compression_ratio(void)
+{
+	if (suspend_compression_ops.disabled)
+		return 100;
+	else
+		return 100 - suspend_expected_compression;
+}
+
+static void suspend_compressor_disable_if_empty(void)
+{
+	suspend_compression_ops.disabled = !(*suspend_compressor_name);
+}
+
+static int suspend_compress_initialise(int starting_cycle)
+{
+	if (starting_cycle)
+		suspend_compressor_disable_if_empty();
+
+	return 0;
+}
+/*
+ * data for our proc entries.
+ */
+
+static struct suspend_proc_data proc_params[] = {
+	{
+		.filename			= "expected_compression",
+		.permissions			= PROC_RW,
+		.type				= SUSPEND_PROC_DATA_INTEGER,
+		.data = {
+			.integer = {
+				.variable	= &suspend_expected_compression,
+				.minimum	= 0,
+				.maximum	= 99,
+			}
+		}
+	},
+
+	{
+		.filename			= "disable_compression",
+		.permissions			= PROC_RW,
+		.type				= SUSPEND_PROC_DATA_INTEGER,
+		.data = {
+			.integer = {
+				.variable	= &suspend_compression_ops.disabled,
+				.minimum	= 0,
+				.maximum	= 1,
+			}
+		}
+	},
+
+	{
+		.filename			= "compressor",
+		.permissions			= PROC_RW,
+		.type				= SUSPEND_PROC_DATA_STRING,
+		.data = {
+			.string = {
+				.variable	= suspend_compressor_name,
+				.max_length	= 31,
+			}
+		},
+		.write_proc			= &suspend_compressor_disable_if_empty,
+	}
+};
+
+/*
+ * Ops structure.
+ */
+
+static struct suspend_module_ops suspend_compression_ops = {
+	.type			= FILTER_PLUGIN,
+	.name			= "Suspend2 Compressor",
+	.module			= THIS_MODULE,
+	.memory_needed 		= suspend_compress_memory_needed,
+	.print_debug_info	= suspend_compress_print_debug_stats,
+	.save_config_info	= suspend_compress_save_config_info,
+	.load_config_info	= suspend_compress_load_config_info,
+	.storage_needed		= suspend_compress_storage_needed,
+	
+	.initialise		= suspend_compress_initialise,
+	
+	.write_init		= suspend_compress_write_init,
+	.write_cleanup		= suspend_compress_write_cleanup,
+	.read_init		= suspend_compress_read_init,
+	.read_cleanup		= suspend_compress_read_cleanup,
+
+	.ops = {
+		.filter = {
+			.write_chunk		= suspend_compress_write_chunk,
+			.read_chunk		= suspend_compress_read_chunk,
+		}
+	}
+};
+
+/* ---- Registration ---- */
+
+static __init int suspend_compress_load(void)
+{
+	int result;
+	int i, numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data);
+
+	printk("Suspend2 Compression Driver loading.\n");
+	if (!(result = suspend_register_module(&suspend_compression_ops))) {
+		for (i=0; i< numfiles; i++)
+			suspend_register_procfile(&proc_params[i]);
+	} else
+		printk("Suspend2 Compression Driver unable to register!\n");
+	return result;
+}
+
+#ifdef MODULE
+static __exit void suspend_compress_unload(void)
+{
+	printk("Suspend2 Compression Driver unloading.\n");
+	for (i=0; i< numfiles; i++)
+		suspend_unregister_procfile(&proc_params[i]);
+	suspend_unregister_module(&suspend_compression_ops);
+}
+
+
+module_init(suspend_compress_load);
+module_exit(suspend_compress_unload);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nigel Cunningham");
+MODULE_DESCRIPTION("Compression Support for Suspend2");
+#else
+late_initcall(suspend_compress_load);
+#endif
diff -urN oldtree/kernel/power/debug_pagealloc.c newtree/kernel/power/debug_pagealloc.c
--- oldtree/kernel/power/debug_pagealloc.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/debug_pagealloc.c	2006-03-08 15:22:33.261506000 +0000
@@ -0,0 +1,111 @@
+#include <linux/mm.h>
+#ifdef CONFIG_DEBUG_PAGEALLOC
+#include <linux/page-flags.h>
+#include <asm/pgtable.h>
+
+#include "pageflags.h"
+#include "suspend2.h"
+#include "pagedir.h"
+
+
+extern pte_t *lookup_address(unsigned long address);
+
+/* Returns whether it was already in the requested state */
+extern void kernel_map_pages(struct page *page, int numpages, int enable);
+
+static int page_is_kernel_mapped(struct page *page)
+{
+	pte_t *kpte; 
+	unsigned long address;
+
+	if (PageHighMem(page))
+		return 0;
+
+	address = (unsigned long)page_address(page);
+	
+	kpte = lookup_address(address);
+	if (!kpte)
+		return 0;
+
+	if (pte_same(*kpte, mk_pte(page, PAGE_KERNEL)))
+		return 1;
+
+	return 0;
+}
+
+int suspend_map_kernel_page(struct page *page, int enable)
+{
+	int is_already_mapped = page_is_kernel_mapped(page);
+
+	if (enable == is_already_mapped)
+		return 1;
+
+	kernel_map_pages(page, 1, enable);
+
+	return 0;
+}
+
+/*
+ * suspend_map_atomic_copy_pages
+ *
+ * When DEBUG_PAGEALLOC is enabled, we need to map the pages before
+ * an atomic copy.
+ */
+void suspend_map_atomic_copy_pages(void)
+{
+	int i = 0, source_index = -1, dest_index = -1;
+
+	for (i = 0; i < pagedir1.pageset_size; i++) {
+		int orig_was_mapped = 1, copy_was_mapped = 1;
+		struct page *origpage, *copypage;
+
+		source_index = get_next_bit_on(pageset1_map, source_index);
+		dest_index = get_next_bit_on(pageset1_copy_map, dest_index);
+
+		origpage = pfn_to_page(source_index);
+		copypage = pfn_to_page(dest_index);
+		
+	       	if (!PageHighMem(origpage)) {
+			orig_was_mapped = suspend_map_kernel_page(origpage, 1);
+			if ((!orig_was_mapped) &&
+			    (!test_suspend_state(SUSPEND_NOW_RESUMING)))
+				SetPageUnmap(origpage);
+		}
+
+		copy_was_mapped = suspend_map_kernel_page(copypage, 1);
+		if ((!copy_was_mapped) &&
+		    (!test_suspend_state(SUSPEND_NOW_RESUMING)))
+			SetPageUnmap(copypage);
+	}
+}
+
+/*
+ * suspend_unmap_atomic_copy_pages
+ *
+ * We also need to unmap pages when DEBUG_PAGEALLOC is enabled.
+ */
+void suspend_unmap_atomic_copy_pages(void)
+{
+	int i;
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		if (!zone->present_pages)
+			continue;
+		for (i = 0; i < zone->spanned_pages; i++) {
+			struct page *page = pfn_to_page(zone->zone_start_pfn + i);
+			if (PageUnmap(page))
+				suspend_map_kernel_page(page, 0);
+		}
+	}
+}
+#else
+void suspend_map_atomic_copy_pages(void) { };
+
+void suspend_unmap_atomic_copy_pages(void) { };
+
+int suspend_map_kernel_page(struct page *page, int enable)
+{
+	return 1;
+}
+#endif
diff -urN oldtree/kernel/power/debug_pagealloc.h newtree/kernel/power/debug_pagealloc.h
--- oldtree/kernel/power/debug_pagealloc.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/debug_pagealloc.h	2006-03-08 15:22:33.261506000 +0000
@@ -0,0 +1,3 @@
+extern void suspend_map_atomic_copy_pages(void);
+extern void suspend_unmap_atomic_copy_pages(void);
+extern int suspend_map_kernel_page(struct page *page, int enable);
diff -urN oldtree/kernel/power/disk.c newtree/kernel/power/disk.c
--- oldtree/kernel/power/disk.c	2006-03-08 18:48:02.956063750 +0000
+++ newtree/kernel/power/disk.c	2006-03-08 18:19:18.744307250 +0000
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/pm.h>
+#include <linux/freezer.h>
+
 
 #include "power.h"
 
@@ -84,7 +86,7 @@
 	if (!(error = swsusp_shrink_memory()))
 		return 0;
 thaw:
-	thaw_processes();
+	thaw_processes(FREEZER_ALL_THREADS);
 	enable_nonboot_cpus();
 	pm_restore_console();
 	return error;
@@ -93,7 +95,7 @@
 static void unprepare_processes(void)
 {
 	platform_finish();
-	thaw_processes();
+	thaw_processes(FREEZER_ALL_THREADS);
 	enable_nonboot_cpus();
 	pm_restore_console();
 }
@@ -130,7 +132,6 @@
 	if (in_suspend) {
 		device_resume();
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write();
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
@@ -189,22 +190,9 @@
 
 	pr_debug("PM: Checking swsusp image.\n");
 
-	if ((error = swsusp_check()))
-		goto Done;
-
-	pr_debug("PM: Preparing processes for restore.\n");
-
-	if ((error = prepare_processes())) {
-		swsusp_close();
-		goto Done;
-	}
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read())) {
-		swsusp_free();
-		goto Thaw;
-	}
 
 	pr_debug("PM: Preparing devices for restore.\n");
 
diff -urN oldtree/kernel/power/disk.c.orig newtree/kernel/power/disk.c.orig
--- oldtree/kernel/power/disk.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/disk.c.orig	2006-03-08 15:21:19.228879250 +0000
@@ -0,0 +1,412 @@
+/*
+ * kernel/power/disk.c - Suspend-to-disk support.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/ctype.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pm.h>
+
+#include "power.h"
+
+
+static int noresume = 0;
+char resume_file[256] = CONFIG_PM_STD_PARTITION;
+dev_t swsusp_resume_device;
+
+/**
+ *	power_down - Shut machine down for hibernate.
+ *	@mode:		Suspend-to-disk mode
+ *
+ *	Use the platform driver, if configured so, and return gracefully if it
+ *	fails.
+ *	Otherwise, try to power off and reboot. If they fail, halt the machine,
+ *	there ain't no turning back.
+ */
+
+static void power_down(suspend_disk_method_t mode)
+{
+	int error = 0;
+
+	switch(mode) {
+	case PM_DISK_PLATFORM:
+		kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+		error = pm_ops->enter(PM_SUSPEND_DISK);
+		break;
+	case PM_DISK_SHUTDOWN:
+		kernel_power_off();
+		break;
+	case PM_DISK_REBOOT:
+		kernel_restart(NULL);
+		break;
+	}
+	kernel_halt();
+	/* Valid image is on the disk, if we continue we risk serious data corruption
+	   after resume. */
+	printk(KERN_CRIT "Please power me down manually\n");
+	while(1);
+}
+
+static inline void platform_finish(void)
+{
+	if (pm_disk_mode == PM_DISK_PLATFORM) {
+		if (pm_ops && pm_ops->finish)
+			pm_ops->finish(PM_SUSPEND_DISK);
+	}
+}
+
+static int prepare_processes(void)
+{
+	int error;
+
+	pm_prepare_console();
+	disable_nonboot_cpus();
+
+	if (freeze_processes()) {
+		error = -EBUSY;
+		goto thaw;
+	}
+
+	/* Free memory before shutting down devices. */
+	if (!(error = swsusp_shrink_memory()))
+		return 0;
+thaw:
+	thaw_processes();
+	enable_nonboot_cpus();
+	pm_restore_console();
+	return error;
+}
+
+static void unprepare_processes(void)
+{
+	platform_finish();
+	thaw_processes();
+	enable_nonboot_cpus();
+	pm_restore_console();
+}
+
+/**
+ *	pm_suspend_disk - The granpappy of power management.
+ *
+ *	If we're going through the firmware, then get it over with quickly.
+ *
+ *	If not, then call swsusp to do its thing, then figure out how
+ *	to power down the system.
+ */
+
+int pm_suspend_disk(void)
+{
+	int error;
+
+	error = prepare_processes();
+	if (error)
+		return error;
+
+	error = device_suspend(PMSG_FREEZE);
+	if (error) {
+		printk("Some devices failed to suspend\n");
+		unprepare_processes();
+		return error;
+	}
+
+	pr_debug("PM: snapshotting memory.\n");
+	in_suspend = 1;
+	if ((error = swsusp_suspend()))
+		goto Done;
+
+	if (in_suspend) {
+		device_resume();
+		pr_debug("PM: writing image.\n");
+		error = swsusp_write();
+		if (!error)
+			power_down(pm_disk_mode);
+		else {
+			swsusp_free();
+			unprepare_processes();
+			return error;
+		}
+	} else
+		pr_debug("PM: Image restored successfully.\n");
+
+	swsusp_free();
+ Done:
+	device_resume();
+	unprepare_processes();
+	return error;
+}
+
+
+/**
+ *	software_resume - Resume from a saved image.
+ *
+ *	Called as a late_initcall (so all devices are discovered and
+ *	initialized), we call swsusp to see if we have a saved image or not.
+ *	If so, we quiesce devices, the restore the saved image. We will
+ *	return above (in pm_suspend_disk() ) if everything goes well.
+ *	Otherwise, we fail gracefully and return to the normally
+ *	scheduled program.
+ *
+ */
+
+static int software_resume(void)
+{
+	int error;
+
+	down(&pm_sem);
+	if (!swsusp_resume_device) {
+		if (!strlen(resume_file)) {
+			up(&pm_sem);
+			return -ENOENT;
+		}
+		swsusp_resume_device = name_to_dev_t(resume_file);
+		pr_debug("swsusp: Resume From Partition %s\n", resume_file);
+	} else {
+		pr_debug("swsusp: Resume From Partition %d:%d\n",
+			 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
+	}
+
+	if (noresume) {
+		/**
+		 * FIXME: If noresume is specified, we need to find the partition
+		 * and reset it back to normal swap space.
+		 */
+		up(&pm_sem);
+		return 0;
+	}
+
+	pr_debug("PM: Checking swsusp image.\n");
+
+	if ((error = swsusp_check()))
+		goto Done;
+
+	pr_debug("PM: Preparing processes for restore.\n");
+
+	if ((error = prepare_processes())) {
+		swsusp_close();
+		goto Done;
+	}
+
+	pr_debug("PM: Reading swsusp image.\n");
+
+	if ((error = swsusp_read())) {
+		swsusp_free();
+		goto Thaw;
+	}
+
+	pr_debug("PM: Preparing devices for restore.\n");
+
+	if ((error = device_suspend(PMSG_FREEZE))) {
+		printk("Some devices failed to suspend\n");
+		swsusp_free();
+		goto Thaw;
+	}
+
+	mb();
+
+	pr_debug("PM: Restoring saved image.\n");
+	swsusp_resume();
+	pr_debug("PM: Restore failed, recovering.n");
+	device_resume();
+ Thaw:
+	unprepare_processes();
+ Done:
+	/* For success case, the suspend path will release the lock */
+	up(&pm_sem);
+	pr_debug("PM: Resume from disk failed.\n");
+	return 0;
+}
+
+late_initcall(software_resume);
+
+
+static char * pm_disk_modes[] = {
+	[PM_DISK_FIRMWARE]	= "firmware",
+	[PM_DISK_PLATFORM]	= "platform",
+	[PM_DISK_SHUTDOWN]	= "shutdown",
+	[PM_DISK_REBOOT]	= "reboot",
+};
+
+/**
+ *	disk - Control suspend-to-disk mode
+ *
+ *	Suspend-to-disk can be handled in several ways. The greatest
+ *	distinction is who writes memory to disk - the firmware or the OS.
+ *	If the firmware does it, we assume that it also handles suspending
+ *	the system.
+ *	If the OS does it, then we have three options for putting the system
+ *	to sleep - using the platform driver (e.g. ACPI or other PM registers),
+ *	powering off the system or rebooting the system (for testing).
+ *
+ *	The system will support either 'firmware' or 'platform', and that is
+ *	known a priori (and encoded in pm_ops). But, the user may choose
+ *	'shutdown' or 'reboot' as alternatives.
+ *
+ *	show() will display what the mode is currently set to.
+ *	store() will accept one of
+ *
+ *	'firmware'
+ *	'platform'
+ *	'shutdown'
+ *	'reboot'
+ *
+ *	It will only change to 'firmware' or 'platform' if the system
+ *	supports it (as determined from pm_ops->pm_disk_mode).
+ */
+
+static ssize_t disk_show(struct subsystem * subsys, char * buf)
+{
+	return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
+}
+
+
+static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
+{
+	int error = 0;
+	int i;
+	int len;
+	char *p;
+	suspend_disk_method_t mode = 0;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	down(&pm_sem);
+	for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
+		if (!strncmp(buf, pm_disk_modes[i], len)) {
+			mode = i;
+			break;
+		}
+	}
+	if (mode) {
+		if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT)
+			pm_disk_mode = mode;
+		else {
+			if (pm_ops && pm_ops->enter &&
+			    (mode == pm_ops->pm_disk_mode))
+				pm_disk_mode = mode;
+			else
+				error = -EINVAL;
+		}
+	} else
+		error = -EINVAL;
+
+	pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+		 pm_disk_modes[mode]);
+	up(&pm_sem);
+	return error ? error : n;
+}
+
+power_attr(disk);
+
+static ssize_t resume_show(struct subsystem * subsys, char *buf)
+{
+	return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+		       MINOR(swsusp_resume_device));
+}
+
+static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
+{
+	unsigned int maj, min;
+	dev_t res;
+	int ret = -EINVAL;
+
+	if (sscanf(buf, "%u:%u", &maj, &min) != 2)
+		goto out;
+
+	res = MKDEV(maj,min);
+	if (maj != MAJOR(res) || min != MINOR(res))
+		goto out;
+
+	down(&pm_sem);
+	swsusp_resume_device = res;
+	up(&pm_sem);
+	printk("Attempting manual resume\n");
+	noresume = 0;
+	software_resume();
+	ret = n;
+out:
+	return ret;
+}
+
+power_attr(resume);
+
+static ssize_t image_size_show(struct subsystem * subsys, char *buf)
+{
+	return sprintf(buf, "%lu\n", image_size);
+}
+
+static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	unsigned long size;
+
+	if (sscanf(buf, "%lu", &size) == 1) {
+		image_size = size;
+		return n;
+	}
+
+	return -EINVAL;
+}
+
+power_attr(image_size);
+
+static struct attribute * g[] = {
+	&disk_attr.attr,
+	&resume_attr.attr,
+	&image_size_attr.attr,
+	NULL,
+};
+
+
+static struct attribute_group attr_group = {
+	.attrs = g,
+};
+
+
+static int __init pm_disk_init(void)
+{
+	return sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+}
+
+core_initcall(pm_disk_init);
+
+
+static int __init resume_setup(char *str)
+{
+	char *p;
+
+	if (noresume)
+		return 1;
+
+	strncpy(resume_file, str, 255);
+	p = resume_file;
+	while (*p) {
+		if (isspace(*p)) {
+			*p = '\0';
+			break;
+		}
+		p++;
+	}
+	return 1;
+}
+
+static int __init noresume_setup(char *str)
+{
+	noresume = 1;
+	return 1;
+}
+
+__setup("noresume", noresume_setup);
+__setup("resume=", resume_setup);
diff -urN oldtree/kernel/power/encryption.c newtree/kernel/power/encryption.c
--- oldtree/kernel/power/encryption.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/encryption.c	2006-03-08 15:22:33.273506750 +0000
@@ -0,0 +1,597 @@
+/*
+ * kernel/power/suspend_core/encryption.c
+ *
+ * Copyright (C) 2003-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains data encryption routines for suspend,
+ * using cryptoapi transforms.
+ *
+ * ToDo:
+ * - Apply min/max_keysize the cipher changes.
+ * - Test.
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+
+#include "suspend2.h"
+#include "modules.h"
+#include "proc.h"
+#include "suspend2_common.h"
+#include "io.h"
+
+#define S2C_WRITE 0
+#define S2C_READ 1
+
+static struct suspend_module_ops suspend_encryption_ops;
+static struct suspend_module_ops *next_driver;
+
+static char suspend_encryptor_name[32];
+static struct crypto_tfm *suspend_encryptor_transform;
+static char suspend_encryptor_key[256];
+static int suspend_key_len;
+static char suspend_encryptor_iv[256];
+static int suspend_encryptor_mode;
+static int suspend_encryptor_save_key_and_iv;
+
+static u8 *page_buffer = NULL;
+static unsigned int bufofs;
+
+static struct scatterlist suspend_crypt_sg[PAGE_SIZE/8];
+
+/* ---- Local buffer management ---- */
+
+/* allocate_local_buffer
+ *
+ * Description:	Allocates a page of memory for buffering output.
+ * Returns:	Int: Zero if successful, -ENONEM otherwise.
+ */
+
+static int allocate_local_buffer(void)
+{
+	if (!page_buffer) {
+		int i;
+		
+		page_buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	
+		if (!page_buffer) {
+			printk(KERN_ERR
+				"Failed to allocate the page buffer for "
+				"suspend2 encryption driver.\n");
+			return -ENOMEM;
+		}
+
+		for (i=0; i < (PAGE_SIZE / suspend_key_len); i++) {
+			suspend_crypt_sg[i].page = virt_to_page(page_buffer);
+			suspend_crypt_sg[i].offset = suspend_key_len * i;
+			suspend_crypt_sg[i].length = suspend_key_len;
+		}
+	}
+
+	return 0;
+}
+
+/* free_local_buffer
+ *
+ * Description:	Frees memory allocated for buffering output.
+ */
+
+static void free_local_buffer(void)
+{
+	if (page_buffer)
+		free_page((unsigned long) page_buffer);
+
+	page_buffer = NULL;
+}
+
+/* suspend_crypto_cleanup
+ *
+ * Description:	Frees memory allocated for our labours.
+ */
+
+static void suspend_crypto_cleanup(void)
+{
+	if (suspend_encryptor_transform) {
+		crypto_free_tfm(suspend_encryptor_transform);
+		suspend_encryptor_transform = NULL;
+	}
+}
+
+/* suspend_crypto_prepare
+ *
+ * Description:	Prepare to do some work by allocating buffers and transforms.
+ * Returns:	Int: Zero if successful, -ENONEM otherwise.
+ */
+
+static int suspend_encrypt_crypto_prepare(int mode)
+{
+	if (!*suspend_encryptor_name) {
+		printk("Suspend2: Encryptor enabled but no name set.\n");
+		return 1;
+	}
+
+	if (!(suspend_encryptor_transform = crypto_alloc_tfm(suspend_encryptor_name,
+					1 << suspend_encryptor_mode))) {
+		printk("Suspend2: Failed to initialise the encryption transform (%s, mode %d).\n",
+				suspend_encryptor_name, suspend_encryptor_mode);
+		return 1;
+	}
+
+	if (mode)
+		bufofs = PAGE_SIZE;
+	else
+		bufofs = 0;
+
+	suspend_key_len = strlen(suspend_encryptor_key);
+
+	if (crypto_cipher_setkey(suspend_encryptor_transform, suspend_encryptor_key, 
+				suspend_key_len)) {
+		printk("%d is an invalid key length for cipher %s.\n",
+					suspend_key_len,
+					suspend_encryptor_name);
+		return 1;
+	}
+	
+	if (!mode) {
+		crypto_cipher_set_iv(suspend_encryptor_transform,
+				suspend_encryptor_iv,
+				crypto_tfm_alg_ivsize(suspend_encryptor_transform));
+	}
+		
+	return 0;
+}
+
+/* ---- Exported functions ---- */
+
+/* write_init()
+ *
+ * Description:	Allocate buffers and prepare to encrypt data.
+ * Arguments:	Stream_number:	Ignored.
+ * Returns:	Zero on success, -ENOMEM if unable to vmalloc.
+ */
+
+static int suspend_encrypt_write_init(int stream_number)
+{
+	int result;
+	
+	next_driver = suspend_get_next_filter(&suspend_encryption_ops);
+
+	if (!next_driver) {
+		printk("Encryption Driver: Argh! No one wants my output!");
+		return -ECHILD;
+	}
+
+	if ((result = suspend_encrypt_crypto_prepare(S2C_WRITE))) {
+		set_result_state(SUSPEND_ENCRYPTION_SETUP_FAILED);
+		suspend_crypto_cleanup();
+		return result;
+	}
+	
+	if ((result = allocate_local_buffer()))
+		return result;
+
+	/* Only reset the stats if starting to write an image */
+	if (stream_number == 2)
+		bytes_in = bytes_out = 0;
+	
+	bufofs = 0;
+
+	return 0;
+}
+
+/* suspend_encrypt_write_chunk()
+ *
+ * Description:	Encrypt a page of data, buffering output and passing on
+ * 		filled pages to the next module in the pipeline.
+ * Arguments:	Buffer_page:	Pointer to a buffer of size PAGE_SIZE, 
+ * 				containing data to be encrypted.
+ * Returns:	0 on success. Otherwise the error is that returned by later
+ * 		modules, -ECHILD if we have a broken pipeline or -EIO if
+ * 		zlib errs.
+ */
+
+static int suspend_encrypt_write_chunk(struct page *buffer_page)
+{
+	int ret; 
+	unsigned int len;
+	u16 len_written;
+	char *buffer_start;
+	
+	if (!suspend_encryptor_transform)
+		return next_driver->ops.filter.write_chunk(buffer_page);
+
+	buffer_start = kmap(buffer_page);
+	memcpy(page_buffer, buffer_start, PAGE_SIZE);
+	kunmap(buffer_page);
+	
+	bytes_in += PAGE_SIZE;
+
+	len = PAGE_SIZE;
+
+	ret = crypto_cipher_encrypt(suspend_encryptor_transform,
+			suspend_crypt_sg, suspend_crypt_sg, PAGE_SIZE);
+	
+	if (ret) {
+		printk("Encryption failed.\n");
+		return -EIO;
+	}
+	
+	len_written = (u16) len;
+
+	ret = next_driver->ops.filter.write_chunk(virt_to_page(page_buffer));
+
+	return ret;
+}
+
+/* write_cleanup()
+ *
+ * Description: Write unflushed data and free workspace.
+ * Returns:	Result of writing last page.
+ */
+
+static int suspend_encrypt_write_cleanup(void)
+{
+	suspend_crypto_cleanup();
+	free_local_buffer();
+
+	return 0;
+}
+
+/* read_init()
+ *
+ * Description:	Prepare to read a new stream of data.
+ * Arguments:	int: Section of image about to be read.
+ * Returns:	int: Zero on success, error number otherwise.
+ */
+
+static int suspend_encrypt_read_init(int stream_number)
+{
+	int result;
+
+	next_driver = suspend_get_next_filter(&suspend_encryption_ops);
+
+	if (!next_driver) {
+		printk("Encryption Driver: Argh! No one wants "
+				"to feed me data!");
+		return -ECHILD;
+	}
+	
+	if ((result = suspend_encrypt_crypto_prepare(S2C_READ))) {
+		set_result_state(SUSPEND_ENCRYPTION_SETUP_FAILED);
+		suspend_crypto_cleanup();
+		return result;
+	}
+	
+	if ((result = allocate_local_buffer()))
+		return result;
+
+	bufofs = PAGE_SIZE;
+
+	return 0;
+}
+
+/* suspend_encrypt_read_chunk()
+ *
+ * Description:	Retrieve data from later modules and deencrypt it until the
+ * 		input buffer is filled.
+ * Arguments:	Buffer_start: 	Pointer to a buffer of size PAGE_SIZE.
+ * 		Sync:		Whether the previous module (or core) wants its
+ * 				data synchronously.
+ * Returns:	Zero if successful. Error condition from me or from downstream
+ * 		on failure.
+ */
+
+static int suspend_encrypt_read_chunk(struct page *buffer_page, int sync)
+{
+	int ret; 
+	char *buffer_start;
+
+	if (!suspend_encryptor_transform)
+		return next_driver->ops.filter.read_chunk(buffer_page, sync);
+
+	/* 
+	 * All our reads must be synchronous - we can't deencrypt
+	 * data that hasn't been read yet.
+	 */
+
+	if ((ret = next_driver->ops.filter.read_chunk(
+			virt_to_page(page_buffer), SUSPEND_SYNC)) < 0) {
+		printk("Failed to read an encrypted block.\n");
+		return ret;
+	}
+
+	ret = crypto_cipher_decrypt(suspend_encryptor_transform,
+			suspend_crypt_sg, suspend_crypt_sg, PAGE_SIZE);
+
+	if (ret)
+		printk("Decrypt function returned %d.\n", ret);
+
+	buffer_start = kmap(buffer_page);
+	memcpy(buffer_start, page_buffer, PAGE_SIZE);
+	kunmap(buffer_page);
+	return ret;
+}
+
+/* read_cleanup()
+ *
+ * Description:	Clean up after reading part or all of a stream of data.
+ * Returns:	int: Always zero. Never fails.
+ */
+
+static int suspend_encrypt_read_cleanup(void)
+{
+	suspend_crypto_cleanup();
+	free_local_buffer();
+	return 0;
+}
+
+/* suspend_encrypt_print_debug_stats
+ *
+ * Description:	Print information to be recorded for debugging purposes into a
+ * 		buffer.
+ * Arguments:	buffer: Pointer to a buffer into which the debug info will be
+ * 			printed.
+ * 		size:	Size of the buffer.
+ * Returns:	Number of characters written to the buffer.
+ */
+
+static int suspend_encrypt_print_debug_stats(char *buffer, int size)
+{
+	int len;
+	
+	len = snprintf_used(buffer, size, "- Encryptor %s enabled.\n",
+			suspend_encryptor_name);
+	return len;
+}
+
+/* encryption_memory_needed
+ *
+ * Description:	Tell the caller how much memory we need to operate during
+ * 		suspend/resume.
+ * Returns:	Unsigned long. Maximum number of bytes of memory required for
+ * 		operation.
+ */
+
+static unsigned long suspend_encrypt_memory_needed(void)
+{
+	return PAGE_SIZE;
+}
+
+static unsigned long suspend_encrypt_storage_needed(void)
+{
+	return 2 * sizeof(unsigned long) + sizeof(int);
+}
+
+/* suspend_encrypt_save_config_info
+ *
+ * Description:	Save informaton needed when reloading the image at resume time.
+ * Arguments:	Buffer:		Pointer to a buffer of size PAGE_SIZE.
+ * Returns:	Number of bytes used for saving our data.
+ */
+
+static int suspend_encrypt_save_config_info(char *buffer)
+{
+	int buf_offset, str_size;
+
+	str_size = strlen(suspend_encryptor_name);
+	*buffer = (char) str_size;
+	strncpy(buffer + 1, suspend_encryptor_name, str_size + 1);
+	buf_offset = str_size + 2;
+
+	*(buffer + buf_offset) = (char) suspend_encryptor_mode;
+	buf_offset++;
+
+	*(buffer + buf_offset) = (char) suspend_encryptor_save_key_and_iv;
+	buf_offset++;
+
+	if (suspend_encryptor_save_key_and_iv) {
+		
+		str_size = strlen(suspend_encryptor_key);
+		*(buffer + buf_offset) = (char) str_size;
+		strncpy(buffer + buf_offset + 1, suspend_encryptor_key, str_size + 1);
+
+		buf_offset+= str_size + 2;
+
+		str_size = strlen(suspend_encryptor_iv);
+		*(buffer + buf_offset) = (char) str_size;
+		strncpy(buffer + buf_offset + 1, suspend_encryptor_iv, str_size + 1);
+
+		buf_offset += str_size + 2;
+	}
+
+	return buf_offset;
+}
+
+/* suspend_encrypt_load_config_info
+ *
+ * Description:	Reload information needed for deencrypting the image at 
+ * 		resume time.
+ * Arguments:	Buffer:		Pointer to the start of the data.
+ *		Size:		Number of bytes that were saved.
+ */
+
+static void suspend_encrypt_load_config_info(char *buffer, int size)
+{
+	int buf_offset, str_size;
+
+	str_size = (int) *buffer;
+	strncpy(suspend_encryptor_name, buffer + 1, str_size + 1);
+	buf_offset = str_size + 2;
+	
+	suspend_encryptor_mode = (int) *(buffer + buf_offset);
+	buf_offset++;
+
+	suspend_encryptor_save_key_and_iv = (int) *(buffer + buf_offset);
+	buf_offset++;
+
+	if (suspend_encryptor_save_key_and_iv) {
+		str_size = (int) *(buffer + buf_offset);
+		strncpy(suspend_encryptor_key, buffer + buf_offset + 1, str_size + 1);
+
+		buf_offset+= str_size + 2;
+
+		str_size = (int) *(buffer + buf_offset);
+		strncpy(suspend_encryptor_iv, buffer + buf_offset + 1, str_size + 1);
+
+		buf_offset += str_size + 2;
+	} else {
+		*suspend_encryptor_key = 0;
+		*suspend_encryptor_iv = 0;
+	}
+	
+	if (buf_offset != size) {
+		printk("Suspend Encryptor config info size mismatch (%d != %d): settings ignored.\n",
+				buf_offset, size);
+		*suspend_encryptor_key = 0;
+		*suspend_encryptor_iv = 0;
+	}
+	return;
+}
+
+static void suspend_encryptor_disable_if_empty(void)
+{
+	suspend_encryption_ops.disabled = !(*suspend_encryptor_name);
+}
+
+static int suspend_encrypt_initialise(int starting_cycle)
+{
+	if (starting_cycle)
+		suspend_encryptor_disable_if_empty();
+
+	return 0;
+}
+/*
+ * data for our proc entries.
+ */
+
+static struct suspend_proc_data proc_params[] = {
+	{
+		.filename			= "encryptor",
+		.permissions			= PROC_RW,
+		.type				= SUSPEND_PROC_DATA_STRING,
+		.data = {
+			.string = {
+				.variable	= suspend_encryptor_name,
+				.max_length	= 31,
+			}
+		},
+		.write_proc			= suspend_encryptor_disable_if_empty,
+	},
+
+	{
+		.filename			= "encryption_mode",
+		.permissions			= PROC_RW,
+		.type				= SUSPEND_PROC_DATA_INTEGER,
+		.data = {
+			.integer = {
+				.variable	= &suspend_encryptor_mode,
+				.minimum	= 0,
+				.maximum	= 3,
+			}
+		}
+	},
+
+	{
+		.filename			= "encryption_save_key_and_iv",
+		.permissions			= PROC_RW,
+		.type				= SUSPEND_PROC_DATA_INTEGER,
+		.data = {
+			.integer = {
+				.variable	= &suspend_encryptor_save_key_and_iv,
+				.minimum	= 0,
+				.maximum	= 1,
+			}
+		}
+	},
+
+	{
+		.filename			= "encryption_key",
+		.permissions			= PROC_RW,
+		.type				= SUSPEND_PROC_DATA_STRING,
+		.data = {
+			.string = {
+				.variable	= suspend_encryptor_key,
+				.max_length	= 255,
+			}
+		}
+	},
+
+	{
+		.filename			= "encryption_iv",
+		.permissions			= PROC_RW,
+		.type				= SUSPEND_PROC_DATA_STRING,
+		.data = {
+			.string = {
+				.variable	= suspend_encryptor_iv,
+				.max_length	= 255,
+			}
+		}
+	},
+
+	{
+		.filename			= "disable_encryption",
+		.permissions			= PROC_RW,
+		.type				= SUSPEND_PROC_DATA_INTEGER,
+		.data = {
+			.integer = {
+				.variable	= &suspend_encryption_ops.disabled,
+				.minimum	= 0,
+				.maximum	= 1,
+			}
+		}
+	},
+	
+};
+
+/*
+ * Ops structure.
+ */
+
+static struct suspend_module_ops suspend_encryption_ops = {
+	.type			= FILTER_PLUGIN,
+	.name			= "Encryptor",
+	.module			= THIS_MODULE,
+	.memory_needed 		= suspend_encrypt_memory_needed,
+	.print_debug_info	= suspend_encrypt_print_debug_stats,
+	.save_config_info	= suspend_encrypt_save_config_info,
+	.load_config_info	= suspend_encrypt_load_config_info,
+	.storage_needed		= suspend_encrypt_storage_needed,
+	
+	.initialise		= suspend_encrypt_initialise,
+	
+	.write_init		= suspend_encrypt_write_init,
+	.write_cleanup		= suspend_encrypt_write_cleanup,
+	.read_init		= suspend_encrypt_read_init,
+	.read_cleanup		= suspend_encrypt_read_cleanup,
+
+	.ops = {
+		.filter = {
+			.write_chunk		= suspend_encrypt_write_chunk,
+			.read_chunk		= suspend_encrypt_read_chunk,
+		}
+	}
+};
+
+/* ---- Registration ---- */
+
+static __init int suspend_encrypt_load(void)
+{
+	int result;
+	int i, numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data);
+
+	printk("Suspend2 Encryption Driver loading.\n");
+	if (!(result = suspend_register_module(&suspend_encryption_ops))) {
+		for (i=0; i< numfiles; i++)
+			suspend_register_procfile(&proc_params[i]);
+	} else
+		printk("Suspend2 Encryption Driver unable to register!\n");
+	return result;
+}
+
+late_initcall(suspend_encrypt_load);
diff -urN oldtree/kernel/power/extent.c newtree/kernel/power/extent.c
--- oldtree/kernel/power/extent.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/extent.c	2006-03-08 15:22:33.277507000 +0000
@@ -0,0 +1,247 @@
+/* kernel/power/extent.c
+ * 
+ * (C) 2003-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * Distributed under GPLv2.
+ * 
+ * These functions encapsulate the manipulation of storage metadata. For
+ * pageflags, we use dynamically allocated bitmaps.
+ */
+
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include "modules.h"
+#include "extent.h"
+#include "ui.h"
+
+int extents_allocated = 0;
+
+/* get_extent
+ *
+ * Returns a free extent. May fail, returning NULL instead.
+ */
+
+static struct extent *get_extent(void)
+{
+	struct extent *result;
+	
+	if (!(result = kmalloc(sizeof(struct extent), GFP_ATOMIC)))
+		return NULL;
+
+	extents_allocated++;
+	result->minimum = result->maximum = 0;
+	result->next = NULL;
+	return result;
+}
+
+/* put_extent.
+ *
+ * Frees an extent. Assumes unlinking is done by the caller.
+ */
+void put_extent(struct extent *extent)
+{
+	BUG_ON(!extent);
+
+	kfree(extent);
+	extents_allocated--;
+}
+
+/* put_extent_chain.
+ *
+ * Frees a whole chain of extents.
+ */
+void put_extent_chain(struct extent_chain *chain)
+{
+	struct extent *this;
+
+	this = chain->first;
+
+	while(this) {
+		struct extent *next = this->next;
+		kfree(this);
+		chain->frees++;
+		extents_allocated --;
+		this = next;
+	}
+	
+	BUG_ON(chain->frees != chain->allocs);
+	chain->first = chain->last = NULL;
+	chain->size = chain->allocs = chain->frees = 0;
+}
+
+/* append_extent_to_extent_chain
+ *
+ * Used where we know a extent is to be added to the end of the list
+ * and does not need merging with the current last extent.
+ */
+
+int append_extent_to_extent_chain(struct extent_chain *chain, 
+		unsigned long minimum, unsigned long maximum)
+{
+	struct extent *newextent = NULL;
+
+	newextent = get_extent();
+	if (!newextent) {
+		printk("Error unable to append a new extent to the chain.\n");
+		return 2;
+	}
+
+	chain->allocs++;
+	chain->size+= (maximum - minimum + 1);
+	newextent->minimum = minimum;
+	newextent->maximum = maximum;
+	newextent->next = NULL;
+
+	if (chain->last) {
+		chain->last->next = newextent;
+		chain->last = newextent;
+	} else 
+		chain->last = chain->first = newextent;
+
+	return 0;
+}
+
+/* serialise_extent_chain
+ *
+ * Write a chain in the image.
+ */
+int serialise_extent_chain(struct extent_chain *chain)
+{
+	struct extent *this;
+	int ret, i = 1;
+	
+	if ((ret = suspend_active_writer->ops.writer.write_header_chunk((char *) chain,
+		sizeof(struct extent_chain) - 2 * sizeof(struct extent *))))
+		return ret;
+
+	this = chain->first;
+	while (this) {
+		if ((ret = suspend_active_writer->ops.writer.write_header_chunk((char *) this,
+				2 * sizeof(unsigned long))))
+			return ret;
+		this = this->next;
+		i++;
+	}
+	return ret;
+}
+
+/* load_extent_chain
+ *
+ * Read back a chain saved in the image.
+ */
+int load_extent_chain(struct extent_chain *chain)
+{
+	struct extent *this, *last = NULL;
+	int i, ret;
+
+	if (!(ret = suspend_active_writer->ops.writer.read_header_chunk((char *) chain,
+		sizeof(struct extent_chain) - 2 * sizeof(struct extent *))))
+		return ret;
+
+	for (i = 0; i < (chain->allocs - chain->frees); i++) {
+		this = kmalloc(sizeof(struct extent), GFP_ATOMIC);
+		BUG_ON(!this); /* Shouldn't run out of memory trying this! */
+		this->next = NULL;
+		if (!(ret = suspend_active_writer->ops.writer.read_header_chunk((char *) this,
+				2 * sizeof(unsigned long))))
+			return ret;
+		if (last)
+			last->next = this;
+		else
+			chain->first = this;
+		last = this;
+	}
+	chain->last = last;
+	return ret;
+}
+
+/* extent_state_next
+ *
+ * Given a state, progress to the next valid entry. We may begin in an
+ * invalid state, as we do when invoked from extent_state_goto_start below.
+ */
+unsigned long extent_state_next(struct extent_iterate_state *state)
+{
+	if (state->current_chain > state->num_chains)
+		return 0;
+
+	if (state->current_extent)
+		GET_EXTENT_NEXT(state->current_extent, state->current_offset);
+
+	while(!state->current_extent) {
+		int chain_num = ++(state->current_chain);
+
+		if (chain_num > state->num_chains)
+			return 0;
+
+		state->current_extent = (state->chains + chain_num)->first;
+
+		if (!state->current_extent)
+			continue;
+
+		state->current_offset = state->current_extent->minimum;
+	}
+
+	return state->current_offset;
+}
+
+/* extent_state_goto_start
+ *
+ * Find the first valid value in a group of chains.
+ */
+void extent_state_goto_start(struct extent_iterate_state *state)
+{
+	state->current_chain = -1;
+	state->current_extent = NULL;
+	state->current_offset = 0;
+}
+
+/* extent_start_save
+ *
+ * Given a state and a struct extent_state_store, save the crreutn
+ * position in a format that can be used with relocated chains (at
+ * resume time).
+ */
+
+void extent_state_save(struct extent_iterate_state *state,
+		struct extent_iterate_saved_state *saved_state)
+{
+	struct extent *extent;
+
+	saved_state->chain_num = state->current_chain;
+	saved_state->extent_num = 0;
+	saved_state->offset = state->current_offset;
+
+	if (saved_state->chain_num == -1)
+		return;
+	
+	extent = (state->chains + state->current_chain)->first;
+
+	while (extent != state->current_extent) {
+		saved_state->extent_num++;
+		extent = extent->next;
+	}
+}
+
+/* extent_start_restore
+ *
+ * Restore the position saved by extent_state_save.
+ */
+
+void extent_state_restore(struct extent_iterate_state *state,
+		struct extent_iterate_saved_state *saved_state)
+{
+	int posn = saved_state->extent_num;
+
+	if (saved_state->chain_num == -1) {
+		extent_state_goto_start(state);
+		return;
+	}
+
+	state->current_chain = saved_state->chain_num;
+	state->current_extent = (state->chains + state->current_chain)->first;
+	state->current_offset = saved_state->offset;
+
+	while (posn--)
+		state->current_extent = state->current_extent->next;
+}
diff -urN oldtree/kernel/power/extent.h newtree/kernel/power/extent.h
--- oldtree/kernel/power/extent.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/extent.h	2006-03-08 15:22:33.277507000 +0000
@@ -0,0 +1,105 @@
+/*
+ * kernel/power/extent.h
+ *
+ * Copyright (C) 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains declarations related to extents. Extents are
+ * suspend's method of storing some of the metadata for the image.
+ * See extent.c for more info.
+ *
+ */
+
+#ifndef EXTENT_H
+#define EXTENT_H
+struct extent_chain {
+	int size; /* size of the extent ie sum (max-min+1) */
+	int allocs;
+	int frees;
+	int debug;
+	char *name;
+	struct extent *first;
+	struct extent *last;
+};
+
+/*
+ * We rely on extents not fitting evenly into a page.
+ * The last four bytes are used to store the number
+ * of the page, to make saving & reloading pages simpler.
+ */
+struct extent {
+	unsigned long minimum;
+	unsigned long maximum;
+	struct extent *next;
+};
+
+struct extent_iterate_state {
+	struct extent_chain *chains;
+	int num_chains;
+	int current_chain;
+	struct extent *current_extent;
+	unsigned long current_offset;
+};
+
+struct extent_iterate_saved_state {
+	int chain_num;
+	int extent_num;
+	unsigned long offset;
+};
+
+#define extent_state_eof(state) ((state)->num_chains < (state)->current_chain)
+
+#define extent_for_each(extent_chain, extentpointer, value) \
+if ((extent_chain)->first) \
+	for ((extentpointer) = (extent_chain)->first, (value) = \
+			(extentpointer)->minimum; \
+	     ((extentpointer) && ((extentpointer)->next || (value) <= \
+				 (extentpointer)->maximum)); \
+	     (((value) == (extentpointer)->maximum) ? \
+		((extentpointer) = (extentpointer)->next, (value) = \
+		 ((extentpointer) ? (extentpointer)->minimum : 0)) : \
+			(value)++))
+
+/*
+ * When using compression and expected_compression > 0,
+ * we allocate fewer swap entries, so GET_EXTENT_NEXT can
+ * validly run out of data to return.
+ */
+#define GET_EXTENT_NEXT(currentextent, currentval) \
+{ \
+	if (currentextent) { \
+		if ((currentval) == (currentextent)->maximum) { \
+			if ((currentextent)->next) { \
+				(currentextent) = (currentextent)->next; \
+				(currentval) = (currentextent)->minimum; \
+			} else { \
+				(currentextent) = NULL; \
+				(currentval) = 0; \
+			} \
+		} else \
+			currentval++; \
+	} \
+}
+
+extern int extents_allocated;
+void put_extent(struct extent *extent);
+void put_extent_chain(struct extent_chain *chain);
+int append_extent_to_extent_chain(struct extent_chain *chain, 
+		unsigned long minimum, unsigned long maximum);
+int serialise_extent_chain(struct extent_chain *chain);
+int load_extent_chain(struct extent_chain *chain);
+
+/* swap_entry_to_extent_val & extent_val_to_swap_entry: 
+ * We are putting offset in the low bits so consecutive swap entries
+ * make consecutive extent values */
+#define swap_entry_to_extent_val(swp_entry) (swp_entry.val)
+#define extent_val_to_swap_entry(val) (swp_entry_t) { (val) }
+
+void extent_state_save(struct extent_iterate_state *state,
+		struct extent_iterate_saved_state *saved_state);
+void extent_state_restore(struct extent_iterate_state *state,
+		struct extent_iterate_saved_state *saved_state);
+void extent_state_goto_start(struct extent_iterate_state *state);
+unsigned long extent_state_next(struct extent_iterate_state *state);
+#endif
diff -urN oldtree/kernel/power/io.c newtree/kernel/power/io.c
--- oldtree/kernel/power/io.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/io.c	2006-03-08 15:22:33.281507250 +0000
@@ -0,0 +1,1026 @@
+/*
+ * kernel/power/io.c
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
+ * Copyright (C) 2002-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains high level IO routines for suspending.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/version.h>
+#include <linux/utsname.h>
+#include <linux/mount.h>
+#include <linux/suspend2.h>
+
+#include "version.h"
+#include "modules.h"
+#include "pageflags.h"
+#include "io.h"
+#include "ui.h"
+#include "suspend2_common.h"
+#include "suspend2.h"
+#include "debug_pagealloc.h"
+#include "storage.h"
+
+/* attempt_to_parse_resume_device
+ *
+ * Can we suspend, using the current resume2= parameter?
+ */
+int attempt_to_parse_resume_device(void)
+{
+	struct list_head *writer;
+	struct suspend_module_ops *this_writer;
+	int result, returning = 0;
+
+	if (suspend_activate_storage(0))
+		return 0;
+
+	suspend_active_writer = NULL;
+	clear_suspend_state(SUSPEND_RESUME_DEVICE_OK);
+	set_suspend_state(SUSPEND_DISABLED);
+	clear_result_state(SUSPEND_ABORTED);
+
+	if (!suspend_num_writers) {
+		printk(name_suspend "No writers have been registered. Suspending will be disabled.\n");
+		goto cleanup;
+	}
+	
+	if (!resume2_file[0]) {
+		printk(name_suspend "Resume2 parameter is empty. Suspending will be disabled.\n");
+		goto cleanup;
+	}
+
+	list_for_each(writer, &suspend_writers) {
+		this_writer = list_entry(writer, struct suspend_module_ops,
+				ops.writer.writer_list);
+
+		/* 
+		 * Not sure why you'd want to disable a writer, but
+		 * we should honour the flag if we're providing it
+		 */
+		if (this_writer->disabled) {
+			printk(name_suspend
+					"Writer '%s' is disabled. Ignoring it.\n",
+					this_writer->name);
+			continue;
+		}
+
+		result = this_writer->ops.writer.parse_sig_location(
+				resume2_file, (suspend_num_writers == 1));
+
+		switch (result) {
+			case -EINVAL:
+				/* 
+				 * For this writer, but not a valid 
+				 * configuration. Error already printed.
+				 */
+
+				goto cleanup;
+
+			case 0:
+				/*
+				 * For this writer and valid.
+				 */
+
+				suspend_active_writer = this_writer;
+
+				set_suspend_state(SUSPEND_RESUME_DEVICE_OK);
+				clear_suspend_state(SUSPEND_DISABLED);
+				printk(name_suspend "Suspending enabled.\n");
+
+				returning = 1;
+				goto cleanup;
+		}
+	}
+	printk(name_suspend "No matching enabled writer found. Suspending disabled.\n");
+cleanup:
+	suspend_deactivate_storage(0);
+	return returning;
+}
+
+void attempt_to_parse_resume_device2(void)
+{
+	suspend_prepare_usm();
+	attempt_to_parse_resume_device();
+	suspend_cleanup_usm();
+}
+
+/* noresume_reset_modules
+ *
+ * Description:	When we read the start of an image, modules (and especially the
+ * 		active writer) might need to reset data structures if we decide
+ * 		to invalidate the image rather than resuming from it.
+ */
+
+static void noresume_reset_modules(void)
+{
+	struct suspend_module_ops *this_filter;
+	
+	list_for_each_entry(this_filter, &suspend_filters, ops.filter.filter_list) {
+		if (this_filter->ops.filter.noresume_reset)
+			this_filter->ops.filter.noresume_reset();
+	}
+
+	if (suspend_active_writer && suspend_active_writer->ops.writer.noresume_reset)
+		suspend_active_writer->ops.writer.noresume_reset();
+}
+
+/* fill_suspend_header()
+ * 
+ * Description:	Fill the suspend header structure.
+ * Arguments:	struct suspend_header: Header data structure to be filled.
+ */
+
+static void fill_suspend_header(struct suspend_header *sh)
+{
+	int i;
+	
+	memset((char *)sh, 0, sizeof(*sh));
+
+	sh->version_code = LINUX_VERSION_CODE;
+	sh->num_physpages = num_physpages;
+	sh->orig_mem_free = suspend_orig_mem_free;
+	strncpy(sh->machine, system_utsname.machine, 65);
+	strncpy(sh->version, system_utsname.version, 65);
+	sh->page_size = PAGE_SIZE;
+	sh->pagedir = pagedir1;
+	sh->pageset_2_size = pagedir2.pageset_size;
+	sh->param0 = suspend_result;
+	sh->param1 = suspend_action;
+	sh->param2 = suspend_debug_state;
+	sh->param3 = console_loglevel;
+	sh->root_fs = current->fs->rootmnt->mnt_sb->s_dev;
+	for (i = 0; i < 4; i++)
+		sh->io_time[i/2][i%2] =
+		       suspend_io_time[i/2][i%2];
+}
+
+/*
+ * rw_init_modules
+ *
+ * Iterate over modules, preparing the ones that will be used to read or write
+ * data.
+ */
+static int rw_init_modules(int write, int which)
+{
+	struct suspend_module_ops *this_module;
+	/* Initialise page transformers */
+	list_for_each_entry(this_module, &suspend_filters,
+			ops.filter.filter_list) {
+		if (this_module->disabled)
+			continue;
+		if ((write && this_module->write_init &&
+			this_module->write_init(which)) ||
+		    (!write && this_module->read_init &&
+		     	this_module->read_init(which))) {
+			abort_suspend("Failed to initialise the %s filter.",
+				this_module->name);
+				return 1;
+		}
+	}
+
+	/* Initialise writer */
+	if ((write && suspend_active_writer->write_init(which)) ||
+	    (!write && suspend_active_writer->read_init(which))) {
+		abort_suspend("Failed to initialise the writer."); 
+		if (!write)
+			suspend_active_writer->ops.writer.invalidate_image();
+		return 1;
+	}
+
+	/* Initialise other modules */
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+		if (this_module->disabled)
+			continue;
+		if ((this_module->type == FILTER_PLUGIN) ||
+		    (this_module->type == WRITER_PLUGIN))
+			continue;
+		if ((write && this_module->write_init &&
+			this_module->write_init(which)) ||
+		    (!write && this_module->read_init &&
+		     	this_module->read_init(which))) {
+				set_result_state(SUSPEND_ABORTED);
+				return 1;
+			}
+	}
+
+	return 0;
+}
+
+/*
+ * rw_cleanup_modules
+ *
+ * Cleanup components after reading or writing a set of pages.
+ * Only the writer may fail.
+ */
+static int rw_cleanup_modules(int write)
+{
+	struct suspend_module_ops *this_module;
+	int result = 0;
+
+	/* Cleanup other modules */
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+		if (this_module->disabled)
+			continue;
+		if ((this_module->type == FILTER_PLUGIN) ||
+		    (this_module->type == WRITER_PLUGIN))
+			continue;
+		if (write) {
+			if (this_module->write_cleanup)
+				result |= this_module->write_cleanup();
+		} else
+			if (this_module->read_cleanup)
+				result |= this_module->read_cleanup();
+	}
+
+	/* Flush data and cleanup */
+	list_for_each_entry(this_module, &suspend_filters,
+			ops.filter.filter_list) {
+		if (this_module->disabled)
+			continue;
+		if (write) {
+			if (this_module->write_cleanup)
+				result |= this_module->write_cleanup();
+		} else
+			if (this_module->read_cleanup)
+				result |= this_module->read_cleanup();
+	}
+
+	if (write)
+		result |= suspend_active_writer->write_cleanup();
+	else
+		result |= suspend_active_writer->read_cleanup();
+
+	return result;
+}
+
+/*
+ * do_rw_loop
+ *
+ * The main I/O loop for reading or writing pages.
+ */
+static int do_rw_loop(int write, int finish_at, dyn_pageflags_t *pageflags,
+		int base, int barmax)
+{
+	int current_page_index = -1, pc, step = 1, nextupdate = 0, i;
+	int result;
+	struct suspend_module_ops *first_filter = suspend_get_next_filter(NULL);
+
+	current_page_index = get_next_bit_on(*pageflags, -1);
+
+	pc = finish_at / 5;
+
+	/* Read the pages */
+	for (i=0; i< finish_at; i++) {
+		int was_mapped = 0;
+		struct page *page = pfn_to_page(current_page_index);
+
+		/* Status */
+		if ((i+base) >= nextupdate)
+			nextupdate = suspend_update_status(i+base, barmax,
+				" %d/%d MB ", MB(base+i+1), MB(barmax));
+
+		if ((i + 1) == pc) {
+			printk("%d%%...", 20 * step);
+			step++;
+			pc = finish_at * step / 5;
+		}
+		
+		was_mapped = suspend_map_kernel_page(page, 1);
+		if (write)
+			result = first_filter->ops.filter.write_chunk(page);
+		else
+			result = first_filter->ops.filter.read_chunk(page,
+					SUSPEND_ASYNC);
+		if (!was_mapped)
+			suspend_map_kernel_page(page, 0);
+
+		if (result) {
+			if (write) {
+				printk("Write chunk returned %d.\n", result);
+				abort_suspend("Failed to write a chunk of the "
+					"image.");
+				return result;
+			} else
+				panic("Failed to read chunk %d/%d of the image. (%d)",
+					i, finish_at, result);
+		}
+
+		/* Interactivity*/
+		check_shift_keys(0, NULL);
+
+		if (test_result_state(SUSPEND_ABORTED) && write)
+			return 1;
+
+		/* Prepare next */
+		current_page_index = get_next_bit_on(*pageflags,
+				current_page_index);
+	}
+
+	printk("done.\n");
+
+	suspend_update_status(base + finish_at, barmax, " %d/%d MB ",
+			MB(base + finish_at), MB(barmax));
+	return 0;
+}
+
+/* write_pageset()
+ *
+ * Description:	Write a pageset to disk.
+ * Arguments:	pagedir:	Pointer to the pagedir to be saved.
+ * 		whichtowrite:	Controls what debugging output is printed.
+ * Returns:	Zero on success or -1 on failure.
+ */
+
+int write_pageset(struct pagedir *pagedir, int whichtowrite)
+{
+	int finish_at, base = 0, start_time, end_time;
+	int barmax = pagedir1.pageset_size + pagedir2.pageset_size;
+	long error = 0;
+	dyn_pageflags_t *pageflags;
+
+	/* 
+	 * Even if there is nothing to read or write, the writer
+	 * may need the init/cleanup for it's housekeeping.  (eg:
+	 * Pageset1 may start where pageset2 ends when writing).
+	 */
+	finish_at = pagedir->pageset_size;
+
+	if (whichtowrite == 1) {
+		suspend_prepare_status(DONT_CLEAR_BAR,
+				"Writing kernel & process data...");
+		base = pagedir2.pageset_size;
+		if (test_action_state(SUSPEND_TEST_FILTER_SPEED) ||
+		    test_action_state(SUSPEND_TEST_BIO))
+			pageflags = &pageset1_map;
+		else
+			pageflags = &pageset1_copy_map;
+	} else {
+		suspend_prepare_status(CLEAR_BAR, "Writing caches...");
+		pageflags = &pageset2_map;
+		bytes_in = bytes_out = 0;
+	}	
+	
+	start_time = jiffies;
+
+	if (!rw_init_modules(1, whichtowrite))
+		error = do_rw_loop(1, finish_at, pageflags, base, barmax);
+
+	if (rw_cleanup_modules(1)) {
+		abort_suspend("Failed to cleanup after writing.");
+		error = 1;
+	}
+
+	/* Statistics */
+	end_time = jiffies;
+	
+	if ((end_time - start_time) && (!test_result_state(SUSPEND_ABORTED))) {
+		suspend_io_time[0][0] += finish_at,
+		suspend_io_time[0][1] += (end_time - start_time);
+	}
+
+	return error;
+}
+
+/* read_pageset()
+ *
+ * Description:	Read a pageset from disk.
+ * Arguments:	pagedir:	Pointer to the pagedir to be saved.
+ * 		whichtowrite:	Controls what debugging output is printed.
+ * 		overwrittenpagesonly: Whether to read the whole pageset or
+ * 		only part.
+ * Returns:	Zero on success or -1 on failure.
+ */
+
+static int read_pageset(struct pagedir *pagedir, int whichtoread,
+		int overwrittenpagesonly)
+{
+	int result = 0, base = 0, start_time, end_time;
+	int finish_at = pagedir->pageset_size;
+	int barmax = pagedir1.pageset_size + pagedir2.pageset_size;
+	dyn_pageflags_t *pageflags;
+
+	if (whichtoread == 1) {
+		suspend_prepare_status(CLEAR_BAR,
+				"Reading kernel & process data...");
+		pageflags = &pageset1_copy_map;
+	} else {
+		suspend_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
+		if (overwrittenpagesonly)
+			barmax = finish_at = min(pagedir1.pageset_size, 
+						 pagedir2.pageset_size);
+		else {
+			base = pagedir1.pageset_size;
+		}
+		pageflags = &pageset2_map;
+	}	
+	
+	start_time = jiffies;
+
+	if (rw_init_modules(0, whichtoread)) {
+		suspend_active_writer->ops.writer.invalidate_image();
+		result = 1;
+	} else
+		result = do_rw_loop(0, finish_at, pageflags, base, barmax);
+
+	if (rw_cleanup_modules(0)) {
+		abort_suspend("Failed to cleanup after reading.");
+		result = 1;
+	}
+
+	/* Statistics */
+	end_time=jiffies;
+
+	if ((end_time - start_time) && (!test_result_state(SUSPEND_ABORTED))) {
+		suspend_io_time[1][0] += finish_at,
+		suspend_io_time[1][1] += (end_time - start_time);
+	}
+
+	return result;
+}
+
+/* write_module_configs()
+ *
+ * Description:	Store the configuration for each module in the image header.
+ * Returns:	Int: Zero on success, Error value otherwise.
+ */
+static int write_module_configs(void)
+{
+	struct suspend_module_ops *this_module;
+	char *buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	int len, index = 1;
+	struct suspend_module_header suspend_module_header;
+
+	if (!buffer) {
+		printk("Failed to allocate a buffer for saving "
+				"module configuration info.\n");
+		return -ENOMEM;
+	}
+		
+	/* 
+	 * We have to know which data goes with which module, so we at
+	 * least write a length of zero for a module. Note that we are
+	 * also assuming every module's config data takes <= PAGE_SIZE.
+	 */
+
+	/* For each module (in registration order) */
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+
+		/* Get the data from the module */
+		len = 0;
+		if (this_module->save_config_info)
+			len = this_module->save_config_info(buffer);
+
+		/* Save the details of the module */
+		suspend_module_header.disabled = this_module->disabled;
+		suspend_module_header.type = this_module->type;
+		suspend_module_header.index = index++;
+		strncpy(suspend_module_header.name, this_module->name, 
+					sizeof(suspend_module_header.name));
+		suspend_active_writer->ops.writer.write_header_chunk(
+				(char *) &suspend_module_header,
+				sizeof(suspend_module_header));
+
+		/* Save the size of the data and any data returned */
+		suspend_active_writer->ops.writer.write_header_chunk((char *) &len,
+				sizeof(int));
+		if (len)
+			suspend_active_writer->ops.writer.write_header_chunk(
+					buffer, len);
+	}
+
+	/* Write a blank header to terminate the list */
+	suspend_module_header.name[0] = '\0';
+	suspend_active_writer->ops.writer.write_header_chunk(
+			(char *) &suspend_module_header,
+			sizeof(suspend_module_header));
+
+	free_page((unsigned long) buffer);
+	return 0;
+}
+
+/* read_module_configs()
+ *
+ * Description:	Reload module configurations from the image header.
+ * Returns:	Int. Zero on success, error value otherwise.
+ */
+
+static int read_module_configs(void)
+{
+	struct suspend_module_ops *this_module;
+	char *buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	int len, result = 0;
+	struct suspend_module_header suspend_module_header;
+
+	if (!buffer) {
+		printk("Failed to allocate a buffer for reloading module "
+				"configuration info.\n");
+		return -ENOMEM;
+	}
+		
+	/* All modules are initially disabled. That way, if we have a module
+	 * loaded now that wasn't loaded when we suspended, it won't be used
+	 * in trying to read the data.
+	 */
+	list_for_each_entry(this_module, &suspend_modules, module_list)
+		this_module->disabled = 1;
+	
+	/* Get the first module header */
+	result = suspend_active_writer->ops.writer.read_header_chunk(
+			(char *) &suspend_module_header, sizeof(suspend_module_header));
+	if (!result) {
+		printk("Failed to read the next module header.\n");
+		free_page((unsigned long) buffer);
+		return -EINVAL;
+	}
+
+	/* For each module (in registration order) */
+	while (suspend_module_header.name[0]) {
+
+		/* Find the module */
+		this_module = suspend_find_module_given_name(suspend_module_header.name);
+
+		if (!this_module) {
+			/* 
+			 * Is it used? Only need to worry about filters. The active
+			 * writer must be loaded!
+			 */
+			if ((!suspend_module_header.disabled) &&
+			    (suspend_module_header.type == FILTER_PLUGIN)) {
+				suspend_early_boot_message(1, SUSPEND_CONTINUE_REQ,
+					"It looks like we need module %s for "
+					"reading the image but it hasn't been "
+					"registered.\n",
+					suspend_module_header.name);
+				if (!(test_suspend_state(SUSPEND_CONTINUE_REQ))) {
+					suspend_active_writer->ops.writer.invalidate_image();
+					result = -EINVAL;
+					noresume_reset_modules();
+					free_page((unsigned long) buffer);
+					return -EINVAL;
+				}
+			} else
+				printk("Plugin %s configuration data found, but the module "
+					"hasn't registered. Looks like it was disabled, so "
+					"we're ignoring it's data.",
+					suspend_module_header.name);
+		}
+		
+		/* Get the length of the data (if any) */
+		result = suspend_active_writer->ops.writer.read_header_chunk(
+				(char *) &len, sizeof(int));
+		if (!result) {
+			printk("Failed to read the length of the module %s's"
+					" configuration data.\n",
+					suspend_module_header.name);
+			free_page((unsigned long) buffer);
+			return -EINVAL;
+		}
+
+		/* Read any data and pass to the module (if we found one) */
+		if (len) {
+			suspend_active_writer->ops.writer.read_header_chunk(buffer, len);
+			if (this_module) {
+				if (!this_module->save_config_info) {
+					printk("Huh? Plugin %s appears to have a "
+						"save_config_info, but not a "
+						"load_config_info function!\n",
+						this_module->name);
+				} else
+					this_module->load_config_info(buffer, len);
+			}
+		}
+
+		if (this_module) {
+			/* Now move this module to the tail of its lists. This will put it
+			 * in order. Any new modules will end up at the top of the lists.
+			 * They should have been set to disabled when loaded (people will
+			 * normally not edit an initrd to load a new module and then
+			 * suspend without using it!).
+			 */
+
+			suspend_move_module_tail(this_module);
+
+			/* 
+			 * We apply the disabled state; modules don't need to save whether they
+			 * were disabled and if they do, we override them anyway.
+			 */
+			this_module->disabled = suspend_module_header.disabled;
+		}
+
+		/* Get the next module header */
+		result = suspend_active_writer->ops.writer.read_header_chunk(
+				(char *) &suspend_module_header,
+				sizeof(suspend_module_header));
+
+		if (!result) {
+			printk("Failed to read the next module header.\n");
+			free_page((unsigned long) buffer);
+			return -EINVAL;
+		}
+
+	}
+
+	free_page((unsigned long) buffer);
+	return 0;
+}
+
+/* write_image_header()
+ *
+ * Description:	Write the image header after write the image proper.
+ * Returns:	Int. Zero on success or -1 on failure.
+ */
+
+int write_image_header(void)
+{
+	int ret;
+	int total = pagedir1.pageset_size + pagedir2.pageset_size+2;
+	char *header_buffer = NULL;
+
+	/* Now prepare to write the header */
+	if ((ret = suspend_active_writer->ops.writer.write_header_init())) {
+		abort_suspend("Active writer's write_header_init"
+				" function failed.");
+		goto write_image_header_abort;
+	}
+
+	/* Get a buffer */
+	header_buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	if (!header_buffer) {
+		abort_suspend("Out of memory when trying to get page "
+				"for header!");
+		goto write_image_header_abort;
+	}
+
+	/* Write suspend header */
+	fill_suspend_header((struct suspend_header *) header_buffer);
+	suspend_active_writer->ops.writer.write_header_chunk(header_buffer,
+			sizeof(struct suspend_header));
+
+	free_page((unsigned long) header_buffer);
+
+	/* Write module configurations */
+	if ((ret = write_module_configs())) {
+		abort_suspend("Failed to write module configs.");
+		goto write_image_header_abort;
+	}
+
+	save_dyn_pageflags(pageset1_map);
+
+	if (suspend_active_writer->ops.writer.serialise_extents &&
+	    (ret = suspend_active_writer->ops.writer.serialise_extents())) {
+		abort_suspend("Active writer's prepare_save_extents "
+				"function failed.");
+		goto write_image_header_abort;
+	}
+
+	/* Flush data and let writer cleanup */
+	if (suspend_active_writer->ops.writer.write_header_cleanup()) {
+		abort_suspend("Failed to cleanup writing header.");
+		goto write_image_header_abort_no_cleanup;
+	}
+
+	if (test_result_state(SUSPEND_ABORTED))
+		goto write_image_header_abort_no_cleanup;
+
+	suspend_message(SUSPEND_IO, SUSPEND_VERBOSE, 1, "|\n");
+	suspend_update_status(total, total, NULL);
+
+	return 0;
+
+write_image_header_abort:
+	suspend_active_writer->ops.writer.write_header_cleanup();
+write_image_header_abort_no_cleanup:
+	return -1;
+}
+
+/* sanity_check()
+ *
+ * Description:	Perform a few checks, seeking to ensure that the kernel being
+ * 		booted matches the one suspended. They need to match so we can
+ * 		be _sure_ things will work. It is not absolutely impossible for
+ * 		resuming from a different kernel to work, just not assured.
+ * Arguments:	Struct suspend_header. The header which was saved at suspend
+ * 		time.
+ */
+static char *sanity_check(struct suspend_header *sh)
+{
+	if (sh->version_code != LINUX_VERSION_CODE)
+		return "Incorrect kernel version.";
+	
+	if (sh->num_physpages != num_physpages)
+		return "Incorrect memory size.";
+
+	if (strncmp(sh->machine, system_utsname.machine, 65))
+		return "Incorrect machine type.";
+
+	if (strncmp(sh->version, system_utsname.version, 65))
+		return "Right kernel version but wrong build number.";
+
+	if (sh->page_size != PAGE_SIZE)
+		return "Incorrect PAGE_SIZE.";
+
+	if ((sh->root_fs == current->fs->rootmnt->mnt_sb->s_dev) &&
+	    (!test_suspend_state(SUSPEND_IGNORE_ROOTFS)))
+		return "Root filesystem has been mounted prior to trying to resume.";
+
+	return 0;
+}
+
+/* __read_pageset1
+ *
+ * Description:	Test for the existence of an image and attempt to load it.
+ * Returns:	Int. Zero if image found and pageset1 successfully loaded.
+ * 		Error if no image found or loaded.
+ */
+static int __read_pageset1(void)
+{			
+	int i, result = 0;
+	char *header_buffer = (char *) get_zeroed_page(GFP_ATOMIC), *sanity_error = NULL;
+	struct suspend_header *suspend_header;
+
+	if (!header_buffer)
+		return -ENOMEM;
+	
+	/* Check for an image */
+	if (!(result = suspend_active_writer->ops.writer.image_exists())) {
+		result = -ENODATA;
+		noresume_reset_modules();
+		goto out;
+	}
+
+	/* Check for noresume command line option */
+	if (test_suspend_state(SUSPEND_NORESUME_SPECIFIED)) {
+		suspend_active_writer->ops.writer.invalidate_image();
+		result = -EINVAL;
+		noresume_reset_modules();
+		goto out;
+	}
+
+	/* Check whether we've resumed before */
+	if (test_suspend_state(SUSPEND_RESUMED_BEFORE)) {
+		int resumed_before_default = 0;
+		if (test_suspend_state(SUSPEND_RETRY_RESUME))
+			resumed_before_default = SUSPEND_CONTINUE_REQ;
+		suspend_early_boot_message(1, resumed_before_default, NULL);
+		clear_suspend_state(SUSPEND_RETRY_RESUME);
+		if (!(test_suspend_state(SUSPEND_CONTINUE_REQ))) {
+			suspend_active_writer->ops.writer.invalidate_image();
+			result = -EINVAL;
+			noresume_reset_modules();
+			goto out;
+		}
+	}
+
+	clear_suspend_state(SUSPEND_CONTINUE_REQ);
+
+	/* 
+	 * Prepare the active writer for reading the image header. The
+	 * activate writer might read its own configuration.
+	 * 
+	 * NB: This call may never return because there might be a signature
+	 * for a different image such that we warn the user and they choose
+	 * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
+	 * location of the image might be unavailable if it was stored on a
+	 * network connection.
+	 */
+
+	if ((result = suspend_active_writer->ops.writer.read_header_init())) {
+		noresume_reset_modules();
+		goto out;
+	}
+	
+	/* Read suspend header */
+	if ((result = suspend_active_writer->ops.writer.read_header_chunk(
+			header_buffer, sizeof(struct suspend_header))) < 0) {
+		noresume_reset_modules();
+		goto out;
+	}
+	
+	suspend_header = (struct suspend_header *) header_buffer;
+
+	/*
+	 * NB: This call may also result in a reboot rather than returning.
+	 */
+
+	if ((sanity_error = sanity_check(suspend_header)) &&
+	    suspend_early_boot_message(1, SUSPEND_CONTINUE_REQ, sanity_error)) {
+		suspend_active_writer->ops.writer.invalidate_image();
+		result = -EINVAL;
+		noresume_reset_modules();
+		goto out;
+	}
+
+	/*
+	 * We have an image and it looks like it will load okay.
+	 */
+
+	/* Get metadata from header. Don't override commandline parameters.
+	 *
+	 * We don't need to save the image size limit because it's not used
+	 * during resume and will be restored with the image anyway.
+	 */
+	
+	suspend_orig_mem_free = suspend_header->orig_mem_free;
+	memcpy((char *) &pagedir1,
+		(char *) &suspend_header->pagedir, sizeof(pagedir1));
+	suspend_result = suspend_header->param0;
+	if (!test_suspend_state(SUSPEND_ACT_USED))
+		suspend_action = suspend_header->param1;
+	if (!test_suspend_state(SUSPEND_DBG_USED))
+		suspend_debug_state = suspend_header->param2;
+	if (!test_suspend_state(SUSPEND_LVL_USED))
+		suspend_default_console_level = suspend_header->param3;
+	clear_suspend_state(SUSPEND_IGNORE_LOGLEVEL);
+	pagedir2.pageset_size = suspend_header->pageset_2_size;
+	for (i = 0; i < 4; i++)
+		suspend_io_time[i/2][i%2] =
+			suspend_header->io_time[i/2][i%2];
+
+	/* Read module configurations */
+	if ((result = read_module_configs())) {
+		noresume_reset_modules();
+		pagedir1.pageset_size =
+			pagedir2.pageset_size = 0;
+		goto out;
+	}
+
+	suspend_prepare_console();
+
+	check_shift_keys(1, "About to read original pageset1 locations.");
+	/* Read original pageset1 locations. These are the addresses we can't use for
+	 * the data to be restored */
+	allocate_dyn_pageflags(&pageset1_map);
+	load_dyn_pageflags(pageset1_map);
+
+	allocate_dyn_pageflags(&conflicting_pages_map);
+
+	set_suspend_state(SUSPEND_NOW_RESUMING);
+
+	/* Relocate it so that it's not overwritten while we're using it to
+	 * copy the original contents back */
+	relocate_dyn_pageflags(&pageset1_map);
+	relocate_dyn_pageflags(&conflicting_pages_map);
+	
+	allocate_dyn_pageflags(&pageset1_copy_map);
+	relocate_dyn_pageflags(&pageset1_copy_map);
+
+	/* Read extent pages */
+	if (suspend_active_writer->ops.writer.load_extents &&
+	    (result = suspend_active_writer->ops.writer.load_extents())) {
+		noresume_reset_modules();
+		abort_suspend("Active writer's load_extents "
+				"function failed.");
+		goto out_reset_console;
+	}
+
+	/* Clean up after reading the header */
+	if ((result = suspend_active_writer->ops.writer.read_header_cleanup())) {
+		noresume_reset_modules();
+		goto out_reset_console;
+	}
+
+	check_shift_keys(1, "About to read pagedir.");
+
+	/* 
+	 * Get the addresses of pages into which we will load the kernel to
+	 * be copied back
+	 */
+	if (suspend_get_pageset1_load_addresses()) {
+		result = -ENOMEM;
+		noresume_reset_modules();
+		goto out_reset_console;
+	}
+
+	/* Read the original kernel back */
+	check_shift_keys(1, "About to read pageset 1.");
+
+	if (read_pageset(&pagedir1, 1, 0)) {
+		suspend_prepare_status(CLEAR_BAR, "Failed to read pageset 1.");
+		result = -EPERM;
+		noresume_reset_modules();
+		goto out_reset_console;
+	}
+
+	check_shift_keys(1, "About to restore original kernel.");
+	result = 0;
+
+	if (!test_action_state(SUSPEND_KEEP_IMAGE) &&
+	    suspend_active_writer->ops.writer.mark_resume_attempted)
+		suspend_active_writer->ops.writer.mark_resume_attempted();
+
+out:
+	free_page((unsigned long) header_buffer);
+	return result;
+
+out_reset_console:
+	free_dyn_pageflags(&pageset1_map);
+	free_dyn_pageflags(&pageset1_copy_map);
+	free_dyn_pageflags(&conflicting_pages_map);
+	suspend_cleanup_console();
+	goto out;
+}
+
+/* read_pageset1()
+ *
+ * Description:	Attempt to read the header and pageset1 of a suspend image.
+ * 		Handle the outcome, complaining where appropriate.
+ */
+
+int read_pageset1(void)
+{
+	int error;
+
+	error = __read_pageset1();
+
+	switch (error) {
+		case 0:
+		case -ENODATA:
+		case -EINVAL:	/* non fatal error */
+			return error;
+		case -EIO:
+			printk(KERN_CRIT name_suspend "I/O error\n");
+			break;
+		case -ENOENT:
+			printk(KERN_CRIT name_suspend "No such file or directory\n");
+			break;
+		case -EPERM:
+			printk(KERN_CRIT name_suspend "Sanity check error\n");
+			break;
+		default:
+			printk(KERN_CRIT name_suspend "Error %d resuming\n", error);
+			break;
+	}
+	abort_suspend("Error %d in read_pageset1",error);
+	return error;
+}
+
+/*
+ * get_have_image_data()
+ */
+
+char *get_have_image_data(void)
+{
+	char *output_buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	struct suspend_header *suspend_header;
+
+	if (!output_buffer) {
+		printk("Output buffer null.\n");
+		return NULL;
+	}
+
+	/* Check for an image */
+	if (!suspend_active_writer->ops.writer.image_exists() ||
+	    suspend_active_writer->ops.writer.read_header_init() ||
+	    suspend_active_writer->ops.writer.read_header_chunk(
+			output_buffer, sizeof(struct suspend_header)) !=
+	    		sizeof(struct suspend_header)) {
+		sprintf(output_buffer, "0\n");
+		goto out;
+	}
+
+	suspend_header = (struct suspend_header *) output_buffer;
+
+	sprintf(output_buffer, "1\n%s\n%s\n",
+			suspend_header->machine,
+			suspend_header->version);
+
+	/* Check whether we've resumed before */
+	if (test_suspend_state(SUSPEND_RESUMED_BEFORE))
+		strcat(output_buffer, "Resumed before.\n");
+
+out:
+	noresume_reset_modules();
+	return output_buffer;
+}
+
+/* read_pageset2()
+ *
+ * Description:	Read in part or all of pageset2 of an image, depending upon
+ * 		whether we are suspending and have only overwritten a portion
+ * 		with pageset1 pages, or are resuming and need to read them 
+ * 		all.
+ * Arguments:	Int. Boolean. Read only pages which would have been
+ * 		overwritten by pageset1?
+ * Returns:	Int. Zero if no error, otherwise the error value.
+ */
+int read_pageset2(int overwrittenpagesonly)
+{
+	int result = 0;
+
+	if (!pagedir2.pageset_size)
+		return 0;
+
+	result = read_pageset(&pagedir2, 2, overwrittenpagesonly);
+
+	suspend_update_status(100, 100, NULL);
+	check_shift_keys(1, "Pagedir 2 read.");
+
+	return result;
+}
diff -urN oldtree/kernel/power/io.h newtree/kernel/power/io.h
--- oldtree/kernel/power/io.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/io.h	2006-03-08 15:22:33.281507250 +0000
@@ -0,0 +1,38 @@
+/*
+ * kernel/power/io.h
+ */
+
+#include "pagedir.h"
+
+/* Non-module data saved in our image header */
+struct suspend_header {
+	u32 version_code;
+	unsigned long num_physpages;
+	unsigned long orig_mem_free;
+	char machine[65];
+	char version[65];
+	int num_cpus;
+	int page_size;
+	int pageset_2_size;
+	int param0;
+	int param1;
+	int param2;
+	int param3;
+	int progress0;
+	int progress1;
+	int progress2;
+	int progress3;
+	int io_time[2][2];
+	struct pagedir pagedir;
+	dev_t root_fs;
+};
+
+extern int write_pageset(struct pagedir *pagedir, int whichtowrite);
+extern int write_image_header(void);
+extern int read_pageset1(void);
+extern int read_pageset2(int overwrittenpagesonly);
+
+extern int attempt_to_parse_resume_device(void);
+extern void attempt_to_parse_resume_device2(void);
+extern dev_t name_to_dev_t(char *line);
+extern __nosavedata unsigned long bytes_in, bytes_out;
diff -urN oldtree/kernel/power/main.c newtree/kernel/power/main.c
--- oldtree/kernel/power/main.c	2006-03-08 18:47:16.957189000 +0000
+++ newtree/kernel/power/main.c	2006-03-08 15:22:33.285507500 +0000
@@ -9,6 +9,7 @@
  */
 
 #include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/delay.h>
@@ -95,7 +96,7 @@
 	if (pm_ops->finish)
 		pm_ops->finish(state);
  Thaw:
-	thaw_processes();
+	thaw_processes(FREEZER_ALL_THREADS);
  Enable_cpu:
 	enable_nonboot_cpus();
 	pm_restore_console();
@@ -103,7 +104,7 @@
 }
 
 
-static int suspend_enter(suspend_state_t state)
+int suspend_enter(suspend_state_t state)
 {
 	int error = 0;
 	unsigned long flags;
@@ -133,7 +134,7 @@
 static void suspend_finish(suspend_state_t state)
 {
 	device_resume();
-	thaw_processes();
+	thaw_processes(FREEZER_ALL_THREADS);
 	enable_nonboot_cpus();
 	if (pm_ops && pm_ops->finish)
 		pm_ops->finish(state);
@@ -146,7 +147,7 @@
 static char *pm_states[PM_SUSPEND_MAX] = {
 	[PM_SUSPEND_STANDBY]	= "standby",
 	[PM_SUSPEND_MEM]	= "mem",
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_SUSPEND2)
 	[PM_SUSPEND_DISK]	= "disk",
 #endif
 };
@@ -177,7 +178,7 @@
 
 static int enter_state(suspend_state_t state)
 {
-	int error;
+	int error = 0;
 
 	if (!valid_state(state))
 		return -ENODEV;
diff -urN oldtree/kernel/power/modules.c newtree/kernel/power/modules.c
--- oldtree/kernel/power/modules.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/modules.c	2006-03-08 15:22:33.285507500 +0000
@@ -0,0 +1,312 @@
+/*
+ * kernel/power/modules.c
+ *
+ * Copyright (C) 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include "suspend2.h"
+#include "modules.h"
+
+struct list_head suspend_filters, suspend_writers, suspend_modules;
+struct suspend_module_ops *suspend_active_writer;
+static int suspend_num_filters;
+int suspend_num_writers, suspend_num_modules;
+
+/*
+ * suspend_header_storage_for_modules
+ *
+ * Returns the amount of space needed to store configuration
+ * data needed by the modules prior to copying back the original
+ * kernel. We can exclude data for pageset2 because it will be
+ * available anyway once the kernel is copied back.
+ */
+unsigned long suspend_header_storage_for_modules(void)
+{
+	struct suspend_module_ops *this_module;
+	unsigned long bytes = 0;
+	
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+		if (this_module->disabled)
+			continue;
+		if (this_module->storage_needed)
+			bytes += this_module->storage_needed();
+	}
+
+	return bytes;
+}
+
+/*
+ * suspend_memory_for_modules
+ *
+ * Returns the amount of memory requested by modules for
+ * doing their work during the cycle.
+ */
+
+unsigned long suspend_memory_for_modules(void)
+{
+	unsigned long bytes = 0;
+	struct suspend_module_ops *this_module;
+
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+		if (this_module->disabled)
+			continue;
+		if (this_module->memory_needed)
+			bytes += this_module->memory_needed();
+	}
+
+	return ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT);
+}
+
+/* suspend_find_module_given_name
+ * Functionality :	Return a module (if found), given a pointer
+ * 			to its name
+ */
+
+struct suspend_module_ops *suspend_find_module_given_name(char *name)
+{
+	struct suspend_module_ops *this_module, *found_module = NULL;
+	
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+		if (!strcmp(name, this_module->name)) {
+			found_module = this_module;
+			break;
+		}			
+	}
+
+	return found_module;
+}
+
+/*
+ * suspend_print_module_debug_info
+ * Functionality   : Get debugging info from modules into a buffer.
+ */
+int suspend_print_module_debug_info(char *buffer, int buffer_size)
+{
+	struct suspend_module_ops *this_module;
+	int len = 0;
+
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+		if (this_module->disabled)
+			continue;
+		if (this_module->print_debug_info) {
+			int result;
+			result = this_module->print_debug_info(buffer + len, 
+					buffer_size - len);
+			len += result;
+		}
+	}
+
+	return len;
+}
+
+/*
+ * suspend_register_module
+ *
+ * Register a module.
+ */
+int suspend_register_module(struct suspend_module_ops *module)
+{
+	if (suspend_find_module_given_name(module->name))
+		return -EBUSY;
+
+	switch (module->type) {
+		case FILTER_PLUGIN:
+			list_add_tail(&module->ops.filter.filter_list,
+					&suspend_filters);
+			suspend_num_filters++;
+			break;
+
+		case WRITER_PLUGIN:
+			list_add_tail(&module->ops.writer.writer_list,
+					&suspend_writers);
+			suspend_num_writers++;
+			break;
+
+		case MISC_PLUGIN:
+			break;
+
+		default:
+			printk("Hmmm. Plugin '%s' has an invalid type."
+				" It has been ignored.\n", module->name);
+			return -EINVAL;
+	}
+	list_add_tail(&module->module_list, &suspend_modules);
+	suspend_num_modules++;
+
+	return 0;	
+}
+
+/*
+ * suspend_unregister_module
+ *
+ * Remove a module.
+ */
+void suspend_unregister_module(struct suspend_module_ops *module)
+{
+	switch (module->type) {
+		case FILTER_PLUGIN:
+			list_del(&module->ops.filter.filter_list);
+			suspend_num_filters--;
+			break;
+
+		case WRITER_PLUGIN:
+			list_del(&module->ops.writer.writer_list);
+			suspend_num_writers--;
+			if (suspend_active_writer == module) {
+				suspend_active_writer = NULL;
+				set_suspend_state(SUSPEND_DISABLED);
+			}
+			break;
+		
+		case MISC_PLUGIN:
+			break;
+
+		default:
+			printk("Hmmm. Plugin '%s' has an invalid type."
+				" It has been ignored.\n", module->name);
+			return;
+	}
+	list_del(&module->module_list);
+	suspend_num_modules--;
+}
+
+/*
+ * suspend_move_module_tail
+ *
+ * Rearrange modules when reloading the config.
+ */
+void suspend_move_module_tail(struct suspend_module_ops *module)
+{
+	switch (module->type) {
+		case FILTER_PLUGIN:
+			if (suspend_num_filters > 1)
+				list_move_tail(&module->ops.filter.filter_list,
+						&suspend_filters);
+			break;
+
+		case WRITER_PLUGIN:
+			if (suspend_num_writers > 1)
+				list_move_tail(&module->ops.writer.writer_list,
+						&suspend_writers);
+			break;
+		
+		case MISC_PLUGIN:
+			break;
+		default:
+			printk("Hmmm. Plugin '%s' has an invalid type."
+				" It has been ignored.\n", module->name);
+			return;
+	}
+	if ((suspend_num_filters + suspend_num_writers) > 1)
+		list_move_tail(&module->module_list, &suspend_modules);
+}
+
+/*
+ * suspend_initialise_modules
+ *
+ * Get ready to do some work!
+ */
+int suspend_initialise_modules(int starting_cycle)
+{
+	struct suspend_module_ops *this_module;
+	int result;
+	
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+		if (this_module->disabled)
+			continue;
+		if (this_module->initialise) {
+			suspend_message(SUSPEND_MEMORY, SUSPEND_MEDIUM, 1,
+				"Initialising module %s.\n",
+				this_module->name);
+			if ((result = this_module->initialise(starting_cycle))) {
+				printk("%s didn't initialise okay.\n",
+						this_module->name);
+				return result;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/* 
+ * suspend_cleanup_modules
+ *
+ * Tell modules the work is done.
+ */
+void suspend_cleanup_modules(int finishing_cycle)
+{
+	struct suspend_module_ops *this_module;
+	
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+		if (this_module->disabled)
+			continue;
+		if (this_module->cleanup) {
+			suspend_message(SUSPEND_MEMORY, SUSPEND_MEDIUM, 1,
+				"Cleaning up module %s.\n",
+				this_module->name);
+			this_module->cleanup(finishing_cycle);
+		}
+	}
+}
+
+/*
+ * suspend_get_next_filter
+ *
+ * Get the next filter in the pipeline.
+ */
+struct suspend_module_ops *suspend_get_next_filter(struct suspend_module_ops *filter_sought)
+{
+	struct suspend_module_ops *last_filter = NULL, *this_filter = NULL;
+
+	list_for_each_entry(this_filter, &suspend_filters, ops.filter.filter_list) {
+		if (this_filter->disabled)
+			continue;
+		if ((last_filter == filter_sought) || (!filter_sought))
+			return this_filter;
+		last_filter = this_filter;
+	}
+
+	return suspend_active_writer;
+}
+
+/* suspend_get_modules
+ * 
+ * Take a reference to modules so they can't go away under us.
+ */
+
+int suspend_get_modules(void)
+{
+	struct suspend_module_ops *this_module;
+	
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+		if (!try_module_get(this_module->module)) {
+			/* Failed! Reverse gets and return error */
+			struct suspend_module_ops *this_module2;
+			list_for_each_entry(this_module2, &suspend_modules, module_list) {
+				if (this_module == this_module2)
+					return -EINVAL;
+				module_put(this_module2->module);
+			}
+		}
+	}
+
+	return 0;
+}
+
+/* suspend_put_modules
+ *
+ * Release our references to modules we used.
+ */
+
+void suspend_put_modules(void)
+{
+	struct suspend_module_ops *this_module;
+	
+	list_for_each_entry(this_module, &suspend_modules, module_list) {
+		module_put(this_module->module);
+	}
+}
diff -urN oldtree/kernel/power/modules.h newtree/kernel/power/modules.h
--- oldtree/kernel/power/modules.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/modules.h	2006-03-08 16:41:35.077851000 +0000
@@ -0,0 +1,179 @@
+/*
+ * kernel/power/modules.h
+ *
+ * Copyright (C) 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains declarations for modules. Plugins are additions to
+ * suspend2 that provide facilities such as image compression or
+ * encryption, backends for storage of the image and user interfaces.
+ *
+ */
+
+/* This is the maximum size we store in the image header for a module name */
+#define SUSPEND_MAX_PLUGIN_NAME_LENGTH 30
+
+/* Per-module metadata */
+struct suspend_module_header {
+	char name[SUSPEND_MAX_PLUGIN_NAME_LENGTH];
+	int disabled;
+	int type;
+	int index;
+	int data_length;
+	unsigned long signature;
+};
+
+extern int suspend_num_modules, suspend_num_writers;
+
+enum {
+	FILTER_PLUGIN,
+	WRITER_PLUGIN,
+	MISC_PLUGIN, /* Block writer, eg. */
+	CHECKSUM_PLUGIN
+};
+
+enum {
+	SUSPEND_ASYNC,
+	SUSPEND_SYNC
+};
+
+struct suspend_filter_ops {
+	/* Writing the image proper */
+	int (*write_chunk) (struct page *buffer_page);
+
+	/* Reading the image proper */
+	int (*read_chunk) (struct page *buffer_page, int sync);
+
+	/* Reset module if image exists but reading aborted */
+	void (*noresume_reset) (void);
+	struct list_head filter_list;
+};
+
+struct suspend_writer_ops {
+	
+	/* Writing the image proper */
+	int (*write_chunk) (struct page *buffer_page);
+
+	/* Reading the image proper */
+	int (*read_chunk) (struct page *buffer_page, int sync);
+
+	/* Reset module if image exists but reading aborted */
+	void (*noresume_reset) (void);
+
+	/* Calls for allocating storage */
+
+	/* Maximum size of image we can save (incl. space already allocated).*/
+	int (*storage_available) (void);
+	
+	/* Amount of storage already allocated */
+	int (*storage_allocated) (void);
+
+	int (*release_storage) (void);
+	
+	/* 
+	 * Header space is allocated separately. Note that allocation
+	 * of space for the header might result in allocated space 
+	 * being stolen from the main pool if there is no unallocated
+	 * space. We have to be able to allocate enough space for
+	 * the header. We can eat memory to ensure there is enough
+	 * for the main pool.
+	 */
+	int (*allocate_header_space) (int space_requested);
+	int (*allocate_storage) (int space_requested);
+	
+	/* Read and write the metadata */	
+	int (*write_header_init) (void);
+	int (*write_header_chunk) (char *buffer_start, int buffer_size);
+	int (*write_header_cleanup) (void);
+
+	int (*read_header_init) (void);
+	int (*read_header_chunk) (char *buffer_start, int buffer_size);
+	int (*read_header_cleanup) (void);
+
+	/* Prepare metadata to be saved (relativise/absolutise extents) */
+	int (*serialise_extents) (void);
+	int (*load_extents) (void);
+	
+	/* Attempt to parse an image location */
+	int (*parse_sig_location) (char *buffer, int only_writer);
+
+	/* Determine whether image exists that we can restore */
+	int (*image_exists) (void);
+	
+	/* Mark the image as having tried to resume */
+	void (*mark_resume_attempted) (void);
+
+	/* Destroy image if one exists */
+	int (*invalidate_image) (void);
+	
+	/* Wait on I/O */
+	int (*wait_on_io) (int flush_all);
+
+	struct list_head writer_list;
+};
+
+struct suspend_module_ops {
+	/* Functions common to all modules */
+	int type;
+	char *name;
+	struct module *module;
+	int disabled;
+	struct list_head module_list;
+
+	/* Bytes */
+	unsigned long (*memory_needed) (void);
+	unsigned long (*storage_needed) (void);
+
+	int (*print_debug_info) (char *buffer, int size);
+	int (*save_config_info) (char *buffer);
+	void (*load_config_info) (char *buffer, int len);
+	
+	/* Initialise & cleanup - general routines called
+	 * at the start and end of a cycle. */
+	int (*initialise) (int starting_cycle);
+	void (*cleanup) (int finishing_cycle);
+
+	int (*write_init) (int stream_number);
+	int (*write_cleanup) (void);
+
+	int (*read_init) (int stream_number);
+	int (*read_cleanup) (void);
+
+	union {
+		struct suspend_filter_ops filter;
+		struct suspend_writer_ops writer;
+	} ops;
+};
+
+extern struct suspend_module_ops *suspend_active_writer;
+extern struct list_head suspend_filters, suspend_writers, suspend_modules;
+
+extern void suspend_prepare_console_modules(void);
+extern void suspend_cleanup_console_modules(void);
+
+extern struct suspend_module_ops *suspend_find_module_given_name(char *name),
+	*suspend_get_next_filter(struct suspend_module_ops *);
+
+extern int suspend_register_module(struct suspend_module_ops *module);
+extern void suspend_move_module_tail(struct suspend_module_ops *module);
+
+extern unsigned long suspend_header_storage_for_modules(void);
+extern unsigned long suspend_memory_for_modules(void);
+
+extern int suspend_print_module_debug_info(char *buffer, int buffer_size);
+extern int suspend_register_module(struct suspend_module_ops *module);
+extern void suspend_unregister_module(struct suspend_module_ops *module);
+
+extern int suspend_initialise_modules(int starting_cycle);
+extern void suspend_cleanup_modules(int finishing_cycle);
+
+int suspend_get_modules(void);
+void suspend_put_modules(void);
+
+static inline void suspend_initialise_module_lists(void) {
+	INIT_LIST_HEAD(&suspend_filters);
+	INIT_LIST_HEAD(&suspend_writers);
+	INIT_LIST_HEAD(&suspend_modules);
+}
+
diff -urN oldtree/kernel/power/netlink.c newtree/kernel/power/netlink.c
--- oldtree/kernel/power/netlink.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/netlink.c	2006-03-08 15:22:33.289507750 +0000
@@ -0,0 +1,370 @@
+/*
+ * netlink.c
+ *
+ * Functions for communicating with a userspace helper via netlink.
+ */
+
+
+#include <linux/suspend.h>
+#include "netlink.h"
+
+#ifdef CONFIG_NET
+struct user_helper_data *uhd_list = NULL;
+
+/* 
+ * Refill our pool of SKBs for use in emergencies (eg, when eating memory and none
+ * can be allocated).
+ */
+static void suspend_fill_skb_pool(struct user_helper_data *uhd)
+{
+	while (uhd->pool_level < uhd->pool_limit) {
+		struct sk_buff *new_skb =
+			alloc_skb(NLMSG_SPACE(uhd->skb_size), GFP_ATOMIC);
+
+		if (!new_skb)
+			break;
+
+		new_skb->next = uhd->emerg_skbs;
+		uhd->emerg_skbs = new_skb;
+		uhd->pool_level++;
+	}
+}
+
+/* 
+ * Try to allocate a single skb. If we can't get one, try to use one from
+ * our pool.
+ */
+static struct sk_buff *suspend_get_skb(struct user_helper_data *uhd)
+{
+	struct sk_buff *skb =
+		alloc_skb(NLMSG_SPACE(uhd->skb_size), GFP_ATOMIC);
+
+	if (skb)
+		return skb;
+
+	skb = uhd->emerg_skbs;
+	if (skb) {
+		uhd->pool_level--;
+		uhd->emerg_skbs = skb->next;
+		skb->next = NULL;
+	}
+
+	return skb;
+}
+
+static void put_skb(struct user_helper_data *uhd, struct sk_buff *skb)
+{
+	if (uhd->pool_level < uhd->pool_limit) {
+		skb->next = uhd->emerg_skbs;
+		uhd->emerg_skbs = skb;
+	} else
+		kfree_skb(skb);
+}
+
+
+static void suspend_notify_userspace(void* data)
+{
+	struct task_struct *t;
+	struct user_helper_data *uhd = (struct user_helper_data *) data;
+
+	BUG_ON(!uhd);
+
+	read_lock(&tasklist_lock);
+	if ((t = find_task_by_pid(uhd->pid)))
+		wake_up_process(t);
+	read_unlock(&tasklist_lock);
+}
+
+DECLARE_WORK(suspend_notify_userspace_work, suspend_notify_userspace, NULL);
+
+void suspend_send_netlink_message(struct user_helper_data *uhd,
+		int type, void* params, size_t len)
+{
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	void *dest;
+
+	skb = suspend_get_skb(uhd);
+	if (!skb) {
+		printk("suspend_netlink: Can't allocate skb!\n");
+		return;
+	}
+
+	/* NLMSG_PUT contains a hidden goto nlmsg_failure */
+	nlh = NLMSG_PUT(skb, 0, uhd->sock_seq, type, len);
+	uhd->sock_seq++;
+
+	dest = NLMSG_DATA(nlh);
+	if (params && len > 0)
+		memcpy(dest, params, len);
+
+	netlink_unicast(uhd->nl, skb, uhd->pid, 0);
+
+	/* We may be in an interrupt context so defer waking up userspace */
+	suspend_notify_userspace_work.data = uhd;
+	schedule_work(&suspend_notify_userspace_work);
+
+	return;
+
+nlmsg_failure:
+	if (skb)
+		put_skb(uhd, skb);
+}
+
+#ifdef CONFIG_PM_DEBUG
+static int is_debugging = 1;
+#else
+static int is_debugging = 0;
+#endif
+
+static void send_whether_debugging(struct user_helper_data *uhd)
+{
+	suspend_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
+			&is_debugging, sizeof(int));
+}
+
+/*
+ * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
+ * are suspending.
+ */
+static int nl_set_nofreeze(struct user_helper_data *uhd, int pid)
+{
+	struct task_struct *t;
+
+	read_lock(&tasklist_lock);
+	if ((t = find_task_by_pid(pid)) == NULL) {
+		read_unlock(&tasklist_lock);
+		printk("Strange. Can't find the userspace task %d.\n", pid);
+		return -EINVAL;
+	}
+
+	t->flags |= PF_NOFREEZE;
+
+	read_unlock(&tasklist_lock);
+	uhd->pid = pid;
+
+	suspend_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
+
+	return 0;
+}
+
+/*
+ * Called when the userspace process has informed us that it's ready to roll.
+ */
+static int nl_ready(struct user_helper_data *uhd, int version)
+{
+	if (version != uhd->interface_version) {
+		printk("%s userspace process using invalid interface version."
+				" Trying to continue without it.\n",
+				uhd->name);
+		if (uhd->not_ready)
+			uhd->not_ready();
+		return 1;
+	}
+
+	complete(&uhd->wait_for_process);
+
+	return 0;
+}
+
+static int suspend_nl_gen_rcv_msg(struct user_helper_data *uhd,
+		struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	int type;
+	int *data;
+	int err;
+
+	/* Let the more specific handler go first. It returns
+	 * 1 for valid messages that it doesn't know. */
+	if ((err = uhd->rcv_msg(skb, nlh)) != 1)
+		return err;
+	
+	type = nlh->nlmsg_type;
+
+	/* Only allow one task to receive NOFREEZE privileges */
+	if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
+		printk("Received extra nofreeze me requests.\n");
+		return -EBUSY;
+	}
+
+	data = (int*)NLMSG_DATA(nlh);
+
+	switch (type) {
+		case NETLINK_MSG_NOFREEZE_ME:
+			if ((err = nl_set_nofreeze(uhd, nlh->nlmsg_pid)) != 0)
+				return err;
+			break;
+		case NETLINK_MSG_GET_DEBUGGING:
+			send_whether_debugging(uhd);
+			break;
+		case NETLINK_MSG_READY:
+			if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) {
+				printk("Invalid ready mesage.\n");
+				return -EINVAL;
+			}
+			if ((err = nl_ready(uhd, *data)) != 0)
+				return err;
+			break;
+	}
+
+	return 0;
+}
+
+static void suspend_user_rcv_skb(struct user_helper_data *uhd,
+				  struct sk_buff *skb)
+{
+	int err;
+	struct nlmsghdr *nlh;
+
+	while (skb->len >= NLMSG_SPACE(0)) {
+		u32 rlen;
+
+		nlh = (struct nlmsghdr *) skb->data;
+		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+			return;
+
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+
+		if ((err = suspend_nl_gen_rcv_msg(uhd, skb, nlh)) != 0)
+			netlink_ack(skb, nlh, err);
+		else if (nlh->nlmsg_flags & NLM_F_ACK)
+			netlink_ack(skb, nlh, 0);
+		skb_pull(skb, rlen);
+	}
+}
+
+static void suspend_netlink_input(struct sock *sk, int len)
+{
+	struct user_helper_data *uhd = uhd_list;
+
+	while (uhd && uhd->netlink_id != sk->sk_protocol)
+		uhd= uhd->next;
+
+	BUG_ON(!uhd);
+
+	do {
+		struct sk_buff *skb;
+		while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+			suspend_user_rcv_skb(uhd, skb);
+			put_skb(uhd, skb);
+		}
+	} while (uhd->nl && uhd->nl->sk_receive_queue.qlen);
+}
+
+static int netlink_prepare(struct user_helper_data *uhd)
+{
+	uhd->next = uhd_list;
+	uhd_list = uhd;
+
+	uhd->sock_seq = 0x42c0ffee;
+	uhd->nl = netlink_kernel_create(uhd->netlink_id, 0,
+			suspend_netlink_input, THIS_MODULE);
+	if (!uhd->nl) {
+		printk("Failed to allocate netlink socket for %s.\n",
+				uhd->name);
+		return -ENOMEM;
+	}
+
+	suspend_fill_skb_pool(uhd);
+
+	return 0;
+}
+
+void suspend_netlink_close(struct user_helper_data *uhd)
+{
+	if (uhd->nl) {
+		sock_release(uhd->nl->sk_socket);
+		uhd->nl = NULL;
+	}
+
+	while (uhd->emerg_skbs) {
+		struct sk_buff *next = uhd->emerg_skbs->next;
+		kfree_skb(uhd->emerg_skbs);
+		uhd->emerg_skbs = next;
+	}
+}
+
+int suspend2_launch_userspace_program(char *command, int channel_no)
+{
+	int retval;
+	static char *envp[] = {
+			"HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			NULL };
+	static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+	char *channel = kmalloc(6, GFP_KERNEL);
+	int arg = 0, size;
+	char test_read[255];
+	char *orig_posn = command;
+
+	if (!strlen(orig_posn))
+		return 1;
+
+	/* Up to 7 args supported */
+	while (arg < 7) {
+		sscanf(orig_posn, "%s", test_read);
+		size = strlen(test_read);
+		if (!(size))
+			break;
+		argv[arg] = kmalloc(size + 1, GFP_ATOMIC);
+		strcpy(argv[arg], test_read);
+		orig_posn += size + 1;
+		*test_read = 0;
+		arg++;
+	}
+	
+	if (channel_no) {
+		sprintf(channel, "-c%d", channel_no);
+		argv[arg] = channel;
+	} else
+		arg--;
+
+	retval = call_usermodehelper(argv[0], argv, envp, 0);
+
+	if (retval)
+		printk("Failed to launch userspace program '%s': Error %d\n",
+				command, retval);
+
+	{
+		int i;
+		for (i = 0; i < arg; i++)
+			if (argv[i] && argv[i] != channel)
+				kfree(argv[i]);
+	}
+
+	kfree(channel);
+
+	return retval;
+}
+
+int suspend_netlink_setup(struct user_helper_data *uhd)
+{
+	if (netlink_prepare(uhd) < 0) {
+		printk("Netlink prepare failed.\n");
+		return 1;
+	}
+
+	if (suspend2_launch_userspace_program(uhd->program, uhd->netlink_id) < 0) {
+		printk("Launch userspace program failed.\n");
+		suspend_netlink_close(uhd);
+		return 1;
+	}
+
+	/* Wait 2 seconds for the userspace process to make contact */
+	wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
+
+	if (uhd->pid == -1) {
+		printk("%s: Failed to contact userspace process.\n",
+				uhd->name);
+		suspend_netlink_close(uhd);
+		return 1;
+	}
+
+	return 0;
+}
+
+#else
+#endif
diff -urN oldtree/kernel/power/netlink.h newtree/kernel/power/netlink.h
--- oldtree/kernel/power/netlink.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/netlink.h	2006-03-08 15:22:33.293508000 +0000
@@ -0,0 +1,43 @@
+/*
+ * netlink.h
+ *
+ * Declarations for functions for communicating with a userspace helper
+ * via netlink.
+ */
+
+#include <linux/netlink.h>
+#include <net/sock.h>
+
+#define NETLINK_MSG_BASE 0x10
+
+#define NETLINK_MSG_READY 0x10
+#define	NETLINK_MSG_NOFREEZE_ME 0x16
+#define NETLINK_MSG_GET_DEBUGGING 0x19
+#define NETLINK_MSG_CLEANUP 0x24
+#define NETLINK_MSG_NOFREEZE_ACK 0x27
+#define NETLINK_MSG_IS_DEBUGGING 0x28
+
+struct user_helper_data {
+	int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
+	void (* not_ready) (void);
+	struct sock *nl;
+	u32 sock_seq;
+	pid_t pid;
+	char *comm;
+	char program[256];
+	int pool_level;
+	int pool_limit;
+	struct sk_buff *emerg_skbs;
+	int skb_size;
+	int netlink_id;	
+	char *name;
+	struct user_helper_data *next;
+	struct completion wait_for_process;
+	int interface_version;
+	int must_init;
+};
+
+void suspend_send_netlink_message(struct user_helper_data *uhd,
+		int type, void* params, size_t len);
+int suspend_netlink_setup(struct user_helper_data *uhd);
+void suspend_netlink_close(struct user_helper_data *uhd);
diff -urN oldtree/kernel/power/pagedir.c newtree/kernel/power/pagedir.c
--- oldtree/kernel/power/pagedir.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/pagedir.c	2006-03-08 18:35:48.742178250 +0000
@@ -0,0 +1,368 @@
+/*
+ * kernel/power/pagedir.c
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
+ * Copyright (C) 2002-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for handling pagesets.
+ * Note that pbes aren't actually stored as such. They're stored as
+ * bitmaps and extents.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/hardirq.h>
+
+#include "pageflags.h"
+#include "ui.h"
+#include "pagedir.h"
+
+int extra_pagedir_pages_allocated = 0;
+
+/* Not static so allocation routine can BUG if recursively called */
+dyn_pageflags_t conflicting_pages_map;
+
+#define PageConflicting(page) (test_dynpageflag(&conflicting_pages_map, page))
+#define SetPageConflicting(page) (set_dynpageflag(&conflicting_pages_map, page))
+#define ClearPageConflicting(page) (clear_dynpageflag(&conflicting_pages_map, page))
+
+/* suspend_free_extra_pagedir_memory
+ *
+ * Description:	Free a previously pagedir metadata.
+ */
+void suspend_free_extra_pagedir_memory(void)
+{
+	unsigned long pagenumber;
+
+	free_dyn_pageflags(&pageset1_map);
+	free_dyn_pageflags(&pageset2_map);
+	free_dyn_pageflags(&pageset1_copy_map);
+
+	/* Free allocated pages */
+	if (allocd_pages_map) {
+		BITMAP_FOR_EACH_SET(allocd_pages_map, pagenumber) {
+			struct page *page = pfn_to_page(pagenumber);
+			ClearPageNosave(page);
+			__free_page(page);
+			extra_pagedir_pages_allocated--;
+		}
+		free_dyn_pageflags(&allocd_pages_map);
+	}
+}
+
+/* suspend_allocate_extra_pagedir_memory
+ *
+ * Description:	Allocate memory for making the atomic copy of pagedir1 in the
+ * 		case where it is bigger than pagedir2.
+ * Arguments:	struct pagedir *: 	The pagedir for which we should 
+ * 					allocate memory.
+ * 		int:			Size of pageset 1.
+ * 		int:			Size of pageset 2.
+ * Result:	int. Zero on success. One if unable to allocate enough memory.
+ */
+int suspend_allocate_extra_pagedir_memory(struct pagedir *p, int pageset_size,
+		int alloc_from)
+{
+	int num_to_alloc = pageset_size - alloc_from - extra_pagedir_pages_allocated;
+	int j, order;
+
+	if (num_to_alloc < 1)
+		num_to_alloc = 0;
+
+	if (num_to_alloc) {
+		int num_added = 0;
+	
+		order = num_to_alloc;
+		if (order >= MAX_ORDER)
+			order = MAX_ORDER - 1;
+
+		while (num_added < num_to_alloc) {
+			struct page *newpage;
+			unsigned long virt;
+			
+			while ((1 << order) > (num_to_alloc - num_added))
+				order--;
+
+			virt = __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, order);
+			while ((!virt) && (order > 0)) {
+				order--;
+				virt = __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, order);
+			}
+
+			if (!virt) {
+				p->pageset_size += num_added;
+				return 1;
+			}
+
+			newpage = virt_to_page(virt);
+			for (j = 0; j < (1 << order); j++) {
+				SetPageNosave(newpage + j);
+				/* Pages will be freed one at a time. */
+				SetPageAllocd(newpage + j);
+				extra_pagedir_pages_allocated++;
+			}
+			num_added+= (1 << order);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * suspend_mark_task_as_pageset1
+ * Functionality   : Marks all the pages belonging to a given process as
+ *                   pageset 1 pages.
+ * Called From     : pagedir.c - mark_pages_for_pageset2
+ *
+ */
+extern struct page *suspend2_follow_page(struct mm_struct *mm, unsigned long address);
+
+void suspend_mark_task_as_pageset1(struct task_struct *t)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+
+	mm = t->active_mm;
+
+	if (!mm || !mm->mmap) return;
+
+	/* Don't try to take the sem when processes are frozen, 
+	 * drivers are suspended and irqs are disabled. We're
+	 * not racing with anything anyway.  */
+	BUG_ON(in_atomic() && !irqs_disabled());
+
+	if (!irqs_disabled())
+		down_read(&mm->mmap_sem);
+	
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_flags & VM_PFNMAP)
+			continue;
+		if (vma->vm_start) {
+			unsigned long posn;
+			for (posn = vma->vm_start; posn < vma->vm_end;
+					posn += PAGE_SIZE) {
+				struct page *page = 
+					suspend2_follow_page(mm, posn);
+				if (page)
+					ClearPagePageset2(page);
+			}
+		}
+	}
+
+	BUG_ON(in_atomic() && !irqs_disabled());
+
+	if (!irqs_disabled())
+		up_read(&mm->mmap_sem);
+}
+
+/* mark_pages_for_pageset2
+ *
+ * Description:	Mark unshared pages in processes not needed for suspend as
+ * 		being able to be written out in a separate pagedir.
+ * 		HighMem pages are simply marked as pageset2. They won't be
+ * 		needed during suspend.
+ */
+
+struct attention_list {
+	struct task_struct *task;
+	struct attention_list *next;
+};
+
+#define HALT_ON(condition) \
+	do { if (unlikely(condition)) { \
+		printk("Suspend2: Halting at line %d. Please report to nigel@suspend2.net.\n", __LINE__); \
+		while(1) \
+			cpu_relax(); \
+		} } while(0)
+
+void suspend_mark_pages_for_pageset2(void)
+{
+	struct zone *zone;
+	struct task_struct *p;
+	struct attention_list *attention_list = NULL, *last = NULL;
+	unsigned long flags, i;
+
+	HALT_ON(in_atomic() && !irqs_disabled());
+
+	clear_dyn_pageflags(pageset2_map);
+
+	if (test_action_state(SUSPEND_NO_PAGESET2))
+		return;
+
+	/* 
+	 * Note that we don't clear the map to begin with!
+	 * This is because if we eat memory, we loose track
+	 * of LRU pages that are still in use but taken off
+	 * the LRU. If I can figure out how the VM keeps
+	 * track of them, I might be able to tweak this a
+	 * little further and decrease pageset one's size
+	 * further.
+	 *
+	 * (Memory grabbing clears the pageset2 flag on
+	 * pages that are really freed!).
+	 */
+	
+	for_each_zone(zone) {
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		if (zone->nr_inactive) {
+			struct page *page;
+			list_for_each_entry(page, &zone->inactive_list, lru)
+				SetPagePageset2(page);
+		}
+		if (zone->nr_active) {
+			struct page *page;
+			list_for_each_entry(page, &zone->active_list, lru)
+				SetPagePageset2(page);
+		}
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+
+	HALT_ON(in_atomic() && !irqs_disabled());
+
+	/* Now we find all userspace process (with task->mm) marked PF_NOFREEZE
+	 * and move them into pageset1.
+	 */
+	read_lock(&tasklist_lock);
+	for_each_process(p)
+		if ((p->mm || p->active_mm) && (p->flags & PF_NOFREEZE)) {
+			struct attention_list *this = kmalloc(sizeof(struct attention_list), GFP_ATOMIC);
+			BUG_ON(!this);
+			this->task = p;
+			this->next = NULL;
+			if (attention_list) {
+				last->next = this;
+				last = this;
+			} else
+				attention_list = last = this;
+		}
+	read_unlock(&tasklist_lock);
+
+	HALT_ON(in_atomic() && !irqs_disabled());
+
+	/* Because the tasks in attention_list are ones related to suspending,
+	 * we know that they won't go away under us.
+	 */
+
+	while (attention_list) {
+		suspend_mark_task_as_pageset1(attention_list->task);
+		last = attention_list;
+		attention_list = attention_list->next;
+		kfree(last);
+	}
+
+	HALT_ON(in_atomic() && !irqs_disabled());
+
+	for_each_zone(zone) {
+		if (!zone->present_pages)
+			continue;
+		for (i = 0; i < zone->spanned_pages; i++) {
+			struct page *page = pfn_to_page(zone->zone_start_pfn + i);
+			BUG_ON(PagePageset2(page) && PageSlab(page));
+		}
+	}
+
+	HALT_ON(in_atomic() && !irqs_disabled());
+
+}
+
+/* suspend_get_nonconflicting_pages
+ *
+ * Description: Gets higher-order pages that won't be overwritten
+ *		while copying the original pages.
+ *
+ *		Note that if only one of the allocated pages overlaps
+ *		with the pages that overlap, another set must be
+ *		tried. Therefore, you shouldn't use this function
+ *		much, and not with high orders.
+ */
+
+unsigned long suspend_get_nonconflicting_pages(const int order)
+{
+	struct page *page;
+	unsigned long new_page, i;
+	int more = 0;
+
+	do {
+		new_page = __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, order);
+		if (!new_page)
+			return 0;
+		page = virt_to_page(new_page);
+		more = 0;
+		for (i = 0; i < (1UL << order); i++) {
+			if (PagePageset1(page + i)) {
+				more = 1;
+				break;
+			}
+		}
+		if (more) {
+			for (i = 0; i < (1UL << order); i++)
+				if (PagePageset1(page + i))
+					SetPageConflicting(page + i);
+				else {
+					__free_pages(page + i, 0);
+				}
+		}
+	}
+	while (more);
+
+	memset((void*)new_page, 0, PAGE_SIZE * (1<<order));
+	return new_page;
+}
+
+/* relocate_page_if_required
+ *
+ * Description: Given the address of a pointer to a page, we check if the page
+ * 		needs relocating and do so if needs be, adjusting the pointer
+ * 		too.
+ */
+
+void suspend_relocate_if_required(unsigned long *current_value, unsigned int size)
+{
+	if (PagePageset1(virt_to_page(*current_value))) {
+		unsigned long new_page = suspend_get_nonconflicting_pages(0);
+		memcpy((char *) new_page, (char *) *current_value, size);
+		if (PageSlab(virt_to_page(*current_value)))
+			kfree((void *) *current_value);
+		else
+			free_page((unsigned long) *current_value);
+		*current_value = new_page;
+	}
+}
+
+/* get_pageset1_load_addresses
+ * 
+ * Description: We check here that pagedir & pages it points to won't collide
+ * 		with pages where we're going to restore from the loaded pages
+ * 		later.
+ * Returns:	Zero on success, one if couldn't find enough pages (shouldn't
+ * 		happen).
+ */
+
+int suspend_get_pageset1_load_addresses(void)
+{
+	int i, result = 0;
+	void *this;
+
+	/*
+	 * Because we're trying to make this work when we're saving as much
+	 * memory as possible we need to remember the pages we reject here
+	 * and then free them when we're done.
+	 */
+	
+	for(i=0; i < pagedir1.pageset_size; i++) {
+		this = (void *) suspend_get_nonconflicting_pages(0);
+		if (!this) {
+			abort_suspend("Error: Ran out of memory seeking locations for reloading data.");
+			result = 1;
+			break;
+		}
+		SetPagePageset1Copy(virt_to_page(this));
+	}
+
+	return result;
+}
diff -urN oldtree/kernel/power/pagedir.h newtree/kernel/power/pagedir.h
--- oldtree/kernel/power/pagedir.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/pagedir.h	2006-03-08 15:22:33.293508000 +0000
@@ -0,0 +1,37 @@
+/*
+ * kernel/power/suspend_core/pagedir.h
+ *
+ * Copyright (C) 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Declarations for routines for handling pagesets.
+ */
+
+/* Pagedir
+ *
+ * Contains the metadata for a set of pages saved in the image.
+ */
+
+struct pagedir {
+	int pageset_size;
+	int lastpageset_size;
+};
+
+extern struct pagedir pagedir1, pagedir2;
+
+extern void suspend_copy_pageset1(void);
+
+extern void suspend_free_extra_pagedir_memory(void);
+
+extern int suspend_allocate_extra_pagedir_memory(struct pagedir *p, int pageset_size, int alloc_from);
+
+extern void suspend_mark_task_as_pageset1 (struct task_struct *t);
+extern void suspend_mark_pages_for_pageset2(void);
+
+extern void suspend_relocate_if_required(unsigned long *current_value, unsigned int size);
+extern int suspend_get_pageset1_load_addresses(void);
+
+extern int extra_pagedir_pages_allocated;
+
+extern unsigned long suspend_get_nonconflicting_pages(int order);
diff -urN oldtree/kernel/power/pageflags.c newtree/kernel/power/pageflags.c
--- oldtree/kernel/power/pageflags.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/pageflags.c	2006-03-08 15:22:33.297508250 +0000
@@ -0,0 +1,150 @@
+/*
+ * kernel/power/suspend_core/pageflags.c
+ *
+ * Copyright (C) 2004-2005 Nigel Cunningham <ncunningham@cyclades.com>
+ * 
+ * This file is released under the GPLv2.
+ *
+ * Routines for dynamically allocating and releasing bitmaps
+ * used as pseudo-pageflags.
+ *
+ * Arrays are not contiguous. The first sizeof(void *) bytes are
+ * the pointer to the next page in the bitmap. This allows us to
+ * 1) work under low memory conditions where order 0 might be all
+ *    that's available
+ * 2) save the pages at suspend time, reload and relocate them as
+ *    necessary at resume time without breaking anything (cf
+ *    extent pages).
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/suspend.h>
+#include "pageflags.h"
+#include "modules.h"
+#include "pagedir.h"
+
+/* Maps used in copying the image back are in builtin.c */
+dyn_pageflags_t pageset1_map;
+dyn_pageflags_t pageset1_copy_map;
+dyn_pageflags_t pageset2_map;
+dyn_pageflags_t in_use_map;
+dyn_pageflags_t allocd_pages_map;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+dyn_pageflags_t unmap_map;
+#endif
+dyn_pageflags_t checksum_map;
+
+static int num_zones(void)
+{
+	int result = 0;
+	struct zone *zone;
+
+	for_each_zone(zone)
+		result++;
+
+	return result;
+}
+
+static int pages_for_zone(struct zone *zone)
+{
+	return (zone->spanned_pages + (PAGE_SIZE << 3) - 1) /
+			(PAGE_SIZE << 3);
+}
+
+/* save_dyn_pageflags
+ *
+ * Description: Save a set of pageflags.
+ * Arguments:   dyn_pageflags_t *: Pointer to the bitmap being saved.
+ */
+
+void save_dyn_pageflags(dyn_pageflags_t pagemap)
+{
+	int i, zone_num = 0;
+	struct zone *zone;
+
+	if (!*pagemap)
+		return;
+
+	for_each_zone(zone) {
+		int size = pages_for_zone(zone);
+		suspend_active_writer->ops.writer.write_header_chunk((char *) &zone_num, sizeof(int));
+		suspend_active_writer->ops.writer.write_header_chunk((char *) &size, sizeof(int));
+
+		for (i = 0; i < size; i++)
+			suspend_active_writer->ops.writer.write_header_chunk((char *) pagemap[zone_num][i], PAGE_SIZE);
+		zone_num++;
+	}
+	zone_num = -1;
+	suspend_active_writer->ops.writer.write_header_chunk((char *) &zone_num, sizeof(int));
+}
+
+/* load_dyn_pageflags
+ *
+ * Description: Load a set of pageflags.
+ * Arguments:   dyn_pageflags_t *: Pointer to the bitmap being loaded.
+ *              (It must be allocated before calling this routine).
+ */
+
+void load_dyn_pageflags(dyn_pageflags_t pagemap)
+{
+	int i, zone_num = 0, zone_check = 0;
+	struct zone *zone;
+
+	if (!pagemap)
+		return;
+
+	for_each_zone(zone) {
+		int size = 0;
+		suspend_active_writer->ops.writer.read_header_chunk((char *) &zone_check, sizeof(int));
+		if (zone_check != zone_num) {
+			printk("Zone check (%d) != zone_num (%d).\n", zone_check, zone_num);
+			BUG();	       
+		}
+		suspend_active_writer->ops.writer.read_header_chunk((char *) &size, sizeof(int));
+
+		for (i = 0; i < size; i++)
+			suspend_active_writer->ops.writer.read_header_chunk((char *) pagemap[zone_num][i], PAGE_SIZE);
+		zone_num++;
+	}
+	suspend_active_writer->ops.writer.read_header_chunk((char *) &zone_check, sizeof(int));
+	if (zone_check != -1) {
+		printk("Didn't read end of dyn pageflag data marker.(%x)\n", zone_check);
+		BUG();
+	}
+}
+
+/* relocate_dyn_pageflags
+ *
+ * Description: Relocate a set of pageflags to ensure they don't collide with
+ *              pageset 1 data which will get overwritten on copyback.
+ * Arguments:   dyn_pageflags_t *: Pointer to the bitmap being relocated.
+ */
+
+extern int num_zones(void);
+
+void relocate_dyn_pageflags(dyn_pageflags_t *pagemap)
+{
+	int i, zone_num = 0;
+	struct zone *zone;
+
+	if (!*pagemap)
+		return;
+
+	suspend_relocate_if_required((void *) pagemap, sizeof (void *) * num_zones());
+
+	for_each_zone(zone) {
+		int pages = (zone->spanned_pages + (PAGE_SIZE << 3) - 1) >>
+			(PAGE_SHIFT + 3);
+
+		suspend_relocate_if_required((void *) &((*pagemap)[zone_num]), sizeof(void *) * pages);
+
+		for (i = 0; i < pages; i++)
+			suspend_relocate_if_required((void *) &((*pagemap)[zone_num][i]),
+				PAGE_SIZE);
+		zone_num++;
+	}
+}
diff -urN oldtree/kernel/power/pageflags.h newtree/kernel/power/pageflags.h
--- oldtree/kernel/power/pageflags.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/pageflags.h	2006-03-08 15:22:33.297508250 +0000
@@ -0,0 +1,86 @@
+/*
+ * kernel/power/pageflags.h
+ *
+ * Copyright (C) 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Suspend2 needs a few pageflags while working that aren't otherwise
+ * used. To save the struct page pageflags, we dynamically allocate
+ * a bitmap and use that. These are the only non order-0 allocations
+ * we do.
+ *
+ * NOTE!!!
+ * We assume that PAGE_SIZE - sizeof(void *) is a multiple of
+ * sizeof(unsigned long). Is this ever false?
+ */
+
+#include <linux/dyn_pageflags.h>
+#include <linux/suspend.h>
+
+extern dyn_pageflags_t in_use_map;
+extern dyn_pageflags_t allocd_pages_map;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+extern dyn_pageflags_t unmap_map;
+#endif
+extern dyn_pageflags_t pageset2_map;
+extern dyn_pageflags_t conflicting_pages_map;
+extern dyn_pageflags_t checksum_map;
+
+/* 
+ * inusemap is used in two ways: 
+ * - During suspend, to tag pages which are not used (to speed up 
+ *   count_data_pages);
+ * - During resume, to tag pages which are in pagedir1. This does not tag 
+ *   pagedir2 pages, so !== first use.
+ */
+
+#define PageInUse(page) (test_dynpageflag(&in_use_map, page))
+#define SetPageInUse(page) (set_dynpageflag(&in_use_map, page))
+#define ClearPageInUse(page) (clear_dynpageflag(&in_use_map, page))
+
+#define PagePageset1(page) (test_dynpageflag(&pageset1_map, page))
+#define SetPagePageset1(page) (set_dynpageflag(&pageset1_map, page))
+#define ClearPagePageset1(page) (clear_dynpageflag(&pageset1_map, page))
+
+#define PagePageset1Copy(page) (test_dynpageflag(&pageset1_copy_map, page))
+#define SetPagePageset1Copy(page) (set_dynpageflag(&pageset1_copy_map, page))
+#define ClearPagePageset1Copy(page) (clear_dynpageflag(&pageset1_copy_map, page))
+
+#define PagePageset2(page) (test_dynpageflag(&pageset2_map, page))
+#define SetPagePageset2(page) (set_dynpageflag(&pageset2_map, page))
+#define ClearPagePageset2(page) (clear_dynpageflag(&pageset2_map, page))
+
+#define PageAllocd(page) (test_dynpageflag(&allocd_pages_map, page))
+#define SetPageAllocd(page) (set_dynpageflag(&allocd_pages_map, page))
+#define ClearPageAllocd(page) (clear_dynpageflag(&allocd_pages_map, page))
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+#define PageUnmap(page) (test_dynpageflag(&unmap_map, page))
+#define SetPageUnmap(page) (set_dynpageflag(&unmap_map, page))
+#define ClearPageUnmap(page) (clear_dynpageflag(&unmap_map, page))
+#endif
+
+static inline int PageChecksumIgnore(struct page *page)
+{
+	return checksum_map ?
+		test_dynpageflag(&checksum_map, page) :
+		0;
+}
+
+static inline void SetPageChecksumIgnore(struct page *page)
+{
+	if (checksum_map)
+		set_dynpageflag(&checksum_map, page);
+};
+
+static inline void ClearPageChecksumIgnore(struct page *page)
+{
+	if (checksum_map)
+		clear_dynpageflag(&checksum_map, page);
+};
+
+extern void save_dyn_pageflags(dyn_pageflags_t pagemap);
+extern void load_dyn_pageflags(dyn_pageflags_t pagemap);
+void relocate_dyn_pageflags(dyn_pageflags_t *pagemap);
+
diff -urN oldtree/kernel/power/power.h newtree/kernel/power/power.h
--- oldtree/kernel/power/power.h	2006-03-08 18:48:02.956063750 +0000
+++ newtree/kernel/power/power.h	2006-03-08 18:19:45.385972250 +0000
@@ -1,6 +1,8 @@
 #include <linux/suspend.h>
 #include <linux/utsname.h>
 
+#include "suspend.h"
+
 struct swsusp_info {
 	struct new_utsname	uts;
 	u32			version_code;
@@ -36,7 +38,7 @@
 extern struct subsystem power_subsys;
 
 /* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
+//extern const void __nosave_begin, __nosave_end;
 
 extern struct pbe *pagedir_nosave;
 
@@ -110,5 +112,4 @@
 extern int swsusp_suspend(void);
 extern int swsusp_resume(void);
 extern int swsusp_read(void);
-extern int swsusp_write(void);
 extern void swsusp_close(void);
diff -urN oldtree/kernel/power/power.h.orig newtree/kernel/power/power.h.orig
--- oldtree/kernel/power/power.h.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/power.h.orig	2006-03-08 15:21:19.228879250 +0000
@@ -0,0 +1,114 @@
+#include <linux/suspend.h>
+#include <linux/utsname.h>
+
+struct swsusp_info {
+	struct new_utsname	uts;
+	u32			version_code;
+	unsigned long		num_physpages;
+	int			cpus;
+	unsigned long		image_pages;
+	unsigned long		pages;
+	unsigned long		size;
+} __attribute__((aligned(PAGE_SIZE)));
+
+
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+extern int pm_suspend_disk(void);
+
+#else
+static inline int pm_suspend_disk(void)
+{
+	return -EPERM;
+}
+#endif
+extern struct semaphore pm_sem;
+#define power_attr(_name) \
+static struct subsys_attribute _name##_attr = {	\
+	.attr	= {				\
+		.name = __stringify(_name),	\
+		.mode = 0644,			\
+	},					\
+	.show	= _name##_show,			\
+	.store	= _name##_store,		\
+}
+
+extern struct subsystem power_subsys;
+
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+
+extern struct pbe *pagedir_nosave;
+
+/* Preferred image size in bytes (default 500 MB) */
+extern unsigned long image_size;
+extern int in_suspend;
+extern dev_t swsusp_resume_device;
+
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+
+extern unsigned int count_data_pages(void);
+
+struct snapshot_handle {
+	loff_t		offset;
+	unsigned int	page;
+	unsigned int	page_offset;
+	unsigned int	prev;
+	struct pbe	*pbe;
+	void		*buffer;
+	unsigned int	buf_offset;
+};
+
+#define data_of(handle)	((handle).buffer + (handle).buf_offset)
+
+extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+int snapshot_image_loaded(struct snapshot_handle *handle);
+
+#define SNAPSHOT_IOC_MAGIC	'3'
+#define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
+#define SNAPSHOT_UNFREEZE		_IO(SNAPSHOT_IOC_MAGIC, 2)
+#define SNAPSHOT_ATOMIC_SNAPSHOT	_IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_ATOMIC_RESTORE		_IO(SNAPSHOT_IOC_MAGIC, 4)
+#define SNAPSHOT_FREE			_IO(SNAPSHOT_IOC_MAGIC, 5)
+#define SNAPSHOT_SET_IMAGE_SIZE		_IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP		_IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE		_IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
+#define SNAPSHOT_FREE_SWAP_PAGES	_IO(SNAPSHOT_IOC_MAGIC, 9)
+#define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_IOC_MAXNR	10
+
+/**
+ *	The bitmap is used for tracing allocated swap pages
+ *
+ *	The entire bitmap consists of a number of bitmap_page
+ *	structures linked with the help of the .next member.
+ *	Thus each page can be allocated individually, so we only
+ *	need to make 0-order memory allocations to create
+ *	the bitmap.
+ */
+
+#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK		(sizeof(long) * 8)
+#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+
+struct bitmap_page {
+	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
+	struct bitmap_page	*next;
+};
+
+extern void free_bitmap(struct bitmap_page *bitmap);
+extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
+extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
+
+extern int swsusp_check(void);
+extern int swsusp_shrink_memory(void);
+extern void swsusp_free(void);
+extern int swsusp_suspend(void);
+extern int swsusp_resume(void);
+extern int swsusp_read(void);
+extern int swsusp_write(void);
+extern void swsusp_close(void);
diff -urN oldtree/kernel/power/power_off.c newtree/kernel/power/power_off.c
--- oldtree/kernel/power/power_off.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/power_off.c	2006-03-08 15:22:33.305508750 +0000
@@ -0,0 +1,78 @@
+/*
+ * kernel/power/power_off.c
+ *
+ * Copyright (C) 2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Support for powering down.
+ */
+
+#include <linux/device.h>
+#include <linux/suspend.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+#include <linux/reboot.h>
+#include "suspend2_common.h"
+#include "suspend2.h"
+#include "ui.h"
+
+unsigned long suspend_powerdown_method = 0; /* 0 - Kernel power off */
+
+extern struct pm_ops *pm_ops;
+
+/* Use suspend_enter from main.c */
+extern int suspend_enter(suspend_state_t state);
+
+int try_pm_state_powerdown(void)
+{
+	if (pm_ops && pm_ops->prepare && suspend_powerdown_method &&
+	    pm_ops->prepare(suspend_powerdown_method))
+			return 0;
+
+	if (suspend_powerdown_method > 3)
+		kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+	else {
+		if (device_suspend(PMSG_SUSPEND)) {
+			printk(KERN_ERR "Some devices failed to suspend\n");
+			return 0;
+		}
+	}
+
+	if (suspend_enter(suspend_powerdown_method))
+		return 0;
+
+	device_resume();
+
+	if (pm_ops && pm_ops->finish && suspend_powerdown_method)
+		pm_ops->finish(suspend_powerdown_method);
+
+	return 1;
+}
+
+/*
+ * suspend_power_down
+ * Functionality   : Powers down or reboots the computer once the image
+ *                   has been written to disk.
+ * Key Assumptions : Able to reboot/power down via code called or that
+ *                   the warning emitted if the calls fail will be visible
+ *                   to the user (ie printk resumes devices).
+ * Called From     : do_suspend2_suspend_2
+ */
+
+void suspend_power_down(void)
+{
+	if (test_action_state(SUSPEND_REBOOT)) {
+		suspend_prepare_status(DONT_CLEAR_BAR, "Ready to reboot.");
+		kernel_restart(NULL);
+	}
+
+	if (pm_ops && pm_ops->enter && suspend_powerdown_method && try_pm_state_powerdown())
+		return;
+
+	kernel_power_off();
+	suspend_prepare_status(DONT_CLEAR_BAR, "Powerdown failed");
+	while (1)
+		cpu_relax();
+}
+
diff -urN oldtree/kernel/power/power_off.h newtree/kernel/power/power_off.h
--- oldtree/kernel/power/power_off.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/power_off.h	2006-03-08 15:22:33.305508750 +0000
@@ -0,0 +1,13 @@
+/*
+ * kernel/power/suspend2_core/power_off.h
+ *
+ * Copyright (C) 2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Support for the powering down.
+ */
+
+int suspend_pm_state_finish(void);
+void suspend_power_down(void);
+extern unsigned long suspend_powerdown_method;
diff -urN oldtree/kernel/power/prepare_image.c newtree/kernel/power/prepare_image.c
--- oldtree/kernel/power/prepare_image.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/prepare_image.c	2006-03-08 18:40:01.389967750 +0000
@@ -0,0 +1,735 @@
+/*
+ * kernel/power/prepare_image.c
+ *
+ * Copyright (C) 2003-2005 Nigel Cunningham <nigel@suspend.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * We need to eat memory until we can:
+ * 1. Perform the save without changing anything (RAM_NEEDED < max_pfn)
+ * 2. Fit it all in available space (suspend_active_writer->available_space() >=
+ *    storage_needed())
+ * 3. Reload the pagedir and pageset1 to places that don't collide with their
+ *    final destinations, not knowing to what extent the resumed kernel will
+ *    overlap with the one loaded at boot time. I think the resumed kernel
+ *    should overlap completely, but I don't want to rely on this as it is 
+ *    an unproven assumption. We therefore assume there will be no overlap at
+ *    all (worse case).
+ * 4. Meet the user's requested limit (if any) on the size of the image.
+ *    The limit is in MB, so pages/256 (assuming 4K pages).
+ *
+ */
+
+#include <linux/highmem.h>
+#include <linux/freezer.h>
+#include <linux/hardirq.h>
+
+#include "suspend2.h"
+#include "pageflags.h"
+#include "modules.h"
+#include "suspend2_common.h"
+#include "io.h"
+#include "ui.h"
+#include "extent.h"
+#include "prepare_image.h"
+#include "checksum.h"
+
+static int are_frozen = 0, num_nosave = 0;
+static int header_space_allocated = 0;
+static int storage_allocated = 0;
+static int storage_available = 0;
+int extra_pd1_pages_allowance = 100;
+
+static int num_pcp_pages(void)
+{
+	struct zone *zone;
+	int result = 0, i = 0;
+
+	/* PCP lists */
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset;
+		int cpu;
+		
+		if (!zone->present_pages)
+			continue;
+		
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			if (!cpu_possible(cpu))
+				continue;
+
+			pset = zone_pcp(zone, cpu);
+
+			for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+				struct per_cpu_pages *pcp;
+
+				pcp = &(pset->pcp[i]);
+				result += pcp->count;
+			}
+		}
+	}
+	return result;
+}
+
+int real_nr_free_pages(void)
+{
+	return nr_free_pages() + num_pcp_pages();
+}
+
+static void get_extra_pd1_allowance(void)
+{
+	int orig_num_free = real_nr_free_pages(), final;
+	
+	suspend_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
+	device_suspend(PMSG_FREEZE);
+	local_irq_disable(); /* irqs might have been re-enabled on us */
+	device_power_down(PMSG_FREEZE);
+	
+	final = real_nr_free_pages();
+
+	device_power_up();
+	local_irq_enable();
+
+	device_resume();
+
+	extra_pd1_pages_allowance = orig_num_free - final + 100;
+}
+
+static int main_storage_needed(int use_ecr,
+		int ignore_extra_pd1_allow)
+{
+	return ((pagedir1.pageset_size + pagedir2.pageset_size +
+	  (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
+	 (use_ecr ? suspend_expected_compression_ratio() : 100) / 100);
+}
+
+static int header_storage_needed(void)
+{
+	unsigned long bytes = ((extents_allocated * 2 * sizeof(unsigned long)) +
+	 sizeof(struct suspend_header) +
+	 sizeof(struct suspend_module_header) +
+	 (int) suspend_header_storage_for_modules() +
+	 (dyn_pageflags_pages_per_bitmap() << PAGE_SHIFT) +
+	 suspend_num_modules *
+	 	(sizeof(struct suspend_module_header) + sizeof(int)));
+
+	return ((int) ((bytes + (int) PAGE_SIZE - 1) >> PAGE_SHIFT));
+}
+	
+static void display_stats(int always, int sub_extra_pd1_allow)
+{ 
+	unsigned long storage_allocated = suspend_active_writer->ops.writer.storage_allocated();
+	char buffer[255];
+	snprintf(buffer, 254, 
+		"Free:%d(%d). Sets:%d(%d),%d(%d). Header:%d. Nosave:%d-%d=%d. Storage:%lu/%u(%u). Needed:%d|%d|%d.\n", 
+		
+		/* Free */
+		nr_free_pages(),
+		nr_free_pages() - nr_free_highpages(),
+		
+		/* Sets */
+		pagedir1.pageset_size, pageset1_sizelow,
+		pagedir2.pageset_size, pageset2_sizelow,
+
+		/* Header */
+		header_storage_needed(),
+
+		/* Nosave */
+		num_nosave, extra_pagedir_pages_allocated,
+		num_nosave - extra_pagedir_pages_allocated,
+
+		/* Storage - converted to pages for comparison */
+		storage_allocated,
+		storage_needed(1, sub_extra_pd1_allow),
+		storage_available,
+
+		/* Needed */
+		ram_to_suspend() - nr_free_pages() - nr_free_highpages(),
+		storage_needed(1, sub_extra_pd1_allow) - storage_available, 
+		(image_size_limit > 0) ? (storage_needed(1, sub_extra_pd1_allow) - (image_size_limit << 8)) : 0);
+	if (always)
+		printk(buffer);
+	else
+		suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_MEDIUM, 1, buffer);
+}
+
+/* generate_free_page_map
+ *
+ * Description:	This routine generates a bitmap of free pages from the
+ * 		lists used by the memory manager. We then use the bitmap
+ * 		to quickly calculate which pages to save and in which
+ * 		pagesets.
+ */
+static void generate_free_page_map(void) 
+{
+	int i, order, loop, cpu;
+	struct page *page;
+	unsigned long flags;
+	struct zone *zone;
+	struct per_cpu_pageset *pset;
+
+	for_each_zone(zone) {
+		if (!zone->present_pages)
+			continue;
+		for(i=0; i < zone->spanned_pages; i++)
+			SetPageInUse(pfn_to_page(zone->zone_start_pfn + i));
+	}
+	
+	for_each_zone(zone) {
+		if (!zone->present_pages)
+			continue;
+		spin_lock_irqsave(&zone->lock, flags);
+		for (order = MAX_ORDER - 1; order >= 0; --order) {
+			list_for_each_entry(page, &zone->free_area[order].free_list, lru)
+				for(loop=0; loop < (1 << order); loop++) {
+					ClearPageInUse(page+loop);
+					ClearPagePageset2(page+loop);
+				}
+		}
+
+		
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			if (!cpu_possible(cpu))
+				continue;
+
+			pset = zone_pcp(zone, cpu);
+
+			for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+				struct per_cpu_pages *pcp;
+				struct page *page;
+
+				pcp = &pset->pcp[i];
+				list_for_each_entry(page, &pcp->list, lru) {
+					ClearPageInUse(page);
+					ClearPagePageset2(page);
+				}
+			}
+		}
+		
+		spin_unlock_irqrestore(&zone->lock, flags);
+	}
+}
+
+static struct page *rotext_start, *rotext_end;
+static struct page *nosave_start, *nosave_end;
+#ifdef CONFIG_DEBUG_RODATA
+static struct page *rtas_start, *rtas_end;
+static struct page *rodata_start, *rodata_end;
+extern char __start_rodata, __end_rodata;
+#endif
+#ifdef CONFIG_PPC_RTAS
+extern unsigned int rtas_data, rtas_size;
+#endif
+#ifdef CONFIG_PPC
+extern char _etext[];
+#else
+extern char _text[], _etext[];
+#endif
+
+#ifdef CONFIG_X86_32	/* 2.6.15 and later */
+extern int bad_ppro;
+
+/* 
+ * Copied from arch/i386/mm/init.c. It should be moved to
+ * an include file after testing.
+ */
+static inline int page_kills_ppro(unsigned long pagenr)
+{
+	if (pagenr >= 0x70000 && pagenr <= 0x7003F)
+		return 1;
+	return 0;
+}
+
+#else
+#define bad_ppro (0)
+#define page_kills_ppro(pfn) (0)
+#endif
+
+static __init int page_nosave_init(void)
+{
+#ifdef CONFIG_DEBUG_RODATA
+	rodata_start = virt_to_page(&__start_rodata);
+	rodata_end = virt_to_page(&__end_rodata);
+#endif
+#ifdef CONFIG_PPC
+	rotext_start = virt_to_page(PAGE_OFFSET);
+#else
+	rotext_start = virt_to_page(&_text);
+#endif
+	rotext_end = virt_to_page(&_etext);
+
+	nosave_start = virt_to_page(&__nosave_begin);
+	nosave_end = virt_to_page(((char *) &__nosave_end) - 1);
+
+#ifdef CONFIG_PPC_RTAS
+	rtas_start = virt_to_page(__va(rtas_data)); 
+	rtas_end = virt_to_page(__va(rtas_data) + rtas_size);
+#endif
+	return 0;
+}
+
+subsys_initcall(page_nosave_init);
+
+/* count_data_pages
+ *
+ * This routine generates our lists of pages to be stored in each
+ * pageset. Since we store the data using extents, and adding new
+ * extents might allocate a new extent page, this routine may well
+ * be called more than once.
+ */
+static struct pageset_sizes_result count_data_pages(void)
+{
+	int chunk_size, num_free = 0;
+	unsigned long loop;
+	int use_pagedir2;
+	struct pageset_sizes_result result;
+	struct zone *zone;
+
+	result.size1 = 0;
+	result.size1low = 0;
+	result.size2 = 0;
+	result.size2low = 0;
+
+	num_nosave = 0;
+
+	clear_dyn_pageflags(pageset1_map);
+	clear_dyn_pageflags(pageset1_copy_map);
+
+	generate_free_page_map();
+
+	if (test_result_state(SUSPEND_ABORTED))
+		return result;
+
+	/*
+	 * Pages not to be saved are marked Nosave irrespective of being reserved
+	 */
+	for_each_zone(zone) {
+		for (loop = 0; loop < zone->spanned_pages; loop++) {
+			unsigned long pfn = zone->zone_start_pfn + loop;
+			struct page *page = pfn_to_page(pfn);
+
+			if (
+#if 0
+#ifdef CONFIG_DEBUG_RODATA
+			    (page >= rodata_start && page <= rodata_end) ||
+#endif
+#ifdef CONFIG_DEBUG_ROTEXT
+			    (page >= rotext_start && page <= rotext_end) ||
+#endif
+#ifdef CONFIG_PPC_RTAS
+			    (page >= rtas_start && page <= rtas_end) ||
+#endif
+			    !pfn_valid(pfn) || 
+			    (bad_ppro && page_kills_ppro(pfn)) ||
+			    (checksum_map && PageChecksumIgnore(page)) ||
+			    !page_is_ram(pfn)) {
+#endif
+			    (page >= nosave_start && page <= nosave_end) ||
+			    PageAllocd(page)) {
+				num_nosave++;
+				continue;
+			}
+			if (!PageReserved(page)) {
+				if ((chunk_size=page)!=0) {
+					num_free += chunk_size;
+					loop += chunk_size - 1;
+					continue;
+				}
+			} else {
+				if (PageHighMem(page)) {
+					/* HighMem pages may be marked Reserved. We ignore them. */
+					num_nosave++;
+					continue;
+				}
+			};
+
+			use_pagedir2 = PagePageset2(page);
+
+			if (use_pagedir2) {
+				result.size2++;
+				if (!PageHighMem(page))
+					result.size2low++;
+				SetPagePageset1Copy(page);
+			} else {
+				result.size1++;
+				SetPagePageset1(page);
+				if (!PageHighMem(page))
+					result.size1low++;
+				}
+		}
+	}
+	
+	suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_MEDIUM, 0,
+		"Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%d) + NumFree (%d) = %d.\n",
+		result.size1, result.size2, num_nosave, num_free,
+		result.size1 + result.size2 + num_nosave + num_free);
+	BITMAP_FOR_EACH_SET(allocd_pages_map, loop)
+		SetPagePageset1Copy(pfn_to_page(loop));
+	return result;
+}
+
+/* amount_needed
+ *
+ * Calculates the amount by which the image size needs to be reduced to meet
+ * our constraints.
+ */
+static int amount_needed(int use_image_size_limit)
+{
+
+	int max1 = max( (int) (ram_to_suspend() - real_nr_free_pages() - 
+			  nr_free_highpages()),
+			((int) (storage_needed(1, 0) -  
+			  storage_available)));
+	if (use_image_size_limit)
+		return max( max1,
+			    (image_size_limit > 0) ? 
+			    ((int) (storage_needed(1, 0) - (image_size_limit << 8))) : 0);
+	return max1;
+}
+
+/* suspend_recalculate_stats
+ *
+ * Eaten is the number of pages which have been eaten.
+ * Pagedirincluded is the number of pages which have been allocated for the pagedir.
+ */
+struct pageset_sizes_result suspend_recalculate_stats(int storage_unavailable) 
+{
+	struct pageset_sizes_result result;
+
+	suspend_mark_pages_for_pageset2();  /* Need to call this before getting pageset1_size! */
+	BUG_ON(in_atomic() && !irqs_disabled());
+	result = count_data_pages();
+	pageset1_sizelow = result.size1low;
+	pageset2_sizelow = result.size2low;
+	pagedir1.lastpageset_size = pagedir1.pageset_size = result.size1;
+	pagedir2.lastpageset_size = pagedir2.pageset_size = result.size2;
+	if (!storage_unavailable) {
+		storage_available = suspend_active_writer->ops.writer.storage_available();
+		display_stats(0, 0);
+	}
+	BUG_ON(in_atomic() && !irqs_disabled());
+	return result;
+}
+
+/* update_image
+ *
+ * Allocate [more] memory and storage for the image.
+ */
+static int update_image(void) 
+{ 
+	struct pageset_sizes_result result;
+	int result2, param_used;
+
+	result = suspend_recalculate_stats(0);
+
+	if (suspend_allocate_checksum_pages()) {
+		suspend_message(SUSPEND_ANY_SECTION, SUSPEND_LOW, 1,
+			"Still need to get more pages for checksum pages.\n");
+		return 1;
+	}
+
+	/* Include allowance for growth in pagedir1 while writing pagedir 2 */
+	if (suspend_allocate_extra_pagedir_memory(&pagedir1,
+				pagedir1.pageset_size + extra_pd1_pages_allowance,
+				pageset2_sizelow)) {
+		suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_LOW, 1,
+			"Still need to get more pages for pagedir 1.\n");
+		return 1;
+	}
+
+	thaw_processes(FREEZER_KERNEL_THREADS);
+
+	param_used = main_storage_needed(1, 0);
+	if ((result2 = suspend_active_writer->ops.writer.allocate_storage(param_used))) {
+		suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_LOW, 1,
+			"Allocate storage returned %d. Still need to get more storage space for the image proper.\n",
+			result2);
+		storage_allocated = suspend_active_writer->ops.writer.storage_allocated();
+		if (freeze_processes()) {
+			set_result_state(SUSPEND_FREEZING_FAILED);
+			set_result_state(SUSPEND_ABORTED);
+		}
+		return 1;
+	}
+
+	param_used = header_storage_needed();
+	if ((result2 = suspend_active_writer->ops.writer.allocate_header_space(param_used))) {
+		suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_LOW, 1,
+			"Still need to get more storage space for header.\n");
+		if (freeze_processes()) {
+			set_result_state(SUSPEND_FREEZING_FAILED);
+			set_result_state(SUSPEND_ABORTED);
+		}
+		storage_allocated = suspend_active_writer->ops.writer.storage_allocated();
+		return 1;
+	}
+
+	header_space_allocated = param_used;
+
+	/* 
+	 * Allocate remaining storage space, if possible, up to the
+	 * maximum we know we'll need. It's okay to allocate the
+	 * maximum if the writer is the swapwriter, but
+	 * we don't want to grab all available space on an NFS share.
+	 * We therefore ignore the expected compression ratio here,
+	 * thereby trying to allocate the maximum image size we could
+	 * need (assuming compression doesn't expand the image), but
+	 * don't complain if we can't get the full amount we're after.
+	 */
+
+	suspend_active_writer->ops.writer.allocate_storage(
+		min(storage_available,
+		    main_storage_needed(0, 1)));
+
+	storage_allocated = suspend_active_writer->ops.writer.storage_allocated();
+
+	if (freeze_processes()) {
+		set_result_state(SUSPEND_FREEZING_FAILED);
+		set_result_state(SUSPEND_ABORTED);
+	}
+
+	suspend_recalculate_stats(0);
+
+	suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_LOW, 1,
+		"Amount still needed (%d) > 0:%d. Header: %d < %d: %d,"
+		" Storage allocd: %d < %d + %d: %d.\n",
+			amount_needed(0),
+			(amount_needed(0) > 0),
+			header_space_allocated, header_storage_needed(),
+			header_space_allocated < header_storage_needed(),
+		 	storage_allocated,
+			header_storage_needed(), main_storage_needed(1, 1),
+			storage_allocated <
+			(header_storage_needed() + main_storage_needed(1, 1)));
+
+	check_shift_keys(0, NULL);
+
+	return ((amount_needed(0) > 0) ||
+		header_space_allocated < header_storage_needed() ||
+		 storage_allocated < 
+		 (header_storage_needed() + main_storage_needed(1, 1)));
+}
+
+/* attempt_to_freeze
+ * 
+ * Try to freeze processes.
+ */
+
+static int attempt_to_freeze(void)
+{
+	int result;
+	
+	/* Stop processes before checking again */
+	thaw_processes(FREEZER_ALL_THREADS);
+	suspend_prepare_status(CLEAR_BAR, "Freezing processes");
+	result = freeze_processes();
+
+	if (result) {
+		set_result_state(SUSPEND_ABORTED);
+		set_result_state(SUSPEND_FREEZING_FAILED);
+	} else
+		are_frozen = 1;
+
+	return result;
+}
+
+int storage_needed(int use_ecr, int ignore_extra_pd1_allow)
+{
+	return 	(main_storage_needed(use_ecr, ignore_extra_pd1_allow)
+		       + header_storage_needed());
+}
+
+int ram_to_suspend(void)
+{
+	return (1 + 
+		max((pagedir1.pageset_size + extra_pd1_pages_allowance - 
+				pageset2_sizelow), 0) +
+		MIN_FREE_RAM + suspend_memory_for_modules());
+}
+
+
+/* eat_memory
+ *
+ * Try to free some memory, either to meet hard or soft constraints on the image
+ * characteristics.
+ * 
+ * Hard constraints:
+ * - Pageset1 must be < half of memory;
+ * - We must have enough memory free at resume time to have pageset1
+ *   be able to be loaded in pages that don't conflict with where it has to
+ *   be restored.
+ * Soft constraints
+ * - User specificied image size limit.
+ */
+static int eat_memory(void)
+{
+	int orig_memory_still_to_eat, last_amount_needed = 0, times_criteria_met = 0;
+	int free_flags = 0, did_eat_memory = 0;
+	
+	/*
+	 * Note that if we have enough storage space and enough free memory, we may
+	 * exit without eating anything. We give up when the last 10 iterations ate
+	 * no extra pages because we're not going to get much more anyway, but
+	 * the few pages we get will take a lot of time.
+	 *
+	 * We freeze processes before beginning, and then unfreeze them if we
+	 * need to eat memory until we think we have enough. If our attempts
+	 * to freeze fail, we give up and abort.
+	 */
+
+	/* -- Stage 1: Freeze Processes -- */
+
+	
+	suspend_recalculate_stats(0);
+
+	orig_memory_still_to_eat = amount_needed(1);
+	last_amount_needed = orig_memory_still_to_eat;
+
+	switch (image_size_limit) {
+		case -1: /* Don't eat any memory */
+			if (orig_memory_still_to_eat) {
+				set_result_state(SUSPEND_ABORTED);
+				set_result_state(SUSPEND_WOULD_EAT_MEMORY);
+			}
+			break;
+		case -2:  /* Free caches only */
+			free_flags = GFP_NOIO | __GFP_HIGHMEM;
+			break;
+		default:
+			free_flags = GFP_ATOMIC | __GFP_HIGHMEM;
+	}
+		
+	thaw_processes(FREEZER_KERNEL_THREADS);
+
+	/* -- Stage 2: Eat memory -- */
+
+	while (((amount_needed(1) > 0) || (image_size_limit == -2)) && 
+		(!test_result_state(SUSPEND_ABORTED)) && 
+		(times_criteria_met < 10)) {
+		int amount_freed;
+		int amount_wanted = orig_memory_still_to_eat - amount_needed(1);
+
+		suspend_prepare_status(CLEAR_BAR, "Seeking to free %dMB of memory.", MB(amount_needed(1)));
+
+		if (amount_wanted < 1)
+			amount_wanted = 1; /* image_size_limit == -2 */
+
+		if (orig_memory_still_to_eat)
+			suspend_update_status(orig_memory_still_to_eat - amount_needed(1),
+				orig_memory_still_to_eat,
+				" Image size %d ",
+				MB(storage_needed(1, 0)));
+		else
+			suspend_update_status(0, 1, "Image size %d ",
+					MB(storage_needed(1, 0)));
+		
+		if ((last_amount_needed - amount_needed(1)) < 10)
+			times_criteria_met++;
+		else
+			times_criteria_met = 0;
+		last_amount_needed = amount_needed(1);
+		amount_freed = shrink_all_memory(last_amount_needed);
+		suspend_recalculate_stats(0);
+
+		did_eat_memory = 1;
+
+		check_shift_keys(0, NULL);
+	}
+
+	if (freeze_processes()) {
+		set_result_state(SUSPEND_FREEZING_FAILED);
+		set_result_state(SUSPEND_ABORTED);
+	}
+	
+	if (did_eat_memory) {
+		unsigned long orig_state = get_suspend_state();
+		/* Freeze_processes will call sys_sync too */
+		restore_suspend_state(orig_state);
+		suspend_recalculate_stats(0);
+	}
+
+	/* Blank out image size display */
+	suspend_update_status(100, 100, NULL);
+
+	if (!test_result_state(SUSPEND_ABORTED)) {
+		/* Include image size limit when checking what to report */
+		if (amount_needed(1) - extra_pd1_pages_allowance > 0) 
+			set_result_state(SUSPEND_UNABLE_TO_FREE_ENOUGH_MEMORY);
+
+		/* But don't include it when deciding whether to abort (soft limit) */
+		if ((amount_needed(0) - extra_pd1_pages_allowance > 0)) {
+			printk("Unable to free sufficient memory to suspend. Still need %d pages.\n",
+				amount_needed(1));
+			display_stats(1, 1);
+			set_result_state(SUSPEND_ABORTED);
+		}
+		
+		check_shift_keys(1, "Memory eating completed.");
+	}
+
+	return 0;
+}
+
+/* prepare_image
+ *
+ * Entry point to the whole image preparation section.
+ *
+ * We do four things:
+ * - Freeze processes;
+ * - Ensure image size constraints are met;
+ * - Complete all the preparation for saving the image,
+ *   including allocation of storage. The only memory
+ *   that should be needed when we're finished is that
+ *   for actually storing the image (and we know how
+ *   much is needed for that because the modules tell
+ *   us).
+ * - Make sure that all dirty buffers are written out.
+ */
+
+#define MAX_TRIES 4
+int suspend_prepare_image(void)
+{
+	int result = 1, tries = 0;
+
+	are_frozen = 0;
+
+	header_space_allocated = 0;
+
+	if (attempt_to_freeze())
+		return 1;
+
+	if (!extra_pd1_pages_allowance)
+		get_extra_pd1_allowance();
+
+	storage_available = suspend_active_writer->ops.writer.storage_available();
+
+	if (!storage_available) {
+		printk(KERN_ERR "You need some storage available to be able to suspend.\n");
+		set_result_state(SUSPEND_ABORTED);
+		set_result_state(SUSPEND_NOSTORAGE_AVAILABLE);
+		return 1;
+	}
+
+	do {
+		suspend_prepare_status(CLEAR_BAR, "Preparing Image.");
+	
+		if (eat_memory() || test_result_state(SUSPEND_ABORTED))
+			break;
+
+		result = update_image();
+
+		check_shift_keys(0, NULL);
+		
+		tries++;
+
+	} while ((result) && (tries < MAX_TRIES) && (!test_result_state(SUSPEND_ABORTED)) &&
+		(!test_result_state(SUSPEND_UNABLE_TO_FREE_ENOUGH_MEMORY)));
+
+	if (tries == MAX_TRIES) {
+		abort_suspend("Unable to successfully prepare the image.\n");
+		display_stats(1, 0);
+	}
+
+	check_shift_keys(1, "Image preparation complete.");
+
+	return result;
+}
diff -urN oldtree/kernel/power/prepare_image.h newtree/kernel/power/prepare_image.h
--- oldtree/kernel/power/prepare_image.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/prepare_image.h	2006-03-08 16:41:02.759831250 +0000
@@ -0,0 +1,31 @@
+/*
+ * kernel/power/prepare_image.h
+ */
+
+extern int suspend_prepare_image(void);
+extern struct pageset_sizes_result suspend_recalculate_stats(int storage_available);
+extern int real_nr_free_pages(void);
+extern int image_size_limit;
+extern int pageset1_sizelow, pageset2_sizelow;
+
+struct pageset_sizes_result {
+	int size1; /* Can't be unsigned - breaks MAX function */
+	int size1low;
+	int size2;
+	int size2low;
+};
+
+#ifdef CONFIG_CRYPTO
+extern int suspend_expected_compression_ratio(void);
+#else
+static inline int suspend_expected_compression_ratio(void)
+{
+	return 0;
+};
+#endif
+
+#define MIN_FREE_RAM (max_low_pfn >> 7)
+
+extern int extra_pd1_pages_allowance;
+extern int storage_needed(int use_ecr, int ignore_extra_p1_allowance);
+extern int ram_to_suspend(void);
diff -urN oldtree/kernel/power/proc.c newtree/kernel/power/proc.c
--- oldtree/kernel/power/proc.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/proc.c	2006-03-08 15:22:33.309509000 +0000
@@ -0,0 +1,305 @@
+/*
+ * /kernel/power/proc.c
+ *
+ * Copyright (C) 2002-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains support for proc entries for tuning Suspend2.
+ *
+ * We have a generic handler that deals with the most common cases, and
+ * hooks for special handlers to use.
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+
+#include "proc.h"
+#include "suspend2.h"
+#include "storage.h"
+
+static int suspend_proc_initialised = 0;
+
+static struct list_head suspend_proc_entries;
+static struct proc_dir_entry *suspend_dir;
+static struct suspend_proc_data proc_params[];
+
+extern void __suspend_try_resume(void);
+extern void suspend_main(void);
+
+/* suspend_read_proc
+ *
+ * Generic handling for reading the contents of bits, integers,
+ * unsigned longs and strings.
+ */
+static int suspend_read_proc(char *page, char **start, off_t off, int count,
+		int *eof, void *data)
+{
+	int len = 0;
+	struct suspend_proc_data *proc_data = (struct suspend_proc_data *) data;
+
+	if (suspend_start_anything(0))
+		return -EBUSY;
+	
+	if (proc_data->needs_storage_manager & 1)
+		suspend_prepare_usm();
+	
+	switch (proc_data->type) {
+		case SUSPEND_PROC_DATA_CUSTOM:
+			if (proc_data->data.special.read_proc) {
+				read_proc_t *read_proc = proc_data->data.special.read_proc;
+				len = read_proc(page, start, off, count, eof, data);
+			} else
+				len = 0;
+			break;
+		case SUSPEND_PROC_DATA_BIT:
+			len = sprintf(page, "%d\n", 
+				-test_bit(proc_data->data.bit.bit,
+					proc_data->data.bit.bit_vector));
+			break;
+		case SUSPEND_PROC_DATA_INTEGER:
+			{
+				int *variable = proc_data->data.integer.variable;
+				len = sprintf(page, "%d\n", *variable);
+				break;
+			}
+		case SUSPEND_PROC_DATA_UL:
+			{
+				long *variable = proc_data->data.ul.variable;
+				len = sprintf(page, "%lu\n", *variable);
+				break;
+			}
+		case SUSPEND_PROC_DATA_STRING:
+			{
+				char *variable = proc_data->data.string.variable;
+				len = sprintf(page, "%s\n", variable);
+				break;
+			}
+	}
+	/* Side effect routine? */
+	if (proc_data->read_proc)
+		proc_data->read_proc();
+	
+	if (len <= count)
+		*eof = 1;
+	
+	if (proc_data->needs_storage_manager & 1)
+		suspend_cleanup_usm();
+
+	suspend_finish_anything(0);
+	
+	return len;
+}
+/* suspend_write_proc
+ *
+ * Generic routine for handling writing to files representing
+ * bits, integers and unsigned longs.
+ */
+
+static int suspend_write_proc(struct file *file, const char *buffer,
+		unsigned long count, void *data)
+{
+	struct suspend_proc_data *proc_data = (struct suspend_proc_data *) data;
+	char *my_buf = (char *) get_zeroed_page(GFP_ATOMIC);
+	int result = count, assigned_temp_buffer = 0;
+
+	if (!my_buf)
+		return -ENOMEM;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+
+	if (copy_from_user(my_buf, buffer, count))
+		return -EFAULT;
+	
+	if (suspend_start_anything(proc_data == &proc_params[0]))
+		return -EBUSY;
+
+	my_buf[count] = 0;
+
+	if (proc_data->needs_storage_manager & 2)
+		suspend_prepare_usm();
+
+	switch (proc_data->type) {
+		case SUSPEND_PROC_DATA_CUSTOM:
+			if (proc_data->data.special.write_proc) {
+				write_proc_t *write_proc = proc_data->data.special.write_proc;
+				result = write_proc(file, buffer, count, data);
+			}
+			break;
+		case SUSPEND_PROC_DATA_BIT:
+			{
+			int value = simple_strtoul(my_buf, NULL, 0);
+			if (value)
+				set_bit(proc_data->data.bit.bit, 
+					(proc_data->data.bit.bit_vector));
+			else
+				clear_bit(proc_data->data.bit.bit,
+					(proc_data->data.bit.bit_vector));
+			}
+			break;
+		case SUSPEND_PROC_DATA_INTEGER:
+			{
+				int *variable = proc_data->data.integer.variable;
+				int minimum = proc_data->data.integer.minimum;
+				int maximum = proc_data->data.integer.maximum;
+				*variable = simple_strtol(my_buf, NULL, 0);
+				if (((*variable) < minimum))
+					*variable = minimum;
+
+				if (((*variable) > maximum))
+					*variable = maximum;
+				break;
+			}
+		case SUSPEND_PROC_DATA_UL:
+			{
+				unsigned long *variable = proc_data->data.ul.variable;
+				unsigned long minimum = proc_data->data.ul.minimum;
+				unsigned long maximum = proc_data->data.ul.maximum;
+				*variable = simple_strtoul(my_buf, NULL, 0);
+				
+				if (minimum && ((*variable) < minimum))
+					*variable = minimum;
+
+				if (maximum && ((*variable) > maximum))
+					*variable = maximum;
+				break;
+			}
+			break;
+		case SUSPEND_PROC_DATA_STRING:
+			{
+				int copy_len = count;
+				char *variable =
+					proc_data->data.string.variable;
+
+				if (proc_data->data.string.max_length &&
+				    (copy_len > proc_data->data.string.max_length))
+					copy_len = proc_data->data.string.max_length;
+
+				if (!variable) {
+					proc_data->data.string.variable =
+						variable = (char *) get_zeroed_page(GFP_ATOMIC);
+					assigned_temp_buffer = 1;
+				}
+				strncpy(variable, my_buf, copy_len);
+				if ((copy_len) &&
+					 (my_buf[copy_len - 1] == '\n'))
+					variable[count - 1] = 0;
+				variable[count] = 0;
+			}
+			break;
+	}
+	free_page((unsigned long) my_buf);
+	/* Side effect routine? */
+	if (proc_data->write_proc)
+		proc_data->write_proc();
+
+	/* Free temporary buffers */
+	if (assigned_temp_buffer) {
+		free_page((unsigned long) proc_data->data.string.variable);
+		proc_data->data.string.variable = NULL;
+	}
+
+	if (proc_data->needs_storage_manager & 2)
+		suspend_cleanup_usm();
+
+	suspend_finish_anything(proc_data == &proc_params[0]);
+	
+	return result;
+}
+
+/* Non-module proc entries.
+ *
+ * This array contains entries that are automatically registered at
+ * boot. Plugins and the console code register their own entries separately.
+ *
+ * NB: If you move do_suspend, change suspend_write_proc's test so that
+ * suspend_start_anything still gets a 1 when the user echos > do_suspend!
+ */
+
+static struct suspend_proc_data proc_params[] = {
+	{ .filename			= "do_suspend",
+	  .permissions			= PROC_WRITEONLY,
+	  .type				= SUSPEND_PROC_DATA_CUSTOM,
+	  .write_proc			= suspend_main,
+	  .needs_storage_manager	= 2,
+	},
+
+	{ .filename			= "do_resume",
+	  .permissions			= PROC_WRITEONLY,
+	  .type				= SUSPEND_PROC_DATA_CUSTOM,
+	  .write_proc			= __suspend_try_resume,
+	  .needs_storage_manager	= 2,
+	},
+};
+
+/* suspend_initialise_proc
+ *
+ * Initialise the /proc/suspend2 directory.
+ */
+
+static void suspend_initialise_proc(void)
+{
+	int i;
+	int numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data);
+	
+	if (suspend_proc_initialised)
+		return;
+
+	suspend_dir = proc_mkdir("suspend2", NULL);
+	
+	BUG_ON(!suspend_dir);
+
+	INIT_LIST_HEAD(&suspend_proc_entries);
+
+	suspend_proc_initialised = 1;
+
+	for (i=0; i< numfiles; i++)
+		suspend_register_procfile(&proc_params[i]);
+}
+
+/* suspend_register_procfile
+ *
+ * Helper for registering a new /proc/suspend2 entry.
+ */
+
+struct proc_dir_entry *suspend_register_procfile(
+		struct suspend_proc_data *suspend_proc_data)
+{
+	struct proc_dir_entry *new_entry;
+	
+	if (!suspend_proc_initialised)
+		suspend_initialise_proc();
+
+	new_entry = create_proc_entry(
+			suspend_proc_data->filename,
+			suspend_proc_data->permissions, 
+			suspend_dir);
+	if (new_entry) {
+		list_add_tail(&suspend_proc_data->proc_data_list, &suspend_proc_entries);
+		new_entry->read_proc = suspend_read_proc;
+		new_entry->write_proc = suspend_write_proc;
+		new_entry->data = suspend_proc_data;
+	} else {
+		printk("Error! create_proc_entry returned NULL.\n");
+		INIT_LIST_HEAD(&suspend_proc_data->proc_data_list);
+	}
+	return new_entry;
+}
+
+/* suspend_unregister_procfile
+ *
+ * Helper for removing unwanted /proc/suspend2 entries.
+ *
+ */
+void suspend_unregister_procfile(struct suspend_proc_data *suspend_proc_data)
+{
+	if (list_empty(&suspend_proc_data->proc_data_list))
+		return;
+
+	remove_proc_entry(
+		suspend_proc_data->filename,
+		suspend_dir);
+	list_del(&suspend_proc_data->proc_data_list);
+}
diff -urN oldtree/kernel/power/proc.h newtree/kernel/power/proc.h
--- oldtree/kernel/power/proc.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/proc.h	2006-03-08 15:22:33.313509250 +0000
@@ -0,0 +1,70 @@
+/*
+ * kernel/power/proc.h
+ *
+ * Copyright (C) 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * It provides declarations for suspend to use in managing
+ * /proc/suspend2. When we switch to kobjects,
+ * this will become redundant.
+ *
+ */
+
+#include <linux/proc_fs.h>
+
+struct suspend_proc_data {
+	char *filename;
+	int permissions;
+	int type;
+	int needs_storage_manager;
+	union {
+		struct {
+			unsigned long *bit_vector;
+			int bit;
+		} bit;
+		struct {
+			int *variable;
+			int minimum;
+			int maximum;
+		} integer;
+		struct {
+			unsigned long *variable;
+			unsigned long minimum;
+			unsigned long maximum;
+		} ul;
+		struct {
+			char *variable;
+			int max_length;
+		} string;
+		struct {
+			read_proc_t *read_proc;
+			write_proc_t *write_proc;
+			void *data;
+		} special;
+	} data;
+	
+	/* Side effects routines. Used, eg, for reparsing the
+	 * resume2 entry when it changes */
+	void (*read_proc) (void);
+	void (*write_proc) (void); 
+	struct list_head proc_data_list;
+};
+
+enum {
+	SUSPEND_PROC_DATA_NONE,
+	SUSPEND_PROC_DATA_CUSTOM,
+	SUSPEND_PROC_DATA_BIT,
+	SUSPEND_PROC_DATA_INTEGER,
+	SUSPEND_PROC_DATA_UL,
+	SUSPEND_PROC_DATA_STRING
+};
+
+#define PROC_WRITEONLY 0200
+#define PROC_READONLY 0400
+#define PROC_RW 0600
+
+struct proc_dir_entry *suspend_register_procfile(
+		struct suspend_proc_data *suspend_proc_data);
+void suspend_unregister_procfile(struct suspend_proc_data *suspend_proc_data);
+
diff -urN oldtree/kernel/power/process.c newtree/kernel/power/process.c
--- oldtree/kernel/power/process.c	2006-03-08 18:48:02.960064000 +0000
+++ newtree/kernel/power/process.c	2006-03-08 16:35:25.930780750 +0000
@@ -1,165 +1,417 @@
-/*
- * drivers/power/process.c - Functions for starting/stopping processes on 
- *                           suspend transitions.
+ /*
+ * kernel/power/process.c
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
+ * Copyright (C) 2002-2004 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Freeze_and_free contains the routines software suspend uses to freeze other
+ * processes during the suspend cycle and to (if necessary) free up memory in
+ * accordance with limitations on the image size.
+ *
+ * Ideally, the image saved to disk would be an atomic copy of the entire
+ * contents of all RAM and related hardware state. One of the first
+ * prerequisites for getting our approximation of this is stopping the activity
+ * of other processes. We can't stop all other processes, however, since some
+ * are needed in doing the I/O to save the image. Freeze_and_free.c contains
+ * the routines that control suspension and resuming of these processes.
+ *
+ * Under high I/O load, we need to be careful about the order in which we
+ * freeze processes. If we freeze processes in the wrong order, we could
+ * deadlock others. The freeze_order array this specifies the order in which
+ * critical processes are frozen. All others are suspended after these have
+ * entered the refrigerator.
  *
- * Originally from swsusp.
+ * Another complicating factor is that freeing memory requires the processes
+ * to not be frozen, but at the end of freeing memory, they need to be frozen
+ * so that we can be sure we actually have eaten enough memory. This is why
+ * freezing and freeing are in the one file. The freezer is not called from
+ * the main logic, but indirectly, via the code for eating memory. The eat
+ * memory logic is iterative, first freezing processes and checking the stats,
+ * then (if necessary) unfreezing them and eating more memory until it looks
+ * like the criteria are met (at which point processes are frozen & stats
+ * checked again).
  */
 
 
-#undef DEBUG
-
-#include <linux/smp_lock.h>
-#include <linux/interrupt.h>
 #include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namespace.h>
+#include <linux/buffer_head.h>
+#include <linux/hardirq.h>
+#include <asm/tlbflush.h>
 #include <linux/syscalls.h>
 
-/* 
- * Timeout for stopping processes
+unsigned long freezer_state = 0;
+
+#if 0
+//#ifdef CONFIG_PM_DEBUG
+#define freezer_message(msg, a...) do { printk(msg, ##a); } while(0)
+#else
+#define freezer_message(msg, a...) do { } while(0)
+#endif
+
+/* Timeouts when freezing */
+#define FREEZER_TOTAL_TIMEOUT (5 * HZ)
+#define FREEZER_CHECK_TIMEOUT (HZ / 10)
+
+DECLARE_COMPLETION(kernelspace_thaw);
+DECLARE_COMPLETION(userspace_thaw);
+static atomic_t nr_userspace_frozen;
+static atomic_t nr_kernelspace_frozen;
+struct frozen_fs
+{
+      struct list_head fsb_list;
+      struct super_block *sb;
+};
+
+
+LIST_HEAD(frozen_fs_list);
+
+void freezer_make_fses_rw(void)
+{
+      struct frozen_fs *fs, *next_fs;
+
+      list_for_each_entry_safe(fs, next_fs, &frozen_fs_list, fsb_list) {
+              thaw_bdev(fs->sb->s_bdev, fs->sb);
+
+              list_del(&fs->fsb_list);
+              kfree(fs);
+      }
+}
+
+/*
+* Done after userspace is frozen, so there should be no danger of
+* fses being unmounted while we're in here.
+*/
+int freezer_make_fses_ro(void)
+{
+      struct frozen_fs *fs;
+      struct super_block *sb;
+
+      /* Generate the list */
+      list_for_each_entry(sb, &super_blocks, s_list) {
+              if (!sb->s_root || !sb->s_bdev ||
+                  (sb->s_frozen == SB_FREEZE_TRANS) ||
+                  (sb->s_flags & MS_RDONLY))
+                      continue;
+
+              fs = kmalloc(sizeof(struct frozen_fs), GFP_ATOMIC);
+              fs->sb = sb;
+              list_add_tail(&fs->fsb_list, &frozen_fs_list);
+      };
+
+      /* Do the freezing in reverse order so filesystems dependant
+       * upon others are frozen in the right order. (Eg loopback
+       * on ext3). */
+      list_for_each_entry_reverse(fs, &frozen_fs_list, fsb_list)
+              freeze_bdev(fs->sb->s_bdev);
+       return 0;
+}
+
+/*
+ * freezeable
+ *
+ * Description:       Determine whether a process should be frozen yet.
+ * Parameters:        struct task_struct *    The process to consider.
+ *            int                     Boolean - 0 = userspace else all.
+ * Returns:   int                     0 if don't freeze yet, otherwise do.
  */
-#define TIMEOUT	(20 * HZ)
+
+static int freezeable(struct task_struct * p, int all_freezable)
 
 
-static inline int freezeable(struct task_struct * p)
 {
 	if ((p == current) || 
+            (p->flags & PF_FROZEN) ||
 	    (p->flags & PF_NOFREEZE) ||
 	    (p->exit_state == EXIT_ZOMBIE) ||
 	    (p->exit_state == EXIT_DEAD) ||
 	    (p->state == TASK_STOPPED) ||
-	    (p->state == TASK_TRACED))
+            (p->state == TASK_TRACED) ||
+            (!p->mm && !all_freezable))
+
 		return 0;
 	return 1;
 }
 
-/* Refrigerator is place where frozen processes are stored :-). */
+static void __freeze_process(struct completion *completion_handler,
+                              atomic_t *nr_frozen)
+
+{
+     long save;
+      freezer_message("%s (%d) frozen.\n",
+                      current->comm, current->pid);
+      save = current->state;
+      atomic_inc(nr_frozen);
+      wait_for_completion(completion_handler);
+      atomic_dec(nr_frozen);
+
+      current->state = save;
+      freezer_message("%s (%d) leaving freezer.\n",
+                      current->comm, current->pid);
+}
+
+/*
+ * Refrigerator
+ */
 void refrigerator(void)
 {
-	/* Hmm, should we be allowed to suspend when there are realtime
-	   processes around? */
-	long save;
-	save = current->state;
-	pr_debug("%s entered refrigerator\n", current->comm);
-	printk("=");
-
-	frozen_process(current);
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending(); /* We sent fake signal, clean it up */
-	spin_unlock_irq(&current->sighand->siglock);
-
-	while (frozen(current)) {
-		current->state = TASK_UNINTERRUPTIBLE;
-		schedule();
+       unsigned long flags;
+       might_sleep();
+       /* Locking to handle race against waking the process in
+        * freeze threads. */
+       spin_lock_irqsave(&current->sighand->siglock, flags);
+       frozen_process(current);
+
+      recalc_sigpending();
+      spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+      if (test_freezer_state(FREEZER_ON)) {
+               if (current->mm)
+                      __freeze_process(&userspace_thaw, &nr_userspace_frozen);
+               else
+                      __freeze_process(&kernelspace_thaw, &nr_userspace_frozen);   
 	}
-	pr_debug("%s left refrigerator\n", current->comm);
-	current->state = save;
+      spin_lock_irqsave(&current->sighand->siglock, flags);
+      recalc_sigpending();
+      current->flags &= ~PF_FROZEN;
+      spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+      return;
+}
+void thaw_processes(int do_all_threads)
+{
+        if (do_all_threads) {
+                clear_freezer_state(FREEZER_ON);
+                clear_freezer_state(ABORT_FREEZING);
+        }
+
+        complete_all(&kernelspace_thaw);
+        while (atomic_read(&nr_kernelspace_frozen) > 0)
+                yield();
+
+        init_completion(&kernelspace_thaw);
+        freezer_make_fses_rw();
+
+        if (do_all_threads) {
+                complete_all(&userspace_thaw);
+                while (atomic_read(&nr_userspace_frozen) > 0)
+                        yield();
+                init_completion(&userspace_thaw);
+        }
 }
 
-static inline void freeze_process(struct task_struct *p)
+ /*
+ * num_freezeable
+ *
+ * Description:       Determine how many processes of our type are still to be
+ *            frozen. As a side effect, update the progress bar too.
+ * Parameters:        int     Which type we are trying to freeze.
+ *            int     Whether we are displaying our progress.
+ */
+static int num_freezeable(int do_all_threads) {
+
+      struct task_struct *g, *p;
+      int todo_this_type = 0;
+      read_lock(&tasklist_lock);
+      do_each_thread(g, p) {
+              if (freezeable(p, do_all_threads))
+                      todo_this_type++;
+      } while_each_thread(g, p);
+      read_unlock(&tasklist_lock);
+
+      return todo_this_type;
+}
+/*
+ * num_uninterruptible
+ *
+ * Description:       Determine how many processes of our type are in state
+ *            task uninterruptible.
+ * Parameters:        int     Which type we are trying to freeze.
+ */
+static int num_uninterruptible(int do_all_threads) {
+
+      struct task_struct *g, *p;
+      int count = 0;
+
+      read_lock(&tasklist_lock);
+      do_each_thread(g, p) {
+              if (freezeable(p, do_all_threads) &&
+                      p->state == TASK_UNINTERRUPTIBLE)
+                      count++;
+      } while_each_thread(g, p);
+      read_unlock(&tasklist_lock);
+
+      return count;
+
+}
+
+/*
+ * Tell threads of the type to enter the freezer.
+ */
+static void signal_threads(int do_all_threads)
 {
-	unsigned long flags;
+        struct task_struct *g, *p;
+        unsigned long flags;
 
-	if (!freezing(p)) {
-		freeze(p);
-		spin_lock_irqsave(&p->sighand->siglock, flags);
-		signal_wake_up(p, 0);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
-	}
+        read_lock(&tasklist_lock);
+        do_each_thread(g, p) {
+                if (!freezeable(p, do_all_threads))
+                        continue;
+
+                freeze(p);
+                spin_lock_irqsave(&p->sighand->siglock, flags);
+                signal_wake_up(p, 0);
+                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+        } while_each_thread(g, p);
+        read_unlock(&tasklist_lock);
 }
 
-/* 0 = success, else # of processes that we failed to stop */
+/*
+ * Prod processes that haven't entered the refrigerator yet.
+ */
+static void prod_processes(int do_all_threads)
+{
+      struct task_struct *g, *p;
+      unsigned long flags;
+
+      read_lock(&tasklist_lock);
+      do_each_thread(g, p) {
+              if (!freezeable(p, do_all_threads))
+                      continue;
+
+              spin_lock_irqsave(&p->sighand->siglock, flags);
+              if (!(p->flags & PF_FROZEN)) {
+                      recalc_sigpending();
+                      signal_wake_up(p, 0);
+              }
+              spin_unlock_irqrestore(&p->sighand->siglock, flags);
+      } while_each_thread(g, p);
+      read_unlock(&tasklist_lock);
+}
+
+/*
+ * Freezer failure.
+ *
+ * Check whether we failed to freeze all the processes that
+ * should be frozen. If we find a task that failed to freeze,
+ * we give useful information on what failed and how.
+ */
+static int freezer_failure(int do_all_threads)
+{
+      int result = 0;
+      struct task_struct *g, *p;
+      read_lock(&tasklist_lock);
+      do_each_thread(g, p) {
+              if (!freezeable(p, do_all_threads) ||
+                              p->state == TASK_UNINTERRUPTIBLE)
+                      continue;
+
+              if (!result) {
+                      printk(KERN_ERR "Stopping tasks failed.\n");
+                      printk(KERN_ERR "Tasks that refused to be "
+                       "refrigerated and haven't since exited:\n");
+                      set_freezer_state(ABORT_FREEZING);
+                      result = 1;
+              }
+
+              if ((freezing(p))) {
+                      printk(" - %s (#%d) signalled but "
+                              "didn't enter refrigerator.\n",
+                              p->comm, p->pid);
+              } else
+                      printk(" - %s (#%d) signalled "
+                              "and todo list empty.\n",
+                              p->comm, p->pid);
+      } while_each_thread(g, p);
+      read_unlock(&tasklist_lock);
+
+      return result;
+}
+/*
+ * freeze_threads
+ *
+ * Freeze a set of threads having particular attributes.
+ *
+ * Types:
+ * 2: User threads.
+ * 3: Kernel threads.
+ */
+static int freeze_threads(int do_all_threads)
+{
+      int result = 0, still_to_do;
+      unsigned long start_time = jiffies;
+
+      if (do_all_threads)
+              freezer_make_fses_ro();
+
+      signal_threads(do_all_threads);
+
+      /* Watch them do it, wake them if they ignore us. */
+      do {
+              prod_processes(do_all_threads);
+
+              set_task_state(current, TASK_INTERRUPTIBLE);
+              schedule_timeout(FREEZER_CHECK_TIMEOUT);
+
+              still_to_do = num_freezeable(do_all_threads) -
+                      num_uninterruptible(do_all_threads);
+
+      } while(still_to_do && (!test_freezer_state(ABORT_FREEZING)) &&
+              !time_after(jiffies, start_time + FREEZER_TOTAL_TIMEOUT));
+
+      /*
+       * Did we time out? See if we failed to freeze processes as well.
+       *
+       */
+      if ((time_after(jiffies, start_time + FREEZER_TOTAL_TIMEOUT))
+                      && (still_to_do))
+              result = freezer_failure(do_all_threads);
+
+      BUG_ON(in_atomic());
+
+      return 0;
+}
+/*
+ * freeze_processes - Freeze processes prior to saving an image of memory.
+ *
+ * Return value: 0 = success, 1 = faulure.
+ */
 int freeze_processes(void)
 {
-	int todo, nr_user, user_frozen;
-	unsigned long start_time;
-	struct task_struct *g, *p;
-	unsigned long flags;
-
-	printk( "Stopping tasks: " );
-	start_time = jiffies;
-	user_frozen = 0;
-	do {
-		nr_user = todo = 0;
-		read_lock(&tasklist_lock);
-		do_each_thread(g, p) {
-			if (!freezeable(p))
-				continue;
-			if (frozen(p))
-				continue;
-			if (p->mm && !(p->flags & PF_BORROWED_MM)) {
-				/* The task is a user-space one.
-				 * Freeze it unless there's a vfork completion
-				 * pending
-				 */
-				if (!p->vfork_done)
-					freeze_process(p);
-				nr_user++;
-			} else {
-				/* Freeze only if the user space is frozen */
-				if (user_frozen)
-					freeze_process(p);
-				todo++;
-			}
-		} while_each_thread(g, p);
-		read_unlock(&tasklist_lock);
-		todo += nr_user;
-		if (!user_frozen && !nr_user) {
-			sys_sync();
-			start_time = jiffies;
-		}
-		user_frozen = !nr_user;
-		yield();			/* Yield is okay here */
-		if (todo && time_after(jiffies, start_time + TIMEOUT))
-			break;
-	} while(todo);
-
-	/* This does not unfreeze processes that are already frozen
-	 * (we have slightly ugly calling convention in that respect,
-	 * and caller must call thaw_processes() if something fails),
-	 * but it cleans up leftover PF_FREEZE requests.
-	 */
-	if (todo) {
-		printk( "\n" );
-		printk(KERN_ERR " stopping tasks timed out "
-			"after %d seconds (%d tasks remaining):\n",
-			TIMEOUT / HZ, todo);
-		read_lock(&tasklist_lock);
-		do_each_thread(g, p) {
-			if (freezeable(p) && !frozen(p))
-				printk(KERN_ERR "  %s\n", p->comm);
-			if (freezing(p)) {
-				pr_debug("  clean up: %s\n", p->comm);
-				p->flags &= ~PF_FREEZE;
-				spin_lock_irqsave(&p->sighand->siglock, flags);
-				recalc_sigpending_tsk(p);
-				spin_unlock_irqrestore(&p->sighand->siglock, flags);
-			}
-		} while_each_thread(g, p);
-		read_unlock(&tasklist_lock);
-		return todo;
-	}
+      enum system_states old_state = system_state;
+      int result = 0;
 
-	printk( "|\n" );
-	BUG_ON(in_atomic());
-	return 0;
-}
-
-void thaw_processes(void)
-{
-	struct task_struct *g, *p;
-
-	printk( "Restarting tasks..." );
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		if (!freezeable(p))
-			continue;
-		if (!thaw_process(p))
-			printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
-	} while_each_thread(g, p);
-
-	read_unlock(&tasklist_lock);
-	schedule();
-	printk( " done\n" );
+      if (!test_freezer_state(FREEZER_ON)) {
+              /*
+               * No race. While !FREEZER_ON, processes
+               * won't enter __freeze_process
+               */
+              init_completion(&userspace_thaw);
+              init_completion(&kernelspace_thaw);
+              set_freezer_state(FREEZER_ON);
+      }
+
+      /* Now freeze processes that were syncing and are still running */
+      if (freeze_threads(0) || (test_freezer_state(ABORT_FREEZING))) {
+              result = 1;
+              goto out;
+      }
+
+      /* Freeze kernel threads */
+      if (freeze_threads(1) || (test_freezer_state(ABORT_FREEZING)))
+              result = 1;
+
+out:
+      system_state = old_state;
+      return result;
 }
 
+EXPORT_SYMBOL(freezer_state);
 EXPORT_SYMBOL(refrigerator);
diff -urN oldtree/kernel/power/process.c.orig newtree/kernel/power/process.c.orig
--- oldtree/kernel/power/process.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/process.c.orig	2006-03-08 15:21:19.228879250 +0000
@@ -0,0 +1,165 @@
+/*
+ * drivers/power/process.c - Functions for starting/stopping processes on 
+ *                           suspend transitions.
+ *
+ * Originally from swsusp.
+ */
+
+
+#undef DEBUG
+
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+
+/* 
+ * Timeout for stopping processes
+ */
+#define TIMEOUT	(20 * HZ)
+
+
+static inline int freezeable(struct task_struct * p)
+{
+	if ((p == current) || 
+	    (p->flags & PF_NOFREEZE) ||
+	    (p->exit_state == EXIT_ZOMBIE) ||
+	    (p->exit_state == EXIT_DEAD) ||
+	    (p->state == TASK_STOPPED) ||
+	    (p->state == TASK_TRACED))
+		return 0;
+	return 1;
+}
+
+/* Refrigerator is place where frozen processes are stored :-). */
+void refrigerator(void)
+{
+	/* Hmm, should we be allowed to suspend when there are realtime
+	   processes around? */
+	long save;
+	save = current->state;
+	pr_debug("%s entered refrigerator\n", current->comm);
+	printk("=");
+
+	frozen_process(current);
+	spin_lock_irq(&current->sighand->siglock);
+	recalc_sigpending(); /* We sent fake signal, clean it up */
+	spin_unlock_irq(&current->sighand->siglock);
+
+	while (frozen(current)) {
+		current->state = TASK_UNINTERRUPTIBLE;
+		schedule();
+	}
+	pr_debug("%s left refrigerator\n", current->comm);
+	current->state = save;
+}
+
+static inline void freeze_process(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (!freezing(p)) {
+		freeze(p);
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		signal_wake_up(p, 0);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+}
+
+/* 0 = success, else # of processes that we failed to stop */
+int freeze_processes(void)
+{
+	int todo, nr_user, user_frozen;
+	unsigned long start_time;
+	struct task_struct *g, *p;
+	unsigned long flags;
+
+	printk( "Stopping tasks: " );
+	start_time = jiffies;
+	user_frozen = 0;
+	do {
+		nr_user = todo = 0;
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p) {
+			if (!freezeable(p))
+				continue;
+			if (frozen(p))
+				continue;
+			if (p->mm && !(p->flags & PF_BORROWED_MM)) {
+				/* The task is a user-space one.
+				 * Freeze it unless there's a vfork completion
+				 * pending
+				 */
+				if (!p->vfork_done)
+					freeze_process(p);
+				nr_user++;
+			} else {
+				/* Freeze only if the user space is frozen */
+				if (user_frozen)
+					freeze_process(p);
+				todo++;
+			}
+		} while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+		todo += nr_user;
+		if (!user_frozen && !nr_user) {
+			sys_sync();
+			start_time = jiffies;
+		}
+		user_frozen = !nr_user;
+		yield();			/* Yield is okay here */
+		if (todo && time_after(jiffies, start_time + TIMEOUT))
+			break;
+	} while(todo);
+
+	/* This does not unfreeze processes that are already frozen
+	 * (we have slightly ugly calling convention in that respect,
+	 * and caller must call thaw_processes() if something fails),
+	 * but it cleans up leftover PF_FREEZE requests.
+	 */
+	if (todo) {
+		printk( "\n" );
+		printk(KERN_ERR " stopping tasks timed out "
+			"after %d seconds (%d tasks remaining):\n",
+			TIMEOUT / HZ, todo);
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p) {
+			if (freezeable(p) && !frozen(p))
+				printk(KERN_ERR "  %s\n", p->comm);
+			if (freezing(p)) {
+				pr_debug("  clean up: %s\n", p->comm);
+				p->flags &= ~PF_FREEZE;
+				spin_lock_irqsave(&p->sighand->siglock, flags);
+				recalc_sigpending_tsk(p);
+				spin_unlock_irqrestore(&p->sighand->siglock, flags);
+			}
+		} while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+		return todo;
+	}
+
+	printk( "|\n" );
+	BUG_ON(in_atomic());
+	return 0;
+}
+
+void thaw_processes(void)
+{
+	struct task_struct *g, *p;
+
+	printk( "Restarting tasks..." );
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		if (!freezeable(p))
+			continue;
+		if (!thaw_process(p))
+			printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
+	} while_each_thread(g, p);
+
+	read_unlock(&tasklist_lock);
+	schedule();
+	printk( " done\n" );
+}
+
+EXPORT_SYMBOL(refrigerator);
diff -urN oldtree/kernel/power/snapshot.c newtree/kernel/power/snapshot.c
--- oldtree/kernel/power/snapshot.c	2006-03-08 18:48:02.960064000 +0000
+++ newtree/kernel/power/snapshot.c	2006-03-08 15:22:33.321509750 +0000
@@ -176,7 +176,7 @@
 		return 0;
 
 	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
+	//BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
 	if (PageReserved(page) && pfn_is_nosave(pfn))
diff -urN oldtree/kernel/power/snapshot.c.orig newtree/kernel/power/snapshot.c.orig
--- oldtree/kernel/power/snapshot.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/snapshot.c.orig	2006-03-08 15:21:19.228879250 +0000
@@ -0,0 +1,828 @@
+/*
+ * linux/kernel/power/snapshot.c
+ *
+ * This file provide system snapshot/restore functionality.
+ *
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2, and is based on swsusp.c.
+ *
+ */
+
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/smp_lock.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/highmem.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#include "power.h"
+
+struct pbe *pagedir_nosave;
+static unsigned int nr_copy_pages;
+static unsigned int nr_meta_pages;
+static unsigned long *buffer;
+
+#ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	unsigned int n = 0;
+
+	for_each_zone (zone)
+		if (is_highmem(zone)) {
+			mark_free_pages(zone);
+			for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
+				struct page *page;
+				unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+				if (!pfn_valid(pfn))
+					continue;
+				page = pfn_to_page(pfn);
+				if (PageReserved(page))
+					continue;
+				if (PageNosaveFree(page))
+					continue;
+				n++;
+			}
+		}
+	return n;
+}
+
+struct highmem_page {
+	char *data;
+	struct page *page;
+	struct highmem_page *next;
+};
+
+static struct highmem_page *highmem_copy;
+
+static int save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	mark_free_pages(zone);
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%10000))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page))
+			continue;
+		BUG_ON(PageNosave(page));
+		if (PageNosaveFree(page))
+			continue;
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			return -ENOMEM;
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data) {
+			kfree(save);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+	return 0;
+}
+
+int save_highmem(void)
+{
+	struct zone *zone;
+	int res = 0;
+
+	pr_debug("swsusp: Saving Highmem");
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			res = save_highmem_zone(zone);
+		if (res)
+			return res;
+	}
+	printk("\n");
+	return 0;
+}
+
+int restore_highmem(void)
+{
+	printk("swsusp: Restoring Highmem\n");
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+	return 0;
+}
+#endif
+
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+
+/**
+ *	saveable - Determine whether a page should be cloned or not.
+ *	@pfn:	The page
+ *
+ *	We save a page if it's Reserved, and not in the range of pages
+ *	statically defined as 'unsaveable', or if it isn't reserved, and
+ *	isn't part of a free chunk of pages.
+ */
+
+static int saveable(struct zone *zone, unsigned long *zone_pfn)
+{
+	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
+	struct page *page;
+
+	if (!pfn_valid(pfn))
+		return 0;
+
+	page = pfn_to_page(pfn);
+	BUG_ON(PageReserved(page) && PageNosave(page));
+	if (PageNosave(page))
+		return 0;
+	if (PageReserved(page) && pfn_is_nosave(pfn))
+		return 0;
+	if (PageNosaveFree(page))
+		return 0;
+
+	return 1;
+}
+
+unsigned int count_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	unsigned int n = 0;
+
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			n += saveable(zone, &zone_pfn);
+	}
+	return n;
+}
+
+static void copy_data_pages(struct pbe *pblist)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *pbe, *p;
+
+	pbe = pblist;
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		/* This is necessary for swsusp_free() */
+		for_each_pb_page (p, pblist)
+			SetPageNosaveFree(virt_to_page(p));
+		for_each_pbe (p, pblist)
+			SetPageNosaveFree(virt_to_page(p->address));
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+			if (saveable(zone, &zone_pfn)) {
+				struct page *page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				BUG_ON(!pbe);
+				pbe->orig_address = (unsigned long)page_address(page);
+				/* copy_page is not usable for copying task structs. */
+				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+				pbe = pbe->next;
+			}
+		}
+	}
+	BUG_ON(pbe);
+}
+
+
+/**
+ *	free_pagedir - free pages allocated with alloc_pagedir()
+ */
+
+static void free_pagedir(struct pbe *pblist)
+{
+	struct pbe *pbe;
+
+	while (pblist) {
+		pbe = (pblist + PB_PAGE_SKIP)->next;
+		ClearPageNosave(virt_to_page(pblist));
+		ClearPageNosaveFree(virt_to_page(pblist));
+		free_page((unsigned long)pblist);
+		pblist = pbe;
+	}
+}
+
+/**
+ *	fill_pb_page - Create a list of PBEs on a given memory page
+ */
+
+static inline void fill_pb_page(struct pbe *pbpage)
+{
+	struct pbe *p;
+
+	p = pbpage;
+	pbpage += PB_PAGE_SKIP;
+	do
+		p->next = p + 1;
+	while (++p < pbpage);
+}
+
+/**
+ *	create_pbe_list - Create a list of PBEs on top of a given chain
+ *	of memory pages allocated with alloc_pagedir()
+ */
+
+static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
+{
+	struct pbe *pbpage, *p;
+	unsigned int num = PBES_PER_PAGE;
+
+	for_each_pb_page (pbpage, pblist) {
+		if (num >= nr_pages)
+			break;
+
+		fill_pb_page(pbpage);
+		num += PBES_PER_PAGE;
+	}
+	if (pbpage) {
+		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+			p->next = p + 1;
+		p->next = NULL;
+	}
+}
+
+/**
+ *	On resume it is necessary to trace and eventually free the unsafe
+ *	pages that have been allocated, because they are needed for I/O
+ *	(on x86-64 we likely will "eat" these pages once again while
+ *	creating the temporary page translation tables)
+ */
+
+struct eaten_page {
+	struct eaten_page *next;
+	char padding[PAGE_SIZE - sizeof(void *)];
+};
+
+static struct eaten_page *eaten_pages = NULL;
+
+static void release_eaten_pages(void)
+{
+	struct eaten_page *p, *q;
+
+	p = eaten_pages;
+	while (p) {
+		q = p->next;
+		/* We don't want swsusp_free() to free this page again */
+		ClearPageNosave(virt_to_page(p));
+		free_page((unsigned long)p);
+		p = q;
+	}
+	eaten_pages = NULL;
+}
+
+/**
+ *	@safe_needed - on resume, for storing the PBE list and the image,
+ *	we can only use memory pages that do not conflict with the pages
+ *	which had been used before suspend.
+ *
+ *	The unsafe pages are marked with the PG_nosave_free flag
+ *
+ *	Allocated but unusable (ie eaten) memory pages should be marked
+ *	so that swsusp_free() can release them
+ */
+
+static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+{
+	void *res;
+
+	if (safe_needed)
+		do {
+			res = (void *)get_zeroed_page(gfp_mask);
+			if (res && PageNosaveFree(virt_to_page(res))) {
+				/* This is for swsusp_free() */
+				SetPageNosave(virt_to_page(res));
+				((struct eaten_page *)res)->next = eaten_pages;
+				eaten_pages = res;
+			}
+		} while (res && PageNosaveFree(virt_to_page(res)));
+	else
+		res = (void *)get_zeroed_page(gfp_mask);
+	if (res) {
+		SetPageNosave(virt_to_page(res));
+		SetPageNosaveFree(virt_to_page(res));
+	}
+	return res;
+}
+
+unsigned long get_safe_page(gfp_t gfp_mask)
+{
+	return (unsigned long)alloc_image_page(gfp_mask, 1);
+}
+
+/**
+ *	alloc_pagedir - Allocate the page directory.
+ *
+ *	First, determine exactly how many pages we need and
+ *	allocate them.
+ *
+ *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
+ *	struct pbe elements (pbes) and the last element in the page points
+ *	to the next page.
+ *
+ *	On each page we set up a list of struct_pbe elements.
+ */
+
+struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed)
+{
+	unsigned int num;
+	struct pbe *pblist, *pbe;
+
+	if (!nr_pages)
+		return NULL;
+
+	pblist = alloc_image_page(gfp_mask, safe_needed);
+	/* FIXME: rewrite this ugly loop */
+	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
+        		pbe = pbe->next, num += PBES_PER_PAGE) {
+		pbe += PB_PAGE_SKIP;
+		pbe->next = alloc_image_page(gfp_mask, safe_needed);
+	}
+	if (!pbe) { /* get_zeroed_page() failed */
+		free_pagedir(pblist);
+		pblist = NULL;
+        } else
+		create_pbe_list(pblist, nr_pages);
+	return pblist;
+}
+
+/**
+ * Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
+ */
+
+void swsusp_free(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+
+	for_each_zone(zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
+				struct page *page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				if (PageNosave(page) && PageNosaveFree(page)) {
+					ClearPageNosave(page);
+					ClearPageNosaveFree(page);
+					free_page((long) page_address(page));
+				}
+			}
+	}
+	nr_copy_pages = 0;
+	nr_meta_pages = 0;
+	pagedir_nosave = NULL;
+	buffer = NULL;
+}
+
+
+/**
+ *	enough_free_mem - Make sure we enough free memory to snapshot.
+ *
+ *	Returns TRUE or FALSE after checking the number of available
+ *	free pages.
+ */
+
+static int enough_free_mem(unsigned int nr_pages)
+{
+	struct zone *zone;
+	unsigned int n = 0;
+
+	for_each_zone (zone)
+		if (!is_highmem(zone))
+			n += zone->free_pages;
+	pr_debug("swsusp: available memory: %u pages\n", n);
+	return n > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+
+static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+{
+	struct pbe *p;
+
+	for_each_pbe (p, pblist) {
+		p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
+		if (!p->address)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+static struct pbe *swsusp_alloc(unsigned int nr_pages)
+{
+	struct pbe *pblist;
+
+	if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
+		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
+		return NULL;
+	}
+
+	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
+		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+		swsusp_free();
+		return NULL;
+	}
+
+	return pblist;
+}
+
+asmlinkage int swsusp_save(void)
+{
+	unsigned int nr_pages;
+
+	pr_debug("swsusp: critical section: \n");
+
+	drain_local_pages();
+	nr_pages = count_data_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_pages);
+
+	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
+		 nr_pages,
+		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
+		 PAGES_FOR_IO, nr_free_pages());
+
+	if (!enough_free_mem(nr_pages)) {
+		printk(KERN_ERR "swsusp: Not enough free memory\n");
+		return -ENOMEM;
+	}
+
+	pagedir_nosave = swsusp_alloc(nr_pages);
+	if (!pagedir_nosave)
+		return -ENOMEM;
+
+	/* During allocating of suspend pagedir, new cold pages may appear.
+	 * Kill them.
+	 */
+	drain_local_pages();
+	copy_data_pages(pagedir_nosave);
+
+	/*
+	 * End of critical section. From now on, we can write to memory,
+	 * but we should not touch disk. This specially means we must _not_
+	 * touch swap space! Except we must write out our image of course.
+	 */
+
+	nr_copy_pages = nr_pages;
+	nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+	return 0;
+}
+
+static void init_header(struct swsusp_info *info)
+{
+	memset(info, 0, sizeof(struct swsusp_info));
+	info->version_code = LINUX_VERSION_CODE;
+	info->num_physpages = num_physpages;
+	memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+	info->cpus = num_online_cpus();
+	info->image_pages = nr_copy_pages;
+	info->pages = nr_copy_pages + nr_meta_pages + 1;
+	info->size = info->pages;
+	info->size <<= PAGE_SHIFT;
+}
+
+/**
+ *	pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *	list starting at @pbe are stored in the array @buf[] (1 page)
+ */
+
+static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+{
+	int j;
+
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		buf[j] = pbe->orig_address;
+		pbe = pbe->next;
+	}
+	if (!pbe)
+		for (; j < PAGE_SIZE / sizeof(long); j++)
+			buf[j] = 0;
+	return pbe;
+}
+
+/**
+ *	snapshot_read_next - used for reading the system memory snapshot.
+ *
+ *	On the first call to it @handle should point to a zeroed
+ *	snapshot_handle structure.  The structure gets updated and a pointer
+ *	to it should be passed to this function every next time.
+ *
+ *	The @count parameter should contain the number of bytes the caller
+ *	wants to read from the snapshot.  It must not be zero.
+ *
+ *	On success the function returns a positive number.  Then, the caller
+ *	is allowed to read up to the returned number of bytes from the memory
+ *	location computed by the data_of() macro.  The number returned
+ *	may be smaller than @count, but this only happens if the read would
+ *	cross a page boundary otherwise.
+ *
+ *	The function returns 0 to indicate the end of data stream condition,
+ *	and a negative number is returned on error.  In such cases the
+ *	structure pointed to by @handle is not updated and should not be used
+ *	any more.
+ */
+
+int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+{
+	if (handle->page > nr_meta_pages + nr_copy_pages)
+		return 0;
+	if (!buffer) {
+		/* This makes the buffer be freed by swsusp_free() */
+		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		if (!buffer)
+			return -ENOMEM;
+	}
+	if (!handle->offset) {
+		init_header((struct swsusp_info *)buffer);
+		handle->buffer = buffer;
+		handle->pbe = pagedir_nosave;
+	}
+	if (handle->prev < handle->page) {
+		if (handle->page <= nr_meta_pages) {
+			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
+			if (!handle->pbe)
+				handle->pbe = pagedir_nosave;
+		} else {
+			handle->buffer = (void *)handle->pbe->address;
+			handle->pbe = handle->pbe->next;
+		}
+		handle->prev = handle->page;
+	}
+	handle->buf_offset = handle->page_offset;
+	if (handle->page_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->page_offset;
+		handle->page_offset = 0;
+		handle->page++;
+	} else {
+		handle->page_offset += count;
+	}
+	handle->offset += count;
+	return count;
+}
+
+/**
+ *	mark_unsafe_pages - mark the pages that cannot be used for storing
+ *	the image during resume, because they conflict with the pages that
+ *	had been used before suspend
+ */
+
+static int mark_unsafe_pages(struct pbe *pblist)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *p;
+
+	if (!pblist) /* a sanity check */
+		return -EINVAL;
+
+	/* Clear page flags */
+	for_each_zone (zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+				ClearPageNosaveFree(pfn_to_page(zone_pfn +
+					zone->zone_start_pfn));
+	}
+
+	/* Mark orig addresses */
+	for_each_pbe (p, pblist) {
+		if (virt_addr_valid(p->orig_address))
+			SetPageNosaveFree(virt_to_page(p->orig_address));
+		else
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+{
+	/* We assume both lists contain the same number of elements */
+	while (src) {
+		dst->orig_address = src->orig_address;
+		dst = dst->next;
+		src = src->next;
+	}
+}
+
+static int check_header(struct swsusp_info *info)
+{
+	char *reason = NULL;
+
+	if (info->version_code != LINUX_VERSION_CODE)
+		reason = "kernel version";
+	if (info->num_physpages != num_physpages)
+		reason = "memory size";
+	if (strcmp(info->uts.sysname,system_utsname.sysname))
+		reason = "system type";
+	if (strcmp(info->uts.release,system_utsname.release))
+		reason = "kernel release";
+	if (strcmp(info->uts.version,system_utsname.version))
+		reason = "version";
+	if (strcmp(info->uts.machine,system_utsname.machine))
+		reason = "machine";
+	if (reason) {
+		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
+		return -EPERM;
+	}
+	return 0;
+}
+
+/**
+ *	load header - check the image header and copy data from it
+ */
+
+static int load_header(struct snapshot_handle *handle,
+                              struct swsusp_info *info)
+{
+	int error;
+	struct pbe *pblist;
+
+	error = check_header(info);
+	if (!error) {
+		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
+		if (!pblist)
+			return -ENOMEM;
+		pagedir_nosave = pblist;
+		handle->pbe = pblist;
+		nr_copy_pages = info->image_pages;
+		nr_meta_pages = info->pages - info->image_pages - 1;
+	}
+	return error;
+}
+
+/**
+ *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *	the PBEs in the list starting at @pbe
+ */
+
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
+{
+	int j;
+
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		pbe->orig_address = buf[j];
+		pbe = pbe->next;
+	}
+	return pbe;
+}
+
+/**
+ *	create_image - use metadata contained in the PBE list
+ *	pointed to by pagedir_nosave to mark the pages that will
+ *	be overwritten in the process of restoring the system
+ *	memory state from the image and allocate memory for
+ *	the image avoiding these pages
+ */
+
+static int create_image(struct snapshot_handle *handle)
+{
+	int error = 0;
+	struct pbe *p, *pblist;
+
+	p = pagedir_nosave;
+	error = mark_unsafe_pages(p);
+	if (!error) {
+		pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+		if (pblist)
+			copy_page_backup_list(pblist, p);
+		free_pagedir(p);
+		if (!pblist)
+			error = -ENOMEM;
+	}
+	if (!error)
+		error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+	if (!error) {
+		release_eaten_pages();
+		pagedir_nosave = pblist;
+	} else {
+		pagedir_nosave = NULL;
+		handle->pbe = NULL;
+		nr_copy_pages = 0;
+		nr_meta_pages = 0;
+	}
+	return error;
+}
+
+/**
+ *	snapshot_write_next - used for writing the system memory snapshot.
+ *
+ *	On the first call to it @handle should point to a zeroed
+ *	snapshot_handle structure.  The structure gets updated and a pointer
+ *	to it should be passed to this function every next time.
+ *
+ *	The @count parameter should contain the number of bytes the caller
+ *	wants to write to the image.  It must not be zero.
+ *
+ *	On success the function returns a positive number.  Then, the caller
+ *	is allowed to write up to the returned number of bytes to the memory
+ *	location computed by the data_of() macro.  The number returned
+ *	may be smaller than @count, but this only happens if the write would
+ *	cross a page boundary otherwise.
+ *
+ *	The function returns 0 to indicate the "end of file" condition,
+ *	and a negative number is returned on error.  In such cases the
+ *	structure pointed to by @handle is not updated and should not be used
+ *	any more.
+ */
+
+int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+{
+	int error = 0;
+
+	if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+		return 0;
+	if (!buffer) {
+		/* This makes the buffer be freed by swsusp_free() */
+		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		if (!buffer)
+			return -ENOMEM;
+	}
+	if (!handle->offset)
+		handle->buffer = buffer;
+	if (handle->prev < handle->page) {
+		if (!handle->prev) {
+			error = load_header(handle, (struct swsusp_info *)buffer);
+			if (error)
+				return error;
+		} else if (handle->prev <= nr_meta_pages) {
+			handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+			if (!handle->pbe) {
+				error = create_image(handle);
+				if (error)
+					return error;
+				handle->pbe = pagedir_nosave;
+				handle->buffer = (void *)handle->pbe->address;
+			}
+		} else {
+			handle->pbe = handle->pbe->next;
+			handle->buffer = (void *)handle->pbe->address;
+		}
+		handle->prev = handle->page;
+	}
+	handle->buf_offset = handle->page_offset;
+	if (handle->page_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->page_offset;
+		handle->page_offset = 0;
+		handle->page++;
+	} else {
+		handle->page_offset += count;
+	}
+	handle->offset += count;
+	return count;
+}
+
+int snapshot_image_loaded(struct snapshot_handle *handle)
+{
+	return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
+		handle->page <= nr_meta_pages + nr_copy_pages);
+}
diff -urN oldtree/kernel/power/storage.c newtree/kernel/power/storage.c
--- oldtree/kernel/power/storage.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/storage.c	2006-03-08 15:22:33.321509750 +0000
@@ -0,0 +1,323 @@
+/*
+ * kernel/power/storage.c
+ *
+ * Copyright (C) 2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for talking to a userspace program that manages storage.
+ *
+ * The kernel side:
+ * - starts the userspace program;
+ * - sends messages telling it when to open and close the connection;
+ * - tells it when to quit;
+ *
+ * The user space side:
+ * - passes messages regarding status;
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+ 
+#include "proc.h"
+#include "modules.h"
+#include "netlink.h"
+#include "storage.h"
+#include "ui.h"
+
+static struct user_helper_data usm_helper_data;
+static struct suspend_module_ops usm_ops;
+static int message_received = 0;
+static int activations = 0;
+static int usm_prepare_count = 0;
+static int storage_manager_last_action = 0;
+static int storage_manager_action = 0;
+
+static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	int type;
+	int *data;
+
+	type = nlh->nlmsg_type;
+
+	/* A control message: ignore them */
+	if (type < NETLINK_MSG_BASE)
+		return 0;
+
+	/* Unknown message: reply with EINVAL */
+	if (type >= USM_MSG_MAX)
+		return -EINVAL;
+
+	/* All operations require privileges, even GET */
+	if (security_netlink_recv(skb))
+		return -EPERM;
+
+	/* Only allow one task to receive NOFREEZE privileges */
+	if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
+		return -EBUSY;
+
+	data = (int*)NLMSG_DATA(nlh);
+
+	switch (type) {
+		case USM_MSG_SUCCESS:
+		case USM_MSG_FAILED:
+			message_received = type;
+			complete(&usm_helper_data.wait_for_process);
+			break;
+		default:
+			printk("Storage manager doesn't recognise message %d.\n", type);
+	}
+
+	return 1;
+}
+
+int suspend_activate_storage(int force)
+{
+	int tries = 1;
+
+	if (usm_helper_data.pid == -1 || usm_ops.disabled)
+		return 0;
+
+	message_received = 0;
+	activations++;
+
+	if (activations > 1 && !force)
+		return 0;
+
+	while ((!message_received || message_received == USM_MSG_FAILED) && tries < 2) {
+		suspend_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt %d.\n", tries);
+
+		init_completion(&usm_helper_data.wait_for_process);
+
+		suspend_send_netlink_message(&usm_helper_data,
+			USM_MSG_CONNECT,
+			NULL, 0);
+
+		/* Wait 2 seconds for the userspace process to make contact */
+		wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
+
+		tries++;
+	}
+
+	return 0;
+}
+
+int suspend_deactivate_storage(int force)
+{
+	if (usm_helper_data.pid == -1 || usm_ops.disabled)
+		return 0;
+	
+	message_received = 0;
+	activations--;
+
+	if (activations && !force)
+		return 0;
+
+	init_completion(&usm_helper_data.wait_for_process);
+
+	suspend_send_netlink_message(&usm_helper_data,
+			USM_MSG_DISCONNECT,
+			NULL, 0);
+
+	wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
+
+	if (!message_received || message_received == USM_MSG_FAILED) {
+		printk("Returning failure disconnecting storage.\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_DEBUG
+static void storage_manager_simulate(void)
+{
+	printk("--- Storage manager simulate ---\n");
+	suspend_prepare_usm();
+	schedule();
+	printk("--- Deactivate storage 1 ---\n");
+	suspend_deactivate_storage(1);
+	schedule();
+	printk("--- Activate storage 1 ---\n");
+	suspend_activate_storage(1);
+	schedule();
+	printk("--- Cleanup usm ---\n");
+	suspend_cleanup_usm();
+	schedule();
+	printk("--- Storage manager simulate ends ---\n");
+}
+#endif
+
+static unsigned long usm_storage_needed(void)
+{
+	return strlen(usm_helper_data.program);
+}
+
+static int usm_save_config_info(char *buf)
+{
+	int len = strlen(usm_helper_data.program);
+	memcpy(buf, usm_helper_data.program, len);
+	return len;
+}
+
+static void usm_load_config_info(char *buf, int size)
+{
+	/* Don't load the saved path if one has already been set */
+	if (usm_helper_data.program[0])
+		return;
+
+	memcpy(usm_helper_data.program, buf, size);
+}
+
+static unsigned long usm_memory_needed(void)
+{
+	/* ball park figure of 32 pages */
+	return (32 * PAGE_SIZE);
+}
+
+/* suspend_prepare_usm
+ */
+int suspend_prepare_usm(void)
+{
+	usm_prepare_count++;
+
+	if (usm_prepare_count > 1 || usm_ops.disabled)
+		return 0;
+	
+	usm_helper_data.pid = -1;
+
+	if (!*usm_helper_data.program)
+		return 0;
+
+	suspend_netlink_setup(&usm_helper_data);
+
+	if (usm_helper_data.pid == -1)
+		printk("Suspend2 Storage Manager wanted, but couldn't start it.\n");
+
+	suspend_activate_storage(0);
+
+	return (usm_helper_data.pid != -1);
+}
+
+void suspend_cleanup_usm(void)
+{
+	usm_prepare_count--;
+
+	if (usm_helper_data.pid > -1 && !usm_prepare_count) {
+		struct task_struct *t;
+
+		suspend_deactivate_storage(0);
+
+		suspend_send_netlink_message(&usm_helper_data,
+				NETLINK_MSG_CLEANUP, NULL, 0);
+
+		read_lock(&tasklist_lock);
+		if ((t = find_task_by_pid(usm_helper_data.pid)))
+			t->flags &= ~PF_NOFREEZE;
+		read_unlock(&tasklist_lock);
+
+		suspend_netlink_close(&usm_helper_data);
+
+		usm_helper_data.pid = -1;
+	}
+}
+
+static void storage_manager_activate(void)
+{
+	if (storage_manager_action == storage_manager_last_action)
+		return;
+
+	if (storage_manager_action)
+		suspend_prepare_usm();
+	else
+		suspend_cleanup_usm();
+
+	storage_manager_last_action = storage_manager_action;
+}
+
+/*
+ * User interface specific /proc/suspend entries.
+ */
+
+static struct suspend_proc_data proc_params[] = {
+	{ .filename			= "disable_storage_manager",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_INTEGER,
+	  .data = {
+		.integer = {
+			.variable	= &usm_ops.disabled,
+			.minimum	= 0,
+			.maximum	= 1,
+		}
+	  }
+	},
+	{ .filename			= "storage_manager",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_STRING,
+	  .data = {
+		.string = {
+			.variable	= usm_helper_data.program,
+			.max_length	= 254,
+		}
+	  }
+	},
+	{ .filename			= "activate_storage",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_INTEGER,
+	  .data = {
+		.integer = {
+			.variable	= &storage_manager_action,
+			.minimum	= 0,
+			.maximum	= 1,
+		}
+	  },
+	  .write_proc 			= storage_manager_activate,
+	},
+
+#ifdef CONFIG_PM_DEBUG
+	{ .filename			= "simulate_atomic_copy",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_NONE,
+	  .write_proc 			= storage_manager_simulate,
+	}
+#endif
+};
+
+static struct suspend_module_ops usm_ops = {
+	.type				= MISC_PLUGIN,
+	.name				= "Userspace Storage Manager",
+	.module				= THIS_MODULE,
+	.storage_needed			= usm_storage_needed,
+	.save_config_info		= usm_save_config_info,
+	.load_config_info		= usm_load_config_info,
+	.memory_needed			= usm_memory_needed,
+};
+
+/* suspend_usm_proc_init
+ * Description: Boot time initialisation for user interface.
+ */
+static __init int suspend_usm_proc_init(void)
+{
+	int result, i, numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data);
+
+	if (!(result = suspend_register_module(&usm_ops)))
+		for (i=0; i< numfiles; i++)
+			suspend_register_procfile(&proc_params[i]);
+
+	usm_helper_data.nl = NULL;
+	usm_helper_data.program[0] = '\0';
+	usm_helper_data.pid = -1;
+	usm_helper_data.skb_size = 0;
+	usm_helper_data.pool_limit = 6;
+	usm_helper_data.netlink_id = NETLINK_SUSPEND2_USM;
+	usm_helper_data.name = "userspace storage manager";
+	usm_helper_data.rcv_msg = usm_user_rcv_msg;
+	usm_helper_data.interface_version = 1;
+	usm_helper_data.must_init = 0;
+	init_completion(&usm_helper_data.wait_for_process);
+
+	return result;
+}
+
+late_initcall(suspend_usm_proc_init);
diff -urN oldtree/kernel/power/storage.h newtree/kernel/power/storage.h
--- oldtree/kernel/power/storage.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/storage.h	2006-03-08 15:22:33.325510000 +0000
@@ -0,0 +1,21 @@
+/*
+ *
+ */
+
+int suspend_prepare_usm(void);
+void suspend_cleanup_usm(void);
+
+int suspend_activate_storage(int force);
+int suspend_deactivate_storage(int force);
+
+enum {
+	USM_MSG_BASE = 0x10,
+
+	/* Kernel -> Userspace */
+	USM_MSG_CONNECT = 0x30,
+	USM_MSG_DISCONNECT = 0x31,
+	USM_MSG_SUCCESS = 0x40,
+	USM_MSG_FAILED = 0x41,
+
+	USM_MSG_MAX,
+};
diff -urN oldtree/kernel/power/suspend.c newtree/kernel/power/suspend.c
--- oldtree/kernel/power/suspend.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/suspend.c	2006-03-08 15:22:33.325510000 +0000
@@ -0,0 +1,1132 @@
+/*
+ * kernel/power/suspend2.c
+ */
+/** \mainpage Suspend2.
+ *
+ * Suspend2 provides support for saving and restoring an image of
+ * system memory to an arbitrary storage device, either on the local computer,
+ * or across some network. The support is entirely OS based, so Suspend2 
+ * works without requiring BIOS, APM or ACPI support. The vast majority of the
+ * code is also architecture independant, so it should be very easy to port
+ * the code to new architectures. Suspend includes support for SMP, 4G HighMem
+ * and preemption. Initramfses and initrds are also supported.
+ *
+ * Suspend2 uses a modular design, in which the method of storing the image is
+ * completely abstracted from the core code, as are transformations on the data
+ * such as compression and/or encryption (multiple 'modules' can be used to
+ * provide arbitrary combinations of functionality). The user interface is also
+ * modular, so that arbitrarily simple or complex interfaces can be used to
+ * provide anything from debugging information through to eye candy.
+ * 
+ * \section Copyright
+ *
+ * Suspend2 is released under the GPLv2.
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
+ * Copyright (C) 2002-2005 Nigel Cunningham <ncunningham@cyclades.com><BR>
+ *
+ * \section Credits
+ * 
+ * Nigel would like to thank the following people for their work:
+ * 
+ * Pavel Machek <pavel@ucw.cz><BR>
+ * Modifications, defectiveness pointing, being with Gabor at the very beginning,
+ * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
+ *
+ * Steve Doddi <dirk@loth.demon.co.uk><BR> 
+ * Support the possibility of hardware state restoring.
+ *
+ * Raph <grey.havens@earthling.net><BR>
+ * Support for preserving states of network devices and virtual console
+ * (including X and svgatextmode)
+ *
+ * Kurt Garloff <garloff@suse.de><BR>
+ * Straightened the critical function in order to prevent compilers from
+ * playing tricks with local variables.
+ *
+ * Andreas Mohr <a.mohr@mailto.de>
+ *
+ * Alex Badea <vampire@go.ro><BR>
+ * Fixed runaway init
+ *
+ * Jeff Snyder <je4d@pobox.com><BR>
+ * ACPI patch
+ *
+ * Nathan Friess <natmanz@shaw.ca><BR>
+ * Some patches.
+ *
+ * Michael Frank <mhf@linuxmail.org><BR>
+ * Extensive testing and help with improving stability. Nigel was constantly
+ * amazed by the quality and quantity of Michael's help.
+ *
+ * Bernard Blackham <bernard@blackham.com.au><BR>
+ * Web page & Wiki administration, some coding. Another person without whom
+ * Suspend would not be where it is.
+ *
+ * ..and of course the myriads of Suspend2 users who have helped diagnose
+ * and fix bugs, made suggestions on how to improve the code, proofread
+ * documentation, and donated time and money.
+ *
+ * Thanks also to corporate sponsors:
+ *
+ * <B>Cyclades.com.</B> Nigel's employers from Dec 2004, who allow him to work on
+ * Suspend and PM related issues on company time.
+ * 
+ * <B>LinuxFund.org.</B> Sponsored Nigel's work on Suspend for four months Oct 2003
+ * to Jan 2004.
+ *
+ * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
+ * maintenance of SMP and Highmem support.
+ *
+ * <B>OSDL.</B> Provided access to various hardware configurations, make occasional
+ * small donations to the project.
+ */
+
+#define SUSPEND_MAIN_C
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/console.h>
+#include <linux/version.h>
+#include <linux/reboot.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/freezer.h>
+#include <asm/uaccess.h>
+#include <asm/setup.h>
+
+#include "version.h"
+#include "suspend2.h"
+#include "modules.h"
+#include "proc.h"
+#include "pageflags.h"
+#include "prepare_image.h"
+#include "io.h"
+#include "ui.h"
+#include "suspend2_common.h"
+#include "extent.h"
+#include "power_off.h"
+#include "atomic_copy.h"
+#include "debug_pagealloc.h"
+#include "storage.h"
+
+#ifdef  CONFIG_X86
+#include <asm/i387.h> /* for kernel_fpu_end */
+#endif
+
+/* Variables to be preserved over suspend */
+int pageset1_sizelow = 0, pageset2_sizelow = 0, image_size_limit = 0;
+unsigned long suspend_orig_mem_free = 0;
+
+static dyn_pageflags_t pageset1_check_map;
+static dyn_pageflags_t pageset2_check_map;
+static char *debug_info_buffer;
+static char suspend_core_version[] = SUSPEND_CORE_VERSION;
+
+extern void do_suspend2_lowlevel(int resume);
+extern __nosavedata char resume_commandline[COMMAND_LINE_SIZE];
+
+unsigned long suspend_action = 0;
+unsigned long suspend_result = 0;
+unsigned long suspend_debug_state = 0;
+
+/* 
+ * ---  Variables -----
+ * 
+ * The following are used by the arch specific low level routines 
+ * and only needed if suspend2 is compiled in. Other variables,
+ * used by the freezer even if suspend2 is not compiled in, are
+ * found in process.c
+ */
+
+/*! How long I/O took. */
+int suspend_io_time[2][2];
+
+/* Compression ratio */
+__nosavedata unsigned long bytes_in = 0, bytes_out = 0;
+
+/*! Pageset metadata. */
+struct pagedir pagedir1 = { 0, 0}, pagedir2 = { 0, 0}; 
+
+/* Suspend2 variables used by built-in routines. */
+
+/*! The number of suspends we have started (some may have been cancelled) */
+unsigned int nr_suspends = 0;
+
+/*! The console log level we default to. */
+int suspend_default_console_level = 0;
+
+/* 
+ * For resume2= kernel option. It's pointless to compile
+ * suspend2 without any writers, but compilation shouldn't
+ * fail if you do.
+ */
+
+unsigned long software_suspend_state = ((1 << SUSPEND_DISABLED) | (1 << SUSPEND_BOOT_TIME) |
+		(1 << SUSPEND_RESUME_NOT_DONE) | (1 << SUSPEND_IGNORE_LOGLEVEL));
+
+mm_segment_t	oldfs;
+
+char resume2_file[256] = CONFIG_SUSPEND2_DEFAULT_RESUME2;
+
+static atomic_t actions_running;
+
+extern int block_dump;
+
+int block_dump_save;
+
+/*
+ * Basic clean-up routine.
+ */
+void suspend_finish_anything(int finishing_cycle)
+{
+	if (atomic_dec_and_test(&actions_running)) {
+		suspend_cleanup_modules(finishing_cycle);
+		suspend_put_modules();
+		clear_suspend_state(SUSPEND_RUNNING);
+	}
+
+	set_fs(oldfs);
+
+	if (finishing_cycle)
+		block_dump = block_dump_save;
+}
+
+/*
+ * Basic set-up routine.
+ */
+int suspend_start_anything(int starting_cycle)
+{
+	oldfs = get_fs();
+
+	if (atomic_add_return(1, &actions_running) == 1) {
+       		set_fs(KERNEL_DS);
+
+		set_suspend_state(SUSPEND_RUNNING);
+
+		if (suspend_get_modules()) {
+			printk("Get modules failed!\n");
+			clear_suspend_state(SUSPEND_RUNNING);
+			set_fs(oldfs);
+			return -EBUSY;
+		}
+
+		if (suspend_initialise_modules(starting_cycle)) {
+			printk("Initialise modules failed!\n");
+			suspend_finish_anything(starting_cycle);
+			return -EBUSY;
+		}
+
+		if (starting_cycle) {
+			block_dump_save = block_dump;
+			block_dump = 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * save_image
+ * Result code (int): Zero on success, non zero on failure.
+ * Functionality    : High level routine which performs the steps necessary
+ *                    to prepare and save the image after preparatory steps
+ *                    have been taken.
+ * Key Assumptions  : Processes frozen, sufficient memory available, drivers
+ *                    suspended.
+ * Called from      : suspend_suspend_2
+ */
+
+static int save_image(void)
+{
+	int temp_result;
+
+	suspend_message(SUSPEND_ANY_SECTION, SUSPEND_LOW, 1,
+		" - Final values: %d and %d.\n",
+		pagedir1.pageset_size, 
+		pagedir2.pageset_size);
+
+	check_shift_keys(1, "About to write pagedir2.");
+
+	temp_result = write_pageset(&pagedir2, 2);
+	
+	if (temp_result == -1 || test_result_state(SUSPEND_ABORTED))
+		return -1;
+
+	check_shift_keys(1, "About to copy pageset 1.");
+
+	suspend_deactivate_storage(1);
+
+	suspend_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy.");
+	
+	do_suspend2_lowlevel(0);
+
+	return 0;
+}
+
+/*
+ * Save the second part of the image.
+ */
+int save_image_part1(void)
+{
+	int temp_result, old_ps1_size = pagedir1.pageset_size;
+	dyn_pageflags_t temp;
+	
+	/* Quick switch: We want to compare the old stats with the new ones. */
+	temp = pageset1_map;
+	pageset1_map = pageset1_check_map;
+	pageset1_check_map = temp;
+
+	temp = pageset2_map;
+	pageset2_map = pageset2_check_map;
+	pageset2_check_map = temp;
+
+	BUG_ON(!irqs_disabled());
+
+	suspend_recalculate_stats(1);
+
+	if ((pagedir1.pageset_size - old_ps1_size) > extra_pd1_pages_allowance) {
+		abort_suspend("Pageset1 has grown by %d pages."
+			" Only %d growth is allowed for!\n",
+			pagedir1.pageset_size - old_ps1_size,
+			extra_pd1_pages_allowance);
+		return -1;
+	}
+
+	suspend_map_atomic_copy_pages();
+
+	BUG_ON(!irqs_disabled());
+
+	if (!test_action_state(SUSPEND_TEST_FILTER_SPEED) &&
+	    !test_action_state(SUSPEND_TEST_BIO))
+		suspend_copy_pageset1();
+
+	/*
+	 *  ----   FROM HERE ON, NEED TO REREAD PAGESET2 IF ABORTING!!! -----
+	 *  
+	 */
+	
+	suspend_unmap_atomic_copy_pages();
+
+#ifdef CONFIG_X86
+	kernel_fpu_end();
+#endif
+
+	device_power_up();
+	
+	local_irq_enable();
+
+	device_resume();
+
+	if (suspend_activate_storage(1))
+		panic("Failed to reactivate our storage.");
+	
+	suspend_update_status(pagedir2.pageset_size,
+			pagedir1.pageset_size + pagedir2.pageset_size,
+			NULL);
+	
+	if (test_result_state(SUSPEND_ABORTED))
+		goto abort_reloading_pagedir_two;
+
+	check_shift_keys(1, "About to write pageset1.");
+
+	/*
+	 * End of critical section.
+	 */
+	
+	suspend_message(SUSPEND_ANY_SECTION, SUSPEND_LOW, 1,
+			"-- Writing pageset1\n");
+
+	temp_result = write_pageset(&pagedir1, 1);
+
+	/* We didn't overwrite any memory, so no reread needs to be done. */
+	if (test_action_state(SUSPEND_TEST_FILTER_SPEED))
+		return -1;
+
+	if (temp_result == -1 || test_result_state(SUSPEND_ABORTED))
+		goto abort_reloading_pagedir_two;
+
+	check_shift_keys(1, "About to write header.");
+
+	if (test_result_state(SUSPEND_ABORTED))
+		goto abort_reloading_pagedir_two;
+
+	temp_result = write_image_header();
+
+	if (test_action_state(SUSPEND_TEST_BIO))
+		return -1;
+
+	if (temp_result || (test_result_state(SUSPEND_ABORTED)))
+		goto abort_reloading_pagedir_two;
+
+	check_shift_keys(1, "About to power down or reboot.");
+
+	return 0;
+
+abort_reloading_pagedir_two:
+	temp_result = read_pageset2(1);
+
+	/* If that failed, we're sunk. Panic! */
+	if (temp_result)
+		panic("Attempt to reload pagedir 2 while aborting "
+				"a suspend failed.");
+
+	return -1;		
+
+}
+
+#define SNPRINTF(a...) 	len += snprintf_used(debug_info_buffer + len, \
+		PAGE_SIZE - len - 1, ## a)
+
+static int io_MB_per_second(int read_write)
+{
+	if (!suspend_io_time[read_write][1])
+		return 0;
+
+	return MB((unsigned long) suspend_io_time[read_write][0]) * HZ /
+		suspend_io_time[read_write][1];
+}
+
+/* get_debug_info
+ * Functionality:	Store debug info in a buffer.
+ * Called from:		suspend2_try_suspend.
+ */
+
+
+static int get_suspend_debug_info(void)
+{
+	int len = 0;
+	if (!debug_info_buffer) {
+		debug_info_buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+		if (!debug_info_buffer) {
+			printk("Error! Unable to allocate buffer for"
+					"software suspend debug info.\n");
+			return 0;
+		}
+	}
+
+	SNPRINTF("Suspend2 debugging info:\n");
+	SNPRINTF("- SUSPEND core   : %s\n", SUSPEND_CORE_VERSION);
+	SNPRINTF("- Kernel Version : %s\n", UTS_RELEASE);
+	SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
+	SNPRINTF("- Attempt number : %d\n", nr_suspends);
+	SNPRINTF("- Parameters     : %ld %ld %ld %d %d %ld\n",
+			suspend_result,
+			suspend_action,
+			suspend_debug_state,
+			suspend_default_console_level,
+			image_size_limit,
+			suspend_powerdown_method);
+	SNPRINTF("- Overall expected compression percentage: %d.\n",
+			100 - suspend_expected_compression_ratio());
+	len+= suspend_print_module_debug_info(debug_info_buffer + len, 
+			PAGE_SIZE - len - 1);
+	if (suspend_io_time[0][1]) {
+		if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
+			SNPRINTF("- I/O speed: Write %d KB/s",
+			  (KB((unsigned long) suspend_io_time[0][0]) * HZ /
+			  suspend_io_time[0][1]));
+			if (suspend_io_time[1][1])
+				SNPRINTF(", Read %d KB/s",
+				  (KB((unsigned long) suspend_io_time[1][0]) * HZ /
+				  suspend_io_time[1][1]));
+		} else {
+			SNPRINTF("- I/O speed: Write %d MB/s",
+			 (MB((unsigned long) suspend_io_time[0][0]) * HZ /
+			  suspend_io_time[0][1]));
+			if (suspend_io_time[1][1])
+				SNPRINTF(", Read %d MB/s",
+				 (MB((unsigned long) suspend_io_time[1][0]) * HZ /
+				  suspend_io_time[1][1]));
+		}
+		SNPRINTF(".\n");
+	}
+	else
+		SNPRINTF("- No I/O speed stats available.\n");
+
+	return len;
+}
+
+/*
+ * debuginfo_read_proc
+ * Functionality   : Displays information that may be helpful in debugging
+ *                   software suspend.
+ */
+int debuginfo_read_proc(char *page, char **start, off_t off, int count,
+		int *eof, void *data)
+{
+	int info_len, copy_len;
+
+	info_len = get_suspend_debug_info();
+
+	copy_len = min(info_len - (int) off, count);
+	if (copy_len < 0)
+		copy_len = 0;
+
+	if (copy_len) {
+		memcpy(page, debug_info_buffer + off, copy_len);
+		*start = page;
+	} 
+
+	if (copy_len + off == info_len)
+		*eof = 1;
+
+	free_page((unsigned long) debug_info_buffer);
+	debug_info_buffer = NULL;
+	return copy_len;
+}
+
+static int allocate_bitmaps(void)
+{
+	suspend_message(SUSPEND_MEMORY, SUSPEND_VERBOSE, 1,
+			"Allocating in_use_map\n");
+	if (allocate_dyn_pageflags(&in_use_map) ||
+	    allocate_dyn_pageflags(&pageset1_map) ||
+	    allocate_dyn_pageflags(&pageset1_copy_map) ||
+	    allocate_dyn_pageflags(&allocd_pages_map) ||
+	    allocate_dyn_pageflags(&pageset2_map) ||
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	    allocate_dyn_pageflags(&unmap_map) ||
+#endif	
+	    allocate_dyn_pageflags(&pageset1_check_map) ||
+	    allocate_dyn_pageflags(&pageset2_check_map))
+		return 1;
+
+	return 0;
+}
+
+static void free_metadata(void)
+{
+	free_dyn_pageflags(&pageset1_map);
+	free_dyn_pageflags(&pageset1_copy_map);
+	free_dyn_pageflags(&allocd_pages_map);
+	free_dyn_pageflags(&pageset2_map);
+	free_dyn_pageflags(&in_use_map);
+	free_dyn_pageflags(&pageset1_check_map);
+	free_dyn_pageflags(&pageset2_check_map);
+}
+
+static int check_still_keeping_image(void)
+{
+	if (test_action_state(SUSPEND_KEEP_IMAGE)) {
+		printk("Image already stored: powering down immediately.");
+		suspend_power_down();
+		return 1;	/* Just in case we're using S3 */
+	}
+
+	printk("Invalidating previous image.\n");
+	suspend_active_writer->ops.writer.invalidate_image();
+
+	return 0;
+}
+
+static int suspend_init(void)
+{
+	suspend_result = 0;
+
+	printk(name_suspend "Initiating a software suspend cycle.\n");
+
+	nr_suspends++;
+	clear_suspend_state(SUSPEND_NOW_RESUMING);
+	
+	suspend_io_time[0][0] = suspend_io_time[0][1] = 
+		suspend_io_time[1][0] =
+		suspend_io_time[1][1] = 0;
+
+	suspend_prepare_console();
+
+	free_metadata();	/* We might have kept it */
+
+	//attempt_to_parse_resume_device();
+	
+	if (test_suspend_state(SUSPEND_DISABLED))
+		return 0;
+	
+	if (allocate_bitmaps())
+		return 0;
+	
+	disable_nonboot_cpus();
+
+	return 1;
+}
+
+void suspend_cleanup(void)
+{
+	int i;
+
+	i = get_suspend_debug_info();
+
+	suspend_free_extra_pagedir_memory();
+	
+	pagedir1.pageset_size = pagedir2.pageset_size = 0;
+
+	thaw_processes(FREEZER_KERNEL_THREADS);
+
+#ifdef CONFIG_SUSPEND2_KEEP_IMAGE
+	if (test_action_state(SUSPEND_KEEP_IMAGE) &&
+	    !test_result_state(SUSPEND_ABORTED)) {
+		suspend_message(SUSPEND_ANY_SECTION, SUSPEND_LOW, 1,
+			name_suspend "Not invalidating the image due "
+			"to Keep Image being enabled.\n");
+		set_result_state(SUSPEND_KEPT_IMAGE);
+	} else
+#endif
+		if (suspend_active_writer)
+			suspend_active_writer->ops.writer.invalidate_image();
+
+	free_metadata();
+
+#ifdef CONFIG_DEBUG_PAGE_ALLOC
+	free_dyn_pageflags(&unmap_map);
+#endif
+
+	if (debug_info_buffer) {
+		/* Printk can only handle 1023 bytes, including
+		 * its level mangling. */
+		for (i = 0; i < 3; i++)
+			printk("%s", debug_info_buffer + (1023 * i));
+		free_page((unsigned long) debug_info_buffer);
+		debug_info_buffer = NULL;
+	}
+
+	thaw_processes(FREEZER_ALL_THREADS);
+
+	suspend_cleanup_console();
+
+	enable_nonboot_cpus();
+}
+
+static int can_suspend(void)
+{
+	if (test_suspend_state(SUSPEND_DISABLED))
+		attempt_to_parse_resume_device();
+
+	if (test_suspend_state(SUSPEND_DISABLED)) {
+		printk(name_suspend "Software suspend is disabled.\n"
+			"This may be because you haven't put something along the "
+			"lines of\n\nresume2=swap:/dev/hda1\n\n"
+			"in lilo.conf or equivalent. (Where /dev/hda1 is your "
+			"swap partition).\n");
+		set_result_state(SUSPEND_ABORTED);
+		return 0;
+	}
+	
+	return 1;
+}
+
+/*
+ * suspend_main
+ * Functionality   : First level of code for software suspend invocations.
+ *                   Stores and restores load averages (to avoid a spike),
+ *                   allocates bitmaps, freezes processes and eats memory
+ *                   as required before suspending drivers and invoking
+ *                   the 'low level' code to save the state to disk.
+ *                   By the time we return from do_suspend2_lowlevel, we
+ *                   have either failed to save the image or successfully
+ *                   suspended and reloaded the image. The difference can
+ *                   be discerned by checking SUSPEND_ABORTED.
+ * Called From     : 
+ */
+
+void suspend_main(void)
+{
+	if (suspend_activate_storage(0))
+		return;
+
+	if (!can_suspend())
+		goto cleanup;
+
+	/*
+	 * If kept image and still keeping image and suspending to RAM, we will 
+	 * return 1 after suspending and resuming (provided the power doesn't
+	 * run out.
+	 */
+	if (test_result_state(SUSPEND_KEPT_IMAGE) && check_still_keeping_image()) 
+		goto cleanup;
+
+
+	if (suspend_init() && !suspend_prepare_image() && !test_result_state(SUSPEND_ABORTED) &&
+		!test_action_state(SUSPEND_FREEZER_TEST)) {
+		suspend_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
+		save_image();
+	}
+	
+	suspend_cleanup();
+cleanup:
+	suspend_deactivate_storage(0);
+}
+
+/* image_exists_read
+ * 
+ * Return 0 or 1, depending on whether an image is found.
+ */
+
+char *get_have_image_data(void);
+
+static int image_exists_read(char *page, char **start, off_t off, int count,
+		int *eof, void *data)
+{
+	int len = 0;
+	char *result;
+	
+	if (suspend_activate_storage(0))
+		return count;
+
+	if (!test_suspend_state(SUSPEND_RESUME_DEVICE_OK))
+		attempt_to_parse_resume_device();
+
+	if (!suspend_active_writer) {
+		len = sprintf(page, "-1\n");
+	} else {
+		result = get_have_image_data();
+		printk("get_have_image_data returned %p.\n", result);
+		if (result) {
+			len = sprintf(page, "%s",  result);
+			free_page((unsigned long) result);
+		}
+	}
+
+	*eof = 1;
+
+	suspend_deactivate_storage(0);
+
+	return len;
+}
+
+/* image_exists_read
+ * 
+ * Return 0 or 1, depending on whether an image is found.
+ */
+static int image_exists_write(struct file *file, const char *buffer,
+		unsigned long count, void *data)
+{
+	if (suspend_activate_storage(0))
+		return count;
+
+	if (suspend_active_writer && suspend_active_writer->ops.writer.image_exists())
+		suspend_active_writer->ops.writer.invalidate_image();
+
+	suspend_deactivate_storage(0);
+
+	return count;
+}
+
+/*
+ * Core proc entries that aren't built in.
+ *
+ * This array contains entries that are automatically registered at
+ * boot. Plugins and the console code register their own entries separately.
+ */
+static struct suspend_proc_data proc_params[] = {
+	{ .filename			= "debug_info",
+	  .permissions			= PROC_READONLY,
+	  .type				= SUSPEND_PROC_DATA_CUSTOM,
+	  .data = {
+		  .special = {
+			  .read_proc	= debuginfo_read_proc,
+		  }
+	  }
+	},
+	
+	{ .filename			= "extra_pages_allowance",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_INTEGER,
+	  .data = {
+		  .integer = {
+			  .variable	= &extra_pd1_pages_allowance,
+			  .minimum	= 0,
+			  .maximum	= 32767,
+		  }
+	  }
+	},
+	
+	{ .filename			= "ignore_rootfs",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_IGNORE_ROOTFS,
+		  }
+	  }
+	},
+	
+	{ .filename			= "image_exists",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_CUSTOM,
+	  .needs_storage_manager	= 3,
+	  .data = {
+		  .special = {
+			  .read_proc	= image_exists_read,
+			  .write_proc	= image_exists_write,
+		  }
+	  }
+	},
+
+	{ .filename			= "image_size_limit",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_INTEGER,
+	  .data = {
+		  .integer = {
+			  .variable	= &image_size_limit,
+			  .minimum	= -2,
+			  .maximum	= 32767,
+		  }
+	  }
+	},
+
+	{ .filename			= "last_result",
+	  .permissions			= PROC_READONLY,
+	  .type				= SUSPEND_PROC_DATA_UL,
+	  .data = {
+		  .ul = {
+			  .variable	=  &suspend_result,
+		  }
+	  }
+	},
+	
+	{ .filename			= "reboot",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_REBOOT,
+		  }
+	  }
+	},
+	  
+	{ .filename			= "resume2",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_STRING,
+	  .needs_storage_manager	= 2,
+	  .data = {
+		  .string = {
+			  .variable	= resume2_file,
+			  .max_length	= 255,
+		  }
+	  },
+	  .write_proc			= attempt_to_parse_resume_device2,
+	},
+
+	{ .filename			= "resume_commandline",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_STRING,
+	  .data = {
+		  .string = {
+			  .variable	= resume_commandline,
+			  .max_length	= COMMAND_LINE_SIZE,
+		  }
+	  },
+	},
+
+	{ .filename			= "version",
+	  .permissions			= PROC_READONLY,
+	  .type				= SUSPEND_PROC_DATA_STRING,
+	  .data = {
+		  .string = {
+			  .variable	= suspend_core_version,
+		  }
+	  }
+	},
+
+#ifdef CONFIG_PM_DEBUG
+	{ .filename			= "freezer_test",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_FREEZER_TEST,
+		  }
+	  }
+	},
+
+	{ .filename			= "test_bio",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_TEST_BIO,
+		  }
+	  }
+	},
+
+	{ .filename			= "test_filter_speed",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_TEST_FILTER_SPEED,
+		  }
+	  }
+	},
+
+	{ .filename			= "slow",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_SLOW,
+		  }
+	  }
+	},
+	
+	{ .filename			= "no_pageset2",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_NO_PAGESET2,
+		  }
+	  }
+	},
+	
+#endif
+	  
+#if defined(CONFIG_ACPI)
+	{ .filename			= "powerdown_method",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_UL,
+	  .data = {
+		  .ul = {
+			  .variable	= &suspend_powerdown_method,
+			  .minimum	= 0,
+			  .maximum	= 5,
+		  }
+	  }
+	},
+#endif
+
+#ifdef CONFIG_SUSPEND2_KEEP_IMAGE
+	{ .filename			= "keep_image",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_KEEP_IMAGE,
+		  }
+	  }
+	},
+#endif
+};
+
+
+/*
+ * Called from init kernel_thread.
+ * We check if we have an image and if so we try to resume.
+ * We also start ksuspendd if configuration looks right.
+ */
+
+int suspend_resume(void)
+{
+	int read_image_result = 0;
+
+	if (sizeof(swp_entry_t) != sizeof(long)) {
+		printk(KERN_WARNING name_suspend
+			"The size of swp_entry_t != size of long. "
+			"Please report this!\n");
+		return 1;
+	}
+	
+	if (!resume2_file[0])
+		printk(KERN_WARNING name_suspend
+			"You need to use a resume2= command line parameter to "
+			"tell Suspend2 where to look for an image.\n");
+
+	suspend_activate_storage(0);
+
+	if (!(test_suspend_state(SUSPEND_RESUME_DEVICE_OK)) &&
+		!attempt_to_parse_resume_device()) {
+		/* 
+		 * Without a usable storage device we can do nothing - 
+		 * even if noresume is given
+		 */
+
+		if (!suspend_num_writers)
+			printk(KERN_ALERT name_suspend
+				"No writers have been registered.\n");
+		else
+			printk(KERN_ALERT name_suspend
+				"Missing or invalid storage location "
+				"(resume2= parameter). Please correct and "
+				"rerun lilo (or equivalent) before "
+				"suspending.\n");
+		suspend_deactivate_storage(0);
+		return 1;
+	}
+
+	suspend_orig_mem_free = real_nr_free_pages();
+
+	read_image_result = read_pageset1(); /* non fatal error ignored */
+
+	if (test_suspend_state(SUSPEND_NORESUME_SPECIFIED))
+		printk(KERN_WARNING name_suspend "Resuming disabled as requested.\n");
+
+	suspend_deactivate_storage(0);
+	
+	if (read_image_result)
+		return 1;
+
+	suspend_atomic_restore();
+
+	BUG();
+
+	return 0;
+}
+
+static __init int core_load(void)
+{
+	int i, numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data);
+
+	printk("Suspend2 Core.\n");
+	
+	suspend_initialise_module_lists();
+	
+	for (i=0; i< numfiles; i++)
+		suspend_register_procfile(&proc_params[i]);
+
+	return 0;
+}
+
+/* -- Functions for kickstarting a suspend or resume --- */
+
+/*
+ * Check if we have an image and if so try to resume.
+ */
+
+void __suspend_try_resume(void)
+{
+	set_suspend_state(SUSPEND_TRYING_TO_RESUME);
+	
+	clear_suspend_state(SUSPEND_RESUME_NOT_DONE);
+
+	suspend_resume();
+
+	clear_suspend_state(SUSPEND_IGNORE_LOGLEVEL);
+	clear_suspend_state(SUSPEND_TRYING_TO_RESUME);
+}
+
+/* Wrapper for when called from init/do_mounts.c */
+void suspend2_try_resume(void)
+{
+	if (suspend_start_anything(0))
+		return;
+
+	__suspend_try_resume();
+
+	/* 
+	 * For initramfs, we have to clear the boot time
+	 * flag after trying to resume
+	 */
+	clear_suspend_state(SUSPEND_BOOT_TIME);
+
+	suspend_finish_anything(0);
+}
+
+/*
+ * suspend2_try_suspend
+ * Functionality   : Wrapper around suspend_main.
+ * Called From     : drivers/acpi/sleep/main.c
+ *                   kernel/reboot.c
+ */
+
+void suspend2_try_suspend(void)
+{
+	if (suspend_start_anything(0))
+		return;
+
+	suspend_main();
+
+	suspend_finish_anything(0);
+}
+
+/* --  Commandline Parameter Handling ---
+ *
+ * Resume setup: obtain the storage device.
+ */
+
+static int __init resume2_setup(char *str)
+{
+	if (!*str)
+		return 0;
+	
+	strncpy(resume2_file, str, 255);
+	return 0;
+}
+
+/*
+ * Allow the user to set the action parameter from lilo, prior to resuming.
+ */
+static int __init suspend_act_setup(char *str)
+{
+	if(str)
+		suspend_action=simple_strtol(str,NULL,0);
+	set_suspend_state(SUSPEND_ACT_USED);
+	return 0;
+}
+
+/*
+ * Allow the user to set the debug parameter from lilo, prior to resuming.
+ */
+/*
+ * Allow the user to specify that we should ignore any image found and
+ * invalidate the image if necesssary. This is equivalent to running
+ * the task queue and a sync and then turning off the power. The same
+ * precautions should be taken: fsck if you're not journalled.
+ */
+static int __init noresume2_setup(char *str)
+{
+	set_suspend_state(SUSPEND_NORESUME_SPECIFIED);
+	return 0;
+}
+
+static int __init suspend_retry_resume_setup(char *str)
+{
+	set_suspend_state(SUSPEND_RETRY_RESUME);
+	return 0;
+}
+
+#ifdef CONFIG_PM_DEBUG
+
+static int __init suspend_dbg_setup(char *str)
+{
+	if(str)
+		suspend_debug_state=simple_strtol(str,NULL,0);
+	set_suspend_state(SUSPEND_DBG_USED);
+	return 0;
+}
+
+/*
+ * Allow the user to set the debug level parameter from lilo, prior to
+ * resuming.
+ */
+static int __init suspend_lvl_setup(char *str)
+{
+	if(str)
+		console_loglevel =
+		suspend_default_console_level = 
+			simple_strtol(str,NULL,0);
+	set_suspend_state(SUSPEND_LVL_USED);
+	clear_suspend_state(SUSPEND_IGNORE_LOGLEVEL);
+	return 0;
+}
+
+__setup("suspend_dbg=", suspend_dbg_setup);
+__setup("suspend_lvl=", suspend_lvl_setup);
+#endif
+
+__setup("noresume2", noresume2_setup);
+__setup("resume2=", resume2_setup);
+__setup("suspend_act=", suspend_act_setup);
+__setup("suspend_retry_resume", suspend_retry_resume_setup);
+
+late_initcall(core_load);
+EXPORT_SYMBOL(software_suspend_state);
diff -urN oldtree/kernel/power/suspend.h newtree/kernel/power/suspend.h
--- oldtree/kernel/power/suspend.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/suspend.h	2006-03-08 15:22:33.329510250 +0000
@@ -0,0 +1,28 @@
+/*
+ * kernel/power/suspend.h
+ *
+ * Copyright (C) 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains declarations used throughout swsusp.
+ *
+ */
+
+#ifndef KERNEL_POWER_SUSPEND_H
+#define KERNEL_POWER_SUSPEND_H
+
+#define SUSPEND_PD_PAGES(x)     (((x)*sizeof(struct pbe))/PAGE_SIZE+1)
+   
+/* mm/page_alloc.c */
+extern void drain_local_pages(void);
+
+void save_processor_state(void);
+void restore_processor_state(void);
+struct saved_context;
+void __save_processor_state(struct saved_context *ctxt);
+void __restore_processor_state(struct saved_context *ctxt);
+
+extern suspend_pagedir_t *pagedir_nosave __nosavedata;
+
+#endif
diff -urN oldtree/kernel/power/suspend2.h newtree/kernel/power/suspend2.h
--- oldtree/kernel/power/suspend2.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/suspend2.h	2006-03-08 15:22:33.329510250 +0000
@@ -0,0 +1,31 @@
+/*
+ * kernel/power/suspend2.h
+ *
+ * Copyright (C) 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains declarations used throughout swsusp and suspend2.
+ *
+ */
+#ifndef KERNEL_POWER_SUSPEND_CORE_H
+#define KERNEL_POWER_SUSPEND_CORE_H
+
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+
+extern unsigned long suspend_orig_mem_free;
+
+#define KB(x) ((x) << (PAGE_SHIFT - 10))
+#define MB(x) ((x) >> (20 - PAGE_SHIFT))
+
+extern int suspend_start_anything(int starting_cycle);
+extern void suspend_finish_anything(int finishing_cycle);
+
+#if 1
+#define PRINTK(a...) do { } while(0)
+#else
+#define PRINTK(fmt, arg...) printk(KERN_DEBUG fmt, ##arg)
+#endif
+
+#endif
diff -urN oldtree/kernel/power/suspend2_common.h newtree/kernel/power/suspend2_common.h
--- oldtree/kernel/power/suspend2_common.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/suspend2_common.h	2006-03-08 15:22:33.329510250 +0000
@@ -0,0 +1,25 @@
+#ifdef CONFIG_PM_DEBUG
+#define set_debug_state(bit) (test_and_set_bit(bit, &suspend_debug_state))
+#define clear_debug_state(bit) (test_and_clear_bit(bit, &suspend_debug_state))
+#else
+#define set_debug_state(bit) (0)
+#define clear_debug_state(bit) (0)
+#endif
+
+#define set_result_state(bit) (test_and_set_bit(bit, &suspend_result))
+#define clear_result_state(bit) (test_and_clear_bit(bit, &suspend_result))
+
+enum {
+	SUSPEND_ABORT_REQUESTED = 1,
+	SUSPEND_NOSTORAGE_AVAILABLE,
+	SUSPEND_INSUFFICIENT_STORAGE,
+	SUSPEND_FREEZING_FAILED,
+	SUSPEND_UNEXPECTED_ALLOC,
+	SUSPEND_KEPT_IMAGE,
+	SUSPEND_WOULD_EAT_MEMORY,
+	SUSPEND_UNABLE_TO_FREE_ENOUGH_MEMORY,
+	SUSPEND_ENCRYPTION_SETUP_FAILED
+};
+
+extern int suspend_default_console_level;
+extern unsigned int nr_suspends;
diff -urN oldtree/kernel/power/suspend_block_io.c newtree/kernel/power/suspend_block_io.c
--- oldtree/kernel/power/suspend_block_io.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/suspend_block_io.c	2006-03-08 15:22:33.333510500 +0000
@@ -0,0 +1,1086 @@
+/*
+ * block_io.c
+ *
+ * Copyright 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * Distributed under GPLv2.
+ * 
+ * This file contains block io functions for suspend2. These are
+ * used by the swapwriter and it is planned that they will also
+ * be used by the NFSwriter.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/kthread.h>
+#include <linux/buffer_head.h>
+#include <linux/syscalls.h>
+#include <asm/types.h>
+
+#include "suspend2.h"
+#include "proc.h"
+#include "modules.h"
+#include "prepare_image.h"
+#include "block_io.h"
+#include "extent.h"
+#include "suspend2_common.h"
+#include "ui.h"
+
+/* Bits in struct io_info->flags */
+enum {
+	IO_WRITING,
+	IO_RESTORE_PAGE_PROT,
+	IO_AWAITING_READ,
+	IO_AWAITING_WRITE,
+	IO_AWAITING_SUBMIT,
+	IO_AWAITING_CLEANUP,
+	IO_HANDLE_PAGE_PROT
+};
+
+#define MAX_OUTSTANDING_IO 2048
+
+/*
+ *
+ *     IO in progress information storage and helpers
+ *
+ */
+
+struct io_info {
+	struct bio *sys_struct;
+	sector_t block[MAX_BUF_PER_PAGE];
+	struct page *buffer_page;
+	struct page *data_page;
+	unsigned long flags;
+	struct block_device *dev;
+	struct list_head list;
+	int readahead_index;
+	struct work_struct work;
+	int printme;
+};
+
+/* Locks separated to allow better SMP support.
+ * An io_struct moves through the lists as follows.
+ * free -> submit_batch -> busy -> ready_for_cleanup -> free
+ */
+static LIST_HEAD(ioinfo_free);
+static DEFINE_SPINLOCK(ioinfo_free_lock);
+
+static LIST_HEAD(ioinfo_ready_for_cleanup);
+static DEFINE_SPINLOCK(ioinfo_ready_lock);
+
+static LIST_HEAD(ioinfo_submit_batch);
+static DEFINE_SPINLOCK(ioinfo_submit_lock);
+
+static LIST_HEAD(ioinfo_busy);
+static DEFINE_SPINLOCK(ioinfo_busy_lock);
+
+static atomic_t submit_batch;
+static int submit_batch_size = 64;
+static int submit_batched(void);
+
+struct task_struct *suspend_bio_task;
+
+/* [Max] number of I/O operations pending */
+static atomic_t outstanding_io;
+static int max_outstanding_io = 0;
+static atomic_t buffer_allocs, buffer_frees;
+
+/* [Max] number of pages used for above struct */
+static int infopages = 0;
+static int maxinfopages = 0;
+
+static volatile unsigned long suspend_readahead_flags[(MAX_OUTSTANDING_IO + BITS_PER_LONG - 1) / BITS_PER_LONG];
+static spinlock_t suspend_readahead_flags_lock = SPIN_LOCK_UNLOCKED;
+static struct page *suspend_readahead_pages[MAX_OUTSTANDING_IO];
+static int readahead_index, readahead_submit_index;
+
+static int current_stream;
+struct extent_iterate_saved_state suspend_writer_posn_save[3];
+
+/* Pointer to current entry being loaded/saved. */
+struct extent_iterate_state suspend_writer_posn;
+
+/* Not static, so that the allocators can setup and complete
+ * writing the header */
+char *suspend_writer_buffer;
+int suspend_writer_buffer_posn;
+
+int suspend_read_fd;
+
+static unsigned long nr_schedule_calls[8];
+
+static char *sch_caller[] = {
+	"get_io_info_struct #1    ",
+	"get_io_info_struct #2    ",
+	"get_io_info_struct #3    ",
+	"suspend_finish_all_io    ",
+	"wait_on_one_page         ",
+	"submit                   ",
+	"start_one                ",
+	"suspend_wait_on_readahead",
+};
+
+static struct suspend_bdev_info *suspend_devinfo;
+int need_extra_next;
+
+/*
+ * suspend_reset_io_stats
+ *
+ * Description:	Reset all our sanity-checking statistics.
+ */
+static void suspend_reset_io_stats(void)
+{
+	int i;
+	
+	max_outstanding_io = 0;
+	maxinfopages = 0;
+	
+	for (i = 0; i < 8; i++)
+		nr_schedule_calls[i] = 0;
+}
+
+/*
+ * suspend_check_io_stats
+ *
+ * Description:	Check that our statistics look right and print
+ * 		any debugging info wanted.
+ */
+static void suspend_check_io_stats(void)
+{
+	int i;
+
+	BUG_ON(atomic_read(&outstanding_io));
+	BUG_ON(infopages);
+	BUG_ON(!list_empty(&ioinfo_submit_batch));
+	BUG_ON(!list_empty(&ioinfo_busy));
+	BUG_ON(!list_empty(&ioinfo_ready_for_cleanup));
+	BUG_ON(!list_empty(&ioinfo_free));
+	BUG_ON(atomic_read(&buffer_allocs) != atomic_read(&buffer_frees));
+
+	suspend_message(SUSPEND_WRITER, SUSPEND_LOW, 0,
+			"Maximum outstanding_io was %d.\n",
+			max_outstanding_io);
+	suspend_message(SUSPEND_WRITER, SUSPEND_LOW, 0,
+			"Max info pages was %d.\n",
+			maxinfopages);
+	if (atomic_read(&buffer_allocs) != atomic_read(&buffer_frees))
+		suspend_message(SUSPEND_WRITER, SUSPEND_MEDIUM, 0,
+			"Buffer allocs (%d) != buffer frees (%d)",
+				atomic_read(&buffer_allocs),
+				atomic_read(&buffer_frees));
+	for(i = 0; i < 8; i++)
+		suspend_message(SUSPEND_WRITER, SUSPEND_MEDIUM, 0,
+			"Nr schedule calls %s: %lu.\n", sch_caller[i], nr_schedule_calls[i]);
+}
+
+/*
+ * cleanup_one
+ * 
+ * Description: Clean up after completing I/O on a page.
+ * Arguments:	struct io_info:	Data for I/O to be completed.
+ */
+static void __suspend_bio_cleanup_one(struct io_info *io_info)
+{
+	struct page *buffer_page;
+	struct page *data_page;
+	char *buffer_address, *data_address;
+	int reading;
+
+	buffer_page = io_info->buffer_page;
+	data_page = io_info->data_page;
+
+	reading = test_bit(IO_AWAITING_READ, &io_info->flags);
+	suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 0,
+		"Cleanup IO: [%p]\n", 
+		io_info);
+
+	if (reading && io_info->readahead_index == -1) {
+		/*
+		 * Copy the page we read into the buffer our caller provided.
+		 */
+		data_address = (char *) kmap(data_page);
+		buffer_address = (char *) kmap(buffer_page);
+		memcpy(data_address, buffer_address, PAGE_SIZE);
+		kunmap(data_page);
+		kunmap(buffer_page);
+	
+	}
+
+	if (!reading || io_info->readahead_index == -1) {
+		/* Sanity check */
+		if (page_count(buffer_page) != 2)
+			printk(KERN_EMERG "Cleanup IO: Page count on page %p is %d. Not good!\n",
+					buffer_page, page_count(buffer_page));
+		put_page(buffer_page);
+		__free_page(buffer_page);
+		atomic_inc(&buffer_frees);
+	} else
+		put_page(buffer_page);
+	
+	bio_put(io_info->sys_struct);
+	io_info->sys_struct = NULL;
+	io_info->flags = 0;
+}
+
+/* __suspend_io_cleanup
+ */
+
+static int suspend_bio_cleanup_one(void *data)
+{
+	struct io_info *io_info = (struct io_info *) data;
+	int readahead_index;
+	unsigned long flags;
+
+	/*
+	 * If this I/O was a readahead, remember its index.
+	 */
+	readahead_index = io_info->readahead_index;
+
+	/*
+	 * Add it to the free list.
+	 */
+	list_del_init(&io_info->list);
+	
+	/*
+	 * Do the cleanup.
+	 */
+	__suspend_bio_cleanup_one(io_info);
+
+	/*
+	 * Record the readahead as done.
+	 */
+	if (readahead_index > -1) {
+		int index = readahead_index/BITS_PER_LONG;
+		int bit = readahead_index - (index * BITS_PER_LONG);
+		spin_lock_irqsave(&suspend_readahead_flags_lock, flags);
+		set_bit(bit, &suspend_readahead_flags[index]);
+		spin_unlock_irqrestore(&suspend_readahead_flags_lock, flags);
+	}
+
+	spin_lock_irqsave(&ioinfo_free_lock, flags);
+	list_add_tail(&io_info->list, &ioinfo_free);
+	spin_unlock_irqrestore(&ioinfo_free_lock, flags);
+	
+	/* Important: Must be last thing we do to avoid a race with
+	 * finish_all_io when using keventd to do the cleanup */
+	atomic_dec(&outstanding_io);
+
+	return 0;
+}
+
+/* suspend_cleanup_some_completed_io
+ *
+ * NB: This is designed so that multiple callers can be in here simultaneously.
+ */
+
+static void suspend_cleanup_some_completed_io(void)
+{
+	int num_cleaned = 0;
+	struct io_info *first;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioinfo_ready_lock, flags);
+	while(!list_empty(&ioinfo_ready_for_cleanup)) {
+		int result;
+		first = list_entry(ioinfo_ready_for_cleanup.next, struct io_info, list);
+
+		BUG_ON(!test_and_clear_bit(IO_AWAITING_CLEANUP, &first->flags));
+
+		list_del_init(&first->list);
+
+		spin_unlock_irqrestore(&ioinfo_ready_lock, flags);
+
+		result = suspend_bio_cleanup_one((void *) first);
+
+		spin_lock_irqsave(&ioinfo_ready_lock, flags);
+		if (result)
+			continue;
+		num_cleaned++;
+		if (num_cleaned == submit_batch_size)
+			break;
+	}
+	spin_unlock_irqrestore(&ioinfo_ready_lock, flags);
+}
+
+/* do_bio_wait
+ *
+ * Actions taken when we want some I/O to get run.
+ * 
+ * Submit any I/O that's batched up (if we're not already doing
+ * that, unplug queues, schedule and clean up whatever we can.
+ */
+static void do_bio_wait(int caller)
+{
+	int num_submitted = 0;
+
+	nr_schedule_calls[caller]++;
+	
+	/* Don't want to wait on I/O we haven't submitted! */
+	num_submitted = submit_batched();
+
+	kblockd_flush();
+	
+	io_schedule();
+
+	suspend_cleanup_some_completed_io();
+}
+
+/*
+ * suspend_finish_all_io
+ *
+ * Description:	Finishes all IO and frees all IO info struct pages.
+ */
+static void suspend_finish_all_io(void)
+{
+	struct io_info *this, *next = NULL;
+	unsigned long flags;
+
+	/* Wait for all I/O to complete. */
+	while (atomic_read(&outstanding_io))
+		do_bio_wait(2);
+
+	spin_lock_irqsave(&ioinfo_free_lock, flags);
+	
+	/* 
+	 * Two stages, to avoid using freed pages.
+	 *
+	 * First free all io_info structs on a page except the first.
+	 */
+	list_for_each_entry_safe(this, next, &ioinfo_free, list) {
+		if (((unsigned long) this) & ~PAGE_MASK)
+			list_del(&this->list);
+	}
+
+	/* 
+	 * Now we have only one reference to each page, and can safely
+	 * free pages, knowing we're not going to be trying to access the
+	 * same page after freeing it.
+	 */
+	list_for_each_entry_safe(this, next, &ioinfo_free, list) {
+		list_del(&this->list);
+		free_page((unsigned long) this);
+		infopages--;
+		suspend_message(SUSPEND_MEMORY, SUSPEND_VERBOSE, 0,
+				"[FreedIOPage %lx]", this);
+	}
+	
+	spin_unlock_irqrestore(&ioinfo_free_lock, flags);
+}
+
+/*
+ * wait_on_one_page
+ *
+ * Description:	Wait for a particular I/O to complete.
+ */
+static void wait_on_one_page(struct io_info *io_info)
+{
+	do { do_bio_wait(3); } while (io_info->flags);
+}
+
+/*
+ * wait_on_readahead
+ *
+ * Wait until a particular readahead is ready.
+ */
+static void suspend_wait_on_readahead(int readahead_index)
+{
+	int index = readahead_index / BITS_PER_LONG;
+	int bit = readahead_index - index * BITS_PER_LONG;
+
+	/* read_ahead_index is the one we want to return */
+	while (!test_bit(bit, &suspend_readahead_flags[index]))
+		do_bio_wait(6);
+}
+
+/*
+ * readahead_done
+ *
+ * Returns whether the readahead requested is ready.
+ */
+
+static int suspend_readahead_ready(int readahead_index)
+{
+	int index = readahead_index / BITS_PER_LONG;
+	int bit = readahead_index - (index * BITS_PER_LONG);
+
+	return test_bit(bit, &suspend_readahead_flags[index]);
+}
+
+/* suspend_readahead_prepare
+ * Set up for doing readahead on an image */
+static int suspend_prepare_readahead(int index)
+{
+	unsigned long new_page = get_zeroed_page(GFP_ATOMIC);
+
+	if(!new_page)
+		return -ENOMEM;
+
+	suspend_readahead_pages[index] = virt_to_page(new_page);
+	return 0;
+}
+
+/* suspend_readahead_cleanup
+ * Clean up structures used for readahead */
+static void suspend_cleanup_readahead(int page)
+{
+	__free_page(suspend_readahead_pages[page]);
+	suspend_readahead_pages[page] = 0;
+	return;
+}
+
+/*
+ * suspend_end_bio
+ *
+ * Description:	Function called by block driver from interrupt context when I/O
+ * 		is completed. This is the reason we use spinlocks in
+ * 		manipulating the io_info lists. 		
+ * 		Nearly the fs/buffer.c version, but we want to mark the page as 
+ * 		done in our own structures too.
+ */
+
+static int suspend_end_bio(struct bio *bio, unsigned int num, int err)
+{
+	struct io_info *io_info = bio->bi_private;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioinfo_busy_lock, flags);
+	list_del_init(&io_info->list);
+	spin_unlock_irqrestore(&ioinfo_busy_lock, flags);
+
+	set_bit(IO_AWAITING_CLEANUP, &io_info->flags);
+		
+	spin_lock_irqsave(&ioinfo_ready_lock, flags);
+	list_add_tail(&io_info->list, &ioinfo_ready_for_cleanup);
+	spin_unlock_irqrestore(&ioinfo_ready_lock, flags);
+	return 0;
+}
+
+/**
+ *	submit - submit BIO request.
+ *	@rw:	READ or WRITE.
+ *	@io_info: IO info structure.
+ *
+ * 	Based on Patrick's pmdisk code from long ago:
+ *	"Straight from the textbook - allocate and initialize the bio.
+ *	If we're writing, make sure the page is marked as dirty.
+ *	Then submit it and carry on."
+ *
+ *	With a twist, though - we handle block_size != PAGE_SIZE.
+ *	Caller has already checked that our page is not fragmented.
+ */
+
+static int submit(int rw, struct io_info *io_info)
+{
+	int error = 0;
+	struct bio *bio = NULL;
+	unsigned long flags;
+
+	while (!bio) {
+		bio = bio_alloc(GFP_ATOMIC,1);
+		if (!bio)
+			do_bio_wait(4);
+	}
+
+	bio->bi_bdev = io_info->dev;
+	bio->bi_sector = io_info->block[0];
+	bio->bi_private = io_info;
+	bio->bi_end_io = suspend_end_bio;
+	bio->bi_flags |= (1 << BIO_SUSPEND2);
+	io_info->sys_struct = bio;
+	if (io_info->printme)
+		PRINTK("%s dev %p block %ld => sector %ld\n",
+			rw ? "Write" : "Read",
+			bio->bi_bdev, io_info->block[0],
+			(unsigned long) bio->bi_sector);
+
+	if (bio_add_page(bio, io_info->buffer_page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk("ERROR: adding page to bio at %lld\n",
+				(unsigned long long) io_info->block[0]);
+		bio_put(bio);
+		return -EFAULT;
+	}
+
+	if (rw == WRITE)
+		bio_set_pages_dirty(bio);
+
+	spin_lock_irqsave(&ioinfo_busy_lock, flags);
+	list_add_tail(&io_info->list, &ioinfo_busy);
+	spin_unlock_irqrestore(&ioinfo_busy_lock, flags);
+	
+	submit_bio(rw,bio);
+
+	return error;
+}
+
+/* 
+ * submit a batch. The submit function can wait on I/O, so we have
+ * simple locking to avoid infinite recursion.
+ */
+static int submit_batched(void)
+{
+	static int running_already = 0;
+	struct io_info *first;
+	unsigned long flags;
+	int num_submitted = 0;
+
+	running_already = 1;
+	spin_lock_irqsave(&ioinfo_submit_lock, flags);
+	while(!list_empty(&ioinfo_submit_batch)) {
+		first = list_entry(ioinfo_submit_batch.next, struct io_info, list);
+
+		BUG_ON(!test_and_clear_bit(IO_AWAITING_SUBMIT, &first->flags));
+
+		list_del_init(&first->list);
+
+		atomic_dec(&submit_batch);
+
+		spin_unlock_irqrestore(&ioinfo_submit_lock, flags);
+
+		if (test_bit(IO_AWAITING_READ, &first->flags))
+			submit(READ, first);
+		else
+			submit(WRITE, first);
+
+		spin_lock_irqsave(&ioinfo_submit_lock, flags);
+		
+		num_submitted++;
+		if (num_submitted == submit_batch_size)
+			break;
+	}
+	spin_unlock_irqrestore(&ioinfo_submit_lock, flags);
+	running_already = 0;
+
+	return num_submitted;
+}
+
+static void add_to_batch(struct io_info *io_info)
+{
+	unsigned long flags;
+	
+	set_bit(IO_AWAITING_SUBMIT, &io_info->flags);
+
+	/* Put our prepared I/O struct on the batch list. */
+	spin_lock_irqsave(&ioinfo_submit_lock, flags);
+	list_add_tail(&io_info->list, &ioinfo_submit_batch);
+	spin_unlock_irqrestore(&ioinfo_submit_lock, flags);
+
+	atomic_inc(&submit_batch);
+
+	if ((!suspend_bio_task) && (atomic_read(&submit_batch) >= submit_batch_size))
+		submit_batched();
+}
+
+/*
+ * get_io_info_struct
+ *
+ * Description:	Get an I/O struct.
+ * Returns:	Pointer to the struct prepared for use.
+ */
+static struct io_info *get_io_info_struct(void)
+{
+	unsigned long newpage = 0, flags;
+	struct io_info *this = NULL;
+	int remaining = 0;
+
+	do {
+		while (atomic_read(&outstanding_io) >= MAX_OUTSTANDING_IO)
+			do_bio_wait(0);
+
+		/* Can start a new I/O. Is there a free one? */
+		if (!list_empty(&ioinfo_free)) {
+			/* Yes. Grab it. */
+			spin_lock_irqsave(&ioinfo_free_lock, flags);
+			break;
+		}
+
+		/* No. Need to allocate a new page for I/O info structs. */
+		newpage = get_zeroed_page(GFP_ATOMIC);
+		if (!newpage) {
+			do_bio_wait(1);
+			continue;
+		}
+
+		suspend_message(SUSPEND_MEMORY, SUSPEND_VERBOSE, 0,
+				"[NewIOPage %lx]", newpage);
+		infopages++;
+		if (infopages > maxinfopages)
+			maxinfopages++;
+
+		/* Prepare the new page for use. */
+		this = (struct io_info *) newpage;
+		remaining = PAGE_SIZE;
+		spin_lock_irqsave(&ioinfo_free_lock, flags);
+		while (remaining >= (sizeof(struct io_info))) {
+			list_add_tail(&this->list, &ioinfo_free);
+			this = (struct io_info *) (((char *) this) + 
+					sizeof(struct io_info));
+			remaining -= sizeof(struct io_info);
+		}
+		break;
+	} while (1);
+
+	/*
+	 * We have an I/O info struct. Remove it from the free list.
+	 * It will be added to the submit or busy list later.
+	 */
+	this = list_entry(ioinfo_free.next, struct io_info, list);
+	list_del_init(&this->list);
+	spin_unlock_irqrestore(&ioinfo_free_lock, flags);
+	return this;
+}
+
+/*
+ * start_one
+ *
+ * Description:	Prepare and start a read or write operation.
+ * 		Note that we use our own buffer for reading or writing.
+ * 		This simplifies doing readahead and asynchronous writing.
+ * 		We can begin a read without knowing the location into which
+ * 		the data will eventually be placed, and the buffer passed
+ * 		for a write can be reused immediately (essential for the
+ * 		modules system).
+ * 		Failure? What's that?
+ * Returns:	The io_info struct created.
+ */
+static struct io_info *start_one(int rw, struct submit_params *submit_info)
+{
+	struct io_info *io_info = get_io_info_struct();
+	unsigned long buffer_virt = 0;
+	char *to, *from;
+	struct page *buffer_page;
+
+	if (!io_info)
+		return NULL;
+
+	/* Get our local buffer */
+	suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 1,
+			"Start_IO: [%p]", io_info);
+	
+	/* Copy settings to the io_info struct */
+	io_info->data_page = submit_info->page;
+	io_info->readahead_index = submit_info->readahead_index;
+	io_info->printme = submit_info->printme;
+
+	if (io_info->readahead_index == -1) {
+		while (!(buffer_virt = get_zeroed_page(GFP_ATOMIC)))
+			do_bio_wait(5);
+
+		atomic_inc(&buffer_allocs);
+		suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 0,
+				"[ALLOC BUFFER]->%d",
+				real_nr_free_pages());
+		buffer_page = virt_to_page(buffer_virt);
+	
+		io_info->buffer_page = buffer_page;
+	} else {
+		unsigned long flags;
+		int index = io_info->readahead_index / BITS_PER_LONG;
+		int bit = io_info->readahead_index - index * BITS_PER_LONG;
+
+		spin_lock_irqsave(&suspend_readahead_flags_lock, flags);
+		clear_bit(bit, &suspend_readahead_flags[index]);
+		spin_unlock_irqrestore(&suspend_readahead_flags_lock, flags);
+
+		io_info->buffer_page = buffer_page = submit_info->page;
+	}
+
+	/* If writing, copy our data. The data is probably in
+	 * lowmem, but we cannot be certain. If there is no
+	 * compression/encryption, we might be passed the
+	 * actual source page's address. */
+	if (rw == WRITE) {
+		set_bit(IO_WRITING, &io_info->flags);
+
+		to = (char *) buffer_virt;
+		from = kmap_atomic(io_info->data_page, KM_USER1);
+		memcpy(to, from, PAGE_SIZE);
+		kunmap_atomic(from, KM_USER1);
+	}
+
+	/* Submit the page */
+	get_page(buffer_page);
+	
+	io_info->dev = submit_info->dev;
+	io_info->block[0] = submit_info->block[0];
+
+	if (rw == READ)
+		set_bit(IO_AWAITING_READ, &io_info->flags);
+	else
+		set_bit(IO_AWAITING_WRITE, &io_info->flags);
+
+	suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 1,
+			"-> (PRE BRW) %d\n",
+			real_nr_free_pages());
+
+	if (submit_batch_size > 1)
+		add_to_batch(io_info);
+	else
+	 	submit(rw, io_info);
+	
+	atomic_inc(&outstanding_io);
+	if (atomic_read(&outstanding_io) > max_outstanding_io)
+		max_outstanding_io++;
+	
+	return io_info;
+}
+
+static int suspend_do_io(int rw, 
+		struct submit_params *submit_info, int syncio)
+{
+	struct io_info *io_info;
+
+	if(!submit_info->dev)
+		return 1;
+	
+	io_info = start_one(rw, submit_info);
+
+	if (!io_info)
+		return 1;
+	else if (syncio)
+		wait_on_one_page(io_info);
+
+	/* If we were the only one, clean everything up */
+	if (!atomic_read(&outstanding_io))
+		suspend_finish_all_io();
+	return 0;
+} 
+
+/* We used to use bread here, but it doesn't correctly handle
+ * blocksize != PAGE_SIZE. Now we create a submit_info to get the data we
+ * want and use our normal routines (synchronously).
+ */
+
+static int suspend_bdev_page_io(int rw, struct block_device *bdev, long pos,
+		struct page *page)
+{
+	struct submit_params submit_info;
+
+	if (!bdev)
+		return 0;
+
+	submit_info.page = page;
+	submit_info.dev = bdev;
+	submit_info.block[0] = pos;
+	submit_info.readahead_index = -1;
+	return suspend_do_io(rw, &submit_info, 1);
+}
+
+static unsigned long suspend_bio_memory_needed(void)
+{
+	/* We want to have at least enough memory so as to have
+	 * MAX_OUTSTANDING_IO transactions on the fly at once. If we 
+	 * can to more, fine. */
+	return (MAX_OUTSTANDING_IO * (PAGE_SIZE + sizeof(struct request) +
+				sizeof(struct bio) + sizeof(struct io_info)));
+}
+
+static void suspend_set_devinfo(struct suspend_bdev_info *info)
+{
+	suspend_devinfo = info;
+}
+
+static int forward_one_page(void)
+{
+	int i, j;
+
+	for (j = 0; j < need_extra_next + 1; j++) {
+		extent_state_next(&suspend_writer_posn);
+	
+		/* Have to go forward one to ensure we're on the right chain,
+		 * before we can know how many more blocks to skip.*/
+		for (i = 1; i < suspend_devinfo[suspend_writer_posn.current_chain].blocks_per_page; i++)
+			extent_state_next(&suspend_writer_posn);
+
+		if (extent_state_eof(&suspend_writer_posn)) {
+			printk("Extent state eof.\n");
+			return -ENODATA;
+		}
+	}
+
+	need_extra_next = 0;
+
+	return 0;
+}
+
+static int __suspend_rw_page(int rw, struct page *page,
+		int readahead_index, int sync, int debug)
+{
+	int i, current_chain;
+	struct submit_params submit_params;
+
+	if (test_action_state(SUSPEND_TEST_FILTER_SPEED))
+		return 0;
+		
+	submit_params.readahead_index = readahead_index;
+	submit_params.page = page;
+	
+	if (forward_one_page())
+		return -ENODATA;
+
+	current_chain = suspend_writer_posn.current_chain;
+	submit_params.dev = suspend_devinfo[current_chain].bdev;
+	submit_params.block[0] = (suspend_writer_posn.current_offset -
+		       suspend_devinfo[current_chain].blocks_per_page + 1) <<
+		suspend_devinfo[current_chain].bmap_shift;
+
+	if (debug)
+		printk("%s: %lx:%lx.\n", rw ? "Write" : "Read",
+				(long) submit_params.dev->bd_dev,
+				(long) submit_params.block[0]);
+
+	i = suspend_do_io(rw, &submit_params, sync);
+
+	if (i)
+		return -EIO;
+
+	return 0;
+}
+
+static int suspend_rw_page(int rw, struct page *page,
+		int readahead_index, int sync)
+{
+	return __suspend_rw_page(rw, page, readahead_index, sync, 0);
+}
+
+static int suspend_bio_read_chunk(struct page *buffer_page, int sync)
+{
+	static int last_result;
+	unsigned long *virt;
+
+	if (sync == SUSPEND_ASYNC)
+		return suspend_rw_page(READ, buffer_page, -1, sync);
+
+	/* Start new readahead while we wait for our page */
+	if (readahead_index == -1) {
+		last_result = 0;
+		readahead_index = readahead_submit_index = 0;
+	}
+
+	/* Start a new readahead? */
+	if (last_result) {
+		/* We failed to submit a read, and have cleaned up
+		 * all the readahead previously submitted */
+		if (readahead_submit_index == readahead_index)
+			return -EPERM;
+		goto wait;
+	}
+	
+	do {
+		if (suspend_prepare_readahead(readahead_submit_index))
+			break;
+
+		last_result = suspend_rw_page(
+			READ,
+			suspend_readahead_pages[readahead_submit_index], 
+			readahead_submit_index, SUSPEND_ASYNC);
+		if (last_result) {
+			printk("Begin read chunk for page %d returned %d.\n",
+				readahead_submit_index, last_result);
+			suspend_cleanup_readahead(readahead_submit_index);
+			break;
+		}
+
+		readahead_submit_index++;
+
+		if (readahead_submit_index == MAX_OUTSTANDING_IO)
+			readahead_submit_index = 0;
+
+	} while((!last_result) && (readahead_submit_index != readahead_index) &&
+			(!suspend_readahead_ready(readahead_index)));
+
+wait:
+	suspend_wait_on_readahead(readahead_index);
+
+	virt = kmap_atomic(buffer_page, KM_USER1);
+	memcpy(virt, page_address(suspend_readahead_pages[readahead_index]),
+			PAGE_SIZE);
+	kunmap_atomic(virt, KM_USER1);
+
+	suspend_cleanup_readahead(readahead_index);
+
+	readahead_index++;
+	if (readahead_index == MAX_OUTSTANDING_IO)
+		readahead_index = 0;
+
+	return 0;
+}
+
+static int suspend_read_init(int stream_number)
+{
+	current_stream = stream_number;
+	extent_state_restore(&suspend_writer_posn,
+			&suspend_writer_posn_save[current_stream]);
+
+	BUG_ON(!suspend_writer_posn.current_extent);
+
+	suspend_reset_io_stats();
+
+	readahead_index = readahead_submit_index = -1;
+
+	return 0;
+}
+
+static int suspend_read_cleanup(void)
+{
+	suspend_finish_all_io();
+	while (readahead_index != readahead_submit_index) {
+		suspend_cleanup_readahead(readahead_index);
+		readahead_index++;
+		if (readahead_index == MAX_OUTSTANDING_IO)
+			readahead_index = 0;
+	}
+	suspend_check_io_stats();
+	return 0;
+}
+
+static int suspend_write_init(int stream_number)
+{
+	extent_state_restore(&suspend_writer_posn,
+			&suspend_writer_posn_save[stream_number]);
+	current_stream = stream_number;
+
+	BUG_ON(!suspend_writer_posn.current_extent);
+
+	suspend_reset_io_stats();
+
+	return 0;
+}
+
+static int suspend_write_cleanup(void)
+{
+	if (current_stream == 2)
+		extent_state_save(&suspend_writer_posn,
+				&suspend_writer_posn_save[1]);
+	
+	suspend_finish_all_io();
+	
+	suspend_check_io_stats();
+
+	return 0;
+}
+
+static int suspend_write_chunk(struct page *buffer_page)
+{
+	return suspend_rw_page(WRITE, buffer_page, -1, 0);
+}
+
+static int suspend_rw_header_chunk(int rw, char *buffer, int buffer_size)
+{
+	int bytes_left = buffer_size;
+	
+	/* Read a chunk of the header */
+	while (bytes_left) {
+		char *source_start = buffer + buffer_size - bytes_left;
+		char *dest_start = suspend_writer_buffer + suspend_writer_buffer_posn;
+		int capacity = PAGE_SIZE - suspend_writer_buffer_posn;
+		char *to = rw ? dest_start : source_start;
+		char *from = rw ? source_start : dest_start;
+
+		if (bytes_left <= capacity) {
+			if (test_debug_state(SUSPEND_HEADER))
+				printk("Copy %d bytes from %p to %p.\n",
+						bytes_left, to, from);
+			memcpy(to, from, bytes_left);
+			suspend_writer_buffer_posn += bytes_left;
+			return rw ? 0 : buffer_size;
+		}
+
+		/* Next to read the next page */
+		if (test_debug_state(SUSPEND_HEADER))
+			printk("Copy %d bytes from %p to %p.\n",
+					capacity, to, from);
+		memcpy(to, from, capacity);
+		bytes_left -= capacity;
+
+		if (rw == READ && test_suspend_state(SUSPEND_TRY_RESUME_RD))
+			sys_read(suspend_read_fd,
+				suspend_writer_buffer, BLOCK_SIZE);
+		else {
+			if (__suspend_rw_page(rw,
+					virt_to_page(suspend_writer_buffer),
+					-1, !rw,
+					test_debug_state(SUSPEND_HEADER)))
+				return -EIO;
+		}
+
+		suspend_writer_buffer_posn = 0;
+		check_shift_keys(0, NULL);
+	}
+
+	return rw ? 0 : buffer_size;
+}
+
+static int write_header_chunk_finish(void)
+{
+	return __suspend_rw_page(WRITE,
+		virt_to_page(suspend_writer_buffer),
+		-1, 0, test_debug_state(SUSPEND_HEADER)) ? -EIO : 0;
+}
+
+static int read_header_chunk(char *buffer, int buffer_size)
+{
+	return suspend_rw_header_chunk(READ, buffer, buffer_size);
+}
+
+static int write_header_chunk(char *buffer, int buffer_size)
+{
+	return suspend_rw_header_chunk(WRITE, buffer, buffer_size);
+}
+
+struct suspend_bio_ops suspend_bio_ops = {
+	.submit_io = suspend_do_io,
+	.bdev_page_io = suspend_bdev_page_io,
+	.rw_page = suspend_rw_page,
+	.wait_on_readahead = suspend_wait_on_readahead,
+	.check_io_stats = suspend_check_io_stats,
+	.reset_io_stats = suspend_reset_io_stats,
+	.finish_all_io = suspend_finish_all_io,
+	.prepare_readahead = suspend_prepare_readahead,
+	.cleanup_readahead = suspend_cleanup_readahead,
+	.readahead_pages = suspend_readahead_pages,
+	.readahead_ready = suspend_readahead_ready,
+	.need_extra_next = &need_extra_next,
+	.forward_one_page = forward_one_page,
+	.set_devinfo = suspend_set_devinfo,
+	.read_init = suspend_read_init,
+	.read_chunk = suspend_bio_read_chunk,
+	.read_cleanup = suspend_read_cleanup,
+	.write_init = suspend_write_init,
+	.write_chunk = suspend_write_chunk,
+	.write_cleanup = suspend_write_cleanup,
+	.read_header_chunk = read_header_chunk,
+	.write_header_chunk = write_header_chunk,
+	.write_header_chunk_finish = write_header_chunk_finish,
+};
+
+static struct suspend_module_ops suspend_blockwriter_ops = 
+{
+	.name					= "Block I/O",
+	.type					= MISC_PLUGIN,
+	.module					= THIS_MODULE,
+	.memory_needed				= suspend_bio_memory_needed,
+};
+
+static __init int suspend_block_io_load(void)
+{
+	return suspend_register_module(&suspend_blockwriter_ops);
+}
+
+#ifdef MODULE
+static __exit void suspend_block_io_unload(void)
+{
+	suspend_unregister_module(&suspend_blockwriter_ops);
+}
+
+module_init(suspend_block_io_load);
+module_exit(suspend_block_io_unload);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nigel Cunningham");
+MODULE_DESCRIPTION("Suspend2 block io functions");
+#else
+late_initcall(suspend_block_io_load);
+#endif
diff -urN oldtree/kernel/power/suspend_checksums.c newtree/kernel/power/suspend_checksums.c
--- oldtree/kernel/power/suspend_checksums.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/suspend_checksums.c	2006-03-08 15:22:33.333510500 +0000
@@ -0,0 +1,509 @@
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#ifdef CONFIG_KDB
+#include <linux/kdb.h>
+#include <linux/kdbprivate.h>
+#endif
+#include <linux/module.h>
+
+#include "suspend.h"
+#include "modules.h"
+#include "pageflags.h"
+#include "proc.h"
+#include "pagedir.h"
+#include "ui.h"
+
+#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / sizeof(unsigned long))
+#define NEXT_CHECKSUM_PAGE(page) *((unsigned long *) (((char *) (page)) + PAGE_SIZE - sizeof(void *)))
+
+static int checksum_pages;
+static unsigned long *first_checksum_page, *last_checksum_page;
+static int num_reload_pages = 0;
+
+struct reload_data
+{
+	int pageset;
+	int pagenumber;
+	struct page *page_address;
+	char *base_version;
+	char *compared_version;
+	struct reload_data *next;
+};
+
+static struct reload_data *first_reload_data, *last_reload_data;
+
+unsigned long suspend_page_checksum(struct page *page)
+{
+	unsigned long *virt;
+	int i;
+	unsigned long value = 0;
+
+	virt = (unsigned long *) kmap_atomic(page, KM_USER0);
+	for (i = 0; i < (PAGE_SIZE / sizeof(unsigned long)); i++)
+		value += *(virt + i);
+	kunmap_atomic(virt, KM_USER0);
+	return value;
+}
+
+extern void get_first_pbe(struct pbe *pbe, struct pagedir *pagedir);
+extern void get_next_pbe(struct pbe *pbe);
+
+void __suspend_calculate_checksums(dyn_pageflags_t map, unsigned long **current_checksum_page,
+		int *page_index)
+{
+	int page_number;
+	
+	BITMAP_FOR_EACH_SET(map, page_number) {
+		*(*current_checksum_page + *page_index) =
+			suspend_page_checksum(pfn_to_page(page_number));
+		*page_index++;
+		if (*page_index == CHECKSUMS_PER_PAGE) {
+			*page_index = 0;
+			*current_checksum_page = (unsigned long *)
+				NEXT_CHECKSUM_PAGE(*current_checksum_page);
+		}
+	};
+}
+
+void suspend_calculate_checksums(void)
+{
+	int page_index = 0;
+	unsigned long *current_checksum_page = first_checksum_page;
+	
+	if (!first_checksum_page) {
+		suspend_prepare_status(1, 0, "Unable to checksum at this point.");
+		return;
+	}
+
+	suspend_prepare_status(1, 0, "Calculating checksums... ");
+
+	__suspend_calculate_checksums(pageset1_map, &current_checksum_page,
+			&page_index);
+	
+	__suspend_calculate_checksums(pageset2_map, &current_checksum_page,
+			&page_index);
+
+	suspend_prepare_status(1, 0, "Checksums done.");
+}
+
+int __suspend_check_checksums(int whichpagedir, unsigned long **current_checksum_page,
+		int *page_index, struct reload_data **next_reload_data)
+{
+	int page_number, num_differences = 0;
+	unsigned long sum_now;
+	dyn_pageflags_t map;
+
+	if (whichpagedir == 1)
+		map = pageset1_map;
+	else
+		map = pageset2_map;
+
+	BITMAP_FOR_EACH_SET(map, page_number) {
+		/* Also ignore the page containing our variables */
+		if (!PageChecksumIgnore(pfn_to_page(page_number))) {
+			/* Also ignore the page containing our variables */
+			sum_now = suspend_page_checksum(pfn_to_page(page_number));
+			if (sum_now != *(*current_checksum_page + *page_index)) {
+				num_differences++;
+				if (next_reload_data) {
+					char *virt;
+					struct reload_data *this = *next_reload_data;
+					this->pageset = whichpagedir;
+					this->pagenumber = page_number;
+					this->page_address = pfn_to_page(page_number);
+					virt = kmap_atomic(pfn_to_page(page_number), KM_USER0);
+					memcpy(this->compared_version,
+						virt, PAGE_SIZE);
+					kunmap_atomic(virt, KM_USER0);
+					*next_reload_data = this->next;
+				}
+			}
+		}
+
+		*page_index++;
+		if (*page_index == CHECKSUMS_PER_PAGE) {
+			*page_index = 0;
+			*current_checksum_page = (unsigned long *) 
+				NEXT_CHECKSUM_PAGE(*current_checksum_page);
+		}
+	}
+
+	return num_differences;
+}
+
+void suspend_check_checksums(void)
+{
+	int page_index = 0, num_differences = 0;
+	unsigned long *current_checksum_page = first_checksum_page;
+	struct reload_data *next_reload_data = first_reload_data;
+
+	if (!first_checksum_page) {
+		suspend_prepare_status(1, 0, "Unable to checksum at this point.");
+		return;
+	}
+
+	num_differences += __suspend_check_checksums(1, &current_checksum_page,
+			&page_index, &next_reload_data);
+
+	num_differences += __suspend_check_checksums(2, &current_checksum_page,
+			&page_index, &next_reload_data);
+}
+
+/*
+ * free_reload_data.
+ *
+ * Reload data begins on a page boundary.
+ */
+void suspend_free_reload_data(void)
+{
+	struct reload_data *this_data = first_reload_data;
+	struct reload_data *prev_reload_data = this_data;
+	
+	while (this_data) {
+		if (this_data->compared_version)
+		  free_pages((unsigned long) this_data->compared_version, 0);
+
+		if (this_data->base_version)
+			free_pages((unsigned long) this_data->base_version, 0);
+
+		this_data = this_data->next;
+
+		if (!(((unsigned long) this_data) & ~PAGE_MASK)) {
+			prev_reload_data->next = this_data;
+			prev_reload_data = this_data;
+		}
+	}
+
+	this_data = first_reload_data;
+	while (this_data) {
+		prev_reload_data = this_data;
+		this_data = this_data->next;
+		free_pages((unsigned long) prev_reload_data, 0);
+		num_reload_pages--;
+	}
+
+	first_reload_data = last_reload_data = NULL;
+
+}
+
+/* suspend_reread_pages()
+ *
+ * Description:	Reread pages from an image for diagnosing differences.
+ * Arguments:	page_list:	A list containing information on pages
+ *                              to be reloaded, sorted by pageset and
+ *                              page index.
+ * Returns:	Zero on success or -1 on failure.
+ */
+
+int suspend_reread_pages(struct reload_data *page_list)
+{
+	int result = 0, whichtoread, pageset_offset = -1;
+	long i = 0;
+	struct suspend_module_ops *this_filter, *first_filter = get_next_filter(NULL);
+	dyn_pageflags_t *pageflags = &pageset1_map;
+
+	if (!page_list)
+		return 0;
+
+	for (whichtoread = page_list->pageset; whichtoread <= 2; whichtoread++) {
+		struct pagedir *pagedir;
+
+		switch (whichtoread) {
+			case 1:
+				pagedir = &pagedir1;
+				break;
+			case 2:
+				pagedir = &pagedir2;
+				pageflags = &pageset2_map;
+				pageset_offset = -1;
+				i = -1;
+				break;
+			default:
+				goto out;
+		}
+
+		suspend_message(SUSPEND_IO, SUSPEND_LOW, 0,
+			"Reread pages from pagedir %d.\n", whichtoread);
+
+		/* Initialise page transformers */
+		list_for_each_entry(this_filter, &suspend_filters, ops.filter.filter_list) {
+			if (this_filter->disabled)
+				continue;
+			if (this_filter->read_init && 
+				this_filter->read_init(whichtoread)) {
+				abort_suspend("Failed to initialise a filter.");
+				return 1;
+			}
+		}
+
+		/* Initialise writer */
+		if (active_writer->read_init(whichtoread)) {
+			abort_suspend("Failed to initialise the writer."); 
+			result = 1;
+			goto reread_free_buffers;
+		}
+
+		/* Read the pages */
+		while(i <= page_list->pagenumber) {
+			/* Read */
+			result = first_filter->ops.filter.read_chunk(
+					virt_to_page(page_list->base_version),
+					SUSPEND_SYNC);
+
+			if (result) {
+				abort_suspend("Failed to read a chunk of the image.");
+				goto reread_free_buffers;
+			}
+
+			/* Interactivity*/
+			check_shift_keys(0, NULL);
+
+			/* Prepare next */
+			pageset_offset = get_next_bit_on(*pageflags, pageset_offset);
+
+			/* Got the one we're after? */
+			i++;
+
+			if (i == page_list->pagenumber)
+				page_list = page_list->next;
+
+			if (page_list->pageset != whichtoread)
+				break;
+		}
+		
+reread_free_buffers:
+
+		/* Cleanup reads from this pageset. */
+		list_for_each_entry(this_filter, &suspend_modules, module_list) {
+			if (this_filter->disabled)
+				continue;
+			if (this_filter->read_cleanup &&
+				this_filter->read_cleanup()) {
+				abort_suspend("Failed to cleanup a filter.");
+				result = 1;
+			}
+		}
+
+		if (active_writer->read_cleanup()) {
+			abort_suspend("Failed to cleanup the writer.");
+			result = 1;
+		}
+	}
+out:
+	printk("\n");
+
+	return result;
+}
+void suspend_free_checksum_pages(void)
+{
+	unsigned long *next_checksum_page;
+
+	while(first_checksum_page) {
+		next_checksum_page =
+		  (unsigned long *) NEXT_CHECKSUM_PAGE(first_checksum_page);
+		free_pages((unsigned long) first_checksum_page, 0);
+		first_checksum_page = next_checksum_page;
+	}
+	last_checksum_page = NULL;
+	checksum_pages = 0;
+}
+
+#define PRINTABLE(a) (((a) < 32 || (a) > 122) ? '.' : (a))
+static void local_print_location(
+		unsigned char *real,
+		unsigned char *original,
+		unsigned char *resumetime)
+{
+	int i;
+
+	for (i = 0; i < 8; i++)
+		if (*(original + i) != *(resumetime + i))
+			break;
+	if (i == 8)
+		return;
+	
+	suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, "%p", real);
+	if (PageChecksumIgnore(virt_to_page(real)))
+		suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1,
+			" [NoSave]");
+	if (PageSlab(virt_to_page(real)))
+		suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1,
+			" [Slab]");
+	suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, "\n");
+
+#ifdef CONFIG_KDB
+	for (i = 0; i < 8; i++) {
+		static const char *last_sym = NULL;
+		if (*(original + i) != *(resumetime + i)) {
+			kdb_symtab_t symtab;
+
+			kdbnearsym((unsigned long) real + i,
+					&symtab);
+
+			if ((!symtab.sym_name) ||
+					(symtab.sym_name == last_sym))
+				continue;
+
+			last_sym = symtab.sym_name;
+
+			suspend_message(SUSPEND_INTEGRITY, SUSPEND_LOW, 1,
+				"%p = %s\n",
+				symtab.sym_start,
+				symtab.sym_name);
+		}
+	}
+#endif
+
+	for (i = 0; i < 8; i++)
+		suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1,
+			"%2x ", *(original + i));
+	suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, "    ");
+	for (i = 0; i < 8; i++)
+		suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1,
+			"%c", PRINTABLE(*(original + i)));
+	suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, "    ");
+	
+	for (i = 0; i < 8; i++)
+		suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1,
+			"%2x ", *(resumetime + i));
+	suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, "    ");
+	for (i = 0; i < 8; i++)
+		suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1,
+			"%c", PRINTABLE(*(resumetime + i)));
+	suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, "\n\n");
+}
+
+int suspend_allocate_reload_data(int pages)
+{
+	struct reload_data *this_data;
+	unsigned long data_start;
+	int i;
+
+	if (num_reload_pages >= pages)
+		return 0;
+
+	for (i = 1; i <= pages; i++) {
+		data_start = get_zeroed_page(GFP_ATOMIC);
+
+		if (!data_start)
+			return -ENOMEM;
+
+		SetPageChecksumIgnore(virt_to_page(data_start));
+		this_data = (struct reload_data *) data_start; 
+		num_reload_pages++;
+
+		while (data_start == 
+		  ((((unsigned long) (this_data + 1)) - 1) & PAGE_MASK)) {
+			struct page *page;
+			unsigned long virt;
+
+			virt = get_zeroed_page(GFP_ATOMIC);
+			if (!virt) {
+				printk("Couldn't get a page in which to store "
+					"a changed page.\n");
+				return -ENOMEM;
+			}
+			page = virt_to_page(virt);
+
+			this_data->compared_version = (char *) virt;
+			SetPageChecksumIgnore(page);
+			
+			virt = get_zeroed_page(GFP_ATOMIC);
+			if (!virt) {
+				printk("Couldn't get a page in which to store "
+					"a baseline page.\n");
+				return -ENOMEM;
+			}
+			page = virt_to_page(virt);
+
+			this_data->base_version = (char *) virt;
+			SetPageChecksumIgnore(page);
+
+			if (last_reload_data)
+				last_reload_data->next = this_data;
+			else
+				first_reload_data = this_data;
+			
+			last_reload_data = this_data;
+
+			this_data++;
+		}
+
+		check_shift_keys(0, NULL);
+	}
+
+	return 0;
+}
+
+void suspend_print_differences(void)
+{
+	struct reload_data *this_data = first_reload_data;
+	int i;
+
+	suspend_reread_pages(first_reload_data);
+	
+	while (this_data) {
+		if (this_data->pageset && 
+		    this_data->pagenumber) {
+			suspend_message(SUSPEND_INTEGRITY, SUSPEND_MEDIUM, 1,
+				"Pagedir %d. Page %d. Address %p."
+				" Base %p. Copy %p.\n",
+				this_data->pageset,
+				this_data->pagenumber,
+				page_address(this_data->page_address),
+				this_data->base_version,
+				this_data->compared_version);
+			for (i= 0; i < (PAGE_SIZE / 8); i++) {
+				local_print_location(
+					page_address(this_data->page_address) + i * 8,
+					this_data->base_version + i * 8,
+					this_data->compared_version + i * 8);
+				check_shift_keys(0, NULL);
+			}
+			check_shift_keys(1, NULL);
+		} else
+			return;
+		this_data = this_data->next;
+	}
+}
+
+int __suspend_allocate_checksum_pages(void)
+{
+	int pages_required =
+		(pagedir1.pageset_size + pagedir2.pageset_size) / CHECKSUMS_PER_PAGE;
+	unsigned long this_page;
+
+	while (checksum_pages <= pages_required) {
+		this_page = get_zeroed_page(GFP_ATOMIC);
+		if (!this_page)
+			return -ENOMEM;
+
+		if (!first_checksum_page)
+			first_checksum_page = 
+				(unsigned long *) this_page;
+		else
+			NEXT_CHECKSUM_PAGE(last_checksum_page) = this_page;
+		
+		last_checksum_page = (unsigned long *) this_page;
+		SetPageChecksumIgnore(virt_to_page(this_page));
+		checksum_pages++;
+	}
+
+	return suspend_allocate_reload_data(2);
+}
+
+int suspend_checksum_init(void)
+{
+	if (suspend_allocate_dyn_pageflags(&checksum_map))
+		return 1;
+	return 0;
+}
+
+
+void suspend_checksum_cleanup(void)
+{
+	suspend_free_reload_data();
+	suspend_free_checksum_pages();
+	
+	suspend_free_dyn_pageflags(&checksum_map);
+}
diff -urN oldtree/kernel/power/suspend_file.c newtree/kernel/power/suspend_file.c
--- oldtree/kernel/power/suspend_file.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/suspend_file.c	2006-03-08 17:10:10.373050250 +0000
@@ -0,0 +1,1074 @@
+/*
+ * Filewriter.c
+ *
+ * Copyright 2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * Distributed under GPLv2.
+ * 
+ * This file encapsulates functions for usage of a simple file as a
+ * backing store. It is based upon the swapwriter, and shares the
+ * same basic working. Here, though, we have nothing to do with
+ * swapspace, and only one device to worry about.
+ *
+ * The user can just
+ *
+ * echo Suspend2 > /path/to/my_file
+ *
+ * and
+ *
+ * echo /path/to/my_file > /proc/software_suspend/filewriter_target
+ *
+ * then put what they find in /proc/software_suspend/resume2
+ * as their resume2= parameter in lilo.conf (and rerun lilo if using it).
+ *
+ * Having done this, they're ready to suspend and resume.
+ *
+ * TODO:
+ * - File resizing.
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/syscalls.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+
+#include "suspend2.h"
+#include "suspend2_common.h"
+#include "version.h"
+#include "proc.h"
+#include "modules.h"
+#include "ui.h"
+#include "extent.h"
+#include "io.h"
+#include "storage.h"
+#include "block_io.h"
+
+static struct suspend_module_ops filewriterops;
+
+/* Details of our target.  */
+
+char filewriter_target[256];
+static struct inode *target_inode;
+static struct file *target_file;
+static struct block_device *target_bdev;
+static int used_devt = 0;
+static sector_t target_firstblock = 0;
+static int target_storage_available = 0;
+static int target_claim = 0;
+
+static char HaveImage[] = "HaveImage\n";
+static char NoImage[] =   "Suspend2\n";
+static const int resumed_before_byte = sizeof(HaveImage) + 1;
+#define sig_size resumed_before_byte
+
+extern dev_t ROOT_DEV;
+extern char *__initdata root_device_name;
+
+/* Header_pages must be big enough for signature */
+static int header_pages, main_pages;
+
+#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
+
+static struct suspend_bdev_info devinfo;
+
+static void set_devinfo(struct block_device *bdev, int target_blkbits)
+{
+	devinfo.bdev = bdev;
+	if (!target_blkbits) {
+		devinfo.bmap_shift = devinfo.blocks_per_page = 0;
+	} else {
+		devinfo.bmap_shift = target_blkbits - 9;
+		devinfo.blocks_per_page = (1 << (PAGE_SHIFT - target_blkbits));
+	}
+}
+
+/* Extent chain for blocks */
+static struct extent_chain block_chain;
+
+/* Signature operations */
+enum {
+	GET_IMAGE_EXISTS,
+	INVALIDATE,
+	MARK_RESUME_ATTEMPTED,
+};
+
+/* Helpers. */
+
+static int filewriter_storage_available(void)
+{
+	int result = 0;
+
+	if (!target_inode)
+		return 0;
+
+	switch (target_inode->i_mode & S_IFMT) {
+		case S_IFSOCK:
+		case S_IFCHR:
+		case S_IFIFO: /* Socket, Char, Fifo */
+			return -1;
+		case S_IFREG: /* Regular file: current size - holes + free space on part */
+			result = target_storage_available;
+			break;
+		case S_IFBLK: /* Block device */
+			if (target_bdev->bd_disk) {
+				if (target_bdev->bd_part)
+					result = (unsigned long)target_bdev->bd_part->nr_sects >> (PAGE_SHIFT - 9);
+				else
+					result = (unsigned long)target_bdev->bd_disk->capacity >> (PAGE_SHIFT - 9);
+			} else {
+				printk("bdev->bd_disk null.\n");
+				return 0;
+			}
+	}
+
+	return result;
+}
+
+static int has_contiguous_blocks(int page_num)
+{
+	int j;
+	sector_t last = 0;
+
+	for (j = 0; j < devinfo.blocks_per_page; j++) {
+		sector_t this = bmap(target_inode,
+				page_num * devinfo.blocks_per_page + j);
+
+		if (!this || (last && (last + 1) != this))
+			break;
+
+		last = this;
+	}
+			
+	return (j == devinfo.blocks_per_page);
+}
+
+/*
+ * Ramdisk access variables
+ */
+
+static int size_ignoring_sparseness(void)
+{
+	int mappable = 0, i;
+	
+	if (target_is_normal_file()) {
+		for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++)
+			if (has_contiguous_blocks(i))
+				mappable++;
+	
+		return mappable;
+	} else
+		return filewriter_storage_available();
+}
+
+static void get_main_pool_phys_params(void)
+{
+	int i;
+	
+	if (block_chain.first)
+		put_extent_chain(&block_chain);
+
+	if (target_is_normal_file()) {
+		int extent_min = -1, extent_max = -1;
+
+		for (i = 0;
+		     i < (target_inode->i_size >> PAGE_SHIFT);
+		     i++) {
+			sector_t new_sector;
+
+			if (!has_contiguous_blocks(i))
+				continue;
+
+			new_sector = bmap(target_inode,
+				(i * devinfo.blocks_per_page));
+
+			/* 
+			 * I'd love to be able to fill in holes and resize 
+			 * files, but not yet...
+			 */
+
+			if (new_sector == extent_max + 1)
+				extent_max+= devinfo.blocks_per_page;
+			else {
+				if (extent_min > -1) {
+					if (test_action_state(SUSPEND_TEST_BIO))
+						printk("Adding extent %d-%d.\n",
+							extent_min << devinfo.bmap_shift,
+						        ((extent_max + 1) << devinfo.bmap_shift) - 1);
+					append_extent_to_extent_chain(
+						&block_chain,
+						extent_min,
+						extent_max);
+				}
+				extent_min = new_sector;
+				extent_max = extent_min + devinfo.blocks_per_page - 1;
+			}
+		}
+		if (extent_min > -1) {
+			append_extent_to_extent_chain(&block_chain,
+				       extent_min, extent_max);
+			if (test_action_state(SUSPEND_TEST_BIO))
+				printk("Adding extent %d-%d.\n",
+					extent_min << devinfo.bmap_shift,
+					((extent_max + 1) << devinfo.bmap_shift) - 1);
+		}
+
+	} else
+		if (target_storage_available > 0) {
+			append_extent_to_extent_chain(&block_chain,
+			 0, 
+			 min(main_pages, target_storage_available) * devinfo.blocks_per_page - 1);
+		}
+}
+
+static void get_target_info(int get_size)
+{
+	if (!target_bdev || IS_ERR(target_bdev)) {
+		target_inode = NULL;
+		set_devinfo(NULL, 0);
+		target_storage_available = 0;
+	} else {
+		if (!target_inode)
+			target_inode = target_bdev->bd_inode;
+		set_devinfo(target_bdev, target_inode->i_blkbits);
+		if (get_size)
+			target_storage_available = size_ignoring_sparseness();
+	}	
+}
+
+static void filewriter_cleanup(int finishing_cycle)
+{
+	if (target_bdev) {
+		if (target_claim) {
+			bd_release(target_bdev);
+			target_claim = 0;
+		}
+
+		if (used_devt) {
+			blkdev_put(target_bdev);
+			used_devt = 0;
+		}
+		target_bdev = NULL;
+		get_target_info(0);
+	}
+
+	if (target_file > 0) {
+		filp_close(target_file, NULL);
+		target_file = NULL;
+	}
+}
+
+static void filewriter_get_target_info(char *target, int get_size,
+		int resume2)
+{
+	if (target_file)
+		filewriter_cleanup(0);
+
+	if (!target || !strlen(target))
+		return;
+
+	target_file = filp_open(target, O_RDWR, 0);
+
+	if (IS_ERR(target_file) || !target_file) {
+		dev_t resume_dev_t;
+
+		if (!resume2) {
+			printk("Open file %s returned %p.\n", target, target_file);
+			target_file = NULL;
+			return;
+		}
+
+		target_file = NULL;
+		resume_dev_t = name_to_dev_t(target);
+		if (!resume_dev_t) {
+			printk("Open file %s returned %p and name_to_devt failed.\n", target, target_file);
+			if (!resume_dev_t) {
+				struct kstat stat;
+				int error = vfs_stat(target, &stat);
+				if (error) {
+					printk("Stating the file also failed. Nothing more we can do.\n");
+					return;
+				}
+				resume_dev_t = stat.rdev;
+			}
+			return;
+		}
+	     	target_bdev = open_by_devnum(resume_dev_t, FMODE_READ);
+		if (IS_ERR(target_bdev)) {
+			printk("Got a dev_num (%lx) but failed to open it.\n",
+					(unsigned long) resume_dev_t);
+			return;
+		}
+		used_devt = 1;
+		target_inode = target_bdev->bd_inode;
+	} else
+		target_inode = target_file->f_mapping->host;
+
+	if (S_ISLNK(target_inode->i_mode) ||
+	    S_ISDIR(target_inode->i_mode) ||
+	    S_ISSOCK(target_inode->i_mode) ||
+	    S_ISFIFO(target_inode->i_mode)) {
+		printk("The filewriter works with regular files, character files and block devices.\n");
+		goto cleanup;
+	}
+
+	if (!used_devt) {
+		if (S_ISBLK(target_inode->i_mode)) {
+			target_bdev = I_BDEV(target_inode);
+			if (!bd_claim(target_bdev, &filewriterops))
+				target_claim = 1;
+		} else
+			target_bdev = target_inode->i_sb->s_bdev;
+	}
+
+	get_target_info(get_size);
+
+	if (!resume2)
+		target_firstblock = bmap(target_inode, 0) << devinfo.bmap_shift;
+	
+	return;
+cleanup:
+	target_inode = NULL;
+	if (target_file) {
+		filp_close(target_file, NULL);
+		target_file = NULL;
+	}
+	get_target_info(0);
+}
+
+int parse_signature(char *header)
+{
+	int have_image = !memcmp(HaveImage, header, sizeof(HaveImage) - 1);
+	int no_image_header = !memcmp(NoImage, header, sizeof(NoImage) - 1);
+
+	if (no_image_header)
+		return 0;
+
+	if (!have_image)
+		return -1;
+
+	if (header[resumed_before_byte] & 1)
+		set_suspend_state(SUSPEND_RESUMED_BEFORE);
+	else
+		clear_suspend_state(SUSPEND_RESUMED_BEFORE);
+
+	return 1;
+}
+
+/* prepare_signature */
+
+static int prepare_signature(char *current_header)
+{
+	/* 
+	 * Explicitly put the \0 that clears the 'tried to resume from
+	 * this image before' flag.
+	 */
+	strncpy(current_header, HaveImage, sizeof(HaveImage));
+	current_header[resumed_before_byte] = 0;
+	return 0;
+}
+
+static int filewriter_storage_allocated(void)
+{
+	int result;
+
+	if (!target_inode)
+		return 0;
+
+	if (target_is_normal_file()) {
+		result = (int) target_storage_available;
+	} else
+		result = header_pages + main_pages;
+
+	return result;
+}
+
+static int filewriter_release_storage(void)
+{
+	if ((test_action_state(SUSPEND_KEEP_IMAGE)) &&
+	     test_suspend_state(SUSPEND_NOW_RESUMING))
+		return 0;
+
+	put_extent_chain(&block_chain);
+
+	header_pages = main_pages = 0;
+	return 0;
+}
+
+static int filewriter_allocate_header_space(int space_requested)
+{
+	int i;
+
+	/* We only steal pages from the main pool. If it doesn't have any yet... */
+	
+	if (!block_chain.first)
+		return 0;
+
+	extent_state_goto_start(&suspend_writer_posn);
+	
+	for (i = 0; i < space_requested; i++) {
+		if (suspend_bio_ops.forward_one_page())
+			return -ENOSPC;
+	}
+
+	/* The end of header pages will be the start of pageset 2 */
+	extent_state_save(&suspend_writer_posn, &suspend_writer_posn_save[2]);
+	header_pages = space_requested;
+	return 0;
+}
+
+static int filewriter_allocate_storage(int space_requested)
+{
+	int result = 0, prev_header_pages;
+	/* FIXME This looks wrong */
+	int blocks_to_get = (space_requested << devinfo.bmap_shift) - block_chain.size;
+	
+	/* Only release_storage reduces the size */
+	if (blocks_to_get < 1)
+		return 0;
+
+	main_pages = space_requested;
+
+	get_main_pool_phys_params();
+
+	suspend_message(SUSPEND_WRITER, SUSPEND_MEDIUM, 0,
+		"Finished with block_chain.size == %d.\n",
+		block_chain.size);
+
+	if (block_chain.size < (header_pages + main_pages))
+		result = -ENOSPC;
+
+	prev_header_pages = header_pages;
+	header_pages = 0;
+	filewriter_allocate_header_space(prev_header_pages);
+	return result;
+}
+
+static int filewriter_write_header_init(void)
+{
+	char new_sig[sig_size];
+	
+	extent_state_goto_start(&suspend_writer_posn);
+
+	suspend_writer_buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	suspend_writer_buffer_posn = 0;
+
+	/* We change it once the whole header is written */
+	strcpy(new_sig, NoImage);
+	suspend_bio_ops.write_header_chunk(new_sig, sig_size);
+
+	/* Info needed to bootstrap goes at the start of the header.
+	 * First we save the basic info needed for reading, including the number
+	 * of header pages. Then we save the structs containing data needed
+	 * for reading the header pages back.
+	 * Note that even if header pages take more than one page, when we
+	 * read back the info, we will have restored the location of the
+	 * next header page by the time we go to use it.
+	 */
+	suspend_bio_ops.write_header_chunk((char *) &suspend_writer_posn_save, 
+			3 * sizeof(struct extent_iterate_saved_state));
+
+	suspend_bio_ops.write_header_chunk((char *) &devinfo,
+			sizeof(devinfo));
+
+	serialise_extent_chain(&block_chain);
+	
+	return 0;
+}
+
+static int filewriter_write_header_cleanup(void)
+{
+	/* Write any unsaved data */
+	if (suspend_writer_buffer_posn)
+		suspend_bio_ops.write_header_chunk_finish();
+
+	suspend_bio_ops.finish_all_io();
+
+	extent_state_goto_start(&suspend_writer_posn);
+	suspend_bio_ops.forward_one_page();
+
+	/* Adjust image header */
+	suspend_bio_ops.bdev_page_io(READ, target_bdev,
+			target_firstblock,
+			virt_to_page(suspend_writer_buffer));
+
+	prepare_signature(suspend_writer_buffer);
+		
+	suspend_bio_ops.bdev_page_io(WRITE, target_bdev,
+			target_firstblock,
+			virt_to_page(suspend_writer_buffer));
+
+	free_page((unsigned long) suspend_writer_buffer);
+	suspend_writer_buffer = NULL;
+	
+	suspend_bio_ops.finish_all_io();
+
+	return 0;
+}
+
+/* HEADER READING */
+
+#ifdef CONFIG_DEVFS_FS
+int  create_dev(char *name, dev_t dev, char *devfs_name);
+#else
+static int create_dev(char *name, dev_t dev, char *devfs_name)
+{
+	sys_unlink(name);
+	return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev));
+}
+#endif
+
+static int rd_init(void)
+{
+	suspend_writer_buffer_posn = 0;
+
+	create_dev("/dev/root", ROOT_DEV, root_device_name);
+	create_dev("/dev/ram", MKDEV(RAMDISK_MAJOR, 0), NULL);
+
+	suspend_read_fd = sys_open("/dev/root", O_RDONLY, 0);
+	if (suspend_read_fd < 0)
+		goto out;
+	
+	sys_read(suspend_read_fd, suspend_writer_buffer, BLOCK_SIZE);
+
+	memcpy(&suspend_writer_posn_save,
+		suspend_writer_buffer + suspend_writer_buffer_posn,
+		sizeof(suspend_writer_posn_save));
+	
+	suspend_writer_buffer_posn += sizeof(suspend_writer_posn_save);
+
+	return 0;
+out:
+	sys_unlink("/dev/ram");
+	sys_unlink("/dev/root");
+	return -EIO;
+}
+
+static int file_init(void)
+{
+	suspend_writer_buffer_posn = sig_size;
+
+	/* Read filewriter configuration */
+	suspend_bio_ops.bdev_page_io(READ, target_bdev,
+			target_firstblock,
+			virt_to_page((unsigned long) suspend_writer_buffer));
+	
+	return 0;
+}
+
+/*
+ * read_header_init()
+ * 
+ * Ramdisk support based heavily on init/do_mounts_rd.c
+ *
+ * Description:
+ * 1. Attempt to read the device specified with resume2=.
+ * 2. Check the contents of the header for our signature.
+ * 3. Warn, ignore, reset and/or continue as appropriate.
+ * 4. If continuing, read the filewriter configuration section
+ *    of the header and set up block device info so we can read
+ *    the rest of the header & image.
+ *
+ * Returns:
+ * May not return if user choose to reboot at a warning.
+ * -EINVAL if cannot resume at this time. Booting should continue
+ * normally.
+ */
+
+static int filewriter_read_header_init(void)
+{
+	int result;
+	struct block_device *tmp;
+
+	*(suspend_bio_ops.need_extra_next) = 1;
+
+	suspend_writer_buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	
+	if (test_suspend_state(SUSPEND_TRY_RESUME_RD))
+		result = rd_init();
+	else
+		result = file_init();
+	
+	if (result)
+		return result;
+
+	suspend_writer_buffer_posn = sig_size;
+	memcpy(&suspend_writer_posn_save,
+			suspend_writer_buffer + suspend_writer_buffer_posn,
+			3 * sizeof(struct extent_iterate_saved_state));
+	
+	suspend_writer_buffer_posn += 3 * sizeof(struct extent_iterate_saved_state);
+
+	tmp = devinfo.bdev;
+
+	memcpy(&devinfo,
+			suspend_writer_buffer + suspend_writer_buffer_posn,
+			sizeof(struct suspend_bdev_info));
+	devinfo.bdev = tmp;
+	suspend_writer_buffer_posn += sizeof(struct suspend_bdev_info);
+
+	extent_state_goto_start(&suspend_writer_posn);
+	load_extent_chain(&block_chain);
+
+	return 0;
+}
+
+static int filewriter_read_header_cleanup(void)
+{
+	free_page((unsigned long) suspend_writer_buffer);
+	suspend_writer_buffer = NULL;
+	return 0;
+}
+
+static int filewriter_signature_op(int op)
+{
+	char *cur;
+	int result = 0, changed = 0;
+	
+	if(target_bdev <= 0)
+		return -1;
+
+	cur = (char *) get_zeroed_page(GFP_ATOMIC);
+	if (!cur) {
+		printk("Unable to allocate a page for reading the image signature.\n");
+		return -ENOMEM;
+	}
+
+	suspend_bio_ops.bdev_page_io(READ, target_bdev,
+			target_firstblock,
+			virt_to_page(cur));
+
+	result = parse_signature(cur);
+		
+	switch (op) {
+		case INVALIDATE:
+			if (result == -1)
+				goto out;
+
+			strcpy(cur, NoImage);
+			cur[resumed_before_byte] = 0;
+			result = changed = 1;
+			break;
+		case MARK_RESUME_ATTEMPTED:
+			if (result == 1) {
+				cur[resumed_before_byte] |= 1;
+				changed = 1;
+			}
+			break;
+	}
+
+	if (changed)
+		suspend_bio_ops.bdev_page_io(WRITE, target_bdev,
+				target_firstblock,
+				virt_to_page(cur));
+
+out:
+	suspend_bio_ops.finish_all_io();
+	free_page((unsigned long) cur);
+	return result;
+}
+
+/*
+ * workspace_size
+ *
+ * Description:
+ * Returns the number of bytes of RAM needed for this
+ * code to do its work. (Used when calculating whether
+ * we have enough memory to be able to suspend & resume).
+ *
+ */
+static unsigned long filewriter_memory_needed(void)
+{
+	return 0;
+}
+
+/* Print debug info
+ *
+ * Description:
+ */
+
+static int filewriter_print_debug_stats(char *buffer, int size)
+{
+	int len = 0;
+	
+	if (suspend_active_writer != &filewriterops) {
+		len = snprintf_used(buffer, size, "- Filewriter inactive.\n");
+		return len;
+	}
+
+	len = snprintf_used(buffer, size, "- Filewriter active.\n");
+
+	len+= snprintf_used(buffer+len, size-len, "  Storage available for image: %ld pages.\n",
+			filewriter_storage_allocated());
+
+	return len;
+}
+
+/*
+ * Storage needed
+ *
+ * Returns amount of space in the image header required
+ * for the filewriter's data.
+ *
+ * We ensure the space is allocated, but actually save the
+ * data from write_header_init and therefore don't also define a
+ * save_config_info routine.
+ */
+static unsigned long filewriter_storage_needed(void)
+{
+	return strlen(filewriter_target) + 1;
+}
+
+/* 
+ * filewriter_invalidate_image
+ * 
+ */
+static int filewriter_invalidate_image(void)
+{
+	int result;
+
+	if (nr_suspends > 0)
+		filewriter_release_storage();
+
+	result = filewriter_signature_op(INVALIDATE);
+	if (result == 1 && !nr_suspends)
+		printk(KERN_WARNING name_suspend "Image invalidated.\n");
+
+	return result;
+}
+
+/*
+ * Image_exists
+ *
+ */
+
+static int filewriter_image_exists(void)
+{
+	return filewriter_signature_op(GET_IMAGE_EXISTS);
+}
+
+/*
+ * Mark resume attempted.
+ *
+ * Record that we tried to resume from this image.
+ */
+
+static void filewriter_mark_resume_attempted(void)
+{
+	filewriter_signature_op(MARK_RESUME_ATTEMPTED);
+}
+
+static void filewriter_set_resume2(void)
+{
+	char *buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	char *buffer2 = (char *) get_zeroed_page(GFP_ATOMIC);
+	unsigned long sector = bmap(target_inode, 0);
+	int offset = 0;
+
+	if (target_bdev) {
+		set_devinfo(target_bdev, target_inode->i_blkbits);
+
+		bdevname(target_bdev, buffer2);
+		offset += snprintf(buffer + offset, PAGE_SIZE - offset, 
+				"/dev/%s", buffer2);
+		
+		if (sector)
+			offset += snprintf(buffer + offset, PAGE_SIZE - offset,
+				":0x%lx", sector << devinfo.bmap_shift);
+	} else
+		offset += snprintf(buffer + offset, PAGE_SIZE - offset,
+				"%s is not a valid target.", filewriter_target);
+			
+	sprintf(resume2_file, "file:%s", buffer);
+
+	free_page((unsigned long) buffer);
+	free_page((unsigned long) buffer2);
+
+	attempt_to_parse_resume_device();
+}
+
+static int __test_filewriter_target(char *target, int resume_time)
+{
+	filewriter_get_target_info(filewriter_target, 0, 0);
+	if (filewriter_signature_op(GET_IMAGE_EXISTS) > -1) {
+		printk(name_suspend "Filewriter: File signature found.\n");
+		if (!resume_time)
+			filewriter_set_resume2();
+		
+		suspend_bio_ops.set_devinfo(&devinfo);
+		suspend_writer_posn.chains = &block_chain;
+		suspend_writer_posn.num_chains = 1;
+
+		return 0;
+	}
+
+	if (*filewriter_target)
+		printk(KERN_ERR name_suspend
+			"Filewriter: Sorry. No signature found at %s.\n",
+			filewriter_target);
+	else
+		printk(KERN_ERR name_suspend
+			"Filewriter: Sorry. No signature found.\n");
+
+	return 1;
+}
+
+static void test_filewriter_target(void)
+{
+	__test_filewriter_target(filewriter_target, 0);
+}
+
+/*
+ * Parse Image Location
+ *
+ * Attempt to parse a resume2= parameter.
+ * Swap Writer accepts:
+ * resume2=file:DEVNAME[:FIRSTBLOCK]
+ *
+ * Where:
+ * DEVNAME is convertable to a dev_t by name_to_dev_t
+ * FIRSTBLOCK is the location of the first block in the file.
+ * BLOCKSIZE is the logical blocksize >= SECTOR_SIZE & <= PAGE_SIZE, 
+ * mod SECTOR_SIZE == 0 of the device.
+ * Data is validated by attempting to read a header from the
+ * location given. Failure will result in filewriter refusing to
+ * save an image, and a reboot with correct parameters will be
+ * necessary.
+ */
+
+static int filewriter_parse_sig_location(char *commandline, int only_writer)
+{
+	char *thischar, *devstart = NULL, *colon = NULL, *at_symbol = NULL;
+	int result = -EINVAL, target_blocksize = 0;
+
+	if (strncmp(commandline, "file:", 5)) {
+		if (!only_writer)
+			return 1;
+	} else
+		commandline += 5;
+
+	/* 
+	 * Don't check signature again if we're beginning a cycle. If we already
+	 * did the initialisation successfully, assume we'll be okay when it comes
+	 * to resuming.
+	 */
+	if (target_bdev)
+		return 0;
+	
+	devstart = thischar = commandline;
+	while ((*thischar != ':') && (*thischar != '@') &&
+		((thischar - commandline) < 250) && (*thischar))
+		thischar++;
+
+	if (*thischar == ':') {
+		colon = thischar;
+		*colon = 0;
+		thischar++;
+	}
+
+	while ((*thischar != '@') && ((thischar - commandline) < 250) && (*thischar))
+		thischar++;
+
+	if (*thischar == '@') {
+		at_symbol = thischar;
+		*at_symbol = 0;
+	}
+	
+	if (colon)
+		target_firstblock = (int) simple_strtoul(colon + 1, NULL, 0);
+	else
+		target_firstblock = 0;
+
+	if (at_symbol) {
+		target_blocksize = (int) simple_strtoul(at_symbol + 1, NULL, 0);
+		if (target_blocksize & (SECTOR_SIZE - 1)) {
+			printk("Filewriter: Blocksizes are multiples of %d.\n", SECTOR_SIZE);
+			result = -EINVAL;
+			goto out;
+		}
+	}
+	
+	filewriter_get_target_info(commandline, 0, 1);
+
+	if (!target_bdev || IS_ERR(target_bdev)) {
+		target_bdev = NULL;
+		result = -1;
+		goto out;
+	}
+
+	result = __test_filewriter_target(commandline, 1);
+
+out:
+	if (colon)
+		*colon = ':';
+	if (at_symbol)
+		*at_symbol = '@';
+
+	return result;
+}
+
+/* filewriter_save_config_info
+ *
+ * Description:	Save the target's name, not for resume time, but for all_settings.
+ * Arguments:	Buffer:		Pointer to a buffer of size PAGE_SIZE.
+ * Returns:	Number of bytes used for saving our data.
+ */
+
+static int filewriter_save_config_info(char *buffer)
+{
+	strcpy(buffer, filewriter_target);
+	return strlen(filewriter_target) + 1;
+}
+
+/* filewriter_load_config_info
+ *
+ * Description:	Reload target's name.
+ * Arguments:	Buffer:		Pointer to the start of the data.
+ *		Size:		Number of bytes that were saved.
+ */
+
+static void filewriter_load_config_info(char *buffer, int size)
+{
+	strcpy(filewriter_target, buffer);
+}
+
+static int filewriter_initialise(int starting_cycle)
+{
+	int result = 0;
+
+	if (starting_cycle) {
+	       if (suspend_active_writer != &filewriterops)
+			return 0;
+
+		if (!*filewriter_target) {
+			printk("Filewriter is the active writer,  but no filename has been set.\n");
+			return 1;
+		}
+	}
+
+	if (filewriter_target)
+		filewriter_get_target_info(filewriter_target, starting_cycle, 0);
+
+	if (starting_cycle && (filewriter_image_exists() == -1)) {
+		printk("%s is does not have a valid signature for suspending.\n",
+				filewriter_target);
+		result = 1;
+	}
+
+	return result;
+}
+
+static struct suspend_proc_data filewriter_proc_data[] = {
+
+	{
+	 .filename			= "filewriter_target",
+	 .permissions			= PROC_RW,
+	 .type				= SUSPEND_PROC_DATA_STRING,
+	 .needs_storage_manager		= 2,
+	 .data = {
+		 .string = {
+			 .variable	= filewriter_target,
+			 .max_length	= 256,
+		 }
+	 },
+	 .write_proc			= test_filewriter_target,
+	},
+
+	{ .filename			= "disable_filewriter",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_INTEGER,
+	  .data = {
+		.integer = {
+			.variable	= &filewriterops.disabled,
+			.minimum	= 0,
+			.maximum	= 1,
+		}
+	  },
+	  .write_proc			= attempt_to_parse_resume_device2,
+	}
+};
+
+static struct suspend_module_ops filewriterops = {
+	.type					= WRITER_PLUGIN,
+	.name					= "File Writer",
+	.module					= THIS_MODULE,
+	.memory_needed				= filewriter_memory_needed,
+	.print_debug_info			= filewriter_print_debug_stats,
+	.save_config_info			= filewriter_save_config_info,
+	.load_config_info			= filewriter_load_config_info,
+	.storage_needed				= filewriter_storage_needed,
+	.initialise				= filewriter_initialise,
+	.cleanup				= filewriter_cleanup,
+
+	.ops = {
+		.writer = {
+		 .storage_available 	= filewriter_storage_available,
+		 .storage_allocated	= filewriter_storage_allocated,
+		 .release_storage	= filewriter_release_storage,
+		 .allocate_header_space	= filewriter_allocate_header_space,
+		 .allocate_storage	= filewriter_allocate_storage,
+		 .image_exists		= filewriter_image_exists,
+		 .mark_resume_attempted	= filewriter_mark_resume_attempted,
+		 .write_header_init	= filewriter_write_header_init,
+		 .write_header_cleanup	= filewriter_write_header_cleanup,
+		 .read_header_init	= filewriter_read_header_init,
+		 .read_header_cleanup	= filewriter_read_header_cleanup,
+		 .invalidate_image	= filewriter_invalidate_image,
+		 .parse_sig_location	= filewriter_parse_sig_location,
+		}
+	}
+};
+
+/* ---- Registration ---- */
+static __init int filewriter_load(void)
+{
+	int result;
+	int i, numfiles = sizeof(filewriter_proc_data) / sizeof(struct suspend_proc_data);
+	
+	printk("Suspend2 FileWriter loading.\n");
+
+	filewriterops.read_init = suspend_bio_ops.read_init;
+	filewriterops.ops.writer.read_chunk = suspend_bio_ops.read_chunk;
+	filewriterops.read_cleanup = suspend_bio_ops.read_cleanup;
+	filewriterops.write_init = suspend_bio_ops.write_init;
+	filewriterops.ops.writer.write_chunk = suspend_bio_ops.write_chunk;
+	filewriterops.write_cleanup = suspend_bio_ops.write_cleanup;
+	filewriterops.ops.writer.read_header_chunk =
+		suspend_bio_ops.read_header_chunk;
+	filewriterops.ops.writer.write_header_chunk =
+		suspend_bio_ops.write_header_chunk;
+
+	if (!(result = suspend_register_module(&filewriterops))) {
+		for (i=0; i< numfiles; i++)
+			suspend_register_procfile(&filewriter_proc_data[i]);
+	} else
+		printk("Suspend2 FileWriter unable to register!\n");
+
+	return result;
+}
+
+#ifdef MODULE
+static __exit void filewriter_unload(void)
+{
+	int i, numfiles = sizeof(filewriter_proc_data) / sizeof(struct suspend_proc_data);
+
+	printk("Suspend2 FileWriter unloading.\n");
+
+	for (i=0; i< numfiles; i++)
+		suspend_unregister_procfile(&filewriter_proc_data[i]);
+	suspend_unregister_module(&filewriterops);
+}
+
+module_init(filewriter_load);
+module_exit(filewriter_unload);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nigel Cunningham");
+MODULE_DESCRIPTION("Suspend2 filewriter");
+#else
+late_initcall(filewriter_load);
+#endif
diff -urN oldtree/kernel/power/suspend_swap.c newtree/kernel/power/suspend_swap.c
--- oldtree/kernel/power/suspend_swap.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/suspend_swap.c	2006-03-08 16:53:54.012031500 +0000
@@ -0,0 +1,1153 @@
+/*
+ * Swapwriter.c
+ *
+ * Copyright 2004-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * Distributed under GPLv2.
+ * 
+ * This file encapsulates functions for usage of swap space as a
+ * backing store.
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/swapops.h>
+#include <linux/swap.h>
+
+#include "suspend2.h"
+#include "suspend2_common.h"
+#include "version.h"
+#include "proc.h"
+#include "modules.h"
+#include "io.h"
+#include "ui.h"
+#include "extent.h"
+#include "block_io.h"
+
+static struct suspend_module_ops swapwriterops;
+
+#define SIGNATURE_VER 6
+
+/* --- Struct of pages stored on disk */
+
+union diskpage {
+	union swap_header swh;	/* swh.magic is the only member used */
+};
+
+union p_diskpage {
+	union diskpage *pointer;
+	char *ptr;
+        unsigned long address;
+};
+
+/* Devices used for swap */
+static struct suspend_bdev_info devinfo[MAX_SWAPFILES];
+
+/* Extent chains for swap & blocks */
+struct extent_chain swapextents;
+struct extent_chain block_chain[MAX_SWAPFILES];
+
+static dev_t header_dev_t;
+static struct block_device *header_block_device;
+static unsigned long headerblock;
+
+/* For swapfile automatically swapon/off'd. */
+static char swapfilename[SWAP_FILENAME_MAXLENGTH] = "";
+extern asmlinkage long sys_swapon(const char *specialfile, int swap_flags);
+extern asmlinkage long sys_swapoff(const char *specialfile);
+static int suspend_swapon_status;
+
+/* Header Page Information */
+static int header_pages_allocated;
+
+/* User Specified Parameters. */
+
+static unsigned long resume_firstblock;
+static int resume_blocksize;
+static dev_t resume_dev_t;
+static struct block_device *resume_block_device;
+
+struct sysinfo swapinfo;
+static int swapwriter_invalidate_image(void);
+
+/* Block devices open. */
+struct bdev_opened
+{
+	dev_t device;
+	struct block_device *bdev;
+	int set_swapinfo;
+	int claimed;
+};
+
+/* 
+ * Entry MAX_SWAPFILES is the resume block device, which may
+ * not be a swap device enabled when we suspend.
+ * Entry MAX_SWAPFILES + 1 is the header block device, which
+ * is needed before we find out which slot it occupies.
+ */
+static struct bdev_opened *bdev_info_list[MAX_SWAPFILES + 2];
+
+static void close_bdev(int i)
+{
+	struct bdev_opened *this = bdev_info_list[i];
+
+	if (this->claimed)
+		bd_release(this->bdev);
+
+	/* Release our reference. */
+	blkdev_put(this->bdev);
+
+	if (this->set_swapinfo)
+
+	/* Free our info. */
+	kfree(this);
+
+	bdev_info_list[i] = NULL;
+}
+
+static void close_bdevs(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_SWAPFILES; i++)
+		if (bdev_info_list[i])
+			close_bdev(i);
+
+	resume_block_device = header_block_device = NULL;
+}
+
+static struct block_device *open_bdev(int index, dev_t device)
+{
+	struct bdev_opened *this;
+	struct block_device *bdev;
+
+	if (bdev_info_list[index] && (bdev_info_list[index]->device == device)) {
+		bdev = bdev_info_list[index]->bdev;
+		return bdev;
+	}
+	
+	if (bdev_info_list[index] && bdev_info_list[index]->device != device)
+		close_bdev(index);
+
+	bdev = open_by_devnum(device, FMODE_READ);
+
+	if (IS_ERR(bdev) || !bdev) {
+		suspend_early_boot_message(1,SUSPEND_CONTINUE_REQ,  
+				"Failed to get access to block device "
+				"%d.\n You could be "
+				"booting with a 2.6 kernel when you "
+				"suspended a 2.4 kernel.");
+		return ERR_PTR(-EINVAL);
+	}
+
+	this = kmalloc(sizeof(struct bdev_opened), GFP_KERNEL);
+	BUG_ON(!this);
+
+	bdev_info_list[index] = this;
+	this->device = device;
+	this->bdev = bdev;
+
+	return bdev;
+}
+
+/* Must be silent - might be called from cat /proc/suspend/debug_info
+ * Returns 0 if was off, -EBUSY if was on, error value otherwise.
+ */
+static int enable_swapfile(void)
+{
+	int activateswapresult = -EINVAL;
+
+	if (suspend_swapon_status)
+		return 0;
+
+	if (swapfilename[0]) {
+		/* Attempt to swap on with maximum priority */
+		activateswapresult = sys_swapon(swapfilename, 0xFFFF);
+		if ((activateswapresult) && (activateswapresult != -EBUSY))
+			printk(name_suspend
+				"The swapfile/partition specified by "
+				"/proc/suspend/swapfile (%s) could not"
+				" be turned on (error %d). Attempting "
+				"to continue.\n",
+				swapfilename, activateswapresult);
+		if (!activateswapresult)
+			suspend_swapon_status = 1;
+	}
+	return activateswapresult;
+}
+
+/* Returns 0 if was on, -EINVAL if was off, error value otherwise */
+static int disable_swapfile(void)
+{
+	int result = -EINVAL;
+	
+	if (!suspend_swapon_status)
+		return 0;
+
+	if (swapfilename[0]) {
+		result = sys_swapoff(swapfilename);
+		if (result == -EINVAL)
+	 		return 0;	/* Wasn't on */
+		if (!result)
+			suspend_swapon_status = 0;
+	}
+
+	return result;
+}
+
+static int try_to_parse_resume_device(char *commandline)
+{
+	struct kstat stat;
+	int error;
+
+	resume_dev_t = name_to_dev_t(commandline);
+
+	if (!resume_dev_t) {
+		error = vfs_stat(commandline, &stat);
+		if (!error)
+			resume_dev_t = stat.rdev;
+	}
+
+	if (!resume_dev_t) {
+		if (test_suspend_state(SUSPEND_TRYING_TO_RESUME))
+			suspend_early_boot_message(1, SUSPEND_CONTINUE_REQ,
+			  "Failed to translate \"%s\" into a device id.\n",
+			  commandline);
+		else
+			printk(name_suspend
+			  "Can't translate \"%s\" into a device id yet.\n",
+			  commandline);
+		return 1;
+	}
+
+	if (IS_ERR(resume_block_device =
+	     open_bdev(MAX_SWAPFILES, resume_dev_t))) {
+		suspend_early_boot_message(1, SUSPEND_CONTINUE_REQ,
+			"Failed to get access to \"%s\", where"
+			" the swap header should be found.",
+			commandline);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* 
+ * If we have read part of the image, we might have filled  memory with
+ * data that should be zeroed out.
+ */
+static void swapwriter_noresume_reset(void)
+{
+	memset((char *) &devinfo, 0, sizeof(devinfo));
+	close_bdevs();
+}
+
+static int parse_signature(char *header, int restore)
+{
+	int type = -1;
+	
+	if (!memcmp("SWAP-SPACE",header,10))
+		return 0;
+	else if (!memcmp("SWAPSPACE2",header,10))
+		return 1;
+
+	else if (!memcmp("S1SUSP",header,6))
+		type = 4;
+	else if (!memcmp("S2SUSP",header,6))
+		type = 5;
+	
+	else if (!memcmp("z",header,1))
+		type = 12;
+	else if (!memcmp("Z",header,1))
+		type = 13;
+	
+	/* 
+	 * Put bdev of suspend header in last byte of swap header
+	 * (unsigned short)
+	 */
+	if (type > 11) {
+		dev_t *header_ptr = (dev_t *) &header[1];
+		unsigned char *headerblocksize_ptr =
+			(unsigned char *) &header[5];
+		u32 *headerblock_ptr = (u32 *) &header[6];
+		header_dev_t = *header_ptr;
+		/* 
+		 * We are now using the highest bit of the char to indicate
+		 * whether we have attempted to resume from this image before.
+		 */
+		clear_suspend_state(SUSPEND_RESUMED_BEFORE);
+		if (((int) *headerblocksize_ptr) & 0x80)
+			set_suspend_state(SUSPEND_RESUMED_BEFORE);
+		headerblock = (unsigned long) *headerblock_ptr;
+	}
+
+	if ((restore) && (type > 5)) {
+		/* We only reset our own signatures */
+		if (type & 1)
+			memcpy(header,"SWAPSPACE2",10);
+		else
+			memcpy(header,"SWAP-SPACE",10);
+	}
+
+	return type;
+}
+
+/*
+ * prepare_signature
+ */
+
+static int prepare_signature(dev_t bdev, unsigned long block,
+		char *current_header)
+{
+	int current_type = parse_signature(current_header, 0);
+	dev_t *header_ptr = (dev_t *) (&current_header[1]);
+	unsigned long *headerblock_ptr =
+		(unsigned long *) (&current_header[6]);
+
+	if ((current_type > 1) && (current_type < 6))
+		return 1;
+
+	/* At the moment, I don't have a way to handle the block being
+	 * > 32 bits. Not enough room in the signature and no way to
+	 * safely put the data elsewhere. */
+
+	if (BITS_PER_LONG == 64 && ffs(block) > 31) {
+		suspend_prepare_status(DONT_CLEAR_BAR,
+			"Header sector requires 33+ bits. "
+			"Would not be able to resume.");
+		return 1;
+	}
+
+	if (current_type & 1)
+		current_header[0] = 'Z';
+	else
+		current_header[0] = 'z';
+	*header_ptr = bdev;
+	/* prev is the first/last swap page of the resume area */
+	*headerblock_ptr = (unsigned long) block; 
+	return 0;
+}
+
+static int swapwriter_allocate_storage(int space_requested);
+
+static int swapwriter_allocate_header_space(int space_requested)
+{
+	int i;
+
+	if (!swapextents.size)
+		swapwriter_allocate_storage(space_requested);
+
+	extent_state_goto_start(&suspend_writer_posn);
+	
+	for (i = 0; i < space_requested; i++) {
+		if (suspend_bio_ops.forward_one_page()) {
+			printk("Out of space while seeking to allocate header pages,\n");
+			return -ENOSPC;
+		}
+
+		header_pages_allocated++;
+	}
+
+	/* The end of header pages will be the start of pageset 2 */
+	extent_state_save(&suspend_writer_posn, &suspend_writer_posn_save[2]);
+	return 0;
+}
+
+static void get_main_pool_phys_params(void)
+{
+	struct extent *extentpointer = NULL;
+	unsigned long address;
+	int i, extent_min = -1, extent_max = -1, last_chain = -1;
+	int prev_header_pages_allocated;
+
+	for (i = 0; i < MAX_SWAPFILES; i++)
+		if (block_chain[i].first)
+			put_extent_chain(&block_chain[i]);
+
+	extent_for_each(&swapextents, extentpointer, address) {
+		swp_entry_t swap_address = extent_val_to_swap_entry(address);
+		unsigned swapfilenum = swp_type(swap_address);
+		pgoff_t offset = swp_offset(swap_address);
+		struct swap_info_struct *sis = get_swap_info_struct(swapfilenum);
+		sector_t new_sector = map_swap_page(sis, offset);
+
+		if ((new_sector == extent_max + 1) &&
+		    (last_chain == swapfilenum))
+			extent_max++;
+		else {
+			if (extent_min > -1) {
+				if (test_action_state(SUSPEND_TEST_BIO))
+					printk("Adding extent %d-%d.\n",
+						extent_min <<
+							devinfo[last_chain].bmap_shift,
+						extent_max <<
+							devinfo[last_chain].bmap_shift);
+						
+				append_extent_to_extent_chain(
+					&block_chain[last_chain],
+					extent_min, extent_max);
+			}
+			extent_min = extent_max = new_sector;
+			last_chain = swapfilenum;
+		}
+	}
+
+	if (extent_min > -1) {
+		if (test_action_state(SUSPEND_TEST_BIO))
+			printk("Adding extent %d-%d.\n",
+				extent_min <<
+					devinfo[last_chain].bmap_shift,
+				extent_max <<
+					devinfo[last_chain].bmap_shift);
+		append_extent_to_extent_chain(
+			&block_chain[last_chain],
+			extent_min, extent_max);
+	}
+
+	prev_header_pages_allocated = header_pages_allocated;
+	header_pages_allocated = 0;
+	swapwriter_allocate_header_space(prev_header_pages_allocated);
+}
+
+static int swapwriter_storage_allocated(void)
+{
+	return swapextents.size;
+}
+
+static int swapwriter_storage_available(void)
+{
+	si_swapinfo(&swapinfo);
+	return swapinfo.freeswap + swapwriter_storage_allocated();
+}
+
+static int swapwriter_initialise(int starting_cycle)
+{
+	if (starting_cycle) {
+		enable_swapfile();
+
+		if (resume_dev_t && !resume_block_device &&
+		    IS_ERR(resume_block_device =
+	    		open_bdev(MAX_SWAPFILES, resume_dev_t)))
+			return 1;
+	}
+	
+	return 0;
+}
+
+static void swapwriter_cleanup(int ending_cycle)
+{
+	if (ending_cycle)
+		disable_swapfile();
+	
+	close_bdevs();
+}
+
+static int swapwriter_release_storage(void)
+{
+	int i = 0;
+
+	if ((test_action_state(SUSPEND_KEEP_IMAGE)) &&
+			test_suspend_state(SUSPEND_NOW_RESUMING))
+		return 0;
+
+	header_pages_allocated = 0;
+	
+	if (swapextents.first) {
+		/* Free swap entries */
+		struct extent *extentpointer;
+		unsigned long extentvalue;
+		swp_entry_t entry;
+		extent_for_each(&swapextents, extentpointer, 
+				extentvalue) {
+			entry = extent_val_to_swap_entry(extentvalue);
+			swap_free(entry);
+		}
+
+		put_extent_chain(&swapextents);
+		
+		for (i = 0; i < MAX_SWAPFILES; i++)
+			if (block_chain[i].first)
+				put_extent_chain(&block_chain[i]);
+	}
+	
+	return 0;
+}
+
+static int swapwriter_allocate_storage(int space_requested)
+{
+	int i, result = 0, first = 1;
+	int pages_to_get = space_requested - swapextents.size;
+	unsigned long extent_min = 0, extent_max = 0;
+	
+	if (pages_to_get < 1)
+		return 0;
+
+	for (i=0; i < MAX_SWAPFILES; i++) {
+		devinfo[i].bmap_shift = 3;
+		devinfo[i].blocks_per_page = 1;
+	}
+
+	for(i=0; i < pages_to_get; i++) {
+		swp_entry_t entry;
+		unsigned long new_value;
+
+		entry = get_swap_page();
+		if (!entry.val) {
+			printk("Failed to get a swap page.\n");
+			result = -ENOSPC;
+			break;
+		}
+		
+		new_value = swap_entry_to_extent_val(entry);
+		if (first) {
+			first = 0;
+			extent_min = extent_max = new_value;
+		} else {
+			if (new_value == extent_max + 1)
+				extent_max++;
+			else {
+				append_extent_to_extent_chain(
+					&swapextents,
+					extent_min, extent_max);
+				extent_min = extent_max = new_value;
+			}
+		}
+	}
+
+	if (!first)
+		append_extent_to_extent_chain(
+			&swapextents,
+			extent_min, extent_max);
+
+	get_main_pool_phys_params();
+	return result;
+}
+
+static int swapwriter_write_header_init(void)
+{
+	int i, result;
+
+	extent_state_goto_start(&suspend_writer_posn);
+	/* Forward one page will be done prior to the read */
+
+	for (i = 0; i < MAX_SWAPFILES; i++)
+			devinfo[i].dev_t = (dev_t) 0;
+
+	suspend_writer_buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+	if (!suspend_writer_buffer) {
+		printk("Failed to get swapwriter buffer.\n");
+		return -ENOMEM;
+	}
+
+	suspend_writer_buffer_posn = 0;
+
+	/* Info needed to bootstrap goes at the start of the header.
+	 * First we save the positions and devinfo, including the number
+	 * of header pages. Then we save the structs containing data needed
+	 * for reading the header pages back.
+	 * Note that even if header pages take more than one page, when we
+	 * read back the info, we will have restored the location of the
+	 * next header page by the time we go to use it.
+	 */
+	if ((result = suspend_bio_ops.write_header_chunk((char *) &suspend_writer_posn_save, 
+			sizeof(suspend_writer_posn_save))))
+		return result;
+
+	if ((result = suspend_bio_ops.write_header_chunk((char *) &devinfo, 
+			sizeof(devinfo))))
+		return result;
+
+	for (i=0; i < MAX_SWAPFILES; i++)
+		serialise_extent_chain(&block_chain[i]);
+
+	return 0;
+}
+
+static int swapwriter_write_header_cleanup(void)
+{
+	int result;
+
+	/* Write any unsaved data */
+	if (suspend_writer_buffer_posn)
+		suspend_bio_ops.write_header_chunk_finish();
+
+	extent_state_goto_start(&suspend_writer_posn);
+	suspend_bio_ops.forward_one_page();
+
+	/* Adjust swap header */
+	suspend_bio_ops.bdev_page_io(READ, resume_block_device,
+			resume_firstblock,
+			virt_to_page(suspend_writer_buffer));
+
+			suspend_writer_posn.current_offset;
+		
+	if (!result)
+		suspend_bio_ops.bdev_page_io(WRITE, resume_block_device,
+			resume_firstblock,
+			virt_to_page(suspend_writer_buffer));
+
+	free_page((unsigned long) suspend_writer_buffer);
+	suspend_writer_buffer = NULL;
+	
+	suspend_bio_ops.finish_all_io();
+
+	return result;
+}
+
+/* ------------------------- HEADER READING ------------------------- */
+
+/*
+ * read_header_init()
+ * 
+ * Description:
+ * 1. Attempt to read the device specified with resume2=.
+ * 2. Check the contents of the swap header for our signature.
+ * 3. Warn, ignore, reset and/or continue as appropriate.
+ * 4. If continuing, read the swapwriter configuration section
+ *    of the header and set up block device info so we can read
+ *    the rest of the header & image.
+ *
+ * Returns:
+ * May not return if user choose to reboot at a warning.
+ * -EINVAL if cannot resume at this time. Booting should continue
+ * normally.
+ */
+
+static int swapwriter_read_header_init(void)
+{
+	int i;
+	
+	BUG_ON(!resume_block_device);
+	BUG_ON(!resume_dev_t);
+
+	suspend_writer_buffer = (char *) get_zeroed_page(GFP_ATOMIC);
+
+	BUG_ON(!suspend_writer_buffer);
+
+	if (!header_dev_t) {
+		printk("read_header_init called when we haven't "
+				"verified there is an image!\n");
+		return -EINVAL;
+	}
+
+	/* 
+	 * If the header is not on the resume_dev_t, get the resume device first.
+	 */
+	if (header_dev_t != resume_dev_t) {
+		header_block_device = open_bdev(MAX_SWAPFILES + 1,
+				header_dev_t);
+
+		if (IS_ERR(header_block_device))
+			return PTR_ERR(header_block_device);
+	} else
+		header_block_device = resume_block_device;
+
+	/* 
+	 * Read swapwriter configuration.
+	 * Headerblock size taken into account already.
+	 */
+	suspend_bio_ops.bdev_page_io(READ, header_block_device,
+			headerblock << 3,
+			virt_to_page((unsigned long) suspend_writer_buffer));
+	
+	memcpy(&suspend_writer_posn_save, suspend_writer_buffer, 3 * sizeof(struct extent_iterate_saved_state));
+	
+	suspend_writer_buffer_posn = 3 * sizeof(struct extent_iterate_saved_state);
+
+	memcpy(&devinfo, suspend_writer_buffer + suspend_writer_buffer_posn, sizeof(devinfo));
+	
+	suspend_writer_buffer_posn += sizeof(devinfo);
+
+	/* Restore device info */
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		dev_t thisdevice = devinfo[i].dev_t;
+		struct block_device *result;
+
+		if (!thisdevice)
+			continue;
+
+		if (thisdevice == resume_dev_t) {
+			bdev_info_list[i] = bdev_info_list[MAX_SWAPFILES];
+			BUG_ON(!bdev_info_list[i]);
+			bdev_info_list[MAX_SWAPFILES] = NULL;
+			continue;
+		}
+
+		if (thisdevice == header_dev_t) {
+			bdev_info_list[i] = bdev_info_list[MAX_SWAPFILES + 1];
+			BUG_ON(!bdev_info_list[i]);
+			bdev_info_list[MAX_SWAPFILES + 1] = NULL;
+			continue;
+		}
+
+		result = open_bdev(i, thisdevice);
+		if (IS_ERR(result)) {
+			close_bdevs();
+			return PTR_ERR(result);
+		}
+	}
+
+	extent_state_goto_start(&suspend_writer_posn);
+	*(suspend_bio_ops.need_extra_next) = 1;
+
+	for (i = 0; i < MAX_SWAPFILES; i++)
+		load_extent_chain(&block_chain[i]);
+
+	return 0;
+}
+
+static int swapwriter_read_header_cleanup(void)
+{
+	free_page((unsigned long) suspend_writer_buffer);
+	return 0;
+}
+
+/* swapwriter_invalidate_image
+ * 
+ */
+static int swapwriter_invalidate_image(void)
+{
+	union p_diskpage cur;
+	int result = 0;
+	char newsig[11];
+	
+	cur.address = get_zeroed_page(GFP_ATOMIC);
+	if (!cur.address) {
+		printk("Unable to allocate a page for restoring the swap signature.\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * If nr_suspends == 0, we must be booting, so no swap pages
+	 * will be recorded as used yet.
+	 */
+
+	if (nr_suspends > 0)
+		swapwriter_release_storage();
+
+	/* 
+	 * We don't do a sanity check here: we want to restore the swap 
+	 * whatever version of kernel made the suspend image.
+	 * 
+	 * We need to write swap, but swap may not be enabled so
+	 * we write the device directly
+	 */
+	
+	suspend_bio_ops.bdev_page_io(READ, resume_block_device,
+			resume_firstblock,
+			virt_to_page(cur.pointer));
+
+	result = parse_signature(cur.pointer->swh.magic.magic, 1);
+		
+	if (result < 4)
+		goto out;
+
+	strncpy(newsig, cur.pointer->swh.magic.magic, 10);
+	newsig[10] = 0;
+
+	suspend_bio_ops.bdev_page_io(WRITE, resume_block_device,
+			resume_firstblock,
+			virt_to_page(cur.pointer));
+
+	if (!nr_suspends)
+		printk(KERN_WARNING name_suspend "Image invalidated.\n");
+out:
+	suspend_bio_ops.finish_all_io();
+	free_page(cur.address);
+	return 0;
+}
+
+/*
+ * workspace_size
+ *
+ * Description:
+ * Returns the number of bytes of RAM needed for this
+ * code to do its work. (Used when calculating whether
+ * we have enough memory to be able to suspend & resume).
+ *
+ */
+static unsigned long swapwriter_memory_needed(void)
+{
+	return 1;
+}
+
+/* Print debug info
+ *
+ * Description:
+ */
+
+static int swapwriter_print_debug_stats(char *buffer, int size)
+{
+	int len = 0;
+	struct sysinfo sysinfo;
+	
+	if (suspend_active_writer != &swapwriterops) {
+		len = snprintf_used(buffer, size, "- Swapwriter inactive.\n");
+		return len;
+	}
+
+	len = snprintf_used(buffer, size, "- Swapwriter active.\n");
+	if (swapfilename[0])
+		len+= snprintf_used(buffer+len, size-len,
+			"  Attempting to automatically swapon: %s.\n", swapfilename);
+
+	si_swapinfo(&sysinfo);
+	
+	len+= snprintf_used(buffer+len, size-len, "  Swap available for image: %ld pages.\n",
+			sysinfo.freeswap + swapwriter_storage_allocated());
+
+	return len;
+}
+
+/*
+ * Storage needed
+ *
+ * Returns amount of space in the swap header required
+ * for the swapwriter's data. This ignores the links between
+ * pages, which we factor in when allocating the space.
+ *
+ * We ensure the space is allocated, but actually save the
+ * data from write_header_init and therefore don't also define a
+ * save_config_info routine.
+ */
+static unsigned long swapwriter_storage_needed(void)
+{
+	return sizeof(suspend_writer_posn_save) + sizeof(devinfo);
+}
+
+/*
+ * Image_exists
+ */
+
+static int swapwriter_image_exists(void)
+{
+	int signature_found;
+	union p_diskpage diskpage;
+	
+	if (!resume_dev_t) {
+		printk("Not even trying to read header "
+				"because resume_dev_t is not set.\n");
+		return 0;
+	}
+	
+	if (!resume_block_device &&
+	    IS_ERR(resume_block_device = open_bdev(MAX_SWAPFILES, resume_dev_t)))
+		return 0;
+
+	diskpage.address = get_zeroed_page(GFP_ATOMIC);
+
+	suspend_bio_ops.bdev_page_io(READ, resume_block_device,
+			resume_firstblock,
+			virt_to_page(diskpage.ptr));
+	suspend_bio_ops.finish_all_io();
+
+	signature_found = parse_signature(diskpage.pointer->swh.magic.magic, 0);
+	free_page(diskpage.address);
+
+	if (signature_found < 2) {
+		return 0;	/* Normal swap space */
+	} else if (signature_found == -1) {
+		printk(KERN_ERR name_suspend
+			"Unable to find a signature. Could you have moved "
+			"a swap file?\n");
+		return 0;
+	} else if (signature_found < 6) {
+		if ((!(test_suspend_state(SUSPEND_NORESUME_SPECIFIED)))
+				&& suspend_early_boot_message(1,
+				SUSPEND_CONTINUE_REQ,
+				"Detected the signature of an alternate "
+				"implementation.\n"))
+			set_suspend_state(SUSPEND_NORESUME_SPECIFIED);
+		return 0;
+	} else if ((signature_found >> 1) != SIGNATURE_VER) {
+		if ((!(test_suspend_state(SUSPEND_NORESUME_SPECIFIED))) &&
+			suspend_early_boot_message(1, SUSPEND_CONTINUE_REQ,
+			 "Found a different style suspend image signature."))
+			set_suspend_state(SUSPEND_NORESUME_SPECIFIED);
+	}
+
+	return 1;
+}
+
+/*
+ * Mark resume attempted.
+ *
+ * Record that we tried to resume from this image.
+ */
+
+static void swapwriter_mark_resume_attempted(void)
+{
+	union p_diskpage diskpage;
+	int signature_found;
+	
+	if (!resume_dev_t) {
+		printk("Not even trying to record attempt at resuming"
+				" because resume_dev_t is not set.\n");
+		return;
+	}
+	
+	diskpage.address = get_zeroed_page(GFP_ATOMIC);
+
+	suspend_bio_ops.bdev_page_io(READ, resume_block_device,
+			resume_firstblock,
+			virt_to_page(diskpage.ptr));
+	signature_found = parse_signature(diskpage.pointer->swh.magic.magic, 0);
+
+	switch (signature_found) {
+		case 12:
+		case 13:
+			diskpage.pointer->swh.magic.magic[5] |= 0x80;
+			break;
+	}
+	
+	suspend_bio_ops.bdev_page_io(WRITE, resume_block_device,
+			resume_firstblock,
+			virt_to_page(diskpage.ptr));
+	suspend_bio_ops.finish_all_io();
+	free_page(diskpage.address);
+	
+	close_bdevs();
+	return;
+}
+
+/*
+ * Parse Image Location
+ *
+ * Attempt to parse a resume2= parameter.
+ * Swap Writer accepts:
+ * resume2=swap:DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
+ *
+ * Where:
+ * DEVNAME is convertable to a dev_t by name_to_dev_t
+ * FIRSTBLOCK is the location of the first block in the swap file
+ * (specifying for a swap partition is nonsensical but not prohibited).
+ * Data is validated by attempting to read a swap header from the
+ * location given. Failure will result in swapwriter refusing to
+ * save an image, and a reboot with correct parameters will be
+ * necessary.
+ */
+
+static int swapwriter_parse_sig_location(char *commandline, int only_writer)
+{
+	char *thischar, *devstart, *colon = NULL, *at_symbol = NULL;
+	union p_diskpage diskpage;
+	int signature_found, result = -EINVAL, temp_result;
+
+	if (strncmp(commandline, "swap:", 5)) {
+		if (!only_writer)
+			return 1;
+	} else
+		commandline += 5;
+
+	devstart = thischar = commandline;
+	while ((*thischar != ':') && (*thischar != '@') &&
+		((thischar - commandline) < 250) && (*thischar))
+		thischar++;
+
+	if (*thischar == ':') {
+		colon = thischar;
+		*colon = 0;
+		thischar++;
+	}
+
+	while ((*thischar != '@') && ((thischar - commandline) < 250) && (*thischar))
+		thischar++;
+
+	if (*thischar == '@') {
+		at_symbol = thischar;
+		*at_symbol = 0;
+	}
+	
+	if (colon)
+		resume_firstblock = (int) simple_strtoul(colon + 1, NULL, 0);
+	else
+		resume_firstblock = 0;
+
+	/* Legacy */
+	if (at_symbol) {
+		resume_blocksize = (int) simple_strtoul(at_symbol + 1, NULL, 0);
+		if (resume_blocksize & (SECTOR_SIZE - 1)) {
+			printk("Swapwriter: Blocksizes are multiples of %d!\n", SECTOR_SIZE);
+			return -EINVAL;
+		}
+		resume_firstblock = resume_firstblock * (resume_blocksize / SECTOR_SIZE);
+	}
+	
+	temp_result = try_to_parse_resume_device(devstart);
+
+	if (colon)
+		*colon = ':';
+	if (at_symbol)
+		*at_symbol = '@';
+
+	if (temp_result)
+		return -EINVAL;
+
+	diskpage.address = get_zeroed_page(GFP_ATOMIC);
+	if (!diskpage.address) {
+		printk(KERN_ERR name_suspend "Swapwriter: Failed to allocate a diskpage for I/O.\n");
+		return -ENOMEM;
+	}
+
+	temp_result = suspend_bio_ops.bdev_page_io(READ,
+			resume_block_device,
+			resume_firstblock,
+			virt_to_page(diskpage.ptr));
+
+	suspend_bio_ops.finish_all_io();
+	
+	if (temp_result) {
+		printk(KERN_ERR name_suspend "Swapwriter: Failed to submit I/O.\n");
+		goto invalid;
+	}
+	
+	signature_found = parse_signature(diskpage.pointer->swh.magic.magic, 0);
+
+	if (signature_found != -1) {
+		printk(name_suspend "Swapwriter: Signature found.\n");
+		result = 0;
+
+		suspend_bio_ops.set_devinfo(devinfo);
+		suspend_writer_posn.chains = &block_chain[0];
+		suspend_writer_posn.num_chains = MAX_SWAPFILES;
+	} else
+		printk(KERN_ERR name_suspend "Swapwriter: No swap signature found at specified location.\n");
+invalid:
+	free_page((unsigned long) diskpage.address);
+	return result;
+
+}
+
+static int header_locations_read_proc(char *page, char **start, off_t off, int count,
+		int *eof, void *data)
+{
+	int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
+	struct inode *swapf = 0;
+	int zone;
+	char *path_page = (char *) __get_free_page(GFP_KERNEL);
+	char *path;
+	int path_len;
+	
+	*eof = 1;
+	if (!page)
+		return 0;
+
+			}
+
+static struct suspend_proc_data swapwriter_proc_data[] = {
+	{
+	 .filename			= "swapfilename",
+	 .permissions			= PROC_RW,
+	 .type				= SUSPEND_PROC_DATA_STRING,
+	 .data = {
+		.string = {
+			.variable	= swapfilename,
+			.max_length	= 255,
+		}
+	 }
+	},
+
+	{
+	 .filename			= "headerlocations",
+	 .permissions			= PROC_READONLY,
+	 .type				= SUSPEND_PROC_DATA_CUSTOM,
+	 .data = {
+		 .special = {
+			.read_proc 	= header_locations_read_proc,
+		}
+	 }
+	},
+
+	{ .filename			= "disable_swapwriter",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_INTEGER,
+	  .data = {
+		.integer = {
+			.variable	= &swapwriterops.disabled,
+			.minimum	= 0,
+			.maximum	= 1,
+		}
+	  },
+	  .write_proc			= attempt_to_parse_resume_device2,
+	}
+};
+
+static struct suspend_module_ops swapwriterops = {
+	.type					= WRITER_PLUGIN,
+	.name					= "Swap Writer",
+	.module					= THIS_MODULE,
+	.memory_needed				= swapwriter_memory_needed,
+	.print_debug_info			= swapwriter_print_debug_stats,
+	.storage_needed				= swapwriter_storage_needed,
+	.initialise				= swapwriter_initialise,
+	.cleanup				= swapwriter_cleanup,
+
+	.ops = {
+		.writer = {
+		 .noresume_reset	= swapwriter_noresume_reset,
+		 .storage_available 	= swapwriter_storage_available,
+		 .storage_allocated	= swapwriter_storage_allocated,
+		 .release_storage	= swapwriter_release_storage,
+		 .allocate_header_space	= swapwriter_allocate_header_space,
+		 .allocate_storage	= swapwriter_allocate_storage,
+		 .image_exists		= swapwriter_image_exists,
+		 .mark_resume_attempted	= swapwriter_mark_resume_attempted,
+		 .write_header_init	= swapwriter_write_header_init,
+		 .write_header_cleanup	= swapwriter_write_header_cleanup,
+		 .read_header_init	= swapwriter_read_header_init,
+		 .read_header_cleanup	= swapwriter_read_header_cleanup,
+		 .invalidate_image	= swapwriter_invalidate_image,
+		 .parse_sig_location	= swapwriter_parse_sig_location,
+		}
+	}
+};
+
+/* ---- Registration ---- */
+static __init int swapwriter_load(void)
+{
+	int result;
+	int i, numfiles = sizeof(swapwriter_proc_data) / sizeof(struct suspend_proc_data);
+	
+	printk("Suspend2 Swap Writer loading.\n");
+
+	swapwriterops.read_init = suspend_bio_ops.read_init;
+	swapwriterops.ops.writer.read_chunk = suspend_bio_ops.read_chunk;
+	swapwriterops.read_cleanup = suspend_bio_ops.read_cleanup;
+	swapwriterops.write_init = suspend_bio_ops.write_init;
+	swapwriterops.ops.writer.write_chunk = suspend_bio_ops.write_chunk;
+	swapwriterops.write_cleanup = suspend_bio_ops.write_cleanup;
+	swapwriterops.ops.writer.read_header_chunk =
+		suspend_bio_ops.read_header_chunk;
+	swapwriterops.ops.writer.write_header_chunk =
+		suspend_bio_ops.write_header_chunk;
+
+	if (!(result = suspend_register_module(&swapwriterops))) {
+
+		for (i=0; i< numfiles; i++)
+			suspend_register_procfile(&swapwriter_proc_data[i]);
+	} else
+		printk("Suspend2 Swap Writer unable to register!\n");
+	return result;
+}
+
+#ifdef MODULE
+static __exit void swapwriter_unload(void)
+{
+	int i, numfiles = sizeof(swapwriter_proc_data) / sizeof(struct suspend_proc_data);
+
+	printk("Suspend2 Swap Writer unloading.\n");
+
+	for (i=0; i< numfiles; i++)
+		suspend_unregister_procfile(&swapwriter_proc_data[i]);
+	suspend_unregister_module(&swapwriterops);
+}
+
+module_init(swapwriter_load);
+module_exit(swapwriter_unload);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nigel Cunningham");
+MODULE_DESCRIPTION("Suspend2 swap writer");
+#else
+late_initcall(swapwriter_load);
+#endif
diff -urN oldtree/kernel/power/swsusp.c newtree/kernel/power/swsusp.c
--- oldtree/kernel/power/swsusp.c	2006-03-08 18:48:02.960064000 +0000
+++ newtree/kernel/power/swsusp.c	2006-03-08 16:13:23.888158250 +0000
@@ -41,16 +41,18 @@
 #include <linux/mm.h>
 #include <linux/suspend.h>
 #include <linux/spinlock.h>
-#include <linux/kernel.h>
 #include <linux/major.h>
-#include <linux/swap.h>
 #include <linux/pm.h>
 #include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/highmem.h>
+#include <linux/freezer.h>
+
 
 #include "power.h"
+#include "swsusp.h"
+#include "suspend.h"
 
 /*
  * Preferred image size in bytes (tunable via /sys/power/image_size).
@@ -184,6 +186,8 @@
 	unsigned int i = 0;
 	char *p = "-\\|/";
 
+	thaw_processes(FREEZER_KERNEL_THREADS);
+
 	printk("Shrinking memory...  ");
 	do {
 		size = 2 * count_highmem_pages();
@@ -207,6 +211,8 @@
 	} while (tmp > 0);
 	printk("\bdone (%lu pages freed)\n", pages);
 
+	freeze_processes();
+
 	return 0;
 }
 
diff -urN oldtree/kernel/power/swsusp.c.orig newtree/kernel/power/swsusp.c.orig
--- oldtree/kernel/power/swsusp.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/swsusp.c.orig	2006-03-08 15:21:19.232879500 +0000
@@ -0,0 +1,273 @@
+/*
+ * linux/kernel/power/swsusp.c
+ *
+ * This file provides code to write suspend image to swap and read it back.
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ * I'd like to thank the following people for their work:
+ *
+ * Pavel Machek <pavel@ucw.cz>:
+ * Modifications, defectiveness pointing, being with me at the very beginning,
+ * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
+ *
+ * Steve Doddi <dirk@loth.demon.co.uk>:
+ * Support the possibility of hardware state restoring.
+ *
+ * Raph <grey.havens@earthling.net>:
+ * Support for preserving states of network devices and virtual console
+ * (including X and svgatextmode)
+ *
+ * Kurt Garloff <garloff@suse.de>:
+ * Straightened the critical function in order to prevent compilers from
+ * playing tricks with local variables.
+ *
+ * Andreas Mohr <a.mohr@mailto.de>
+ *
+ * Alex Badea <vampire@go.ro>:
+ * Fixed runaway init
+ *
+ * Rafael J. Wysocki <rjw@sisk.pl>
+ * Reworked the freeing of memory and the handling of swap
+ *
+ * More state savers are welcome. Especially for the scsi layer...
+ *
+ * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
+ */
+
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/swap.h>
+#include <linux/pm.h>
+#include <linux/swapops.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/highmem.h>
+
+#include "power.h"
+
+/*
+ * Preferred image size in bytes (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N bytes, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned long image_size = 500 * 1024 * 1024;
+
+int in_suspend __nosavedata = 0;
+
+#ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void);
+int save_highmem(void);
+int restore_highmem(void);
+#else
+static int save_highmem(void) { return 0; }
+static int restore_highmem(void) { return 0; }
+static unsigned int count_highmem_pages(void) { return 0; }
+#endif
+
+/**
+ *	The following functions are used for tracing the allocated
+ *	swap pages, so that they can be freed in case of an error.
+ *
+ *	The functions operate on a linked bitmap structure defined
+ *	in power.h
+ */
+
+void free_bitmap(struct bitmap_page *bitmap)
+{
+	struct bitmap_page *bp;
+
+	while (bitmap) {
+		bp = bitmap->next;
+		free_page((unsigned long)bitmap);
+		bitmap = bp;
+	}
+}
+
+struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
+{
+	struct bitmap_page *bitmap, *bp;
+	unsigned int n;
+
+	if (!nr_bits)
+		return NULL;
+
+	bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+	bp = bitmap;
+	for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
+		bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+		bp = bp->next;
+		if (!bp) {
+			free_bitmap(bitmap);
+			return NULL;
+		}
+	}
+	return bitmap;
+}
+
+static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
+{
+	unsigned int n;
+
+	n = BITMAP_PAGE_BITS;
+	while (bitmap && n <= bit) {
+		n += BITMAP_PAGE_BITS;
+		bitmap = bitmap->next;
+	}
+	if (!bitmap)
+		return -EINVAL;
+	n -= BITMAP_PAGE_BITS;
+	bit -= n;
+	n = 0;
+	while (bit >= BITS_PER_CHUNK) {
+		bit -= BITS_PER_CHUNK;
+		n++;
+	}
+	bitmap->chunks[n] |= (1UL << bit);
+	return 0;
+}
+
+unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
+{
+	unsigned long offset;
+
+	offset = swp_offset(get_swap_page_of_type(swap));
+	if (offset) {
+		if (bitmap_set(bitmap, offset)) {
+			swap_free(swp_entry(swap, offset));
+			offset = 0;
+		}
+	}
+	return offset;
+}
+
+void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
+{
+	unsigned int bit, n;
+	unsigned long test;
+
+	bit = 0;
+	while (bitmap) {
+		for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
+			for (test = 1UL; test; test <<= 1) {
+				if (bitmap->chunks[n] & test)
+					swap_free(swp_entry(swap, bit));
+				bit++;
+			}
+		bitmap = bitmap->next;
+	}
+}
+
+/**
+ *	swsusp_shrink_memory -  Try to free as much memory as needed
+ *
+ *	... but do not OOM-kill anyone
+ *
+ *	Notice: all userland should be stopped before it is called, or
+ *	livelock is possible.
+ */
+
+#define SHRINK_BITE	10000
+
+int swsusp_shrink_memory(void)
+{
+	long size, tmp;
+	struct zone *zone;
+	unsigned long pages = 0;
+	unsigned int i = 0;
+	char *p = "-\\|/";
+
+	printk("Shrinking memory...  ");
+	do {
+		size = 2 * count_highmem_pages();
+		size += size / 50 + count_data_pages();
+		size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
+			PAGES_FOR_IO;
+		tmp = size;
+		for_each_zone (zone)
+			if (!is_highmem(zone))
+				tmp -= zone->free_pages;
+		if (tmp > 0) {
+			tmp = shrink_all_memory(SHRINK_BITE);
+			if (!tmp)
+				return -ENOMEM;
+			pages += tmp;
+		} else if (size > image_size / PAGE_SIZE) {
+			tmp = shrink_all_memory(SHRINK_BITE);
+			pages += tmp;
+		}
+		printk("\b%c", p[i++%4]);
+	} while (tmp > 0);
+	printk("\bdone (%lu pages freed)\n", pages);
+
+	return 0;
+}
+
+int swsusp_suspend(void)
+{
+	int error;
+
+	if ((error = arch_prepare_suspend()))
+		return error;
+	local_irq_disable();
+	/* At this point, device_suspend() has been called, but *not*
+	 * device_power_down(). We *must* device_power_down() now.
+	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
+	 * become desynchronized with the actual state of the hardware
+	 * at resume time, and evil weirdness ensues.
+	 */
+	if ((error = device_power_down(PMSG_FREEZE))) {
+		printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
+		goto Enable_irqs;
+	}
+
+	if ((error = save_highmem())) {
+		printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
+		goto Restore_highmem;
+	}
+
+	save_processor_state();
+	if ((error = swsusp_arch_suspend()))
+		printk(KERN_ERR "Error %d suspending\n", error);
+	/* Restore control flow magically appears here */
+	restore_processor_state();
+Restore_highmem:
+	restore_highmem();
+	device_power_up();
+Enable_irqs:
+	local_irq_enable();
+	return error;
+}
+
+int swsusp_resume(void)
+{
+	int error;
+	local_irq_disable();
+	if (device_power_down(PMSG_FREEZE))
+		printk(KERN_ERR "Some devices failed to power down, very bad\n");
+	/* We'll ignore saved state, but this gets preempt count (etc) right */
+	save_processor_state();
+	error = swsusp_arch_resume();
+	/* Code below is only ever reached in case of failure. Otherwise
+	 * execution continues at place where swsusp_arch_suspend was called
+         */
+	BUG_ON(!error);
+	/* The only reason why swsusp_arch_resume() can fail is memory being
+	 * very tight, so we have to free it as soon as we can to avoid
+	 * subsequent failures
+	 */
+	swsusp_free();
+	restore_processor_state();
+	restore_highmem();
+	touch_softlockup_watchdog();
+	device_power_up();
+	local_irq_enable();
+	return error;
+}
diff -urN oldtree/kernel/power/swsusp.h newtree/kernel/power/swsusp.h
--- oldtree/kernel/power/swsusp.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/swsusp.h	2006-03-08 15:22:33.349511500 +0000
@@ -0,0 +1,24 @@
+
+struct suspend_header {
+	u32 version_code;
+	unsigned long num_physpages;
+	unsigned long orig_mem_free;
+	char machine[65];
+	char version[65];
+	int num_cpus;
+	int page_size;
+	int pageset_2_size;
+	int param0;
+	int param1;
+	int param2;
+	int param3;
+	int progress0;
+	int progress1;
+	int progress2;
+	int progress3;
+	int io_time[2][2];
+	
+	suspend_pagedir_t *suspend_pagedir;
+	unsigned int num_pbes;
+};
+
diff -urN oldtree/kernel/power/ui.c newtree/kernel/power/ui.c
--- oldtree/kernel/power/ui.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/ui.c	2006-03-08 18:38:29.608231750 +0000
@@ -0,0 +1,853 @@
+/*
+ * kernel/power/ui.c
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
+ * Copyright (C) 2002-2005 Nigel Cunningham <nigel@suspend2.net>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for Suspend2's user interface.
+ *
+ * The user interface code talks to a userspace program via a
+ * netlink socket.
+ *
+ * The kernel side:
+ * - starts the userui program;
+ * - sends text messages and progress bar status;
+ *
+ * The user space side:
+ * - passes messages regarding user requests (abort, toggle reboot etc)
+ *
+ */
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+#include <linux/console.h>
+#include <linux/ctype.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/kmod.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+ 
+#include "proc.h"
+#include "modules.h"
+#include "suspend2.h"
+#include "suspend2_common.h"
+#include "ui.h"
+#include "version.h"
+#include "netlink.h"
+#include "power.h"
+
+static char local_printf_buf[1024];	/* Same as printk - should be safe */
+
+#ifdef CONFIG_NET
+static struct user_helper_data ui_helper_data;
+static struct suspend_module_ops userui_ops;
+static int orig_loglevel;
+static int orig_default_message_loglevel;
+static int orig_kmsg;
+
+static char lastheader[512];
+static int lastheader_message_len = 0;
+
+/* Number of distinct progress amounts that userspace can display */
+static int progress_granularity = 50;
+
+DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
+
+static void ui_nl_set_state(int n)
+{
+	/* Only let them change certain settings */
+	static const int suspend_action_mask =
+		(1 << SUSPEND_REBOOT) | (1 << SUSPEND_PAUSE) | (1 << SUSPEND_SLOW) |
+		(1 << SUSPEND_LOGALL) | (1 << SUSPEND_SINGLESTEP) |
+		(1 << SUSPEND_PAUSE_NEAR_PAGESET_END);
+
+	suspend_action = (suspend_action & (~suspend_action_mask)) |
+		(n & suspend_action_mask);
+
+	if (!test_action_state(SUSPEND_PAUSE) &&
+			!test_action_state(SUSPEND_SINGLESTEP))
+		wake_up_interruptible(&userui_wait_for_key);
+}
+
+void userui_redraw(void)
+{
+	if (ui_helper_data.pid == -1)
+		return;
+
+	suspend_send_netlink_message(&ui_helper_data,
+			USERUI_MSG_REDRAW, NULL, 0);
+}
+
+/* request_abort_suspend
+ *
+ * Description:	Handle the user requesting the cancellation of a suspend by
+ * 		pressing escape.
+ * Callers:	Invoked from a netlink packet from userspace when the user presses
+ * 	 	escape.
+ */
+void request_abort_suspend(void)
+{
+	if (test_suspend_state(SUSPEND_NOW_RESUMING) || (test_result_state(SUSPEND_ABORT_REQUESTED)))
+		return;
+
+	suspend_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
+				" ABORTING PROCESS ---");
+	set_result_state(SUSPEND_ABORTED);
+	set_result_state(SUSPEND_ABORT_REQUESTED);
+	
+	wake_up_interruptible(&userui_wait_for_key);
+}
+
+static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	int type;
+	int *data;
+
+	type = nlh->nlmsg_type;
+
+	/* A control message: ignore them */
+	if (type < NETLINK_MSG_BASE)
+		return 0;
+
+	/* Unknown message: reply with EINVAL */
+	if (type >= USERUI_MSG_MAX)
+		return -EINVAL;
+
+	/* All operations require privileges, even GET */
+	if (security_netlink_recv(skb))
+		return -EPERM;
+
+	/* Only allow one task to receive NOFREEZE privileges */
+	if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1)
+		return -EBUSY;
+
+	data = (int*)NLMSG_DATA(nlh);
+
+	switch (type) {
+		case USERUI_MSG_ABORT:
+			request_abort_suspend();
+			break;
+		case USERUI_MSG_GET_STATE:
+			suspend_send_netlink_message(&ui_helper_data, 
+					USERUI_MSG_GET_STATE, &suspend_action,
+					sizeof(suspend_action));
+			break;
+		case USERUI_MSG_GET_DEBUG_STATE:
+			suspend_send_netlink_message(&ui_helper_data,
+					USERUI_MSG_GET_DEBUG_STATE,
+					&suspend_debug_state,
+					sizeof(suspend_debug_state));
+			break;
+		case USERUI_MSG_SET_STATE:
+			if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+				return -EINVAL;
+			ui_nl_set_state(*data);
+			break;
+		case USERUI_MSG_SET_DEBUG_STATE:
+			if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+				return -EINVAL;
+			suspend_debug_state = (*data);
+			break;
+		case USERUI_MSG_SPACE:
+			wake_up_interruptible(&userui_wait_for_key);
+			break;
+	}
+
+	return 1;
+}
+
+static unsigned long userui_storage_needed(void)
+{
+	return sizeof(ui_helper_data.program);
+}
+
+static int userui_save_config_info(char *buf)
+{
+	*((int *) buf) = progress_granularity;
+	memcpy(buf + sizeof(int), ui_helper_data.program, sizeof(ui_helper_data.program));
+	return sizeof(ui_helper_data.program) + sizeof(int);
+}
+
+static void userui_load_config_info(char *buf, int size)
+{
+	/* Don't load the saved path if one has already been set */
+	if (ui_helper_data.program[0])
+		return;
+
+	progress_granularity = *((int *) buf);
+	size -= sizeof(int);
+	
+	if (size > sizeof(ui_helper_data.program))
+		size = sizeof(ui_helper_data.program);
+
+	memcpy(ui_helper_data.program, buf + sizeof(int), size);
+	ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
+}
+
+static unsigned long userui_memory_needed(void)
+{
+	/* ball park figure of 128 pages */
+	return (128 * PAGE_SIZE);
+}
+
+unsigned long userui_update_progress(unsigned long value, unsigned long maximum,
+		const char *fmt, va_list args)
+{
+	static int last_step = -1;
+	struct userui_msg_params msg;
+	int bitshift;
+	int this_step;
+	unsigned long next_update;
+
+	if (ui_helper_data.pid == -1)
+		return 0;
+
+	if ((!maximum) || (!progress_granularity))
+		return maximum;
+
+	if (value < 0)
+		value = 0;
+
+	if (value > maximum)
+		value = maximum;
+
+	/* Try to avoid math problems - we can't do 64 bit math here
+	 * (and shouldn't need it - anyone got screen resolution
+	 * of 65536 pixels or more?) */
+	bitshift = maximum - 16;
+	if (bitshift > 0) {
+		unsigned long temp_maximum = maximum >> bitshift;
+		unsigned long temp_value = value >> bitshift;
+		this_step = (int)
+			(temp_value * progress_granularity / temp_maximum);
+		next_update = (((this_step + 1) * temp_maximum /
+					progress_granularity) + 1) << bitshift;
+	} else {
+		this_step = (int) (value * progress_granularity / maximum);
+		next_update = ((this_step + 1) * maximum /
+				progress_granularity) + 1;
+	}
+
+	if (this_step == last_step)
+		return next_update;
+
+	memset(&msg, 0, sizeof(msg));
+
+	msg.a = this_step;
+	msg.b = progress_granularity;
+
+	if (fmt) {
+		vsnprintf(msg.text, sizeof(msg.text), fmt, args);
+		msg.text[sizeof(msg.text)-1] = '\0';
+	}
+
+	suspend_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
+			&msg, sizeof(msg));
+	last_step = this_step;
+
+	return next_update;
+}
+
+/* __suspend_message.
+ *
+ * Description:	This function is intended to do the same job as printk, but
+ * 		without normally logging what is printed. The point is to be
+ * 		able to get debugging info on screen without filling the logs
+ * 		with "1/534. ^M 2/534^M. 3/534^M"
+ *
+ * 		It may be called from an interrupt context - can't sleep!
+ *
+ * Arguments:	int mask: The debugging section(s) this message belongs to.
+ * 		int level: The level of verbosity of this message.
+ * 		int restartline: Whether to output a \r or \n with this line
+ * 			(\n if we're logging all output).
+ * 		const char *fmt, ...: Message to be displayed a la printk.
+ */
+void __suspend_message(unsigned long section, unsigned long level,
+		int normally_logged,
+		const char *fmt, ...)
+{
+	struct userui_msg_params msg;
+
+	va_list args;
+
+	if ((level) && (level > console_loglevel))
+		return;
+
+	memset(&msg, 0, sizeof(msg));
+
+	msg.a = section;
+	msg.b = level;
+	msg.c = normally_logged;
+
+	if (fmt) {
+		va_start(args, fmt);
+		vsnprintf(msg.text, sizeof(msg.text), fmt, args);
+		va_end(args);
+		msg.text[sizeof(msg.text)-1] = '\0';
+	}
+
+	if (test_action_state(SUSPEND_LOGALL))
+		printk("%s\n", msg.text);
+
+	if (ui_helper_data.pid == -1)
+		return;
+
+	suspend_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
+			&msg, sizeof(msg));
+}
+
+static void wait_for_key_via_userui(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(&userui_wait_for_key, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	interruptible_sleep_on(&userui_wait_for_key);
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&userui_wait_for_key, &wait);
+}
+
+char suspend_wait_for_keypress(int timeout)
+{
+	int fd;
+	char key = '\0';
+	struct termios t, t_backup;
+
+	if (ui_helper_data.pid != -1) {
+		wait_for_key_via_userui();
+		key = ' ';
+		goto out;
+	}
+	
+	/* We should be guaranteed /dev/console exists after populate_rootfs() in
+	 * init/main.c
+	 */
+	if ((fd = sys_open("/dev/console", O_RDONLY, 0)) < 0) {
+		printk("Couldn't open /dev/console.\n");
+		goto out;
+	}
+
+	if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
+		goto out_close;
+
+	memcpy(&t_backup, &t, sizeof(t));
+
+	t.c_lflag &= ~(ISIG|ICANON|ECHO);
+	t.c_cc[VMIN] = 0;
+	if (timeout)
+		t.c_cc[VTIME] = timeout*10;
+
+	if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
+		goto out_restore;
+
+	while (1) {
+		if (sys_read(fd, &key, 1) <= 0) {
+			key = '\0';
+			break;
+		}
+		key = tolower(key);
+		if (test_suspend_state(SUSPEND_SANITY_CHECK_PROMPT)) {
+			if (key == 'c') {
+				set_suspend_state(SUSPEND_CONTINUE_REQ);
+				break;
+			} else if (key == ' ')
+				break;
+		} else
+			break;
+	}
+
+out_restore:
+	sys_ioctl(fd, TCSETS, (long)&t_backup);
+
+out_close:
+	sys_close(fd);
+out:
+	return key;
+}
+
+/* abort_suspend
+ *
+ * Description: Begin to abort a cycle. If this wasn't at the user's request
+ * 		(and we're displaying output), tell the user why and wait for
+ * 		them to acknowledge the message.
+ * Arguments:	A parameterised string (imagine this is printk) to display,
+ *	 	telling the user why we're aborting.
+ */
+
+void abort_suspend(const char *fmt, ...)
+{
+	va_list args;
+	int printed_len = 0;
+
+	if (!test_result_state(SUSPEND_ABORTED)) {
+		if (!test_result_state(SUSPEND_ABORT_REQUESTED)) {
+			va_start(args, fmt);
+			printed_len = vsnprintf(local_printf_buf, 
+					sizeof(local_printf_buf), fmt, args);
+			va_end(args);
+			if (ui_helper_data.pid != -1)
+				printed_len = sprintf(local_printf_buf + printed_len,
+					" (Press SPACE to continue)");
+			suspend_prepare_status(CLEAR_BAR, local_printf_buf);
+
+			/* 
+			 * Make sure message seen - wait for shift to be
+			 * released if being pressed 
+			 */
+			if (ui_helper_data.pid != -1)
+				suspend_wait_for_keypress(0);
+		}
+		/* Turn on aborting flag */
+		set_result_state(SUSPEND_ABORTED);
+	}
+}
+
+/* suspend_prepare_status
+ * Description:	Prepare the 'nice display', drawing the header and version,
+ * 		along with the current action and perhaps also resetting the
+ * 		progress bar.
+ * Arguments:	
+ * 		int clearbar: Whether to reset the progress bar.
+ * 		const char *fmt, ...: The action to be displayed.
+ */
+void suspend_prepare_status(int clearbar, const char *fmt, ...)
+{
+	va_list args;
+
+	if (fmt) {
+		va_start(args, fmt);
+		lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
+		va_end(args);
+	}
+
+	if (clearbar)
+		userui_update_progress(0, 1, NULL, NULL);
+
+	__suspend_message(0, SUSPEND_STATUS, 1, lastheader, NULL);
+
+	if (ui_helper_data.pid == -1)
+		printk(KERN_EMERG "%s\n", lastheader);
+}
+
+/* update_status
+ *
+ * Description: Update the progress bar and (if on) in-bar message.
+ * Arguments:	UL value, maximum: Current progress percentage (value/max).
+ * 		const char *fmt, ...: Message to be displayed in the middle
+ * 		of the progress bar.
+ * 		Note that a NULL message does not mean that any previous
+ * 		message is erased! For that, you need suspend_prepare_status with
+ * 		clearbar on.
+ * Returns:	Unsigned long: The next value where status needs to be updated.
+ * 		This is to reduce unnecessary calls to update_status.
+ */
+unsigned long suspend_update_status(unsigned long value, unsigned long maximum,
+		const char *fmt, ...)
+{
+	unsigned long next_update = maximum;
+	va_list args;
+
+	if (!maximum)
+		return maximum;
+
+	if (value < 0)
+		value = 0;
+
+	if (value > maximum)
+		value = maximum;
+
+	va_start(args, fmt);
+
+	next_update = userui_update_progress(value, maximum, fmt, args);
+
+	va_end(args);
+
+	return next_update;
+}
+
+/* check_shift_keys
+ * 
+ * Description:	Potentially pause and wait for the user to tell us to continue.
+ * 		We normally only pause when @pause is set.
+ * Arguments:	int pause: Whether we normally pause.
+ * 		char *message: The message to display. Not parameterised
+ * 		 because it's normally a constant.
+ */
+
+void check_shift_keys(int pause, char *message)
+{
+#ifdef CONFIG_PM_DEBUG
+	int displayed_message = 0, last_key = 0;
+	
+	while (last_key != 32 &&
+		ui_helper_data.pid != -1 &&
+		(!test_result_state(SUSPEND_ABORTED)) &&
+		((test_action_state(SUSPEND_PAUSE) && pause) || 
+		 (test_action_state(SUSPEND_SINGLESTEP)))) {
+		if (!displayed_message) {
+			suspend_prepare_status(DONT_CLEAR_BAR, 
+			   "%s Press SPACE to continue.%s",
+			   message ? message : "",
+			   (test_action_state(SUSPEND_SINGLESTEP)) ? 
+			   " Single step on." : "");
+			displayed_message = 1;
+		}
+		last_key = suspend_wait_for_keypress(0);
+	}
+#endif
+	schedule();
+}
+
+extern asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, 
+		unsigned long arg);
+
+/* suspend_prepare_console
+ *
+ * Description:	Prepare a console for use, save current settings.
+ * Returns:	Boolean: Whether an error occured. Errors aren't
+ * 		treated as fatal, but a warning is printed.
+ */
+void suspend_prepare_console(void)
+{
+	orig_loglevel = console_loglevel;
+	orig_default_message_loglevel = default_message_loglevel;
+	orig_kmsg = kmsg_redirect;
+	kmsg_redirect = fg_console + 1;
+	default_message_loglevel = 1;
+	console_loglevel = suspend_default_console_level;
+
+	ui_helper_data.pid = -1;
+
+	if (userui_ops.disabled)
+		return;
+
+	if (!*ui_helper_data.program) {
+		printk("suspend_userui: program not configured. suspend_userui disabled.\n");
+		return;
+	}
+
+	suspend_netlink_setup(&ui_helper_data);
+
+	return;
+}
+
+/* suspend_restore_console
+ *
+ * Description: Restore the settings we saved above.
+ */
+
+void suspend_cleanup_console(void)
+{
+	suspend_default_console_level = console_loglevel;
+
+	if (ui_helper_data.pid > -1) {
+		struct task_struct *t;
+
+		suspend_send_netlink_message(&ui_helper_data,
+				NETLINK_MSG_CLEANUP, NULL, 0);
+
+		read_lock(&tasklist_lock);
+		if ((t = find_task_by_pid(ui_helper_data.pid)))
+			t->flags &= ~PF_NOFREEZE;
+		read_unlock(&tasklist_lock);
+
+		suspend_netlink_close(&ui_helper_data);
+
+		ui_helper_data.pid = -1;
+	}
+
+	console_loglevel = orig_loglevel;
+	kmsg_redirect = orig_kmsg;
+	default_message_loglevel = orig_default_message_loglevel;
+}
+#else
+static char suspend_wait_for_keypress(int timeout)
+{
+	return 0;
+}
+
+unsigned long suspend_update_status(unsigned long value, unsigned long maximum,
+		const char *fmt, ...)
+{
+	return maximum;
+}
+
+void __suspend_message(unsigned long section, unsigned long level,
+		int normally_logged,
+		const char *fmt, ...)  { }
+void suspend_prepare_status(int clearbar, const char *fmt, ...) { }
+void check_shift_keys(int pause, char *message) { }
+void abort_suspend(const char *fmt, ...) { }
+void suspend_prepare_console(void) { }
+void suspend_cleanup_console(void) { }
+void userui_redraw(void) { }
+#endif
+
+/* suspend_early_boot_message()
+ * Description:	Handle errors early in the process of booting.
+ * 		The user may press C to continue booting, perhaps
+ * 		invalidating the image,  or space to reboot. 
+ * 		This works from either the serial console or normally 
+ * 		attached keyboard.
+ *
+ * 		Note that we come in here from init, while the kernel is
+ * 		locked. If we want to get events from the serial console,
+ * 		we need to temporarily unlock the kernel.
+ *
+ * 		suspend_early_boot_message may also be called post-boot.
+ * 		In this case, it simply printks the message and returns.
+ *
+ * Arguments:	int	Whether we are able to erase the image.
+ * 		int	default_answer. What to do when we timeout. This
+ * 			will normally be continue, but the user might
+ * 			provide command line options (__setup) to override
+ * 			particular cases.
+ * 		Char *. Pointer to a string explaining why we're moaning.
+ */
+
+#define say(message, a...) printk(KERN_EMERG message, ##a)
+#define message_timeout 25 /* message_timeout * 10 must fit in 8 bits */
+
+int suspend_early_boot_message(int message_detail, int default_answer, char *warning_reason, ...)
+{
+	unsigned long orig_state = get_suspend_state(), continue_req = 0;
+	va_list args;
+	int printed_len;
+
+	if (warning_reason) {
+		va_start(args, warning_reason);
+		printed_len = vsnprintf(local_printf_buf, 
+				sizeof(local_printf_buf), 
+				warning_reason,
+				args);
+		va_end(args);
+	}
+
+	if (!test_suspend_state(SUSPEND_BOOT_TIME)) {
+		printk(name_suspend "%s\n", local_printf_buf);
+		return default_answer;
+	}
+
+	/* We might be called directly from do_mounts_initrd if the
+	 * user fails to set up their initrd properly. We need to
+	 * enable the keyboard handler by setting the running flag */
+	set_suspend_state(SUSPEND_RUNNING);
+
+#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
+	console_loglevel = 7;
+
+	say("=== Suspend2 ===\n\n");
+	if (warning_reason) {
+		say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
+		switch (message_detail) {
+		 case 0:
+			say("If you continue booting, note that any image WILL NOT BE REMOVED.\n");
+			say("Suspend is unable to do so because the appropriate modules aren't\n");
+			say("loaded. You should manually remove the image to avoid any\n");
+			say("possibility of corrupting your filesystem(s) later.\n");
+			break;
+		 case 1:
+			say("If you want to use the current suspend image, reboot and try\n");
+			say("again with the same kernel that you suspended from. If you want\n");
+			say("to forget that image, continue and the image will be erased.\n");
+			break;
+		}
+		say("Press SPACE to reboot or C to continue booting with this kernel\n\n");
+		say("Default action if you don't select one in %d seconds is: %s.\n",
+			message_timeout,
+			default_answer == SUSPEND_CONTINUE_REQ ?
+			"continue booting" : "reboot");
+	} else {
+		say("BIG FAT WARNING!!\n\n");
+		say("You have tried to resume from this image before.\n");
+		say("If it failed once, it may well fail again.\n");
+		say("Would you like to remove the image and boot normally?\n");
+		say("This will be equivalent to entering noresume2 on the\n");
+		say("kernel command line.\n\n");
+		say("Press SPACE to remove the image or C to continue resuming.\n\n");
+		say("Default action if you don't select one in %d seconds is: %s.\n",
+			message_timeout,
+			!!default_answer ?
+			"continue resuming" : "remove the image");
+	}
+	
+	set_suspend_state(SUSPEND_SANITY_CHECK_PROMPT);
+	clear_suspend_state(SUSPEND_CONTINUE_REQ);
+
+	if (suspend_wait_for_keypress(message_timeout) == 0) /* We timed out */
+		continue_req = !!default_answer;
+	else
+		continue_req = test_suspend_state(SUSPEND_CONTINUE_REQ);
+
+	if ((warning_reason) && (!continue_req))
+		machine_restart(NULL);
+	
+	restore_suspend_state(orig_state);
+	if (continue_req)
+		set_suspend_state(SUSPEND_CONTINUE_REQ);
+
+#endif // CONFIG_VT or CONFIG_SERIAL_CONSOLE
+	return -EPERM;
+}
+#undef say
+
+/*
+ * User interface specific /proc/suspend entries.
+ */
+
+static struct suspend_proc_data proc_params[] = {
+#ifdef CONFIG_NET
+#ifdef CONFIG_PROC_FS
+	{ .filename			= "default_console_level",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_INTEGER,
+	  .data = {
+		  .integer = {
+			  .variable	= &suspend_default_console_level,
+			  .minimum	= 0,
+#ifdef CONFIG_PM_DEBUG
+			  .maximum	= 7,
+#else
+			  .maximum	= 1,
+#endif
+
+		  }
+	  }
+	},
+
+	{ .filename			= "enable_escape",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_CAN_CANCEL,
+		  }
+	  }
+	},
+
+#ifdef CONFIG_PM_DEBUG
+	{ .filename			= "debug_sections",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_UL,
+	  .data = {
+		  .ul = {
+			  .variable	= &suspend_debug_state,
+			  .minimum	= 0,
+			  .maximum	= 2 << 30,
+		  }
+	  }
+	},
+
+	{ .filename			= "log_everything",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_LOGALL,
+		  }
+	  }
+	},
+	  
+	{ .filename			= "pause_between_steps",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_BIT,
+	  .data = {
+		  .bit = {
+			  .bit_vector	= &suspend_action,
+			  .bit		= SUSPEND_PAUSE,
+		  }
+	  }
+	},
+#endif
+	{ .filename			= "disable_userui_support",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_INTEGER,
+	  .data = {
+		.integer = {
+			.variable	= &userui_ops.disabled,
+			.minimum	= 0,
+			.maximum	= 1,
+		}
+	  }
+	},
+	{ .filename			= "userui_progress_granularity",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_INTEGER,
+	  .data = {
+		.integer = {
+			.variable	= &progress_granularity,
+			.minimum	= 1,
+			.maximum	= 2048,
+		}
+	  }
+	},
+	{ .filename			= "userui_program",
+	  .permissions			= PROC_RW,
+	  .type				= SUSPEND_PROC_DATA_STRING,
+	  .data = {
+		.string = {
+			.variable	= ui_helper_data.program,
+			.max_length	= 255,
+		}
+	  }
+	}
+#endif
+#endif
+};
+
+static struct suspend_module_ops userui_ops = {
+	.type				= MISC_PLUGIN,
+	.name				= "Userspace UI Support",
+	.module				= THIS_MODULE,
+#ifdef CONFIG_NET
+	.storage_needed			= userui_storage_needed,
+	.save_config_info		= userui_save_config_info,
+	.load_config_info		= userui_load_config_info,
+	.memory_needed			= userui_memory_needed,
+#endif
+};
+
+/* suspend_console_proc_init
+ * Description: Boot time initialisation for user interface.
+ */
+static __init int suspend_console_proc_init(void)
+{
+	int result, i, numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data);
+
+	if (!(result = suspend_register_module(&userui_ops)))
+		for (i=0; i< numfiles; i++)
+			suspend_register_procfile(&proc_params[i]);
+
+#ifdef CONFIG_NET
+	ui_helper_data.nl = NULL;
+	ui_helper_data.program[0] = '\0';
+#endif
+	ui_helper_data.pid = -1;
+	ui_helper_data.skb_size = sizeof(struct userui_msg_params);
+	ui_helper_data.pool_limit = 6;
+	ui_helper_data.netlink_id = NETLINK_SUSPEND2_USERUI;
+	ui_helper_data.name = "userspace ui";
+	ui_helper_data.rcv_msg = userui_user_rcv_msg;
+	ui_helper_data.interface_version = 6;
+	ui_helper_data.must_init = 0;
+	ui_helper_data.not_ready = suspend_cleanup_console;
+	init_completion(&ui_helper_data.wait_for_process);
+
+	return result;
+}
+
+late_initcall(suspend_console_proc_init);
diff -urN oldtree/kernel/power/ui.h newtree/kernel/power/ui.h
--- oldtree/kernel/power/ui.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/ui.h	2006-03-08 15:22:33.353511750 +0000
@@ -0,0 +1,44 @@
+/*
+ *
+ */
+
+extern void suspend_prepare_console(void);
+extern void suspend_cleanup_console(void);
+
+extern void check_shift_keys(int pause, char *message);
+extern unsigned long suspend_update_status(unsigned long value, unsigned long maximum,
+		const char *fmt, ...);
+
+extern void abort_suspend(const char *fmt, ...);
+
+extern void userui_redraw(void);
+
+enum {
+	DONT_CLEAR_BAR,
+	CLEAR_BAR
+};
+
+enum {
+	/* Userspace -> Kernel */
+	USERUI_MSG_ABORT = 0x11,
+	USERUI_MSG_SET_STATE = 0x12,
+	USERUI_MSG_GET_STATE = 0x13,
+	USERUI_MSG_GET_DEBUG_STATE = 0x14,
+	USERUI_MSG_SET_DEBUG_STATE = 0x15,
+	USERUI_MSG_SET_PROGRESS_GRANULARITY = 0x17,
+	USERUI_MSG_SPACE = 0x18,
+
+	/* Kernel -> Userspace */
+	USERUI_MSG_MESSAGE = 0x21,
+	USERUI_MSG_PROGRESS = 0x22,
+	USERUI_MSG_REDRAW = 0x25,
+	USERUI_MSG_KEYPRESS = 0x26,
+	USERUI_MSG_DEBUG_STATE = 0x29,
+
+	USERUI_MSG_MAX,
+};
+
+struct userui_msg_params {
+	unsigned long a, b, c, d;
+	char text[255];
+};
diff -urN oldtree/kernel/power/version.h newtree/kernel/power/version.h
--- oldtree/kernel/power/version.h	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/power/version.h	2006-03-08 15:22:33.353511750 +0000
@@ -0,0 +1,2 @@
+#define SUSPEND_CORE_VERSION "2.2.0.1"
+#define name_suspend "Suspend2 " SUSPEND_CORE_VERSION ": "
diff -urN oldtree/kernel/sched.c newtree/kernel/sched.c
--- oldtree/kernel/sched.c	2006-03-08 18:48:02.972064750 +0000
+++ newtree/kernel/sched.c	2006-03-08 15:22:33.357512000 +0000
@@ -4981,7 +4981,6 @@
 		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
-		p->flags |= PF_NOFREEZE;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
 		rq = task_rq_lock(p, &flags);
diff -urN oldtree/kernel/sched.c.orig newtree/kernel/sched.c.orig
--- oldtree/kernel/sched.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/sched.c.orig	2006-03-08 15:21:19.240880000 +0000
@@ -0,0 +1,6504 @@
+/*
+ *  kernel/sched.c
+ *
+ *  Kernel scheduler and related syscalls
+ *
+ *  Copyright (C) 1991-2002  Linus Torvalds
+ *
+ *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
+ *		make semaphores SMP safe
+ *  1998-11-19	Implemented schedule_timeout() and related stuff
+ *		by Andrea Arcangeli
+ *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Cleanups and useful suggestions
+ *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+ *  2003-09-03	Interactivity tuning by Con Kolivas.
+ *  2004-04-02	Scheduler domains code by Nick Piggin
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+#include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kgdb.h>
+#include <linux/kernel_stat.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/suspend.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/seq_file.h>
+#include <linux/sysctl.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/acct.h>
+#include <linux/kprobes.h>
+#include <asm/tlb.h>
+
+#include <asm/unistd.h>
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+
+/*
+ * Some helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
+ * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+ * Timeslices get refilled after they expire.
+ */
+#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
+#define DEF_TIMESLICE		(100 * HZ / 1000)
+#define ON_RUNQUEUE_WEIGHT	 30
+#define CHILD_PENALTY		 95
+#define PARENT_PENALTY		100
+#define EXIT_WEIGHT		  3
+#define PRIO_BONUS_RATIO	 25
+#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
+#define INTERACTIVE_DELTA	  2
+#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
+#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
+#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+
+/*
+ * If a task is 'interactive' then we reinsert it in the active
+ * array after it has expired its current timeslice. (it will not
+ * continue to run immediately, it will still roundrobin with
+ * other interactive tasks.)
+ *
+ * This part scales the interactivity limit depending on niceness.
+ *
+ * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+ * Here are a few examples of different nice levels:
+ *
+ *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+ *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+ *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+ *
+ * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+ *  priority range a task can explore, a value of '1' means the
+ *  task is rated interactive.)
+ *
+ * Ie. nice +19 tasks can never get 'interactive' enough to be
+ * reinserted into the active array. And only heavily CPU-hog nice -20
+ * tasks will be expired. Default nice 0 tasks are somewhere between,
+ * it takes some effort for them to get interactive, but it's not
+ * too hard.
+ */
+
+#define CURRENT_BONUS(p) \
+	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
+		MAX_SLEEP_AVG)
+
+#define GRANULARITY	(10 * HZ / 1000 ? : 1)
+
+#ifdef CONFIG_SMP
+#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
+			num_online_cpus())
+#else
+#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
+#endif
+
+#define SCALE(v1,v1_max,v2_max) \
+	(v1) * (v2_max) / (v1_max)
+
+#define DELTA(p) \
+	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+		INTERACTIVE_DELTA)
+
+#define TASK_INTERACTIVE(p) \
+	((p)->prio <= (p)->static_prio - DELTA(p))
+
+#define INTERACTIVE_SLEEP(p) \
+	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
+		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+
+#define TASK_PREEMPTS_CURR(p, rq) \
+	((p)->prio < (rq)->curr->prio)
+
+/*
+ * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+ * to time slice values: [800ms ... 100ms ... 5ms]
+ *
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
+ */
+
+#define SCALE_PRIO(x, prio) \
+	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
+
+static unsigned int static_prio_timeslice(int static_prio)
+{
+	if (static_prio < NICE_TO_PRIO(0))
+		return SCALE_PRIO(DEF_TIMESLICE*4, static_prio);
+	else
+		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
+}
+
+static inline unsigned int task_timeslice(task_t *p)
+{
+	return static_prio_timeslice(p->static_prio);
+}
+
+#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
+				< (long long) (sd)->cache_hot_time)
+
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	__put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
+/*
+ * These are the runqueue data structures:
+ */
+
+#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+
+typedef struct runqueue runqueue_t;
+
+struct prio_array {
+	unsigned int nr_active;
+	unsigned long bitmap[BITMAP_SIZE];
+	struct list_head queue[MAX_PRIO];
+};
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct runqueue {
+	spinlock_t lock;
+
+	/*
+	 * nr_running and cpu_load should be in the same cacheline because
+	 * remote CPUs use both these fields when doing load calculation.
+	 */
+	unsigned long nr_running;
+#ifdef CONFIG_SMP
+	unsigned long raw_weighted_load;
+	unsigned long cpu_load[3];
+#endif
+	unsigned long long nr_switches;
+
+	/*
+	 * This is part of a global counter where only the total sum
+	 * over all CPUs matters. A task can increase this counter on
+	 * one CPU and if it got migrated afterwards it may decrease
+	 * it on another CPU. Always updated under the runqueue lock:
+	 */
+	unsigned long nr_uninterruptible;
+
+	unsigned long expired_timestamp;
+	unsigned long long timestamp_last_tick;
+	task_t *curr, *idle;
+	struct mm_struct *prev_mm;
+	prio_array_t *active, *expired, arrays[2];
+	int best_expired_prio;
+	atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+	struct sched_domain *sd;
+
+	/* For active balancing */
+	int active_balance;
+	int push_cpu;
+
+	task_t *migration_thread;
+	struct list_head migration_queue;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+	/* latency stats */
+	struct sched_info rq_sched_info;
+
+	/* sys_sched_yield() stats */
+	unsigned long yld_exp_empty;
+	unsigned long yld_act_empty;
+	unsigned long yld_both_empty;
+	unsigned long yld_cnt;
+
+	/* schedule() stats */
+	unsigned long sched_switch;
+	unsigned long sched_cnt;
+	unsigned long sched_goidle;
+
+	/* try_to_wake_up() stats */
+	unsigned long ttwu_cnt;
+	unsigned long ttwu_local;
+#endif
+};
+
+static DEFINE_PER_CPU(struct runqueue, runqueues);
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, domain) \
+for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
+
+#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
+#define this_rq()		(&__get_cpu_var(runqueues))
+#define task_rq(p)		cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)	do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+	return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+	/* this is a valid case when another task releases the spinlock */
+	rq->lock.owner = current;
+#endif
+	spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+	return p->oncpu;
+#else
+	return rq->curr == p;
+#endif
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * We can optimise this out completely for !SMP, because the
+	 * SMP rebalancing from interrupt is the only thing that cares
+	 * here.
+	 */
+	next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	spin_unlock_irq(&rq->lock);
+#else
+	spin_unlock(&rq->lock);
+#endif
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * After ->oncpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 */
+	smp_wmb();
+	prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+/*
+ * task_rq_lock - lock the runqueue a given task resides on and disable
+ * interrupts.  Note the ordering: we can safely lookup the task_rq without
+ * explicitly disabling preemption.
+ */
+static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+	__acquires(rq->lock)
+{
+	struct runqueue *rq;
+
+repeat_lock_task:
+	local_irq_save(*flags);
+	rq = task_rq(p);
+	spin_lock(&rq->lock);
+	if (unlikely(rq != task_rq(p))) {
+		spin_unlock_irqrestore(&rq->lock, *flags);
+		goto repeat_lock_task;
+	}
+	return rq;
+}
+
+static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+	__releases(rq->lock)
+{
+	spin_unlock_irqrestore(&rq->lock, *flags);
+}
+
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 12
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+	int cpu;
+
+	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+	seq_printf(seq, "timestamp %lu\n", jiffies);
+	for_each_online_cpu(cpu) {
+		runqueue_t *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+		struct sched_domain *sd;
+		int dcnt = 0;
+#endif
+
+		/* runqueue-specific stats */
+		seq_printf(seq,
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+		    cpu, rq->yld_both_empty,
+		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
+		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
+		    rq->ttwu_cnt, rq->ttwu_local,
+		    rq->rq_sched_info.cpu_time,
+		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+
+		seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+		/* domain-specific stats */
+		preempt_disable();
+		for_each_domain(cpu, sd) {
+			enum idle_type itype;
+			char mask_str[NR_CPUS];
+
+			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
+			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
+			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
+					itype++) {
+				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+				    sd->lb_cnt[itype],
+				    sd->lb_balanced[itype],
+				    sd->lb_failed[itype],
+				    sd->lb_imbalance[itype],
+				    sd->lb_gained[itype],
+				    sd->lb_hot_gained[itype],
+				    sd->lb_nobusyq[itype],
+				    sd->lb_nobusyg[itype]);
+			}
+			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
+			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
+			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
+		}
+		preempt_enable();
+#endif
+	}
+	return 0;
+}
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+	char *buf = kmalloc(size, GFP_KERNEL);
+	struct seq_file *m;
+	int res;
+
+	if (!buf)
+		return -ENOMEM;
+	res = single_open(file, show_schedstat, NULL);
+	if (!res) {
+		m = file->private_data;
+		m->buf = buf;
+		m->size = size;
+	} else
+		kfree(buf);
+	return res;
+}
+
+struct file_operations proc_schedstat_operations = {
+	.open    = schedstat_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
+# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
+#else /* !CONFIG_SCHEDSTATS */
+# define schedstat_inc(rq, field)	do { } while (0)
+# define schedstat_add(rq, field, amt)	do { } while (0)
+#endif
+
+/*
+ * rq_lock - lock a given runqueue and disable interrupts.
+ */
+static inline runqueue_t *this_rq_lock(void)
+	__acquires(rq->lock)
+{
+	runqueue_t *rq;
+
+	local_irq_disable();
+	rq = this_rq();
+	spin_lock(&rq->lock);
+
+	return rq;
+}
+
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * Called when a process is dequeued from the active array and given
+ * the cpu.  We should note that with the exception of interactive
+ * tasks, the expired queue will become the active queue after the active
+ * queue is empty, without explicitly dequeuing and requeuing tasks in the
+ * expired queue.  (Interactive tasks may be requeued directly to the
+ * active queue, thus delaying tasks in the expired queue from running;
+ * see scheduler_tick()).
+ *
+ * This function is only called from sched_info_arrive(), rather than
+ * dequeue_task(). Even though a task may be queued and dequeued multiple
+ * times as it is shuffled about, we're really interested in knowing how
+ * long it was from the *first* time it was queued to the time that it
+ * finally hit a cpu.
+ */
+static inline void sched_info_dequeued(task_t *t)
+{
+	t->sched_info.last_queued = 0;
+}
+
+/*
+ * Called when a task finally hits the cpu.  We can now calculate how
+ * long it was waiting to run.  We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static void sched_info_arrive(task_t *t)
+{
+	unsigned long now = jiffies, diff = 0;
+	struct runqueue *rq = task_rq(t);
+
+	if (t->sched_info.last_queued)
+		diff = now - t->sched_info.last_queued;
+	sched_info_dequeued(t);
+	t->sched_info.run_delay += diff;
+	t->sched_info.last_arrival = now;
+	t->sched_info.pcnt++;
+
+	if (!rq)
+		return;
+
+	rq->rq_sched_info.run_delay += diff;
+	rq->rq_sched_info.pcnt++;
+}
+
+/*
+ * Called when a process is queued into either the active or expired
+ * array.  The time is noted and later used to determine how long we
+ * had to wait for us to reach the cpu.  Since the expired queue will
+ * become the active queue after active queue is empty, without dequeuing
+ * and requeuing any tasks, we are interested in queuing to either. It
+ * is unusual but not impossible for tasks to be dequeued and immediately
+ * requeued in the same or another array: this can happen in sched_yield(),
+ * set_user_nice(), and even load_balance() as it moves tasks from runqueue
+ * to runqueue.
+ *
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set.  It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(task_t *t)
+{
+	if (!t->sched_info.last_queued)
+		t->sched_info.last_queued = jiffies;
+}
+
+/*
+ * Called when a process ceases being the active-running process, either
+ * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ */
+static inline void sched_info_depart(task_t *t)
+{
+	struct runqueue *rq = task_rq(t);
+	unsigned long diff = jiffies - t->sched_info.last_arrival;
+
+	t->sched_info.cpu_time += diff;
+
+	if (rq)
+		rq->rq_sched_info.cpu_time += diff;
+}
+
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice.  (This may also be called when switching to or from
+ * the idle task.)  We are only called when prev != next.
+ */
+static inline void sched_info_switch(task_t *prev, task_t *next)
+{
+	struct runqueue *rq = task_rq(prev);
+
+	/*
+	 * prev now departs the cpu.  It's not interesting to record
+	 * stats about how efficient we were at scheduling the idle
+	 * process, however.
+	 */
+	if (prev != rq->idle)
+		sched_info_depart(prev);
+
+	if (next != rq->idle)
+		sched_info_arrive(next);
+}
+#else
+#define sched_info_queued(t)		do { } while (0)
+#define sched_info_switch(t, next)	do { } while (0)
+#endif /* CONFIG_SCHEDSTATS */
+
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void dequeue_task(struct task_struct *p, prio_array_t *array)
+{
+	array->nr_active--;
+	list_del(&p->run_list);
+	if (list_empty(array->queue + p->prio))
+		__clear_bit(p->prio, array->bitmap);
+}
+
+static void enqueue_task(struct task_struct *p, prio_array_t *array)
+{
+	sched_info_queued(p);
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
+
+/*
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
+ */
+static void requeue_task(struct task_struct *p, prio_array_t *array)
+{
+	list_move_tail(&p->run_list, array->queue + p->prio);
+}
+
+static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
+{
+	list_add(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
+
+/*
+ * effective_prio - return the priority that is based on the static
+ * priority but is modified by bonuses/penalties.
+ *
+ * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+ * into the -5 ... 0 ... +5 bonus/penalty range.
+ *
+ * We use 25% of the full 0...39 priority range so that:
+ *
+ * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+ * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ *
+ * Both properties are important to certain workloads.
+ */
+static int effective_prio(task_t *p)
+{
+	int bonus, prio;
+
+	if (rt_task(p))
+		return p->prio;
+
+	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+
+	prio = p->static_prio - bonus;
+	if (prio < MAX_RT_PRIO)
+		prio = MAX_RT_PRIO;
+	if (prio > MAX_PRIO-1)
+		prio = MAX_PRIO-1;
+	return prio;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+/*
+ * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
+ * If static_prio_timeslice() is ever changed to break this assumption then
+ * this code will need modification
+ */
+#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
+#define LOAD_WEIGHT(lp) \
+	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
+#define PRIO_TO_LOAD_WEIGHT(prio) \
+	LOAD_WEIGHT(static_prio_timeslice(prio))
+#define RTPRIO_TO_LOAD_WEIGHT(rp) \
+	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+
+static inline void set_load_weight(task_t *p)
+{
+	if (rt_task(p)) {
+		if (p == task_rq(p)->migration_thread)
+			/*
+			 * The migration thread does the actual balancing.
+			 * Giving its load any weight will skew balancing
+			 * adversely.
+			 */
+			p->load_weight = 0;
+		else
+			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+	} else
+		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+}
+
+static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+	rq->raw_weighted_load += p->load_weight;
+}
+
+static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+	rq->raw_weighted_load -= p->load_weight;
+}
+#else
+static inline void set_load_weight(task_t *p)
+{
+}
+
+static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+}
+
+static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+}
+#endif
+
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running++;
+	inc_raw_weighted_load(rq, p);
+}
+
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	dec_raw_weighted_load(rq, p);
+}
+
+/*
+ * __activate_task - move a task to the runqueue.
+ */
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task(p, rq->active);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task_head(p, rq->active);
+	inc_nr_running(p, rq);
+}
+
+static int recalc_task_prio(task_t *p, unsigned long long now)
+{
+	/* Caller must always ensure 'now >= p->timestamp' */
+	unsigned long long __sleep_time = now - p->timestamp;
+	unsigned long sleep_time;
+
+	if (unlikely(p->policy == SCHED_BATCH))
+		sleep_time = 0;
+	else {
+		if (__sleep_time > NS_MAX_SLEEP_AVG)
+			sleep_time = NS_MAX_SLEEP_AVG;
+		else
+			sleep_time = (unsigned long)__sleep_time;
+	}
+
+	if (likely(sleep_time > 0)) {
+		/*
+		 * User tasks that sleep a long time are categorised as
+		 * idle. They will only have their sleep_avg increased to a
+		 * level that makes them just interactive priority to stay
+		 * active yet prevent them suddenly becoming cpu hogs and
+		 * starving other processes.
+		 */
+		if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
+				unsigned long ceiling;
+
+				ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
+					DEF_TIMESLICE);
+				if (p->sleep_avg < ceiling)
+					p->sleep_avg = ceiling;
+		} else {
+
+			/*
+			 * The lower the sleep avg a task has the more
+			 * rapidly it will rise with sleep time. This enables
+			 * tasks to rapidly recover to a low latency priority.
+			 * If a task was sleeping with the noninteractive
+			 * label do not apply this non-linear boost
+			 */
+			if (p->sleep_type != SLEEP_NONINTERACTIVE || !p->mm)
+				sleep_time *=
+					(MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
+
+			/*
+			 * This code gives a bonus to interactive tasks.
+			 *
+			 * The boost works by updating the 'average sleep time'
+			 * value here, based on ->timestamp. The more time a
+			 * task spends sleeping, the higher the average gets -
+			 * and the higher the priority boost gets as well.
+			 */
+			p->sleep_avg += sleep_time;
+
+			if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+				p->sleep_avg = NS_MAX_SLEEP_AVG;
+		}
+	}
+
+	return effective_prio(p);
+}
+
+/*
+ * activate_task - move a task to the runqueue and do priority recalculation
+ *
+ * Update all the scheduling statistics stuff. (sleep average
+ * calculation, priority modifiers, etc.)
+ */
+static void activate_task(task_t *p, runqueue_t *rq, int local)
+{
+	unsigned long long now;
+
+	now = sched_clock();
+#ifdef CONFIG_SMP
+	if (!local) {
+		/* Compensate for drifting sched_clock */
+		runqueue_t *this_rq = this_rq();
+		now = (now - this_rq->timestamp_last_tick)
+			+ rq->timestamp_last_tick;
+	}
+#endif
+
+	if (!rt_task(p))
+		p->prio = recalc_task_prio(p, now);
+
+	if (p->sleep_type != SLEEP_NONINTERACTIVE) {
+		/*
+		 * Tasks which were woken up by interrupts (ie. hw events)
+		 * are most likely of interactive nature. So we give them
+		 * the credit of extending their sleep time to the period
+		 * of time they spend on the runqueue, waiting for execution
+		 * on a CPU, first time around:
+		 */
+		if (in_interrupt())
+			p->sleep_type = SLEEP_INTERRUPTED;
+		else {
+			/*
+			 * Normal first-time wakeups get a credit too for
+			 * on-runqueue time, but it will be weighted down:
+			 */
+			p->sleep_type = SLEEP_INTERACTIVE;
+		}
+	}
+	p->timestamp = now;
+
+	__activate_task(p, rq);
+}
+
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+static void deactivate_task(struct task_struct *p, runqueue_t *rq)
+{
+	dec_nr_running(p, rq);
+	dequeue_task(p, p->array);
+	p->array = NULL;
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+#ifdef CONFIG_SMP
+static void resched_task(task_t *p)
+{
+	int cpu;
+
+	assert_spin_locked(&task_rq(p)->lock);
+
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+		return;
+
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
+	smp_mb();
+	if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
+		smp_send_reschedule(cpu);
+}
+#else
+static inline void resched_task(task_t *p)
+{
+	assert_spin_locked(&task_rq(p)->lock);
+	set_tsk_need_resched(p);
+}
+#endif
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ */
+inline int task_curr(const task_t *p)
+{
+	return cpu_curr(task_cpu(p)) == p;
+}
+
+#ifdef CONFIG_SMP
+typedef struct {
+	struct list_head list;
+
+	task_t *task;
+	int dest_cpu;
+
+	struct completion done;
+} migration_req_t;
+
+/*
+ * The task's runqueue lock must be held.
+ * Returns true if you have to wait for migration thread.
+ */
+static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
+{
+	runqueue_t *rq = task_rq(p);
+
+	/*
+	 * If the task is not on a runqueue (and not running), then
+	 * it is sufficient to simply update the task's cpu field.
+	 */
+	if (!p->array && !task_running(rq, p)) {
+		set_task_cpu(p, dest_cpu);
+		return 0;
+	}
+
+	init_completion(&req->done);
+	req->task = p;
+	req->dest_cpu = dest_cpu;
+	list_add(&req->list, &rq->migration_queue);
+	return 1;
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+void wait_task_inactive(task_t *p)
+{
+	unsigned long flags;
+	runqueue_t *rq;
+	int preempted;
+
+repeat:
+	rq = task_rq_lock(p, &flags);
+	/* Must be off runqueue entirely, not preempted. */
+	if (unlikely(p->array || task_running(rq, p))) {
+		/* If it's preempted, we yield.  It could be a while. */
+		preempted = !task_running(rq, p);
+		task_rq_unlock(rq, &flags);
+		cpu_relax();
+		if (preempted)
+			yield();
+		goto repeat;
+	}
+	task_rq_unlock(rq, &flags);
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesnt have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(task_t *p)
+{
+	int cpu;
+
+	preempt_disable();
+	cpu = task_cpu(p);
+	if ((cpu != smp_processor_id()) && task_curr(p))
+		smp_send_reschedule(cpu);
+	preempt_enable();
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static inline unsigned long source_load(int cpu, int type)
+{
+	runqueue_t *rq = cpu_rq(cpu);
+
+	if (type == 0)
+		return rq->raw_weighted_load;
+
+	return min(rq->cpu_load[type-1], rq->raw_weighted_load);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static inline unsigned long target_load(int cpu, int type)
+{
+	runqueue_t *rq = cpu_rq(cpu);
+
+	if (type == 0)
+		return rq->raw_weighted_load;
+
+	return max(rq->cpu_load[type-1], rq->raw_weighted_load);
+}
+
+/*
+ * Return the average load per task on the cpu's run queue
+ */
+static inline unsigned long cpu_avg_load_per_task(int cpu)
+{
+	runqueue_t *rq = cpu_rq(cpu);
+	unsigned long n = rq->nr_running;
+
+	return n ?  rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
+}
+
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long min_load = ULONG_MAX, this_load = 0;
+	int load_idx = sd->forkexec_idx;
+	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+	do {
+		unsigned long load, avg_load;
+		int local_group;
+		int i;
+
+		/* Skip over this group if it has no CPUs allowed */
+		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+			goto nextgroup;
+
+		local_group = cpu_isset(this_cpu, group->cpumask);
+
+		/* Tally up the load of all CPUs in the group */
+		avg_load = 0;
+
+		for_each_cpu_mask(i, group->cpumask) {
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = source_load(i, load_idx);
+			else
+				load = target_load(i, load_idx);
+
+			avg_load += load;
+		}
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			this = group;
+		} else if (avg_load < min_load) {
+			min_load = avg_load;
+			idlest = group;
+		}
+nextgroup:
+		group = group->next;
+	} while (group != sd->groups);
+
+	if (!idlest || 100*this_load < imbalance*min_load)
+		return NULL;
+	return idlest;
+}
+
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+{
+	cpumask_t tmp;
+	unsigned long load, min_load = ULONG_MAX;
+	int idlest = -1;
+	int i;
+
+	/* Traverse only the allowed CPUs */
+	cpus_and(tmp, group->cpumask, p->cpus_allowed);
+
+	for_each_cpu_mask(i, tmp) {
+		load = source_load(i, 0);
+
+		if (load < min_load || (load == min_load && i == this_cpu)) {
+			min_load = load;
+			idlest = i;
+		}
+	}
+
+	return idlest;
+}
+
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+	struct task_struct *t = current;
+	struct sched_domain *tmp, *sd = NULL;
+
+	for_each_domain(cpu, tmp)
+		if (tmp->flags & flag)
+			sd = tmp;
+
+	while (sd) {
+		cpumask_t span;
+		struct sched_group *group;
+		int new_cpu;
+		int weight;
+
+		span = sd->span;
+		group = find_idlest_group(sd, t, cpu);
+		if (!group)
+			goto nextlevel;
+
+		new_cpu = find_idlest_cpu(group, t, cpu);
+		if (new_cpu == -1 || new_cpu == cpu)
+			goto nextlevel;
+
+		/* Now try balancing at a lower domain level */
+		cpu = new_cpu;
+nextlevel:
+		sd = NULL;
+		weight = cpus_weight(span);
+		for_each_domain(cpu, tmp) {
+			if (weight <= cpus_weight(tmp->span))
+				break;
+			if (tmp->flags & flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
+	}
+
+	return cpu;
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available.  The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
+ *
+ * Returns the CPU we should wake onto.
+ */
+#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+static int wake_idle(int cpu, task_t *p)
+{
+	cpumask_t tmp;
+	struct sched_domain *sd;
+	int i;
+
+	if (idle_cpu(cpu))
+		return cpu;
+
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_IDLE) {
+			cpus_and(tmp, sd->span, p->cpus_allowed);
+			for_each_cpu_mask(i, tmp) {
+				if (idle_cpu(i))
+					return i;
+			}
+		}
+		else
+			break;
+	}
+	return cpu;
+}
+#else
+static inline int wake_idle(int cpu, task_t *p)
+{
+	return cpu;
+}
+#endif
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the to-be-woken-up thread
+ * @state: the mask of task states that can be woken
+ * @sync: do a synchronous wakeup?
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * returns failure only if the task is already active.
+ */
+static int try_to_wake_up(task_t *p, unsigned int state, int sync)
+{
+	int cpu, this_cpu, success = 0;
+	unsigned long flags;
+	long old_state;
+	runqueue_t *rq;
+#ifdef CONFIG_SMP
+	unsigned long load, this_load;
+	struct sched_domain *sd, *this_sd = NULL;
+	int new_cpu;
+#endif
+
+	rq = task_rq_lock(p, &flags);
+	old_state = p->state;
+	if (!(old_state & state))
+		goto out;
+
+	if (p->array)
+		goto out_running;
+
+	cpu = task_cpu(p);
+	this_cpu = smp_processor_id();
+
+#ifdef CONFIG_SMP
+	if (unlikely(task_running(rq, p)))
+		goto out_activate;
+
+	new_cpu = cpu;
+
+	schedstat_inc(rq, ttwu_cnt);
+	if (cpu == this_cpu) {
+		schedstat_inc(rq, ttwu_local);
+		goto out_set_cpu;
+	}
+
+	for_each_domain(this_cpu, sd) {
+		if (cpu_isset(cpu, sd->span)) {
+			schedstat_inc(sd, ttwu_wake_remote);
+			this_sd = sd;
+			break;
+		}
+	}
+
+	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+		goto out_set_cpu;
+
+	/*
+	 * Check for affine wakeup and passive balancing possibilities.
+	 */
+	if (this_sd) {
+		int idx = this_sd->wake_idx;
+		unsigned int imbalance;
+
+		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
+		load = source_load(cpu, idx);
+		this_load = target_load(this_cpu, idx);
+
+		new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+		if (this_sd->flags & SD_WAKE_AFFINE) {
+			unsigned long tl = this_load;
+			unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+
+			/*
+			 * If sync wakeup then subtract the (maximum possible)
+			 * effect of the currently running task from the load
+			 * of the current CPU:
+			 */
+			if (sync)
+				tl -= current->load_weight;
+
+			if ((tl <= load &&
+				tl + target_load(cpu, idx) <= tl_per_task) ||
+				100*(tl + p->load_weight) <= imbalance*load) {
+				/*
+				 * This domain has SD_WAKE_AFFINE and
+				 * p is cache cold in this domain, and
+				 * there is no bad imbalance.
+				 */
+				schedstat_inc(this_sd, ttwu_move_affine);
+				goto out_set_cpu;
+			}
+		}
+
+		/*
+		 * Start passive balancing when half the imbalance_pct
+		 * limit is reached.
+		 */
+		if (this_sd->flags & SD_WAKE_BALANCE) {
+			if (imbalance*this_load <= 100*load) {
+				schedstat_inc(this_sd, ttwu_move_balance);
+				goto out_set_cpu;
+			}
+		}
+	}
+
+	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+out_set_cpu:
+	new_cpu = wake_idle(new_cpu, p);
+	if (new_cpu != cpu) {
+		set_task_cpu(p, new_cpu);
+		task_rq_unlock(rq, &flags);
+		/* might preempt at this point */
+		rq = task_rq_lock(p, &flags);
+		old_state = p->state;
+		if (!(old_state & state))
+			goto out;
+		if (p->array)
+			goto out_running;
+
+		this_cpu = smp_processor_id();
+		cpu = task_cpu(p);
+	}
+
+out_activate:
+#endif /* CONFIG_SMP */
+	if (old_state == TASK_UNINTERRUPTIBLE) {
+		rq->nr_uninterruptible--;
+		/*
+		 * Tasks waking from uninterruptible sleep are likely
+		 * to be sleeping involuntarily on I/O and are otherwise
+		 * cpu bound so label them as noninteractive.
+		 */
+		p->sleep_type = SLEEP_NONINTERACTIVE;
+	} else
+
+	/*
+	 * Tasks that have marked their sleep as noninteractive get
+	 * woken up with their sleep average not weighted in an
+	 * interactive way.
+	 */
+		if (old_state & TASK_NONINTERACTIVE)
+			p->sleep_type = SLEEP_NONINTERACTIVE;
+
+
+	activate_task(p, rq, cpu == this_cpu);
+	/*
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption, if the woken up task will run on
+	 * this cpu. (in this case the 'I will reschedule' promise of
+	 * the waker guarantees that the freshly woken up task is going
+	 * to be considered on this CPU.)
+	 */
+	if (!sync || cpu != this_cpu) {
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	success = 1;
+
+out_running:
+	p->state = TASK_RUNNING;
+out:
+	task_rq_unlock(rq, &flags);
+
+	return success;
+}
+
+int fastcall wake_up_process(task_t *p)
+{
+	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+}
+
+EXPORT_SYMBOL(wake_up_process);
+
+int fastcall wake_up_state(task_t *p, unsigned int state)
+{
+	return try_to_wake_up(p, state, 0);
+}
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+void fastcall sched_fork(task_t *p, int clone_flags)
+{
+	int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+	set_task_cpu(p, cpu);
+
+	/*
+	 * We mark the process as running here, but have not actually
+	 * inserted it onto the runqueue yet. This guarantees that
+	 * nobody will actually run it, and a signal or other external
+	 * event cannot wake it up and insert it on the runqueue either.
+	 */
+	p->state = TASK_RUNNING;
+	INIT_LIST_HEAD(&p->run_list);
+	p->array = NULL;
+#ifdef CONFIG_SCHEDSTATS
+	memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	p->oncpu = 0;
+#endif
+#ifdef CONFIG_PREEMPT
+	/* Want to start with kernel preemption disabled. */
+	task_thread_info(p)->preempt_count = 1;
+#endif
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness.
+	 */
+	local_irq_disable();
+	p->time_slice = (current->time_slice + 1) >> 1;
+	/*
+	 * The remainder of the first timeslice might be recovered by
+	 * the parent if the child exits early enough.
+	 */
+	p->first_time_slice = 1;
+	current->time_slice >>= 1;
+	p->timestamp = sched_clock();
+	if (unlikely(!current->time_slice)) {
+		/*
+		 * This case is rare, it happens when the parent has only
+		 * a single jiffy left from its timeslice. Taking the
+		 * runqueue lock is not a problem.
+		 */
+		current->time_slice = 1;
+		scheduler_tick();
+	}
+	local_irq_enable();
+	put_cpu();
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
+{
+	unsigned long flags;
+	int this_cpu, cpu;
+	runqueue_t *rq, *this_rq;
+
+	rq = task_rq_lock(p, &flags);
+	BUG_ON(p->state != TASK_RUNNING);
+	this_cpu = smp_processor_id();
+	cpu = task_cpu(p);
+
+	/*
+	 * We decrease the sleep average of forking parents
+	 * and children as well, to keep max-interactive tasks
+	 * from forking tasks that are max-interactive. The parent
+	 * (current) is done further down, under its lock.
+	 */
+	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
+		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+
+	p->prio = effective_prio(p);
+
+	if (likely(cpu == this_cpu)) {
+		if (!(clone_flags & CLONE_VM)) {
+			/*
+			 * The VM isn't cloned, so we're in a good position to
+			 * do child-runs-first in anticipation of an exec. This
+			 * usually avoids a lot of COW overhead.
+			 */
+			if (unlikely(!current->array))
+				__activate_task(p, rq);
+			else {
+				p->prio = current->prio;
+				list_add_tail(&p->run_list, &current->run_list);
+				p->array = current->array;
+				p->array->nr_active++;
+				inc_nr_running(p, rq);
+			}
+			set_need_resched();
+		} else
+			/* Run child last */
+			__activate_task(p, rq);
+		/*
+		 * We skip the following code due to cpu == this_cpu
+	 	 *
+		 *   task_rq_unlock(rq, &flags);
+		 *   this_rq = task_rq_lock(current, &flags);
+		 */
+		this_rq = rq;
+	} else {
+		this_rq = cpu_rq(this_cpu);
+
+		/*
+		 * Not the local CPU - must adjust timestamp. This should
+		 * get optimised away in the !CONFIG_SMP case.
+		 */
+		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
+					+ rq->timestamp_last_tick;
+		__activate_task(p, rq);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+
+		/*
+		 * Parent and child are on different CPUs, now get the
+		 * parent runqueue to update the parent's ->sleep_avg:
+		 */
+		task_rq_unlock(rq, &flags);
+		this_rq = task_rq_lock(current, &flags);
+	}
+	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+	task_rq_unlock(this_rq, &flags);
+}
+
+/*
+ * Potentially available exiting-child timeslices are
+ * retrieved here - this way the parent does not get
+ * penalized for creating too many threads.
+ *
+ * (this cannot be used to 'generate' timeslices
+ * artificially, because any timeslice recovered here
+ * was given away by the parent in the first place.)
+ */
+void fastcall sched_exit(task_t *p)
+{
+	unsigned long flags;
+	runqueue_t *rq;
+
+	/*
+	 * If the child was a (relative-) CPU hog then decrease
+	 * the sleep_avg of the parent as well.
+	 */
+	rq = task_rq_lock(p->parent, &flags);
+	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
+		p->parent->time_slice += p->time_slice;
+		if (unlikely(p->parent->time_slice > task_timeslice(p)))
+			p->parent->time_slice = task_timeslice(p);
+	}
+	if (p->sleep_avg < p->parent->sleep_avg)
+		p->parent->sleep_avg = p->parent->sleep_avg /
+		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
+		(EXIT_WEIGHT + 1);
+	task_rq_unlock(rq, &flags);
+}
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+{
+	prepare_lock_switch(rq, next);
+	prepare_arch_switch(next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock.  (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ */
+static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
+	__releases(rq->lock)
+{
+	struct mm_struct *mm = rq->prev_mm;
+	unsigned long prev_task_flags;
+
+	rq->prev_mm = NULL;
+
+	/*
+	 * A task struct has one reference for the use as "current".
+	 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
+	 * calls schedule one last time. The schedule call will never return,
+	 * and the scheduled task must drop that reference.
+	 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+	 * still held, otherwise prev could be scheduled on another cpu, die
+	 * there before we look at prev->state, and then the reference would
+	 * be dropped twice.
+	 *		Manfred Spraul <manfred@colorfullife.com>
+	 */
+	prev_task_flags = prev->flags;
+	finish_arch_switch(prev);
+	finish_lock_switch(rq, prev);
+	if (mm)
+		mmdrop(mm);
+	if (unlikely(prev_task_flags & PF_DEAD)) {
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+	 	 */
+		kprobe_flush_task(prev);
+		put_task_struct(prev);
+	}
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage void schedule_tail(task_t *prev)
+	__releases(rq->lock)
+{
+	runqueue_t *rq = this_rq();
+	finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+	/* In this case, finish_task_switch does not reenable preemption */
+	preempt_enable();
+#endif
+	if (current->set_child_tid)
+		put_user(current->pid, current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new
+ * thread's register state.
+ */
+static inline
+task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
+{
+	struct mm_struct *mm = next->mm;
+	struct mm_struct *oldmm = prev->active_mm;
+
+	if (unlikely(!mm)) {
+		next->active_mm = oldmm;
+		atomic_inc(&oldmm->mm_count);
+		enter_lazy_tlb(oldmm, next);
+	} else
+		switch_mm(oldmm, mm, next);
+
+	if (unlikely(!prev->mm)) {
+		prev->active_mm = NULL;
+		WARN_ON(rq->prev_mm);
+		rq->prev_mm = oldmm;
+	}
+
+	/* Here we just switch the register state and the stack. */
+	switch_to(prev, next, prev);
+
+	return prev;
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, current number of uninterruptible-sleeping threads, total
+ * number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->nr_running;
+
+	return sum;
+}
+
+unsigned long nr_uninterruptible(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_cpu(i)
+		sum += cpu_rq(i)->nr_uninterruptible;
+
+	/*
+	 * Since we read the counters lockless, it might be slightly
+	 * inaccurate. Do not allow it to go below zero though:
+	 */
+	if (unlikely((long)sum < 0))
+		sum = 0;
+
+	return sum;
+}
+
+unsigned long long nr_context_switches(void)
+{
+	unsigned long long i, sum = 0;
+
+	for_each_cpu(i)
+		sum += cpu_rq(i)->nr_switches;
+
+	return sum;
+}
+
+unsigned long nr_iowait(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_cpu(i)
+		sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+	return sum;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	if (rq1 == rq2) {
+		spin_lock(&rq1->lock);
+		__acquire(rq2->lock);	/* Fake it out ;) */
+	} else {
+		if (rq1 < rq2) {
+			spin_lock(&rq1->lock);
+			spin_lock(&rq2->lock);
+		} else {
+			spin_lock(&rq2->lock);
+			spin_lock(&rq1->lock);
+		}
+	}
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
+	__releases(rq1->lock)
+	__releases(rq2->lock)
+{
+	spin_unlock(&rq1->lock);
+	if (rq1 != rq2)
+		spin_unlock(&rq2->lock);
+	else
+		__release(rq2->lock);
+}
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
+	__releases(this_rq->lock)
+	__acquires(busiest->lock)
+	__acquires(this_rq->lock)
+{
+	if (unlikely(!spin_trylock(&busiest->lock))) {
+		if (busiest < this_rq) {
+			spin_unlock(&this_rq->lock);
+			spin_lock(&busiest->lock);
+			spin_lock(&this_rq->lock);
+		} else
+			spin_lock(&busiest->lock);
+	}
+}
+
+/*
+ * If dest_cpu is allowed for this process, migrate the task to it.
+ * This is accomplished by forcing the cpu_allowed mask to only
+ * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
+ * the cpu_allowed mask is restored.
+ */
+static void sched_migrate_task(task_t *p, int dest_cpu)
+{
+	migration_req_t req;
+	runqueue_t *rq;
+	unsigned long flags;
+
+	rq = task_rq_lock(p, &flags);
+	if (!cpu_isset(dest_cpu, p->cpus_allowed)
+	    || unlikely(cpu_is_offline(dest_cpu)))
+		goto out;
+
+	/* force the process onto the specified CPU */
+	if (migrate_task(p, dest_cpu, &req)) {
+		/* Need to wait for migration thread (might exit: take ref). */
+		struct task_struct *mt = rq->migration_thread;
+		get_task_struct(mt);
+		task_rq_unlock(rq, &flags);
+		wake_up_process(mt);
+		put_task_struct(mt);
+		wait_for_completion(&req.done);
+		return;
+	}
+out:
+	task_rq_unlock(rq, &flags);
+}
+
+/*
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
+ */
+void sched_exec(void)
+{
+	int new_cpu, this_cpu = get_cpu();
+	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
+	put_cpu();
+	if (new_cpu != this_cpu)
+		sched_migrate_task(current, new_cpu);
+}
+
+/*
+ * pull_task - move a task from a remote runqueue to the local runqueue.
+ * Both runqueues must be locked.
+ */
+static
+void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
+	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
+{
+	dequeue_task(p, src_array);
+	dec_nr_running(p, src_rq);
+	set_task_cpu(p, this_cpu);
+	inc_nr_running(p, this_rq);
+	enqueue_task(p, this_array);
+	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+				+ this_rq->timestamp_last_tick;
+	/*
+	 * Note that idle threads have a prio of MAX_PRIO, for this test
+	 * to be always true for them.
+	 */
+	if (TASK_PREEMPTS_CURR(p, this_rq))
+		resched_task(this_rq->curr);
+}
+
+/*
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
+		     struct sched_domain *sd, enum idle_type idle,
+		     int *all_pinned)
+{
+	/*
+	 * We do not migrate tasks that are:
+	 * 1) running (obviously), or
+	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
+	 * 3) are cache-hot on their current CPU.
+	 */
+	if (!cpu_isset(this_cpu, p->cpus_allowed))
+		return 0;
+	*all_pinned = 0;
+
+	if (task_running(rq, p))
+		return 0;
+
+	/*
+	 * Aggressive migration if:
+	 * 1) task is cache cold, or
+	 * 2) too many balance attempts have failed.
+	 */
+
+	if (sd->nr_balance_failed > sd->cache_nice_tries)
+		return 1;
+
+	if (task_hot(p, rq->timestamp_last_tick, sd))
+		return 0;
+	return 1;
+}
+
+/*
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+		      unsigned long max_nr_move, unsigned long max_load_move,
+		      struct sched_domain *sd, enum idle_type idle,
+		      int *all_pinned)
+{
+	prio_array_t *array, *dst_array;
+	struct list_head *head, *curr;
+	int idx, pulled = 0, pinned = 0;
+	long rem_load_move;
+	task_t *tmp;
+
+	if (max_nr_move == 0 || max_load_move == 0)
+		goto out;
+
+	rem_load_move = max_load_move;
+	pinned = 1;
+
+	/*
+	 * We first consider expired tasks. Those will likely not be
+	 * executed in the near future, and they are most likely to
+	 * be cache-cold, thus switching CPUs has the least effect
+	 * on them.
+	 */
+	if (busiest->expired->nr_active) {
+		array = busiest->expired;
+		dst_array = this_rq->expired;
+	} else {
+		array = busiest->active;
+		dst_array = this_rq->active;
+	}
+
+new_array:
+	/* Start searching at priority 0: */
+	idx = 0;
+skip_bitmap:
+	if (!idx)
+		idx = sched_find_first_bit(array->bitmap);
+	else
+		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO) {
+		if (array == busiest->expired && busiest->active->nr_active) {
+			array = busiest->active;
+			dst_array = this_rq->active;
+			goto new_array;
+		}
+		goto out;
+	}
+
+	head = array->queue + idx;
+	curr = head->prev;
+skip_queue:
+	tmp = list_entry(curr, task_t, run_list);
+
+	curr = curr->prev;
+
+	if (tmp->load_weight > rem_load_move ||
+	    !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+
+#ifdef CONFIG_SCHEDSTATS
+	if (task_hot(tmp, busiest->timestamp_last_tick, sd))
+		schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
+
+	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+	pulled++;
+	rem_load_move -= tmp->load_weight;
+
+	/*
+	 * We only want to steal up to the prescribed number of tasks
+	 * and the prescribed amount of weighted load.
+	 */
+	if (pulled < max_nr_move && rem_load_move > 0) {
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+out:
+	/*
+	 * Right now, this is the only place pull_task() is called,
+	 * so we can safely collect pull_task() stats here rather than
+	 * inside pull_task().
+	 */
+	schedstat_add(sd, lb_gained[idle], pulled);
+
+	if (all_pinned)
+		*all_pinned = pinned;
+	return pulled;
+}
+
+/*
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the amount of weighted load which should be
+ * moved to restore balance via the imbalance parameter.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+		   unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+{
+	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	unsigned long max_pull;
+	unsigned long busiest_load_per_task, busiest_nr_running;
+	unsigned long this_load_per_task, this_nr_running;
+	int load_idx;
+
+	max_load = this_load = total_load = total_pwr = 0;
+	busiest_load_per_task = busiest_nr_running = 0;
+	this_load_per_task = this_nr_running = 0;
+	if (idle == NOT_IDLE)
+		load_idx = sd->busy_idx;
+	else if (idle == NEWLY_IDLE)
+		load_idx = sd->newidle_idx;
+	else
+		load_idx = sd->idle_idx;
+
+	do {
+		unsigned long load;
+		int local_group;
+		int i;
+		unsigned long sum_nr_running, sum_weighted_load;
+
+		local_group = cpu_isset(this_cpu, group->cpumask);
+
+		/* Tally up the load of all CPUs in the group */
+		sum_weighted_load = sum_nr_running = avg_load = 0;
+
+		for_each_cpu_mask(i, group->cpumask) {
+			runqueue_t *rq = cpu_rq(i);
+
+			if (*sd_idle && !idle_cpu(i))
+				*sd_idle = 0;
+
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = target_load(i, load_idx);
+			else
+				load = source_load(i, load_idx);
+
+			avg_load += load;
+			sum_nr_running += rq->nr_running;
+			sum_weighted_load += rq->raw_weighted_load;
+		}
+
+		total_load += avg_load;
+		total_pwr += group->cpu_power;
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			this = group;
+			this_nr_running = sum_nr_running;
+			this_load_per_task = sum_weighted_load;
+		} else if (avg_load > max_load) {
+			max_load = avg_load;
+			busiest = group;
+			busiest_nr_running = sum_nr_running;
+			busiest_load_per_task = sum_weighted_load;
+		}
+		group = group->next;
+	} while (group != sd->groups);
+
+	if (!busiest || this_load >= max_load || busiest_nr_running <= 1)
+		goto out_balanced;
+
+	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+	if (this_load >= avg_load ||
+			100*max_load <= sd->imbalance_pct*this_load)
+		goto out_balanced;
+
+	busiest_load_per_task /= busiest_nr_running;
+	/*
+	 * We're trying to get all the cpus to the average_load, so we don't
+	 * want to push ourselves above the average load, nor do we wish to
+	 * reduce the max loaded cpu below the average load, as either of these
+	 * actions would just result in more rebalancing later, and ping-pong
+	 * tasks around. Thus we look for the minimum possible imbalance.
+	 * Negative imbalances (*we* are more loaded than anyone else) will
+	 * be counted as no imbalance for these purposes -- we can't fix that
+	 * by pulling tasks to us.  Be careful of negative numbers as they'll
+	 * appear as very large values with unsigned longs.
+	 */
+
+	/* Don't want to pull so many tasks that a group would go idle */
+	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+
+	/* How much load to actually move to equalise the imbalance */
+	*imbalance = min(max_pull * busiest->cpu_power,
+				(avg_load - this_load) * this->cpu_power)
+			/ SCHED_LOAD_SCALE;
+
+	/*
+	 * if *imbalance is less than the average load per runnable task
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < busiest_load_per_task) {
+		unsigned long pwr_now = 0, pwr_move = 0;
+		unsigned long tmp;
+
+		if (max_load - this_load >= busiest_load_per_task*2) {
+			*imbalance = busiest_load_per_task;
+			return busiest;
+		}
+
+		/*
+		 * OK, we don't have enough imbalance to justify moving tasks,
+		 * however we may be able to increase total CPU power used by
+		 * moving them.
+		 */
+
+		pwr_now += busiest->cpu_power *
+			min(busiest_load_per_task, max_load);
+		if (this_nr_running)
+			this_load_per_task /= this_nr_running;
+		else
+			this_load_per_task = SCHED_LOAD_SCALE;
+		pwr_now += this->cpu_power *
+			min(this_load_per_task, this_load);
+		pwr_now /= SCHED_LOAD_SCALE;
+
+		/* Amount of load we'd subtract */
+		tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
+		if (max_load > tmp)
+			pwr_move += busiest->cpu_power *
+				min(busiest_load_per_task, max_load - tmp);
+
+		/* Amount of load we'd add */
+		if (max_load*busiest->cpu_power <
+				busiest_load_per_task*SCHED_LOAD_SCALE)
+			tmp = max_load*busiest->cpu_power/this->cpu_power;
+		else
+			tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+		pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
+		pwr_move /= SCHED_LOAD_SCALE;
+
+		/* Move if we gain throughput */
+		if (pwr_move > pwr_now)
+			*imbalance = busiest_load_per_task;
+		/* or if there's a reasonable chance that *imbalance is big
+		 * enough to cause a move
+		 */
+		 else if (*imbalance <= busiest_load_per_task / 2)
+			goto out_balanced;
+	}
+
+	return busiest;
+
+out_balanced:
+
+	*imbalance = 0;
+	return NULL;
+}
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static runqueue_t *find_busiest_queue(struct sched_group *group,
+	enum idle_type idle)
+{
+	unsigned long load, max_load = 0;
+	runqueue_t *busiest = NULL;
+	int i;
+
+	for_each_cpu_mask(i, group->cpumask) {
+		load = source_load(i, 0);
+
+		if (load > max_load) {
+			max_load = load;
+			busiest = cpu_rq(i);
+		}
+	}
+
+	return busiest;
+}
+
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL	512
+
+#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called with this_rq unlocked.
+ */
+static int load_balance(int this_cpu, runqueue_t *this_rq,
+			struct sched_domain *sd, enum idle_type idle)
+{
+	struct sched_group *group;
+	runqueue_t *busiest;
+	unsigned long imbalance;
+	int nr_moved, all_pinned = 0;
+	int active_balance = 0;
+	int sd_idle = 0;
+
+	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+		sd_idle = 1;
+
+	schedstat_inc(sd, lb_cnt[idle]);
+
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+	if (!group) {
+		schedstat_inc(sd, lb_nobusyg[idle]);
+		goto out_balanced;
+	}
+
+	busiest = find_busiest_queue(group, idle);
+	if (!busiest) {
+		schedstat_inc(sd, lb_nobusyq[idle]);
+		goto out_balanced;
+	}
+
+	BUG_ON(busiest == this_rq);
+
+	schedstat_add(sd, lb_imbalance[idle], imbalance);
+
+	nr_moved = 0;
+	if (busiest->nr_running > 1) {
+		/*
+		 * Attempt to move tasks. If find_busiest_group has found
+		 * an imbalance but busiest->nr_running <= 1, the group is
+		 * still unbalanced. nr_moved simply stays zero, so it is
+		 * correctly treated as an imbalance.
+		 */
+		double_rq_lock(this_rq, busiest);
+		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+					minus_1_or_zero(busiest->nr_running),
+					imbalance, sd, idle, &all_pinned);
+		double_rq_unlock(this_rq, busiest);
+
+		/* All tasks on this runqueue were pinned by CPU affinity */
+		if (unlikely(all_pinned))
+			goto out_balanced;
+	}
+
+	if (!nr_moved) {
+		schedstat_inc(sd, lb_failed[idle]);
+		sd->nr_balance_failed++;
+
+		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
+
+			spin_lock(&busiest->lock);
+
+			/* don't kick the migration_thread, if the curr
+			 * task on busiest cpu can't be moved to this_cpu
+			 */
+			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+				spin_unlock(&busiest->lock);
+				all_pinned = 1;
+				goto out_one_pinned;
+			}
+
+			if (!busiest->active_balance) {
+				busiest->active_balance = 1;
+				busiest->push_cpu = this_cpu;
+				active_balance = 1;
+			}
+			spin_unlock(&busiest->lock);
+			if (active_balance)
+				wake_up_process(busiest->migration_thread);
+
+			/*
+			 * We've kicked active balancing, reset the failure
+			 * counter.
+			 */
+			sd->nr_balance_failed = sd->cache_nice_tries+1;
+		}
+	} else
+		sd->nr_balance_failed = 0;
+
+	if (likely(!active_balance)) {
+		/* We were unbalanced, so reset the balancing interval */
+		sd->balance_interval = sd->min_interval;
+	} else {
+		/*
+		 * If we've begun active balancing, start to back off. This
+		 * case may not be covered by the all_pinned logic if there
+		 * is only 1 task on the busy runqueue (because we don't call
+		 * move_tasks).
+		 */
+		if (sd->balance_interval < sd->max_interval)
+			sd->balance_interval *= 2;
+	}
+
+	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+		return -1;
+	return nr_moved;
+
+out_balanced:
+	schedstat_inc(sd, lb_balanced[idle]);
+
+	sd->nr_balance_failed = 0;
+
+out_one_pinned:
+	/* tune up the balancing interval */
+	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+			(sd->balance_interval < sd->max_interval))
+		sd->balance_interval *= 2;
+
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+		return -1;
+	return 0;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * this_rq is locked.
+ */
+static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+				struct sched_domain *sd)
+{
+	struct sched_group *group;
+	runqueue_t *busiest = NULL;
+	unsigned long imbalance;
+	int nr_moved = 0;
+	int sd_idle = 0;
+
+	if (sd->flags & SD_SHARE_CPUPOWER)
+		sd_idle = 1;
+
+	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+	if (!group) {
+		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+		goto out_balanced;
+	}
+
+	busiest = find_busiest_queue(group, NEWLY_IDLE);
+	if (!busiest) {
+		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+		goto out_balanced;
+	}
+
+	BUG_ON(busiest == this_rq);
+
+	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+
+	nr_moved = 0;
+	if (busiest->nr_running > 1) {
+		/* Attempt to move tasks */
+		double_lock_balance(this_rq, busiest);
+		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+					minus_1_or_zero(busiest->nr_running),
+					imbalance, sd, NEWLY_IDLE, NULL);
+		spin_unlock(&busiest->lock);
+	}
+
+	if (!nr_moved) {
+		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+			return -1;
+	} else
+		sd->nr_balance_failed = 0;
+
+	return nr_moved;
+
+out_balanced:
+	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+		return -1;
+	sd->nr_balance_failed = 0;
+	return 0;
+}
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static void idle_balance(int this_cpu, runqueue_t *this_rq)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(this_cpu, sd) {
+		if (sd->flags & SD_BALANCE_NEWIDLE) {
+			if (load_balance_newidle(this_cpu, this_rq, sd)) {
+				/* We've pulled tasks over so stop searching */
+				break;
+			}
+		}
+	}
+}
+
+/*
+ * active_load_balance is run by migration threads. It pushes running tasks
+ * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
+ * running on each physical CPU where possible, and avoids physical /
+ * logical imbalances.
+ *
+ * Called with busiest_rq locked.
+ */
+static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
+{
+	struct sched_domain *sd;
+	runqueue_t *target_rq;
+	int target_cpu = busiest_rq->push_cpu;
+
+	if (busiest_rq->nr_running <= 1)
+		/* no task to move */
+		return;
+
+	target_rq = cpu_rq(target_cpu);
+
+	/*
+	 * This condition is "impossible", if it occurs
+	 * we need to fix it.  Originally reported by
+	 * Bjorn Helgaas on a 128-cpu setup.
+	 */
+	BUG_ON(busiest_rq == target_rq);
+
+	/* move a task from busiest_rq to target_rq */
+	double_lock_balance(busiest_rq, target_rq);
+
+	/* Search for an sd spanning us and the target CPU. */
+	for_each_domain(target_cpu, sd)
+		if ((sd->flags & SD_LOAD_BALANCE) &&
+			cpu_isset(busiest_cpu, sd->span))
+				break;
+
+	if (unlikely(sd == NULL))
+		goto out;
+
+	schedstat_inc(sd, alb_cnt);
+
+	if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+			RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
+		schedstat_inc(sd, alb_pushed);
+	else
+		schedstat_inc(sd, alb_failed);
+out:
+	spin_unlock(&target_rq->lock);
+}
+
+/*
+ * rebalance_tick will get called every timer tick, on every CPU.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+
+/* Don't have all balancing operations going off at once */
+#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
+
+static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
+			   enum idle_type idle)
+{
+	unsigned long old_load, this_load;
+	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
+	struct sched_domain *sd;
+	int i;
+
+	this_load = this_rq->raw_weighted_load;
+	/* Update our load */
+	for (i = 0; i < 3; i++) {
+		unsigned long new_load = this_load;
+		int scale = 1 << i;
+		old_load = this_rq->cpu_load[i];
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale-1;
+		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+	}
+
+	for_each_domain(this_cpu, sd) {
+		unsigned long interval;
+
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+
+		interval = sd->balance_interval;
+		if (idle != SCHED_IDLE)
+			interval *= sd->busy_factor;
+
+		/* scale ms to jiffies */
+		interval = msecs_to_jiffies(interval);
+		if (unlikely(!interval))
+			interval = 1;
+
+		if (j - sd->last_balance >= interval) {
+			if (load_balance(this_cpu, this_rq, sd, idle)) {
+				/*
+				 * We've pulled tasks over so either we're no
+				 * longer idle, or one of our SMT siblings is
+				 * not idle.
+				 */
+				idle = NOT_IDLE;
+			}
+			sd->last_balance += interval;
+		}
+	}
+}
+#else
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
+{
+}
+static inline void idle_balance(int cpu, runqueue_t *rq)
+{
+}
+#endif
+
+static inline int wake_priority_sleeper(runqueue_t *rq)
+{
+	int ret = 0;
+#ifdef CONFIG_SCHED_SMT
+	spin_lock(&rq->lock);
+	/*
+	 * If an SMT sibling task has been put to sleep for priority
+	 * reasons reschedule the idle task to see if it can now run.
+	 */
+	if (rq->nr_running) {
+		resched_task(rq->idle);
+		ret = 1;
+	}
+	spin_unlock(&rq->lock);
+#endif
+	return ret;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+
+/*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ */
+static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
+				    unsigned long long now)
+{
+	unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
+	p->sched_time += now - last;
+}
+
+/*
+ * Return current->sched_time plus any more ns on the sched_clock
+ * that have not yet been banked.
+ */
+unsigned long long current_sched_time(const task_t *tsk)
+{
+	unsigned long long ns;
+	unsigned long flags;
+	local_irq_save(flags);
+	ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
+	ns = tsk->sched_time + (sched_clock() - ns);
+	local_irq_restore(flags);
+	return ns;
+}
+
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+#define EXPIRED_STARVING(rq) \
+	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+		(jiffies - (rq)->expired_timestamp >= \
+			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
+			((rq)->curr->static_prio > (rq)->best_expired_prio))
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in user space since the last update
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t tmp;
+
+	p->utime = cputime_add(p->utime, cputime);
+
+	/* Add user time to cpustat. */
+	tmp = cputime_to_cputime64(cputime);
+	if (TASK_NICE(p) > 0)
+		cpustat->nice = cputime64_add(cpustat->nice, tmp);
+	else
+		cpustat->user = cputime64_add(cpustat->user, tmp);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+			 cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	runqueue_t *rq = this_rq();
+	cputime64_t tmp;
+
+	p->stime = cputime_add(p->stime, cputime);
+
+	/* Add system time to cpustat. */
+	tmp = cputime_to_cputime64(cputime);
+	if (hardirq_count() - hardirq_offset)
+		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+	else if (softirq_count())
+		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	else if (p != rq->idle)
+		cpustat->system = cputime64_add(cpustat->system, tmp);
+	else if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+	else
+		cpustat->idle = cputime64_add(cpustat->idle, tmp);
+	/* Account for system time used */
+	acct_update_integrals(p);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @p: the process from which the cpu time has been stolen
+ * @steal: the cpu time spent in involuntary wait
+ */
+void account_steal_time(struct task_struct *p, cputime_t steal)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t tmp = cputime_to_cputime64(steal);
+	runqueue_t *rq = this_rq();
+
+	if (p == rq->idle) {
+		p->stime = cputime_add(p->stime, steal);
+		if (atomic_read(&rq->nr_iowait) > 0)
+			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+		else
+			cpustat->idle = cputime64_add(cpustat->idle, tmp);
+	} else
+		cpustat->steal = cputime64_add(cpustat->steal, tmp);
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(void)
+{
+	int cpu = smp_processor_id();
+	runqueue_t *rq = this_rq();
+	task_t *p = current;
+	unsigned long long now = sched_clock();
+
+	update_cpu_clock(p, rq, now);
+
+	rq->timestamp_last_tick = now;
+
+	if (p == rq->idle) {
+		if (wake_priority_sleeper(rq))
+			goto out;
+		rebalance_tick(cpu, rq, SCHED_IDLE);
+		return;
+	}
+
+	/* Task might have expired already, but not scheduled off yet */
+	if (p->array != rq->active) {
+		set_tsk_need_resched(p);
+		goto out;
+	}
+	spin_lock(&rq->lock);
+	/*
+	 * The task was running during this tick - update the
+	 * time slice counter. Note: we do not update a thread's
+	 * priority until it either goes to sleep or uses up its
+	 * timeslice. This makes it possible for interactive tasks
+	 * to use up their timeslices at their highest priority levels.
+	 */
+	if (rt_task(p)) {
+		/*
+		 * RR tasks need a special form of timeslice management.
+		 * FIFO tasks have no timeslices.
+		 */
+		if ((p->policy == SCHED_RR) && !--p->time_slice) {
+			p->time_slice = task_timeslice(p);
+			p->first_time_slice = 0;
+			set_tsk_need_resched(p);
+
+			/* put it at the end of the queue: */
+			requeue_task(p, rq->active);
+		}
+		goto out_unlock;
+	}
+	if (!--p->time_slice) {
+		dequeue_task(p, rq->active);
+		set_tsk_need_resched(p);
+		p->prio = effective_prio(p);
+		p->time_slice = task_timeslice(p);
+		p->first_time_slice = 0;
+
+		if (!rq->expired_timestamp)
+			rq->expired_timestamp = jiffies;
+		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+			enqueue_task(p, rq->expired);
+			if (p->static_prio < rq->best_expired_prio)
+				rq->best_expired_prio = p->static_prio;
+		} else
+			enqueue_task(p, rq->active);
+	} else {
+		/*
+		 * Prevent a too long timeslice allowing a task to monopolize
+		 * the CPU. We do this by splitting up the timeslice into
+		 * smaller pieces.
+		 *
+		 * Note: this does not mean the task's timeslices expire or
+		 * get lost in any way, they just might be preempted by
+		 * another task of equal priority. (one with higher
+		 * priority would have preempted this task already.) We
+		 * requeue this task to the end of the list on this priority
+		 * level, which is in essence a round-robin of tasks with
+		 * equal priority.
+		 *
+		 * This only applies to tasks in the interactive
+		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
+		 */
+		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
+			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
+			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
+			(p->array == rq->active)) {
+
+			requeue_task(p, rq->active);
+			set_tsk_need_resched(p);
+		}
+	}
+out_unlock:
+	spin_unlock(&rq->lock);
+out:
+	rebalance_tick(cpu, rq, NOT_IDLE);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(runqueue_t *rq)
+{
+	/* If an SMT runqueue is sleeping due to priority reasons wake it up */
+	if (rq->curr == rq->idle && rq->nr_running)
+		resched_task(rq->idle);
+}
+
+static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+{
+	struct sched_domain *tmp, *sd = NULL;
+	cpumask_t sibling_map;
+	int i;
+
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_SHARE_CPUPOWER)
+			sd = tmp;
+
+	if (!sd)
+		return;
+
+	/*
+	 * Unlock the current runqueue because we have to lock in
+	 * CPU order to avoid deadlocks. Caller knows that we might
+	 * unlock. We keep IRQs disabled.
+	 */
+	spin_unlock(&this_rq->lock);
+
+	sibling_map = sd->span;
+
+	for_each_cpu_mask(i, sibling_map)
+		spin_lock(&cpu_rq(i)->lock);
+	/*
+	 * We clear this CPU from the mask. This both simplifies the
+	 * inner loop and keps this_rq locked when we exit:
+	 */
+	cpu_clear(this_cpu, sibling_map);
+
+	for_each_cpu_mask(i, sibling_map) {
+		runqueue_t *smt_rq = cpu_rq(i);
+
+		wakeup_busy_runqueue(smt_rq);
+	}
+
+	for_each_cpu_mask(i, sibling_map)
+		spin_unlock(&cpu_rq(i)->lock);
+	/*
+	 * We exit with this_cpu's rq still held and IRQs
+	 * still disabled:
+	 */
+}
+
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
+static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+{
+	struct sched_domain *tmp, *sd = NULL;
+	cpumask_t sibling_map;
+	prio_array_t *array;
+	int ret = 0, i;
+	task_t *p;
+
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_SHARE_CPUPOWER)
+			sd = tmp;
+
+	if (!sd)
+		return 0;
+
+	/*
+	 * The same locking rules and details apply as for
+	 * wake_sleeping_dependent():
+	 */
+	spin_unlock(&this_rq->lock);
+	sibling_map = sd->span;
+	for_each_cpu_mask(i, sibling_map)
+		spin_lock(&cpu_rq(i)->lock);
+	cpu_clear(this_cpu, sibling_map);
+
+	/*
+	 * Establish next task to be run - it might have gone away because
+	 * we released the runqueue lock above:
+	 */
+	if (!this_rq->nr_running)
+		goto out_unlock;
+	array = this_rq->active;
+	if (!array->nr_active)
+		array = this_rq->expired;
+	BUG_ON(!array->nr_active);
+
+	p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
+		task_t, run_list);
+
+	for_each_cpu_mask(i, sibling_map) {
+		runqueue_t *smt_rq = cpu_rq(i);
+		task_t *smt_curr = smt_rq->curr;
+
+		/* Kernel threads do not participate in dependent sleeping */
+		if (!p->mm || !smt_curr->mm || rt_task(p))
+			goto check_smt_task;
+
+		/*
+		 * If a user task with lower static priority than the
+		 * running task on the SMT sibling is trying to schedule,
+		 * delay it till there is proportionately less timeslice
+		 * left of the sibling task to prevent a lower priority
+		 * task from using an unfair proportion of the
+		 * physical cpu's resources. -ck
+		 */
+		if (rt_task(smt_curr)) {
+			/*
+			 * With real time tasks we run non-rt tasks only
+			 * per_cpu_gain% of the time.
+			 */
+			if ((jiffies % DEF_TIMESLICE) >
+				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
+					ret = 1;
+		} else
+			if (smt_curr->static_prio < p->static_prio &&
+				!TASK_PREEMPTS_CURR(p, smt_rq) &&
+				smt_slice(smt_curr, sd) > task_timeslice(p))
+					ret = 1;
+
+check_smt_task:
+		if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
+			rt_task(smt_curr))
+				continue;
+		if (!p->mm) {
+			wakeup_busy_runqueue(smt_rq);
+			continue;
+		}
+
+		/*
+		 * Reschedule a lower priority task on the SMT sibling for
+		 * it to be put to sleep, or wake it up if it has been put to
+		 * sleep for priority reasons to see if it should run now.
+		 */
+		if (rt_task(p)) {
+			if ((jiffies % DEF_TIMESLICE) >
+				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
+					resched_task(smt_curr);
+		} else {
+			if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+				smt_slice(p, sd) > task_timeslice(smt_curr))
+					resched_task(smt_curr);
+			else
+				wakeup_busy_runqueue(smt_rq);
+		}
+	}
+out_unlock:
+	for_each_cpu_mask(i, sibling_map)
+		spin_unlock(&cpu_rq(i)->lock);
+	return ret;
+}
+#else
+static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+{
+}
+
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+{
+	return 0;
+}
+#endif
+
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+
+void fastcall add_preempt_count(int val)
+{
+	/*
+	 * Underflow?
+	 */
+	BUG_ON((preempt_count() < 0));
+	preempt_count() += val;
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void fastcall sub_preempt_count(int val)
+{
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(val > preempt_count());
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
+	preempt_count() -= val;
+}
+EXPORT_SYMBOL(sub_preempt_count);
+
+#endif
+
+static inline int interactive_sleep(enum sleep_type sleep_type)
+{
+	return (sleep_type == SLEEP_INTERACTIVE ||
+		sleep_type == SLEEP_INTERRUPTED);
+}
+
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+	long *switch_count;
+	task_t *prev, *next;
+	runqueue_t *rq;
+	prio_array_t *array;
+	struct list_head *queue;
+	unsigned long long now;
+	unsigned long run_time;
+	int cpu, idx, new_prio;
+
+	/*
+	 * Test if we are atomic.  Since do_exit() needs to call into
+	 * schedule() atomically, we ignore that path for now.
+	 * Otherwise, whine if we are scheduling when we should not be.
+	 */
+	if (likely(!current->exit_state)) {
+		if (unlikely(in_atomic())) {
+			printk(KERN_ERR "BUG: scheduling while atomic: "
+				"%s/0x%08x/%d\n",
+				current->comm, preempt_count(), current->pid);
+			dump_stack();
+		}
+	}
+	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+need_resched:
+	preempt_disable();
+	prev = current;
+	release_kernel_lock(prev);
+need_resched_nonpreemptible:
+	rq = this_rq();
+
+	/*
+	 * The idle thread is not allowed to schedule!
+	 * Remove this check after it has been exercised a bit.
+	 */
+	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
+		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+		dump_stack();
+	}
+
+	schedstat_inc(rq, sched_cnt);
+	now = sched_clock();
+	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
+		run_time = now - prev->timestamp;
+		if (unlikely((long long)(now - prev->timestamp) < 0))
+			run_time = 0;
+	} else
+		run_time = NS_MAX_SLEEP_AVG;
+
+	/*
+	 * Tasks charged proportionately less run_time at high sleep_avg to
+	 * delay them losing their interactive status
+	 */
+	run_time /= (CURRENT_BONUS(prev) ? : 1);
+
+	spin_lock_irq(&rq->lock);
+
+	if (unlikely(prev->flags & PF_DEAD))
+		prev->state = EXIT_DEAD;
+
+	switch_count = &prev->nivcsw;
+	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+		switch_count = &prev->nvcsw;
+		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
+				unlikely(signal_pending(prev))))
+			prev->state = TASK_RUNNING;
+		else {
+			if (prev->state == TASK_UNINTERRUPTIBLE)
+				rq->nr_uninterruptible++;
+			deactivate_task(prev, rq);
+		}
+	}
+
+	cpu = smp_processor_id();
+	if (unlikely(!rq->nr_running)) {
+go_idle:
+		idle_balance(cpu, rq);
+		if (!rq->nr_running) {
+			next = rq->idle;
+			rq->expired_timestamp = 0;
+			wake_sleeping_dependent(cpu, rq);
+			/*
+			 * wake_sleeping_dependent() might have released
+			 * the runqueue, so break out if we got new
+			 * tasks meanwhile:
+			 */
+			if (!rq->nr_running)
+				goto switch_tasks;
+		}
+	} else {
+		if (dependent_sleeper(cpu, rq)) {
+			next = rq->idle;
+			goto switch_tasks;
+		}
+		/*
+		 * dependent_sleeper() releases and reacquires the runqueue
+		 * lock, hence go into the idle loop if the rq went
+		 * empty meanwhile:
+		 */
+		if (unlikely(!rq->nr_running))
+			goto go_idle;
+	}
+
+	array = rq->active;
+	if (unlikely(!array->nr_active)) {
+		/*
+		 * Switch the active and expired arrays.
+		 */
+		schedstat_inc(rq, sched_switch);
+		rq->active = rq->expired;
+		rq->expired = array;
+		array = rq->active;
+		rq->expired_timestamp = 0;
+		rq->best_expired_prio = MAX_PRIO;
+	}
+
+	idx = sched_find_first_bit(array->bitmap);
+	queue = array->queue + idx;
+	next = list_entry(queue->next, task_t, run_list);
+
+	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+		unsigned long long delta = now - next->timestamp;
+		if (unlikely((long long)(now - next->timestamp) < 0))
+			delta = 0;
+
+		if (next->sleep_type == SLEEP_INTERACTIVE)
+			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
+
+		array = next->array;
+		new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+		if (unlikely(next->prio != new_prio)) {
+			dequeue_task(next, array);
+			next->prio = new_prio;
+			enqueue_task(next, array);
+		}
+	}
+	next->sleep_type = SLEEP_NORMAL;
+switch_tasks:
+	if (next == rq->idle)
+		schedstat_inc(rq, sched_goidle);
+	prefetch(next);
+	prefetch_stack(next);
+	clear_tsk_need_resched(prev);
+	rcu_qsctr_inc(task_cpu(prev));
+
+	update_cpu_clock(prev, rq, now);
+
+	prev->sleep_avg -= run_time;
+	if ((long)prev->sleep_avg <= 0)
+		prev->sleep_avg = 0;
+	prev->timestamp = prev->last_ran = now;
+
+	sched_info_switch(prev, next);
+	if (likely(prev != next)) {
+		next->timestamp = now;
+		rq->nr_switches++;
+		rq->curr = next;
+		++*switch_count;
+
+		prepare_task_switch(rq, next);
+		prev = context_switch(rq, prev, next);
+		barrier();
+		/*
+		 * this_rq must be evaluated again because prev may have moved
+		 * CPUs since it called schedule(), thus the 'rq' on its stack
+		 * frame will be invalid.
+		 */
+		finish_task_switch(this_rq(), prev);
+	} else
+		spin_unlock_irq(&rq->lock);
+
+	prev = current;
+	if (unlikely(reacquire_kernel_lock(prev) < 0))
+		goto need_resched_nonpreemptible;
+	preempt_enable_no_resched();
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+		goto need_resched;
+}
+
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable.  Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage void __sched preempt_schedule(void)
+{
+	struct thread_info *ti = current_thread_info();
+#ifdef CONFIG_PREEMPT_BKL
+	struct task_struct *task = current;
+	int saved_lock_depth;
+#endif
+	/*
+	 * If there is a non-zero preempt_count or interrupts are disabled,
+	 * we do not want to preempt the current task.  Just return..
+	 */
+	if (unlikely(ti->preempt_count || irqs_disabled()))
+		return;
+
+need_resched:
+	add_preempt_count(PREEMPT_ACTIVE);
+	/*
+	 * We keep the big kernel semaphore locked, but we
+	 * clear ->lock_depth so that schedule() doesnt
+	 * auto-release the semaphore:
+	 */
+#ifdef CONFIG_PREEMPT_BKL
+	saved_lock_depth = task->lock_depth;
+	task->lock_depth = -1;
+#endif
+	schedule();
+#ifdef CONFIG_PREEMPT_BKL
+	task->lock_depth = saved_lock_depth;
+#endif
+	sub_preempt_count(PREEMPT_ACTIVE);
+
+	/* we could miss a preemption opportunity between schedule and now */
+	barrier();
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+		goto need_resched;
+}
+
+EXPORT_SYMBOL(preempt_schedule);
+
+/*
+ * this is is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage void __sched preempt_schedule_irq(void)
+{
+	struct thread_info *ti = current_thread_info();
+#ifdef CONFIG_PREEMPT_BKL
+	struct task_struct *task = current;
+	int saved_lock_depth;
+#endif
+	/* Catch callers which need to be fixed*/
+	BUG_ON(ti->preempt_count || !irqs_disabled());
+
+need_resched:
+	add_preempt_count(PREEMPT_ACTIVE);
+	/*
+	 * We keep the big kernel semaphore locked, but we
+	 * clear ->lock_depth so that schedule() doesnt
+	 * auto-release the semaphore:
+	 */
+#ifdef CONFIG_PREEMPT_BKL
+	saved_lock_depth = task->lock_depth;
+	task->lock_depth = -1;
+#endif
+	local_irq_enable();
+	schedule();
+	local_irq_disable();
+#ifdef CONFIG_PREEMPT_BKL
+	task->lock_depth = saved_lock_depth;
+#endif
+	sub_preempt_count(PREEMPT_ACTIVE);
+
+	/* we could miss a preemption opportunity between schedule and now */
+	barrier();
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+		goto need_resched;
+}
+
+#endif /* CONFIG_PREEMPT */
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+			  void *key)
+{
+	task_t *p = curr->private;
+	return try_to_wake_up(p, mode, sync);
+}
+
+EXPORT_SYMBOL(default_wake_function);
+
+/*
+ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+			     int nr_exclusive, int sync, void *key)
+{
+	struct list_head *tmp, *next;
+
+	list_for_each_safe(tmp, next, &q->task_list) {
+		wait_queue_t *curr;
+		unsigned flags;
+		curr = list_entry(tmp, wait_queue_t, task_list);
+		flags = curr->flags;
+		if (curr->func(curr, mode, sync, key) &&
+		    (flags & WQ_FLAG_EXCLUSIVE) &&
+		    !--nr_exclusive)
+			break;
+	}
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ */
+void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+{
+	__wake_up_common(q, mode, 1, 0, NULL);
+}
+
+/**
+ * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ */
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	unsigned long flags;
+	int sync = 1;
+
+	if (unlikely(!q))
+		return;
+
+	if (unlikely(!nr_exclusive))
+		sync = 0;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
+
+void fastcall complete(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done++;
+	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+			 1, 0, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+void fastcall complete_all(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done += UINT_MAX/2;
+	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+			 0, 0, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+void fastcall __sched wait_for_completion(struct completion *x)
+{
+	might_sleep();
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			schedule();
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+	spin_unlock_irq(&x->wait.lock);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+unsigned long fastcall __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = schedule_timeout(timeout);
+			spin_lock_irq(&x->wait.lock);
+			if (!timeout) {
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out:
+	spin_unlock_irq(&x->wait.lock);
+	return timeout;
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+int fastcall __sched wait_for_completion_interruptible(struct completion *x)
+{
+	int ret = 0;
+
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			if (signal_pending(current)) {
+				ret = -ERESTARTSYS;
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+			__set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			schedule();
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out:
+	spin_unlock_irq(&x->wait.lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+unsigned long fastcall __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+					  unsigned long timeout)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			if (signal_pending(current)) {
+				timeout = -ERESTARTSYS;
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+			__set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = schedule_timeout(timeout);
+			spin_lock_irq(&x->wait.lock);
+			if (!timeout) {
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out:
+	spin_unlock_irq(&x->wait.lock);
+	return timeout;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+
+#define	SLEEP_ON_VAR					\
+	unsigned long flags;				\
+	wait_queue_t wait;				\
+	init_waitqueue_entry(&wait, current);
+
+#define SLEEP_ON_HEAD					\
+	spin_lock_irqsave(&q->lock,flags);		\
+	__add_wait_queue(q, &wait);			\
+	spin_unlock(&q->lock);
+
+#define	SLEEP_ON_TAIL					\
+	spin_lock_irq(&q->lock);			\
+	__remove_wait_queue(q, &wait);			\
+	spin_unlock_irqrestore(&q->lock, flags);
+
+void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_INTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	schedule();
+	SLEEP_ON_TAIL
+}
+
+EXPORT_SYMBOL(interruptible_sleep_on);
+
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_INTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	timeout = schedule_timeout(timeout);
+	SLEEP_ON_TAIL
+
+	return timeout;
+}
+
+EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+
+void fastcall __sched sleep_on(wait_queue_head_t *q)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_UNINTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	schedule();
+	SLEEP_ON_TAIL
+}
+
+EXPORT_SYMBOL(sleep_on);
+
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_UNINTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	timeout = schedule_timeout(timeout);
+	SLEEP_ON_TAIL
+
+	return timeout;
+}
+
+EXPORT_SYMBOL(sleep_on_timeout);
+
+void set_user_nice(task_t *p, long nice)
+{
+	unsigned long flags;
+	prio_array_t *array;
+	runqueue_t *rq;
+	int old_prio, new_prio, delta;
+
+	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+		return;
+	/*
+	 * We have to be careful, if called from sys_setpriority(),
+	 * the task might be in the middle of scheduling on another CPU.
+	 */
+	rq = task_rq_lock(p, &flags);
+	/*
+	 * The RT priorities are set via sched_setscheduler(), but we still
+	 * allow the 'normal' nice value to be set - but as expected
+	 * it wont have any effect on scheduling until the task is
+	 * not SCHED_NORMAL/SCHED_BATCH:
+	 */
+	if (rt_task(p)) {
+		p->static_prio = NICE_TO_PRIO(nice);
+		goto out_unlock;
+	}
+	array = p->array;
+	if (array) {
+		dequeue_task(p, array);
+		dec_raw_weighted_load(rq, p);
+	}
+
+	old_prio = p->prio;
+	new_prio = NICE_TO_PRIO(nice);
+	delta = new_prio - old_prio;
+	p->static_prio = NICE_TO_PRIO(nice);
+	set_load_weight(p);
+	p->prio += delta;
+
+	if (array) {
+		enqueue_task(p, array);
+		inc_raw_weighted_load(rq, p);
+		/*
+		 * If the task increased its priority or is running and
+		 * lowered its priority, then reschedule its CPU:
+		 */
+		if (delta < 0 || (delta > 0 && task_running(rq, p)))
+			resched_task(rq->curr);
+	}
+out_unlock:
+	task_rq_unlock(rq, &flags);
+}
+
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const task_t *p, const int nice)
+{
+	/* convert nice value [19,-20] to rlimit style value [1,40] */
+	int nice_rlim = 20 - nice;
+	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+		capable(CAP_SYS_NICE));
+}
+
+struct task_struct *kgdb_get_idle(int this_cpu)
+{
+        return cpu_rq(this_cpu)->idle;
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+asmlinkage long sys_nice(int increment)
+{
+	int retval;
+	long nice;
+
+	/*
+	 * Setpriority might change our priority at the same moment.
+	 * We don't have to worry. Conceptually one call occurs first
+	 * and we have a single winner.
+	 */
+	if (increment < -40)
+		increment = -40;
+	if (increment > 40)
+		increment = 40;
+
+	nice = PRIO_TO_NICE(current->static_prio) + increment;
+	if (nice < -20)
+		nice = -20;
+	if (nice > 19)
+		nice = 19;
+
+	if (increment < 0 && !can_nice(current, nice))
+		return -EPERM;
+
+	retval = security_task_setnice(current, nice);
+	if (retval)
+		return retval;
+
+	set_user_nice(current, nice);
+	return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * This is the priority value as seen by users in /proc.
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(const task_t *p)
+{
+	return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ */
+int task_nice(const task_t *p)
+{
+	return TASK_NICE(p);
+}
+EXPORT_SYMBOL_GPL(task_nice);
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ */
+int idle_cpu(int cpu)
+{
+	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ */
+task_t *idle_task(int cpu)
+{
+	return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ */
+static inline task_t *find_process_by_pid(pid_t pid)
+{
+	return pid ? find_task_by_pid(pid) : current;
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void __setscheduler(struct task_struct *p, int policy, int prio)
+{
+	BUG_ON(p->array);
+	p->policy = policy;
+	p->rt_priority = prio;
+	if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
+		p->prio = MAX_RT_PRIO-1 - p->rt_priority;
+	} else {
+		p->prio = p->static_prio;
+		/*
+		 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+		 */
+		if (policy == SCHED_BATCH)
+			p->sleep_avg = 0;
+	}
+	set_load_weight(p);
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of
+ * a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+		       struct sched_param *param)
+{
+	int retval;
+	int oldprio, oldpolicy = -1;
+	prio_array_t *array;
+	unsigned long flags;
+	runqueue_t *rq;
+
+recheck:
+	/* double check policy once rq lock held */
+	if (policy < 0)
+		policy = oldpolicy = p->policy;
+	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
+			policy != SCHED_NORMAL && policy != SCHED_BATCH)
+		return -EINVAL;
+	/*
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+	 * SCHED_BATCH is 0.
+	 */
+	if (param->sched_priority < 0 ||
+	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+		return -EINVAL;
+	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
+					!= (param->sched_priority == 0))
+		return -EINVAL;
+
+	/*
+	 * Allow unprivileged RT tasks to decrease priority:
+	 */
+	if (!capable(CAP_SYS_NICE)) {
+		/*
+		 * can't change policy, except between SCHED_NORMAL
+		 * and SCHED_BATCH:
+		 */
+		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
+			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
+				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+			return -EPERM;
+		/* can't increase priority */
+		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
+		    param->sched_priority > p->rt_priority &&
+		    param->sched_priority >
+				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+			return -EPERM;
+		/* can't change other user's priorities */
+		if ((current->euid != p->euid) &&
+		    (current->euid != p->uid))
+			return -EPERM;
+	}
+
+	retval = security_task_setscheduler(p, policy, param);
+	if (retval)
+		return retval;
+	/*
+	 * To be able to change p->policy safely, the apropriate
+	 * runqueue lock must be held.
+	 */
+	rq = task_rq_lock(p, &flags);
+	/* recheck policy now with rq lock held */
+	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+		policy = oldpolicy = -1;
+		task_rq_unlock(rq, &flags);
+		goto recheck;
+	}
+	array = p->array;
+	if (array)
+		deactivate_task(p, rq);
+	oldprio = p->prio;
+	__setscheduler(p, policy, param->sched_priority);
+	if (array) {
+		__activate_task(p, rq);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	task_rq_unlock(rq, &flags);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+	int retval;
+	struct sched_param lparam;
+	struct task_struct *p;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+		return -EFAULT;
+	read_lock_irq(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (!p) {
+		read_unlock_irq(&tasklist_lock);
+		return -ESRCH;
+	}
+	retval = sched_setscheduler(p, policy, &lparam);
+	read_unlock_irq(&tasklist_lock);
+	return retval;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+				       struct sched_param __user *param)
+{
+	/* negative values for policy are not valid */
+	if (policy < 0)
+		return -EINVAL;
+
+	return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
+{
+	return do_sched_setscheduler(pid, -1, param);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ */
+asmlinkage long sys_sched_getscheduler(pid_t pid)
+{
+	int retval = -EINVAL;
+	task_t *p;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	read_lock(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (p) {
+		retval = security_task_getscheduler(p);
+		if (!retval)
+			retval = p->policy;
+	}
+	read_unlock(&tasklist_lock);
+
+out_nounlock:
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ */
+asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
+{
+	struct sched_param lp;
+	int retval = -EINVAL;
+	task_t *p;
+
+	if (!param || pid < 0)
+		goto out_nounlock;
+
+	read_lock(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	lp.sched_priority = p->rt_priority;
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * This one might sleep, we cannot do it with a spinlock held ...
+	 */
+	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+	return retval;
+
+out_unlock:
+	read_unlock(&tasklist_lock);
+	return retval;
+}
+
+long sched_setaffinity(pid_t pid, cpumask_t new_mask)
+{
+	task_t *p;
+	int retval;
+	cpumask_t cpus_allowed;
+
+	lock_cpu_hotplug();
+	read_lock(&tasklist_lock);
+
+	p = find_process_by_pid(pid);
+	if (!p) {
+		read_unlock(&tasklist_lock);
+		unlock_cpu_hotplug();
+		return -ESRCH;
+	}
+
+	/*
+	 * It is not safe to call set_cpus_allowed with the
+	 * tasklist_lock held.  We will bump the task_struct's
+	 * usage count and then drop tasklist_lock.
+	 */
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	retval = -EPERM;
+	if ((current->euid != p->euid) && (current->euid != p->uid) &&
+			!capable(CAP_SYS_NICE))
+		goto out_unlock;
+
+	cpus_allowed = cpuset_cpus_allowed(p);
+	cpus_and(new_mask, new_mask, cpus_allowed);
+	retval = set_cpus_allowed(p, new_mask);
+
+out_unlock:
+	put_task_struct(p);
+	unlock_cpu_hotplug();
+	return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+			     cpumask_t *new_mask)
+{
+	if (len < sizeof(cpumask_t)) {
+		memset(new_mask, 0, sizeof(cpumask_t));
+	} else if (len > sizeof(cpumask_t)) {
+		len = sizeof(cpumask_t);
+	}
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ */
+asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
+				      unsigned long __user *user_mask_ptr)
+{
+	cpumask_t new_mask;
+	int retval;
+
+	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
+	if (retval)
+		return retval;
+
+	return sched_setaffinity(pid, new_mask);
+}
+
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+
+cpumask_t cpu_present_map __read_mostly;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+#endif
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+	int retval;
+	task_t *p;
+
+	lock_cpu_hotplug();
+	read_lock(&tasklist_lock);
+
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = 0;
+	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+
+out_unlock:
+	read_unlock(&tasklist_lock);
+	unlock_cpu_hotplug();
+	if (retval)
+		return retval;
+
+	return 0;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ */
+asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
+				      unsigned long __user *user_mask_ptr)
+{
+	int ret;
+	cpumask_t mask;
+
+	if (len < sizeof(cpumask_t))
+		return -EINVAL;
+
+	ret = sched_getaffinity(pid, &mask);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
+		return -EFAULT;
+
+	return sizeof(cpumask_t);
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * this function yields the current CPU by moving the calling thread
+ * to the expired array. If there are no other threads running on this
+ * CPU then this function will return.
+ */
+asmlinkage long sys_sched_yield(void)
+{
+	runqueue_t *rq = this_rq_lock();
+	prio_array_t *array = current->array;
+	prio_array_t *target = rq->expired;
+
+	schedstat_inc(rq, yld_cnt);
+	/*
+	 * We implement yielding by moving the task into the expired
+	 * queue.
+	 *
+	 * (special rule: RT tasks will just roundrobin in the active
+	 *  array.)
+	 */
+	if (rt_task(current))
+		target = rq->active;
+
+	if (array->nr_active == 1) {
+		schedstat_inc(rq, yld_act_empty);
+		if (!rq->expired->nr_active)
+			schedstat_inc(rq, yld_both_empty);
+	} else if (!rq->expired->nr_active)
+		schedstat_inc(rq, yld_exp_empty);
+
+	if (array != target) {
+		dequeue_task(current, array);
+		enqueue_task(current, target);
+	} else
+		/*
+		 * requeue_task is cheaper so perform that if possible.
+		 */
+		requeue_task(current, array);
+
+	/*
+	 * Since we are going to call schedule() anyway, there's
+	 * no need to preempt or enable interrupts:
+	 */
+	__release(rq->lock);
+	_raw_spin_unlock(&rq->lock);
+	preempt_enable_no_resched();
+
+	schedule();
+
+	return 0;
+}
+
+static inline void __cond_resched(void)
+{
+	/*
+	 * The BKS might be reacquired before we have dropped
+	 * PREEMPT_ACTIVE, which could trigger a second
+	 * cond_resched() call.
+	 */
+	if (unlikely(preempt_count()))
+		return;
+	if (unlikely(system_state != SYSTEM_RUNNING))
+		return;
+	do {
+		add_preempt_count(PREEMPT_ACTIVE);
+		schedule();
+		sub_preempt_count(PREEMPT_ACTIVE);
+	} while (need_resched());
+}
+
+int __sched cond_resched(void)
+{
+	if (need_resched()) {
+		__cond_resched();
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched);
+
+/*
+ * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int cond_resched_lock(spinlock_t *lock)
+{
+	int ret = 0;
+
+	if (need_lockbreak(lock)) {
+		spin_unlock(lock);
+		cpu_relax();
+		ret = 1;
+		spin_lock(lock);
+	}
+	if (need_resched()) {
+		_raw_spin_unlock(lock);
+		preempt_enable_no_resched();
+		__cond_resched();
+		ret = 1;
+		spin_lock(lock);
+	}
+	return ret;
+}
+
+EXPORT_SYMBOL(cond_resched_lock);
+
+int __sched cond_resched_softirq(void)
+{
+	BUG_ON(!in_softirq());
+
+	if (need_resched()) {
+		__local_bh_enable();
+		__cond_resched();
+		local_bh_disable();
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched_softirq);
+
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * this is a shortcut for kernel-space yielding - it marks the
+ * thread runnable and calls sys_sched_yield().
+ */
+void __sched yield(void)
+{
+	set_current_state(TASK_RUNNING);
+	sys_sched_yield();
+}
+
+EXPORT_SYMBOL(yield);
+
+/*
+ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+void __sched io_schedule(void)
+{
+	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+
+	atomic_inc(&rq->nr_iowait);
+	schedule();
+	atomic_dec(&rq->nr_iowait);
+}
+
+EXPORT_SYMBOL(io_schedule);
+
+long __sched io_schedule_timeout(long timeout)
+{
+	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+	long ret;
+
+	atomic_inc(&rq->nr_iowait);
+	ret = schedule_timeout(timeout);
+	atomic_dec(&rq->nr_iowait);
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the maximum rt_priority that can be used
+ * by a given scheduling class.
+ */
+asmlinkage long sys_sched_get_priority_max(int policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = MAX_USER_RT_PRIO-1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the minimum rt_priority that can be used
+ * by a given scheduling class.
+ */
+asmlinkage long sys_sched_get_priority_min(int policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = 1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+		ret = 0;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ */
+asmlinkage
+long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
+{
+	int retval = -EINVAL;
+	struct timespec t;
+	task_t *p;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	read_lock(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	jiffies_to_timespec(p->policy & SCHED_FIFO ?
+				0 : task_timeslice(p), &t);
+	read_unlock(&tasklist_lock);
+	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+out_nounlock:
+	return retval;
+out_unlock:
+	read_unlock(&tasklist_lock);
+	return retval;
+}
+
+static inline struct task_struct *eldest_child(struct task_struct *p)
+{
+	if (list_empty(&p->children)) return NULL;
+	return list_entry(p->children.next,struct task_struct,sibling);
+}
+
+static inline struct task_struct *older_sibling(struct task_struct *p)
+{
+	if (p->sibling.prev==&p->parent->children) return NULL;
+	return list_entry(p->sibling.prev,struct task_struct,sibling);
+}
+
+static inline struct task_struct *younger_sibling(struct task_struct *p)
+{
+	if (p->sibling.next==&p->parent->children) return NULL;
+	return list_entry(p->sibling.next,struct task_struct,sibling);
+}
+
+static void show_task(task_t *p)
+{
+	task_t *relative;
+	unsigned state;
+	unsigned long free = 0;
+	static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+
+	printk("%-13.13s ", p->comm);
+	state = p->state ? __ffs(p->state) + 1 : 0;
+	if (state < ARRAY_SIZE(stat_nam))
+		printk(stat_nam[state]);
+	else
+		printk("?");
+#if (BITS_PER_LONG == 32)
+	if (state == TASK_RUNNING)
+		printk(" running ");
+	else
+		printk(" %08lX ", thread_saved_pc(p));
+#else
+	if (state == TASK_RUNNING)
+		printk("  running task   ");
+	else
+		printk(" %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+	{
+		unsigned long *n = end_of_stack(p);
+		while (!*n)
+			n++;
+		free = (unsigned long)n - (unsigned long)end_of_stack(p);
+	}
+#endif
+	printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
+	if ((relative = eldest_child(p)))
+		printk("%5d ", relative->pid);
+	else
+		printk("      ");
+	if ((relative = younger_sibling(p)))
+		printk("%7d", relative->pid);
+	else
+		printk("       ");
+	if ((relative = older_sibling(p)))
+		printk(" %5d", relative->pid);
+	else
+		printk("      ");
+	if (!p->mm)
+		printk(" (L-TLB)\n");
+	else
+		printk(" (NOTLB)\n");
+
+	if (state != TASK_RUNNING)
+		show_stack(p, NULL);
+}
+
+void show_state(void)
+{
+	task_t *g, *p;
+
+#if (BITS_PER_LONG == 32)
+	printk("\n"
+	       "                                               sibling\n");
+	printk("  task             PC      pid father child younger older\n");
+#else
+	printk("\n"
+	       "                                                       sibling\n");
+	printk("  task                 PC          pid father child younger older\n");
+#endif
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		/*
+		 * reset the NMI-timeout, listing all files on a slow
+		 * console might take alot of time:
+		 */
+		touch_nmi_watchdog();
+		show_task(p);
+	} while_each_thread(g, p);
+
+	read_unlock(&tasklist_lock);
+	mutex_debug_show_all_locks();
+}
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void __devinit init_idle(task_t *idle, int cpu)
+{
+	runqueue_t *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	idle->timestamp = sched_clock();
+	idle->sleep_avg = 0;
+	idle->array = NULL;
+	idle->prio = MAX_PRIO;
+	idle->state = TASK_RUNNING;
+	idle->cpus_allowed = cpumask_of_cpu(cpu);
+	set_task_cpu(idle, cpu);
+
+	spin_lock_irqsave(&rq->lock, flags);
+	rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	idle->oncpu = 1;
+#endif
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	/* Set the preempt count _outside_ the spinlocks! */
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
+#else
+	task_thread_info(idle)->preempt_count = 0;
+#endif
+}
+
+/*
+ * In a system that switches off the HZ timer nohz_cpu_mask
+ * indicates which cpus entered this state. This is used
+ * in the rcu update to wait only for active cpus. For system
+ * which do not switch off the HZ timer nohz_cpu_mask should
+ * always be CPU_MASK_NONE.
+ */
+cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+
+#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we queue a migration_req_t structure in the source CPU's
+ *    runqueue and wake up that CPU's migration thread.
+ * 2) we down() the locked semaphore => thread blocks.
+ * 3) migration thread wakes up (implicitly it forces the migrated
+ *    thread off the CPU)
+ * 4) it gets the migration request and checks whether the migrated
+ *    task is still in the wrong runqueue.
+ * 5) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 6) migration thread up()s the semaphore.
+ * 7) we wake up and the migration is done.
+ */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely.  The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed(task_t *p, cpumask_t new_mask)
+{
+	unsigned long flags;
+	int ret = 0;
+	migration_req_t req;
+	runqueue_t *rq;
+
+	rq = task_rq_lock(p, &flags);
+	if (!cpus_intersects(new_mask, cpu_online_map)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	p->cpus_allowed = new_mask;
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpu_isset(task_cpu(p), new_mask))
+		goto out;
+
+	if (migrate_task(p, any_online_cpu(new_mask), &req)) {
+		/* Need help from migration thread: drop lock and wait. */
+		task_rq_unlock(rq, &flags);
+		wake_up_process(rq->migration_thread);
+		wait_for_completion(&req.done);
+		tlb_migrate_finish(p->mm);
+		return 0;
+	}
+out:
+	task_rq_unlock(rq, &flags);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(set_cpus_allowed);
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu.  We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+	runqueue_t *rq_dest, *rq_src;
+
+	if (unlikely(cpu_is_offline(dest_cpu)))
+		return;
+
+	rq_src = cpu_rq(src_cpu);
+	rq_dest = cpu_rq(dest_cpu);
+
+	double_rq_lock(rq_src, rq_dest);
+	/* Already moved. */
+	if (task_cpu(p) != src_cpu)
+		goto out;
+	/* Affinity changed (again). */
+	if (!cpu_isset(dest_cpu, p->cpus_allowed))
+		goto out;
+
+	set_task_cpu(p, dest_cpu);
+	if (p->array) {
+		/*
+		 * Sync timestamp with rq_dest's before activating.
+		 * The same thing could be achieved by doing this step
+		 * afterwards, and pretending it was a local activate.
+		 * This way is cleaner and logically correct.
+		 */
+		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+				+ rq_dest->timestamp_last_tick;
+		deactivate_task(p, rq_src);
+		activate_task(p, rq_dest, 0);
+		if (TASK_PREEMPTS_CURR(p, rq_dest))
+			resched_task(rq_dest->curr);
+	}
+
+out:
+	double_rq_unlock(rq_src, rq_dest);
+}
+
+/*
+ * migration_thread - this is a highprio system thread that performs
+ * thread migration by bumping thread off CPU then 'pushing' onto
+ * another runqueue.
+ */
+static int migration_thread(void *data)
+{
+	runqueue_t *rq;
+	int cpu = (long)data;
+
+	rq = cpu_rq(cpu);
+	BUG_ON(rq->migration_thread != current);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		struct list_head *head;
+		migration_req_t *req;
+
+		try_to_freeze();
+
+		spin_lock_irq(&rq->lock);
+
+		if (cpu_is_offline(cpu)) {
+			spin_unlock_irq(&rq->lock);
+			goto wait_to_die;
+		}
+
+		if (rq->active_balance) {
+			active_load_balance(rq, cpu);
+			rq->active_balance = 0;
+		}
+
+		head = &rq->migration_queue;
+
+		if (list_empty(head)) {
+			spin_unlock_irq(&rq->lock);
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+			continue;
+		}
+		req = list_entry(head->next, migration_req_t, list);
+		list_del_init(head->next);
+
+		spin_unlock(&rq->lock);
+		__migrate_task(req->task, cpu, req->dest_cpu);
+		local_irq_enable();
+
+		complete(&req->done);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* Figure out where task on dead CPU should go, use force if neccessary. */
+static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
+{
+	int dest_cpu;
+	cpumask_t mask;
+
+	/* On same node? */
+	mask = node_to_cpumask(cpu_to_node(dead_cpu));
+	cpus_and(mask, mask, tsk->cpus_allowed);
+	dest_cpu = any_online_cpu(mask);
+
+	/* On any allowed CPU? */
+	if (dest_cpu == NR_CPUS)
+		dest_cpu = any_online_cpu(tsk->cpus_allowed);
+
+	/* No more Mr. Nice Guy. */
+	if (dest_cpu == NR_CPUS) {
+		cpus_setall(tsk->cpus_allowed);
+		dest_cpu = any_online_cpu(tsk->cpus_allowed);
+
+		/*
+		 * Don't tell them about moving exiting tasks or
+		 * kernel threads (both mm NULL), since they never
+		 * leave kernel.
+		 */
+		if (tsk->mm && printk_ratelimit())
+			printk(KERN_INFO "process %d (%s) no "
+			       "longer affine to cpu%d\n",
+			       tsk->pid, tsk->comm, dead_cpu);
+	}
+	__migrate_task(tsk, dead_cpu, dest_cpu);
+}
+
+/*
+ * While a dead CPU has no uninterruptible tasks queued at this point,
+ * it might still have a nonzero ->nr_uninterruptible counter, because
+ * for performance reasons the counter is not stricly tracking tasks to
+ * their home CPUs. So we just add the counter to another CPU's counter,
+ * to keep the global sum constant after CPU-down:
+ */
+static void migrate_nr_uninterruptible(runqueue_t *rq_src)
+{
+	runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
+	unsigned long flags;
+
+	local_irq_save(flags);
+	double_rq_lock(rq_src, rq_dest);
+	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
+	rq_src->nr_uninterruptible = 0;
+	double_rq_unlock(rq_src, rq_dest);
+	local_irq_restore(flags);
+}
+
+/* Run through task list and migrate tasks from the dead cpu. */
+static void migrate_live_tasks(int src_cpu)
+{
+	struct task_struct *tsk, *t;
+
+	write_lock_irq(&tasklist_lock);
+
+	do_each_thread(t, tsk) {
+		if (tsk == current)
+			continue;
+
+		if (task_cpu(tsk) == src_cpu)
+			move_task_off_dead_cpu(src_cpu, tsk);
+	} while_each_thread(t, tsk);
+
+	write_unlock_irq(&tasklist_lock);
+}
+
+/* Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible and adding it to
+ * the _front_ of runqueue. Used by CPU offline code.
+ */
+void sched_idle_next(void)
+{
+	int cpu = smp_processor_id();
+	runqueue_t *rq = this_rq();
+	struct task_struct *p = rq->idle;
+	unsigned long flags;
+
+	/* cpu has to be offline */
+	BUG_ON(cpu_online(cpu));
+
+	/* Strictly not necessary since rest of the CPUs are stopped by now
+	 * and interrupts disabled on current cpu.
+	 */
+	spin_lock_irqsave(&rq->lock, flags);
+
+	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+	/* Add idle task to _front_ of it's priority queue */
+	__activate_idle_task(p, rq);
+
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+/* Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+	struct mm_struct *mm = current->active_mm;
+
+	BUG_ON(cpu_online(smp_processor_id()));
+
+	if (mm != &init_mm)
+		switch_mm(mm, &init_mm, current);
+	mmdrop(mm);
+}
+
+static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
+{
+	struct runqueue *rq = cpu_rq(dead_cpu);
+
+	/* Must be exiting, otherwise would be on tasklist. */
+	BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
+
+	/* Cannot have done final schedule yet: would have vanished. */
+	BUG_ON(tsk->flags & PF_DEAD);
+
+	get_task_struct(tsk);
+
+	/*
+	 * Drop lock around migration; if someone else moves it,
+	 * that's OK.  No task can be added to this CPU, so iteration is
+	 * fine.
+	 */
+	spin_unlock_irq(&rq->lock);
+	move_task_off_dead_cpu(dead_cpu, tsk);
+	spin_lock_irq(&rq->lock);
+
+	put_task_struct(tsk);
+}
+
+/* release_task() removes task from tasklist, so we won't find dead tasks. */
+static void migrate_dead_tasks(unsigned int dead_cpu)
+{
+	unsigned arr, i;
+	struct runqueue *rq = cpu_rq(dead_cpu);
+
+	for (arr = 0; arr < 2; arr++) {
+		for (i = 0; i < MAX_PRIO; i++) {
+			struct list_head *list = &rq->arrays[arr].queue[i];
+			while (!list_empty(list))
+				migrate_dead(dead_cpu,
+					     list_entry(list->next, task_t,
+							run_list));
+		}
+	}
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#if defined(CONFIG_DEBUG_KERNEL) && defined(CONFIG_SYSCTL)
+static struct ctl_table sd_ctl_dir[] = {
+	{1, "sched_domain", NULL, 0, 0755, NULL, },
+	{0,},
+};
+
+static struct ctl_table sd_ctl_root[] = {
+	{1, "kernel", NULL, 0, 0755, sd_ctl_dir, },
+	{0,},
+};
+
+static char *sched_strdup(char *str)
+{
+	int n = strlen(str)+1;
+	char *s = kmalloc(n, GFP_KERNEL);
+	if (!s)
+		return NULL;
+	return strcpy(s, str);
+}
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+	struct ctl_table *entry =
+		kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
+	BUG_ON(!entry);
+	memset(entry, 0, n * sizeof(struct ctl_table));
+	return entry;
+}
+
+static void set_table_entry(struct ctl_table *entry, int ctl_name,
+			const char *procname, void *data, int maxlen,
+			mode_t mode, proc_handler *proc_handler)
+{
+	entry->ctl_name = ctl_name;
+	entry->procname = procname;
+	entry->data = data;
+	entry->maxlen = maxlen;
+	entry->mode = mode;
+	entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+	struct ctl_table *table;
+	table = sd_alloc_ctl_entry(14);
+
+	set_table_entry(&table[0], 1, "min_interval", &sd->min_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[1], 2, "max_interval", &sd->max_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[9], 10, "cache_hot_time", &sd->cache_hot_time,
+		sizeof(long long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[10], 11, "cache_nice_tries", &sd->cache_nice_tries,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[11], 12, "per_cpu_gain", &sd->per_cpu_gain,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[12], 13, "flags", &sd->flags,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	return table;
+}
+
+static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+	struct sched_domain *sd;
+	int domain_num = 0, i;
+	struct ctl_table *entry, *table;
+	char buf[32];
+	for_each_domain(cpu, sd)
+		domain_num++;
+	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+
+	i = 0;
+	for_each_domain(cpu, sd) {
+		snprintf(buf, 32, "domain%d", i);
+		entry->ctl_name = i + 1;
+		entry->procname = sched_strdup(buf);
+		entry->mode = 0755;
+		entry->child = sd_alloc_ctl_domain_table(sd);
+		entry++;
+		i++;
+	}
+	return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void init_sched_domain_sysctl(void)
+{
+	int i, cpu_num = num_online_cpus();
+	char buf[32];
+	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+
+	sd_ctl_dir[0].child = entry;
+
+	for (i = 0; i < cpu_num; i++, entry++) {
+		snprintf(buf, 32, "cpu%d", i);
+		entry->ctl_name = i + 1;
+		entry->procname = sched_strdup(buf);
+		entry->mode = 0755;
+		entry->child = sd_alloc_ctl_cpu_table(i);
+	}
+	sd_sysctl_header = register_sysctl_table(sd_ctl_root, 0);
+}
+#else
+static void init_sched_domain_sysctl(void)
+{
+}
+#endif
+
+/*
+ * migration_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int migration_call(struct notifier_block *nfb, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct task_struct *p;
+	struct runqueue *rq;
+	unsigned long flags;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
+		kthread_bind(p, cpu);
+		/* Must be high prio: stop_machine expects to yield to it. */
+		rq = task_rq_lock(p, &flags);
+		__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+		task_rq_unlock(rq, &flags);
+		cpu_rq(cpu)->migration_thread = p;
+		break;
+	case CPU_ONLINE:
+		/* Strictly unneccessary, as first user will wake it. */
+		wake_up_process(cpu_rq(cpu)->migration_thread);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind it from offline cpu so it can run.  Fall thru. */
+		kthread_bind(cpu_rq(cpu)->migration_thread,
+			     any_online_cpu(cpu_online_map));
+		kthread_stop(cpu_rq(cpu)->migration_thread);
+		cpu_rq(cpu)->migration_thread = NULL;
+		break;
+	case CPU_DEAD:
+		migrate_live_tasks(cpu);
+		rq = cpu_rq(cpu);
+		kthread_stop(rq->migration_thread);
+		rq->migration_thread = NULL;
+		/* Idle task back to normal (off runqueue, low prio) */
+		rq = task_rq_lock(rq->idle, &flags);
+		deactivate_task(rq->idle, rq);
+		rq->idle->static_prio = MAX_PRIO;
+		__setscheduler(rq->idle, SCHED_NORMAL, 0);
+		migrate_dead_tasks(cpu);
+		task_rq_unlock(rq, &flags);
+		migrate_nr_uninterruptible(rq);
+		BUG_ON(rq->nr_running != 0);
+
+		/* No need to migrate the tasks: it was best-effort if
+		 * they didn't do lock_cpu_hotplug().  Just wake up
+		 * the requestors. */
+		spin_lock_irq(&rq->lock);
+		while (!list_empty(&rq->migration_queue)) {
+			migration_req_t *req;
+			req = list_entry(rq->migration_queue.next,
+					 migration_req_t, list);
+			list_del_init(&req->list);
+			complete(&req->done);
+		}
+		spin_unlock_irq(&rq->lock);
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block __devinitdata migration_notifier = {
+	.notifier_call = migration_call,
+	.priority = 10
+};
+
+int __init migration_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	/* Start one for boot CPU. */
+	migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	migration_call(&migration_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&migration_notifier);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_SMP
+#undef SCHED_DOMAIN_DEBUG
+#ifdef SCHED_DOMAIN_DEBUG
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
+
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	do {
+		int i;
+		char str[NR_CPUS];
+		struct sched_group *group = sd->groups;
+		cpumask_t groupmask;
+
+		cpumask_scnprintf(str, NR_CPUS, sd->span);
+		cpus_clear(groupmask);
+
+		printk(KERN_DEBUG);
+		for (i = 0; i < level + 1; i++)
+			printk(" ");
+		printk("domain %d: ", level);
+
+		if (!(sd->flags & SD_LOAD_BALANCE)) {
+			printk("does not load-balance\n");
+			if (sd->parent)
+				printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+			break;
+		}
+
+		printk("span %s\n", str);
+
+		if (!cpu_isset(cpu, sd->span))
+			printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+		if (!cpu_isset(cpu, group->cpumask))
+			printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+
+		printk(KERN_DEBUG);
+		for (i = 0; i < level + 2; i++)
+			printk(" ");
+		printk("groups:");
+		do {
+			if (!group) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: group is NULL\n");
+				break;
+			}
+
+			if (!group->cpu_power) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+			}
+
+			if (!cpus_weight(group->cpumask)) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: empty group\n");
+			}
+
+			if (cpus_intersects(groupmask, group->cpumask)) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: repeated CPUs\n");
+			}
+
+			cpus_or(groupmask, groupmask, group->cpumask);
+
+			cpumask_scnprintf(str, NR_CPUS, group->cpumask);
+			printk(" %s", str);
+
+			group = group->next;
+		} while (group != sd->groups);
+		printk("\n");
+
+		if (!cpus_equal(sd->span, groupmask))
+			printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+		level++;
+		sd = sd->parent;
+
+		if (sd) {
+			if (!cpus_subset(groupmask, sd->span))
+				printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
+		}
+
+	} while (sd);
+}
+#else
+#define sched_domain_debug(sd, cpu) {}
+#endif
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+	if (cpus_weight(sd->span) == 1)
+		return 1;
+
+	/* Following flags need at least 2 groups */
+	if (sd->flags & (SD_LOAD_BALANCE |
+			 SD_BALANCE_NEWIDLE |
+			 SD_BALANCE_FORK |
+			 SD_BALANCE_EXEC)) {
+		if (sd->groups != sd->groups->next)
+			return 0;
+	}
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_IDLE |
+			 SD_WAKE_AFFINE |
+			 SD_WAKE_BALANCE))
+		return 0;
+
+	return 1;
+}
+
+static int sd_parent_degenerate(struct sched_domain *sd,
+						struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpus_equal(sd->span, parent->span))
+		return 0;
+
+	/* Does parent contain flags not in child? */
+	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
+	if (cflags & SD_WAKE_AFFINE)
+		pflags &= ~SD_WAKE_BALANCE;
+	/* Flags needing groups don't count if only 1 group in parent */
+	if (parent->groups == parent->groups->next) {
+		pflags &= ~(SD_LOAD_BALANCE |
+				SD_BALANCE_NEWIDLE |
+				SD_BALANCE_FORK |
+				SD_BALANCE_EXEC);
+	}
+	if (~cflags & pflags)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
+ * hold the hotplug lock.
+ */
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+{
+	runqueue_t *rq = cpu_rq(cpu);
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; tmp = tmp->parent) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+		if (sd_parent_degenerate(tmp, parent))
+			tmp->parent = parent->parent;
+	}
+
+	if (sd && sd_degenerate(sd))
+		sd = sd->parent;
+
+	sched_domain_debug(sd, cpu);
+
+	rcu_assign_pointer(rq->sd, sd);
+}
+
+/* cpus with isolated domains */
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+	int ints[NR_CPUS], i;
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+	cpus_clear(cpu_isolated_map);
+	for (i = 1; i <= ints[0]; i++)
+		if (ints[i] < NR_CPUS)
+			cpu_set(ints[i], cpu_isolated_map);
+	return 1;
+}
+
+__setup ("isolcpus=", isolated_cpu_setup);
+
+/*
+ * init_sched_build_groups takes an array of groups, the cpumask we wish
+ * to span, and a pointer to a function which identifies what group a CPU
+ * belongs to. The return value of group_fn must be a valid index into the
+ * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
+ * keep track of groups covered with a cpumask_t).
+ *
+ * init_sched_build_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
+ */
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+				    int (*group_fn)(int cpu))
+{
+	struct sched_group *first = NULL, *last = NULL;
+	cpumask_t covered = CPU_MASK_NONE;
+	int i;
+
+	for_each_cpu_mask(i, span) {
+		int group = group_fn(i);
+		struct sched_group *sg = &groups[group];
+		int j;
+
+		if (cpu_isset(i, covered))
+			continue;
+
+		sg->cpumask = CPU_MASK_NONE;
+		sg->cpu_power = 0;
+
+		for_each_cpu_mask(j, span) {
+			if (group_fn(j) != group)
+				continue;
+
+			cpu_set(j, covered);
+			cpu_set(j, sg->cpumask);
+		}
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+	}
+	last->next = first;
+}
+
+#define SD_NODES_PER_DOMAIN 16
+
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE		2
+#define MIN_CACHE_SIZE		(64*1024U)
+#define DEFAULT_CACHE_SIZE	(5*1024*1024U)
+#define ITERATIONS		1
+#define SIZE_THRESH		130
+#define COST_THRESH		130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
+/*
+ * Architectures may override the migration cost and thus avoid
+ * boot-time calibration. Unit is nanoseconds. Mostly useful for
+ * virtualized hardware:
+ */
+#ifdef CONFIG_DEFAULT_MIGRATION_COST
+			CONFIG_DEFAULT_MIGRATION_COST
+#else
+			-1LL
+#endif
+};
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+	int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+
+	printk("#ints: %d\n", ints[0]);
+	for (i = 1; i <= ints[0]; i++) {
+		migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+		printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
+	}
+	return 1;
+}
+
+__setup ("migration_cost=", migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+	get_option(&str, &migration_factor);
+	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+	return 1;
+}
+
+__setup("migration_factor=", setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+static unsigned long domain_distance(int cpu1, int cpu2)
+{
+	unsigned long distance = 0;
+	struct sched_domain *sd;
+
+	for_each_domain(cpu1, sd) {
+		WARN_ON(!cpu_isset(cpu1, sd->span));
+		if (cpu_isset(cpu2, sd->span))
+			return distance;
+		distance++;
+	}
+	if (distance >= MAX_DOMAIN_DISTANCE) {
+		WARN_ON(1);
+		distance = MAX_DOMAIN_DISTANCE-1;
+	}
+
+	return distance;
+}
+
+static unsigned int migration_debug;
+
+static int __init setup_migration_debug(char *str)
+{
+	get_option(&str, &migration_debug);
+	return 1;
+}
+
+__setup("migration_debug=", setup_migration_debug);
+
+/*
+ * Maximum cache-size that the scheduler should try to measure.
+ * Architectures with larger caches should tune this up during
+ * bootup. Gets used in the domain-setup code (i.e. during SMP
+ * bootup).
+ */
+unsigned int max_cache_size;
+
+static int __init setup_max_cache_size(char *str)
+{
+	get_option(&str, &max_cache_size);
+	return 1;
+}
+
+__setup("max_cache_size=", setup_max_cache_size);
+
+/*
+ * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
+ * is the operation that is timed, so we try to generate unpredictable
+ * cachemisses that still end up filling the L2 cache:
+ */
+static void touch_cache(void *__cache, unsigned long __size)
+{
+	unsigned long size = __size/sizeof(long), chunk1 = size/3,
+			chunk2 = 2*size/3;
+	unsigned long *cache = __cache;
+	int i;
+
+	for (i = 0; i < size/6; i += 8) {
+		switch (i % 6) {
+			case 0: cache[i]++;
+			case 1: cache[size-1-i]++;
+			case 2: cache[chunk1-i]++;
+			case 3: cache[chunk1+i]++;
+			case 4: cache[chunk2-i]++;
+			case 5: cache[chunk2+i]++;
+		}
+	}
+}
+
+/*
+ * Measure the cache-cost of one task migration. Returns in units of nsec.
+ */
+static unsigned long long measure_one(void *cache, unsigned long size,
+				      int source, int target)
+{
+	cpumask_t mask, saved_mask;
+	unsigned long long t0, t1, t2, t3, cost;
+
+	saved_mask = current->cpus_allowed;
+
+	/*
+	 * Flush source caches to RAM and invalidate them:
+	 */
+	sched_cacheflush();
+
+	/*
+	 * Migrate to the source CPU:
+	 */
+	mask = cpumask_of_cpu(source);
+	set_cpus_allowed(current, mask);
+	WARN_ON(smp_processor_id() != source);
+
+	/*
+	 * Dirty the working set:
+	 */
+	t0 = sched_clock();
+	touch_cache(cache, size);
+	t1 = sched_clock();
+
+	/*
+	 * Migrate to the target CPU, dirty the L2 cache and access
+	 * the shared buffer. (which represents the working set
+	 * of a migrated task.)
+	 */
+	mask = cpumask_of_cpu(target);
+	set_cpus_allowed(current, mask);
+	WARN_ON(smp_processor_id() != target);
+
+	t2 = sched_clock();
+	touch_cache(cache, size);
+	t3 = sched_clock();
+
+	cost = t1-t0 + t3-t2;
+
+	if (migration_debug >= 2)
+		printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
+			source, target, t1-t0, t1-t0, t3-t2, cost);
+	/*
+	 * Flush target caches to RAM and invalidate them:
+	 */
+	sched_cacheflush();
+
+	set_cpus_allowed(current, saved_mask);
+
+	return cost;
+}
+
+/*
+ * Measure a series of task migrations and return the average
+ * result. Since this code runs early during bootup the system
+ * is 'undisturbed' and the average latency makes sense.
+ *
+ * The algorithm in essence auto-detects the relevant cache-size,
+ * so it will properly detect different cachesizes for different
+ * cache-hierarchies, depending on how the CPUs are connected.
+ *
+ * Architectures can prime the upper limit of the search range via
+ * max_cache_size, otherwise the search range defaults to 20MB...64K.
+ */
+static unsigned long long
+measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
+{
+	unsigned long long cost1, cost2;
+	int i;
+
+	/*
+	 * Measure the migration cost of 'size' bytes, over an
+	 * average of 10 runs:
+	 *
+	 * (We perturb the cache size by a small (0..4k)
+	 *  value to compensate size/alignment related artifacts.
+	 *  We also subtract the cost of the operation done on
+	 *  the same CPU.)
+	 */
+	cost1 = 0;
+
+	/*
+	 * dry run, to make sure we start off cache-cold on cpu1,
+	 * and to get any vmalloc pagefaults in advance:
+	 */
+	measure_one(cache, size, cpu1, cpu2);
+	for (i = 0; i < ITERATIONS; i++)
+		cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+
+	measure_one(cache, size, cpu2, cpu1);
+	for (i = 0; i < ITERATIONS; i++)
+		cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+
+	/*
+	 * (We measure the non-migrating [cached] cost on both
+	 *  cpu1 and cpu2, to handle CPUs with different speeds)
+	 */
+	cost2 = 0;
+
+	measure_one(cache, size, cpu1, cpu1);
+	for (i = 0; i < ITERATIONS; i++)
+		cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+
+	measure_one(cache, size, cpu2, cpu2);
+	for (i = 0; i < ITERATIONS; i++)
+		cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+
+	/*
+	 * Get the per-iteration migration cost:
+	 */
+	do_div(cost1, 2*ITERATIONS);
+	do_div(cost2, 2*ITERATIONS);
+
+	return cost1 - cost2;
+}
+
+static unsigned long long measure_migration_cost(int cpu1, int cpu2)
+{
+	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
+	unsigned int max_size, size, size_found = 0;
+	long long cost = 0, prev_cost;
+	void *cache;
+
+	/*
+	 * Search from max_cache_size*5 down to 64K - the real relevant
+	 * cachesize has to lie somewhere inbetween.
+	 */
+	if (max_cache_size) {
+		max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
+		size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
+	} else {
+		/*
+		 * Since we have no estimation about the relevant
+		 * search range
+		 */
+		max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
+		size = MIN_CACHE_SIZE;
+	}
+
+	if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
+		printk("cpu %d and %d not both online!\n", cpu1, cpu2);
+		return 0;
+	}
+
+	/*
+	 * Allocate the working set:
+	 */
+	cache = vmalloc(max_size);
+	if (!cache) {
+		printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+		return 1000000; // return 1 msec on very small boxen
+	}
+
+	while (size <= max_size) {
+		prev_cost = cost;
+		cost = measure_cost(cpu1, cpu2, cache, size);
+
+		/*
+		 * Update the max:
+		 */
+		if (cost > 0) {
+			if (max_cost < cost) {
+				max_cost = cost;
+				size_found = size;
+			}
+		}
+		/*
+		 * Calculate average fluctuation, we use this to prevent
+		 * noise from triggering an early break out of the loop:
+		 */
+		fluct = abs(cost - prev_cost);
+		avg_fluct = (avg_fluct + fluct)/2;
+
+		if (migration_debug)
+			printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+				cpu1, cpu2, size,
+				(long)cost / 1000000,
+				((long)cost / 100000) % 10,
+				(long)max_cost / 1000000,
+				((long)max_cost / 100000) % 10,
+				domain_distance(cpu1, cpu2),
+				cost, avg_fluct);
+
+		/*
+		 * If we iterated at least 20% past the previous maximum,
+		 * and the cost has dropped by more than 20% already,
+		 * (taking fluctuations into account) then we assume to
+		 * have found the maximum and break out of the loop early:
+		 */
+		if (size_found && (size*100 > size_found*SIZE_THRESH))
+			if (cost+avg_fluct <= 0 ||
+				max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
+
+				if (migration_debug)
+					printk("-> found max.\n");
+				break;
+			}
+		/*
+		 * Increase the cachesize in 10% steps:
+		 */
+		size = size * 10 / 9;
+	}
+
+	if (migration_debug)
+		printk("[%d][%d] working set size found: %d, cost: %Ld\n",
+			cpu1, cpu2, size_found, max_cost);
+
+	vfree(cache);
+
+	/*
+	 * A task is considered 'cache cold' if at least 2 times
+	 * the worst-case cost of migration has passed.
+	 *
+	 * (this limit is only listened to if the load-balancing
+	 * situation is 'nice' - if there is a large imbalance we
+	 * ignore it for the sake of CPU utilization and
+	 * processing fairness.)
+	 */
+	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
+}
+
+static void calibrate_migration_costs(const cpumask_t *cpu_map)
+{
+	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
+	unsigned long j0, j1, distance, max_distance = 0;
+	struct sched_domain *sd;
+
+	j0 = jiffies;
+
+	/*
+	 * First pass - calculate the cacheflush times:
+	 */
+	for_each_cpu_mask(cpu1, *cpu_map) {
+		for_each_cpu_mask(cpu2, *cpu_map) {
+			if (cpu1 == cpu2)
+				continue;
+			distance = domain_distance(cpu1, cpu2);
+			max_distance = max(max_distance, distance);
+			/*
+			 * No result cached yet?
+			 */
+			if (migration_cost[distance] == -1LL)
+				migration_cost[distance] =
+					measure_migration_cost(cpu1, cpu2);
+		}
+	}
+	/*
+	 * Second pass - update the sched domain hierarchy with
+	 * the new cache-hot-time estimations:
+	 */
+	for_each_cpu_mask(cpu, *cpu_map) {
+		distance = 0;
+		for_each_domain(cpu, sd) {
+			sd->cache_hot_time = migration_cost[distance];
+			distance++;
+		}
+	}
+	/*
+	 * Print the matrix:
+	 */
+	if (migration_debug)
+		printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
+			max_cache_size,
+#ifdef CONFIG_X86
+			cpu_khz/1000
+#else
+			-1
+#endif
+		);
+	if (system_state == SYSTEM_BOOTING) {
+		printk("migration_cost=");
+		for (distance = 0; distance <= max_distance; distance++) {
+			if (distance)
+				printk(",");
+			printk("%ld", (long)migration_cost[distance] / 1000);
+		}
+		printk("\n");
+	}
+	j1 = jiffies;
+	if (migration_debug)
+		printk("migration: %ld seconds\n", (j1-j0)/HZ);
+
+	/*
+	 * Move back to the original CPU. NUMA-Q gets confused
+	 * if we migrate to another quad during bootup.
+	 */
+	if (raw_smp_processor_id() != orig_cpu) {
+		cpumask_t mask = cpumask_of_cpu(orig_cpu),
+			saved_mask = current->cpus_allowed;
+
+		set_cpus_allowed(current, mask);
+		set_cpus_allowed(current, saved_mask);
+	}
+}
+
+#ifdef CONFIG_NUMA
+
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Start at @node */
+		n = (node + i) % MAX_NUMNODES;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+	int i;
+	cpumask_t span, nodemask;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	nodemask = node_to_cpumask(node);
+	cpus_or(span, span, nodemask);
+	set_bit(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		nodemask = node_to_cpumask(next_node);
+		cpus_or(span, span, nodemask);
+	}
+
+	return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
+#ifdef CONFIG_SCHED_SMT
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static struct sched_group sched_group_cpus[NR_CPUS];
+static int cpu_to_cpu_group(int cpu)
+{
+	return cpu;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+	return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+	return cpu;
+}
+#endif
+
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static struct sched_group sched_group_phys[NR_CPUS];
+static int cpu_to_phys_group(int cpu)
+{
+#if defined(CONFIG_SCHED_MC)
+	cpumask_t mask = cpu_coregroup_map(cpu);
+	return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
+	return first_cpu(cpu_sibling_map[cpu]);
+#else
+	return cpu;
+#endif
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
+ */
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
+
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+	return cpu_to_node(cpu);
+}
+static void init_numa_sched_groups_power(struct sched_group *group_head)
+{
+	struct sched_group *sg = group_head;
+	int j;
+
+	if (!sg)
+		return;
+next_sg:
+	for_each_cpu_mask(j, sg->cpumask) {
+		struct sched_domain *sd;
+
+		sd = &per_cpu(phys_domains, j);
+		if (j != first_cpu(sd->groups->cpumask)) {
+			/*
+			 * Only add "power" once for each
+			 * physical package.
+			 */
+			continue;
+		}
+
+		sg->cpu_power += sd->groups->cpu_power;
+	}
+	sg = sg->next;
+	if (sg != group_head)
+		goto next_sg;
+}
+#endif
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+void build_sched_domains(const cpumask_t *cpu_map)
+{
+	int i;
+#ifdef CONFIG_NUMA
+	struct sched_group **sched_group_nodes = NULL;
+	struct sched_group *sched_group_allnodes = NULL;
+
+	/*
+	 * Allocate the per-node list of sched groups
+	 */
+	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+					   GFP_ATOMIC);
+	if (!sched_group_nodes) {
+		printk(KERN_WARNING "Can not alloc sched group node list\n");
+		return;
+	}
+	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
+
+	/*
+	 * Set up domains for cpus specified by the cpu_map.
+	 */
+	for_each_cpu_mask(i, *cpu_map) {
+		int group;
+		struct sched_domain *sd = NULL, *p;
+		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+
+#ifdef CONFIG_NUMA
+		if (cpus_weight(*cpu_map)
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			if (!sched_group_allnodes) {
+				sched_group_allnodes
+					= kmalloc(sizeof(struct sched_group)
+							* MAX_NUMNODES,
+						  GFP_KERNEL);
+				if (!sched_group_allnodes) {
+					printk(KERN_WARNING
+					"Can not alloc allnodes sched group\n");
+					break;
+				}
+				sched_group_allnodes_bycpu[i]
+						= sched_group_allnodes;
+			}
+			sd = &per_cpu(allnodes_domains, i);
+			*sd = SD_ALLNODES_INIT;
+			sd->span = *cpu_map;
+			group = cpu_to_allnodes_group(i);
+			sd->groups = &sched_group_allnodes[group];
+			p = sd;
+		} else
+			p = NULL;
+
+		sd = &per_cpu(node_domains, i);
+		*sd = SD_NODE_INIT;
+		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sd->parent = p;
+		cpus_and(sd->span, sd->span, *cpu_map);
+#endif
+
+		p = sd;
+		sd = &per_cpu(phys_domains, i);
+		group = cpu_to_phys_group(i);
+		*sd = SD_CPU_INIT;
+		sd->span = nodemask;
+		sd->parent = p;
+		sd->groups = &sched_group_phys[group];
+
+#ifdef CONFIG_SCHED_MC
+		p = sd;
+		sd = &per_cpu(core_domains, i);
+		group = cpu_to_core_group(i);
+		*sd = SD_MC_INIT;
+		sd->span = cpu_coregroup_map(i);
+		cpus_and(sd->span, sd->span, *cpu_map);
+		sd->parent = p;
+		sd->groups = &sched_group_core[group];
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+		p = sd;
+		sd = &per_cpu(cpu_domains, i);
+		group = cpu_to_cpu_group(i);
+		*sd = SD_SIBLING_INIT;
+		sd->span = cpu_sibling_map[i];
+		cpus_and(sd->span, sd->span, *cpu_map);
+		sd->parent = p;
+		sd->groups = &sched_group_cpus[group];
+#endif
+	}
+
+#ifdef CONFIG_SCHED_SMT
+	/* Set up CPU (sibling) groups */
+	for_each_cpu_mask(i, *cpu_map) {
+		cpumask_t this_sibling_map = cpu_sibling_map[i];
+		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
+		if (i != first_cpu(this_sibling_map))
+			continue;
+
+		init_sched_build_groups(sched_group_cpus, this_sibling_map,
+						&cpu_to_cpu_group);
+	}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+	/* Set up multi-core groups */
+	for_each_cpu_mask(i, *cpu_map) {
+		cpumask_t this_core_map = cpu_coregroup_map(i);
+		cpus_and(this_core_map, this_core_map, *cpu_map);
+		if (i != first_cpu(this_core_map))
+			continue;
+		init_sched_build_groups(sched_group_core, this_core_map,
+					&cpu_to_core_group);
+	}
+#endif
+
+
+	/* Set up physical groups */
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		init_sched_build_groups(sched_group_phys, nodemask,
+						&cpu_to_phys_group);
+	}
+
+#ifdef CONFIG_NUMA
+	/* Set up node groups */
+	if (sched_group_allnodes)
+		init_sched_build_groups(sched_group_allnodes, *cpu_map,
+					&cpu_to_allnodes_group);
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Set up node groups */
+		struct sched_group *sg, *prev;
+		cpumask_t nodemask = node_to_cpumask(i);
+		cpumask_t domainspan;
+		cpumask_t covered = CPU_MASK_NONE;
+		int j;
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask)) {
+			sched_group_nodes[i] = NULL;
+			continue;
+		}
+
+		domainspan = sched_domain_node_span(i);
+		cpus_and(domainspan, domainspan, *cpu_map);
+
+		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+		sched_group_nodes[i] = sg;
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *sd;
+			sd = &per_cpu(node_domains, j);
+			sd->groups = sg;
+			if (sd->groups == NULL) {
+				/* Turn off balancing if we have no groups */
+				sd->flags = 0;
+			}
+		}
+		if (!sg) {
+			printk(KERN_WARNING
+			"Can not alloc domain group for node %d\n", i);
+			continue;
+		}
+		sg->cpu_power = 0;
+		sg->cpumask = nodemask;
+		cpus_or(covered, covered, nodemask);
+		prev = sg;
+
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			cpumask_t tmp, notcovered;
+			int n = (i + j) % MAX_NUMNODES;
+
+			cpus_complement(notcovered, covered);
+			cpus_and(tmp, notcovered, *cpu_map);
+			cpus_and(tmp, tmp, domainspan);
+			if (cpus_empty(tmp))
+				break;
+
+			nodemask = node_to_cpumask(n);
+			cpus_and(tmp, tmp, nodemask);
+			if (cpus_empty(tmp))
+				continue;
+
+			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+			if (!sg) {
+				printk(KERN_WARNING
+				"Can not alloc domain group for node %d\n", j);
+				break;
+			}
+			sg->cpu_power = 0;
+			sg->cpumask = tmp;
+			cpus_or(covered, covered, tmp);
+			prev->next = sg;
+			prev = sg;
+		}
+		prev->next = sched_group_nodes[i];
+	}
+#endif
+
+	/* Calculate CPU power for physical packages and nodes */
+	for_each_cpu_mask(i, *cpu_map) {
+		int power;
+		struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+		sd = &per_cpu(cpu_domains, i);
+		power = SCHED_LOAD_SCALE;
+		sd->groups->cpu_power = power;
+#endif
+#ifdef CONFIG_SCHED_MC
+		sd = &per_cpu(core_domains, i);
+		power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+					    * SCHED_LOAD_SCALE / 10;
+		sd->groups->cpu_power = power;
+
+		sd = &per_cpu(phys_domains, i);
+
+ 		/*
+ 		 * This has to be < 2 * SCHED_LOAD_SCALE
+ 		 * Lets keep it SCHED_LOAD_SCALE, so that
+ 		 * while calculating NUMA group's cpu_power
+ 		 * we can simply do
+ 		 *  numa_group->cpu_power += phys_group->cpu_power;
+ 		 *
+ 		 * See "only add power once for each physical pkg"
+ 		 * comment below
+ 		 */
+ 		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
+		sd = &per_cpu(phys_domains, i);
+		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+		sd->groups->cpu_power = power;
+#endif
+	}
+
+#ifdef CONFIG_NUMA
+	for (i = 0; i < MAX_NUMNODES; i++)
+		init_numa_sched_groups_power(sched_group_nodes[i]);
+
+	init_numa_sched_groups_power(sched_group_allnodes);
+#endif
+
+	/* Attach the domains */
+	for_each_cpu_mask(i, *cpu_map) {
+		struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+		sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+		sd = &per_cpu(core_domains, i);
+#else
+		sd = &per_cpu(phys_domains, i);
+#endif
+		cpu_attach_domain(sd, i);
+	}
+	/*
+	 * Tune cache-hot values:
+	 */
+	calibrate_migration_costs(cpu_map);
+}
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
+{
+	cpumask_t cpu_default_map;
+
+	/*
+	 * Setup mask for cpus without special case scheduling requirements.
+	 * For now this just excludes isolated cpus, but could be used to
+	 * exclude other special cases in the future.
+	 */
+	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+	build_sched_domains(&cpu_default_map);
+}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
+{
+#ifdef CONFIG_NUMA
+	int i;
+	int cpu;
+
+	for_each_cpu_mask(cpu, *cpu_map) {
+		struct sched_group *sched_group_allnodes
+			= sched_group_allnodes_bycpu[cpu];
+		struct sched_group **sched_group_nodes
+			= sched_group_nodes_bycpu[cpu];
+
+		if (sched_group_allnodes) {
+			kfree(sched_group_allnodes);
+			sched_group_allnodes_bycpu[cpu] = NULL;
+		}
+
+		if (!sched_group_nodes)
+			continue;
+
+		for (i = 0; i < MAX_NUMNODES; i++) {
+			cpumask_t nodemask = node_to_cpumask(i);
+			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+			cpus_and(nodemask, nodemask, *cpu_map);
+			if (cpus_empty(nodemask))
+				continue;
+
+			if (sg == NULL)
+				continue;
+			sg = sg->next;
+next_sg:
+			oldsg = sg;
+			sg = sg->next;
+			kfree(oldsg);
+			if (oldsg != sched_group_nodes[i])
+				goto next_sg;
+		}
+		kfree(sched_group_nodes);
+		sched_group_nodes_bycpu[cpu] = NULL;
+	}
+#endif
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+	int i;
+
+	for_each_cpu_mask(i, *cpu_map)
+		cpu_attach_domain(NULL, i);
+	synchronize_sched();
+	arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+	cpumask_t change_map;
+
+	cpus_and(*partition1, *partition1, cpu_online_map);
+	cpus_and(*partition2, *partition2, cpu_online_map);
+	cpus_or(change_map, *partition1, *partition2);
+
+	/* Detach sched domains from all of the affected cpus */
+	detach_destroy_domains(&change_map);
+	if (!cpus_empty(*partition1))
+		build_sched_domains(partition1);
+	if (!cpus_empty(*partition2))
+		build_sched_domains(partition2);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Force a reinitialization of the sched domains hierarchy.  The domains
+ * and groups cannot be updated in place without racing with the balancing
+ * code, so we temporarily attach all running cpus to the NULL domain
+ * which will prevent rebalancing while the sched domains are recalculated.
+ */
+static int update_sched_domains(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_DOWN_PREPARE:
+		detach_destroy_domains(&cpu_online_map);
+		return NOTIFY_OK;
+
+	case CPU_UP_CANCELED:
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		/*
+		 * Fall through and re-initialise the domains.
+		 */
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	/* The hotplug lock is already held by cpu_up/cpu_down */
+	arch_init_sched_domains(&cpu_online_map);
+
+	return NOTIFY_OK;
+}
+#endif
+
+void __init sched_init_smp(void)
+{
+	lock_cpu_hotplug();
+	arch_init_sched_domains(&cpu_online_map);
+	unlock_cpu_hotplug();
+	/* XXX: Theoretical race here - CPU may be hotplugged now */
+	hotcpu_notifier(update_sched_domains, 0);
+	init_sched_domain_sysctl();
+}
+#else
+void __init sched_init_smp(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+	/* Linker adds these: start and end of __sched functions */
+	extern char __sched_text_start[], __sched_text_end[];
+	return in_lock_functions(addr) ||
+		(addr >= (unsigned long)__sched_text_start
+		&& addr < (unsigned long)__sched_text_end);
+}
+
+void __init sched_init(void)
+{
+	runqueue_t *rq;
+	int i, j, k;
+
+	for_each_cpu(i) {
+		prio_array_t *array;
+
+		rq = cpu_rq(i);
+		spin_lock_init(&rq->lock);
+		rq->nr_running = 0;
+		rq->active = rq->arrays;
+		rq->expired = rq->arrays + 1;
+		rq->best_expired_prio = MAX_PRIO;
+
+#ifdef CONFIG_SMP
+		rq->sd = NULL;
+		for (j = 1; j < 3; j++)
+			rq->cpu_load[j] = 0;
+		rq->active_balance = 0;
+		rq->push_cpu = 0;
+		rq->migration_thread = NULL;
+		INIT_LIST_HEAD(&rq->migration_queue);
+#endif
+		atomic_set(&rq->nr_iowait, 0);
+
+		for (j = 0; j < 2; j++) {
+			array = rq->arrays + j;
+			for (k = 0; k < MAX_PRIO; k++) {
+				INIT_LIST_HEAD(array->queue + k);
+				__clear_bit(k, array->bitmap);
+			}
+			// delimiter for bitsearch
+			__set_bit(MAX_PRIO, array->bitmap);
+		}
+	}
+
+	set_load_weight(&init_task);
+	/*
+	 * The boot idle thread does lazy MMU switching as well:
+	 */
+	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current);
+
+	/*
+	 * Make us the idle thread. Technically, schedule() should not be
+	 * called from this thread, however somewhere below it might be,
+	 * but because we are the idle thread, we just pick up running again
+	 * when this runqueue becomes "idle".
+	 */
+	init_idle(current, smp_processor_id());
+}
+
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+void __might_sleep(char *file, int line)
+{
+#if defined(in_atomic)
+	static unsigned long prev_jiffy;	/* ratelimiting */
+
+	if ((in_atomic() || irqs_disabled()) &&
+	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
+		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+			return;
+		prev_jiffy = jiffies;
+		printk(KERN_ERR "BUG: sleeping function called from invalid"
+				" context at %s:%d\n", file, line);
+		printk("in_atomic():%d, irqs_disabled():%d\n",
+			in_atomic(), irqs_disabled());
+		dump_stack();
+	}
+#endif
+}
+EXPORT_SYMBOL(__might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+void normalize_rt_tasks(void)
+{
+	struct task_struct *p;
+	prio_array_t *array;
+	unsigned long flags;
+	runqueue_t *rq;
+
+	read_lock_irq(&tasklist_lock);
+	for_each_process (p) {
+		if (!rt_task(p))
+			continue;
+
+		rq = task_rq_lock(p, &flags);
+
+		array = p->array;
+		if (array)
+			deactivate_task(p, task_rq(p));
+		__setscheduler(p, SCHED_NORMAL, 0);
+		if (array) {
+			__activate_task(p, task_rq(p));
+			resched_task(rq->curr);
+		}
+
+		task_rq_unlock(rq, &flags);
+	}
+	read_unlock_irq(&tasklist_lock);
+}
+
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+task_t *curr_task(int cpu)
+{
+	return cpu_curr(cpu);
+}
+
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, task_t *p)
+{
+	cpu_curr(cpu) = p;
+}
+
+#endif
diff -urN oldtree/kernel/softirq.c newtree/kernel/softirq.c
--- oldtree/kernel/softirq.c	2006-03-08 18:48:02.972064750 +0000
+++ newtree/kernel/softirq.c	2006-03-08 15:22:33.361512250 +0000
@@ -351,7 +351,6 @@
 static int ksoftirqd(void * __bind_cpu)
 {
 	set_user_nice(current, 19);
-	current->flags |= PF_NOFREEZE;
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
@@ -457,7 +456,7 @@
 	case CPU_UP_PREPARE:
 		BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
 		BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
-		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+		p = kthread_nofreeze_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
 		if (IS_ERR(p)) {
 			printk("ksoftirqd for %i failed\n", hotcpu);
 			return NOTIFY_BAD;
diff -urN oldtree/kernel/softirq.c.orig newtree/kernel/softirq.c.orig
--- oldtree/kernel/softirq.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/softirq.c.orig	2006-03-08 15:21:19.244880250 +0000
@@ -0,0 +1,517 @@
+/*
+ *	linux/kernel/softirq.c
+ *
+ *	Copyright (C) 1992 Linus Torvalds
+ *
+ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+
+#include <asm/irq.h>
+/*
+   - No shared variables, all the data are CPU local.
+   - If a softirq needs serialization, let it serialize itself
+     by its own spinlocks.
+   - Even if softirq is serialized, only local cpu is marked for
+     execution. Hence, we get something sort of weak cpu binding.
+     Though it is still not clear, will it result in better locality
+     or will not.
+
+   Examples:
+   - NET RX softirq. It is multithreaded and does not require
+     any global serialization.
+   - NET TX softirq. It kicks software netdevice queues, hence
+     it is logically serialized per device, but this serialization
+     is invisible to common code.
+   - Tasklets: serialized wrt itself.
+ */
+
+#ifndef __ARCH_IRQ_STAT
+irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
+EXPORT_SYMBOL(irq_stat);
+#endif
+
+static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
+
+static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+
+/*
+ * we cannot loop indefinitely here to avoid userspace starvation,
+ * but we also don't want to introduce a worst case 1/HZ latency
+ * to the pending events, so lets the scheduler to balance
+ * the softirq load for us.
+ */
+static inline void wakeup_softirqd(void)
+{
+	/* Interrupts are disabled: no need to stop preemption */
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+
+	if (tsk && tsk->state != TASK_RUNNING)
+		wake_up_process(tsk);
+}
+
+/*
+ * We restart softirq processing MAX_SOFTIRQ_RESTART times,
+ * and we fall back to softirqd after that.
+ *
+ * This number has been established via experimentation.
+ * The two things to balance is latency against fairness -
+ * we want to handle softirqs as soon as possible, but they
+ * should not be able to lock up the box.
+ */
+#define MAX_SOFTIRQ_RESTART 10
+
+asmlinkage void __do_softirq(void)
+{
+	struct softirq_action *h;
+	__u32 pending;
+	int max_restart = MAX_SOFTIRQ_RESTART;
+	int cpu;
+
+	pending = local_softirq_pending();
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+restart:
+	/* Reset the pending bitmask before enabling irqs */
+	set_softirq_pending(0);
+
+	local_irq_enable();
+
+	h = softirq_vec;
+
+	do {
+		if (pending & 1) {
+			h->action(h);
+			rcu_bh_qsctr_inc(cpu);
+		}
+		h++;
+		pending >>= 1;
+	} while (pending);
+
+	local_irq_disable();
+
+	pending = local_softirq_pending();
+	if (pending && --max_restart)
+		goto restart;
+
+	if (pending)
+		wakeup_softirqd();
+
+	__local_bh_enable();
+}
+
+#ifndef __ARCH_HAS_DO_SOFTIRQ
+
+asmlinkage void do_softirq(void)
+{
+	__u32 pending;
+	unsigned long flags;
+
+	if (in_interrupt())
+		return;
+
+	local_irq_save(flags);
+
+	pending = local_softirq_pending();
+
+	if (pending)
+		__do_softirq();
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(do_softirq);
+
+#endif
+
+void local_bh_enable(void)
+{
+	WARN_ON(irqs_disabled());
+	/*
+	 * Keep preemption disabled until we are done with
+	 * softirq processing:
+ 	 */
+ 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+	if (unlikely(!in_interrupt() && local_softirq_pending()))
+		do_softirq();
+
+	dec_preempt_count();
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
+# define invoke_softirq()	__do_softirq()
+#else
+# define invoke_softirq()	do_softirq()
+#endif
+
+/*
+ * Exit an interrupt context. Process softirqs if needed and possible:
+ */
+void irq_exit(void)
+{
+	account_system_vtime(current);
+	sub_preempt_count(IRQ_EXIT_OFFSET);
+	if (!in_interrupt() && local_softirq_pending())
+		invoke_softirq();
+	preempt_enable_no_resched();
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+inline fastcall void raise_softirq_irqoff(unsigned int nr)
+{
+	__raise_softirq_irqoff(nr);
+
+	/*
+	 * If we're in an interrupt or softirq, we're done
+	 * (this also catches softirq-disabled code). We will
+	 * actually run the softirq once we return from
+	 * the irq or softirq.
+	 *
+	 * Otherwise we wake up ksoftirqd to make sure we
+	 * schedule the softirq soon.
+	 */
+	if (!in_interrupt())
+		wakeup_softirqd();
+}
+
+EXPORT_SYMBOL(raise_softirq_irqoff);
+
+void fastcall raise_softirq(unsigned int nr)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	raise_softirq_irqoff(nr);
+	local_irq_restore(flags);
+}
+
+void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
+{
+	softirq_vec[nr].data = data;
+	softirq_vec[nr].action = action;
+}
+
+EXPORT_SYMBOL(open_softirq);
+
+/* Tasklets */
+struct tasklet_head
+{
+	struct tasklet_struct *list;
+};
+
+/* Some compilers disobey section attribute on statics when not
+   initialized -- RR */
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
+
+void fastcall __tasklet_schedule(struct tasklet_struct *t)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	t->next = __get_cpu_var(tasklet_vec).list;
+	__get_cpu_var(tasklet_vec).list = t;
+	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__tasklet_schedule);
+
+void fastcall __tasklet_hi_schedule(struct tasklet_struct *t)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	t->next = __get_cpu_var(tasklet_hi_vec).list;
+	__get_cpu_var(tasklet_hi_vec).list = t;
+	raise_softirq_irqoff(HI_SOFTIRQ);
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__tasklet_hi_schedule);
+
+static void tasklet_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
+
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_vec).list;
+	__get_cpu_var(tasklet_vec).list = NULL;
+	local_irq_enable();
+
+	while (list) {
+		struct tasklet_struct *t = list;
+
+		list = list->next;
+
+		if (tasklet_trylock(t)) {
+			if (!atomic_read(&t->count)) {
+				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+					BUG();
+				t->func(t->data);
+				tasklet_unlock(t);
+				continue;
+			}
+			tasklet_unlock(t);
+		}
+
+		local_irq_disable();
+		t->next = __get_cpu_var(tasklet_vec).list;
+		__get_cpu_var(tasklet_vec).list = t;
+		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
+		local_irq_enable();
+	}
+}
+
+static void tasklet_hi_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
+
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_hi_vec).list;
+	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	local_irq_enable();
+
+	while (list) {
+		struct tasklet_struct *t = list;
+
+		list = list->next;
+
+		if (tasklet_trylock(t)) {
+			if (!atomic_read(&t->count)) {
+				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+					BUG();
+				t->func(t->data);
+				tasklet_unlock(t);
+				continue;
+			}
+			tasklet_unlock(t);
+		}
+
+		local_irq_disable();
+		t->next = __get_cpu_var(tasklet_hi_vec).list;
+		__get_cpu_var(tasklet_hi_vec).list = t;
+		__raise_softirq_irqoff(HI_SOFTIRQ);
+		local_irq_enable();
+	}
+}
+
+
+void tasklet_init(struct tasklet_struct *t,
+		  void (*func)(unsigned long), unsigned long data)
+{
+	t->next = NULL;
+	t->state = 0;
+	atomic_set(&t->count, 0);
+	t->func = func;
+	t->data = data;
+}
+
+EXPORT_SYMBOL(tasklet_init);
+
+void tasklet_kill(struct tasklet_struct *t)
+{
+	if (in_interrupt())
+		printk("Attempt to kill tasklet from interrupt\n");
+
+	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
+		do
+			yield();
+		while (test_bit(TASKLET_STATE_SCHED, &t->state));
+	}
+	tasklet_unlock_wait(t);
+	clear_bit(TASKLET_STATE_SCHED, &t->state);
+}
+
+EXPORT_SYMBOL(tasklet_kill);
+
+void __init softirq_init(void)
+{
+	open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
+	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
+}
+
+static int ksoftirqd(void * __bind_cpu)
+{
+	set_user_nice(current, 19);
+	current->flags |= PF_NOFREEZE;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+		preempt_disable();
+		if (!local_softirq_pending()) {
+			preempt_enable_no_resched();
+			schedule();
+			preempt_disable();
+		}
+
+		__set_current_state(TASK_RUNNING);
+
+		while (local_softirq_pending()) {
+			/* Preempt disable stops cpu going offline.
+			   If already offline, we'll be on wrong CPU:
+			   don't process */
+			if (cpu_is_offline((long)__bind_cpu))
+				goto wait_to_die;
+			do_softirq();
+			preempt_enable_no_resched();
+			cond_resched();
+			preempt_disable();
+		}
+		preempt_enable();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+
+wait_to_die:
+	preempt_enable();
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * tasklet_kill_immediate is called to remove a tasklet which can already be
+ * scheduled for execution on @cpu.
+ *
+ * Unlike tasklet_kill, this function removes the tasklet
+ * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
+ *
+ * When this function is called, @cpu must be in the CPU_DEAD state.
+ */
+void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
+{
+	struct tasklet_struct **i;
+
+	BUG_ON(cpu_online(cpu));
+	BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
+
+	if (!test_bit(TASKLET_STATE_SCHED, &t->state))
+		return;
+
+	/* CPU is dead, so no lock needed. */
+	for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) {
+		if (*i == t) {
+			*i = t->next;
+			return;
+		}
+	}
+	BUG();
+}
+
+static void takeover_tasklets(unsigned int cpu)
+{
+	struct tasklet_struct **i;
+
+	/* CPU is dead, so no lock needed. */
+	local_irq_disable();
+
+	/* Find end, append list for that CPU. */
+	for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next);
+	*i = per_cpu(tasklet_vec, cpu).list;
+	per_cpu(tasklet_vec, cpu).list = NULL;
+	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+
+	for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next);
+	*i = per_cpu(tasklet_hi_vec, cpu).list;
+	per_cpu(tasklet_hi_vec, cpu).list = NULL;
+	raise_softirq_irqoff(HI_SOFTIRQ);
+
+	local_irq_enable();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
+		BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
+		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("ksoftirqd for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+		kthread_bind(p, hotcpu);
+  		per_cpu(ksoftirqd, hotcpu) = p;
+ 		break;
+	case CPU_ONLINE:
+		wake_up_process(per_cpu(ksoftirqd, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(ksoftirqd, hotcpu),
+			     any_online_cpu(cpu_online_map));
+	case CPU_DEAD:
+		p = per_cpu(ksoftirqd, hotcpu);
+		per_cpu(ksoftirqd, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_ksoftirqd(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+{
+	int ret = 0;
+
+	preempt_disable();
+	ret = smp_call_function(func, info, retry, wait);
+	local_irq_disable();
+	func(info);
+	local_irq_enable();
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
+#endif
diff -urN oldtree/kernel/sys.c newtree/kernel/sys.c
--- oldtree/kernel/sys.c	2006-03-08 18:48:02.976065000 +0000
+++ newtree/kernel/sys.c	2006-03-08 15:22:33.369512750 +0000
@@ -749,12 +749,12 @@
 		unlock_kernel();
 		return -EINVAL;
 
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_SUSPEND2
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		{
-			int ret = software_suspend();
+			suspend2_try_suspend();
 			unlock_kernel();
-			return ret;
+			return 0;
 		}
 #endif
 
diff -urN oldtree/kernel/sys.c.orig newtree/kernel/sys.c.orig
--- oldtree/kernel/sys.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/sys.c.orig	2006-03-08 15:21:19.248880500 +0000
@@ -0,0 +1,2072 @@
+/*
+ *  linux/kernel/sys.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/utsname.h>
+#include <linux/mman.h>
+#include <linux/smp_lock.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/prctl.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/workqueue.h>
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/key.h>
+#include <linux/times.h>
+#include <linux/posix-timers.h>
+#include <linux/security.h>
+#include <linux/dcookies.h>
+#include <linux/suspend.h>
+#include <linux/tty.h>
+#include <linux/signal.h>
+#include <linux/cn_proc.h>
+
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/kprobes.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/unistd.h>
+
+#ifndef SET_UNALIGN_CTL
+# define SET_UNALIGN_CTL(a,b)	(-EINVAL)
+#endif
+#ifndef GET_UNALIGN_CTL
+# define GET_UNALIGN_CTL(a,b)	(-EINVAL)
+#endif
+#ifndef SET_FPEMU_CTL
+# define SET_FPEMU_CTL(a,b)	(-EINVAL)
+#endif
+#ifndef GET_FPEMU_CTL
+# define GET_FPEMU_CTL(a,b)	(-EINVAL)
+#endif
+#ifndef SET_FPEXC_CTL
+# define SET_FPEXC_CTL(a,b)	(-EINVAL)
+#endif
+#ifndef GET_FPEXC_CTL
+# define GET_FPEXC_CTL(a,b)	(-EINVAL)
+#endif
+
+/*
+ * this is where the system-wide overflow UID and GID are defined, for
+ * architectures that now have 32-bit UID/GID but didn't in the past
+ */
+
+int overflowuid = DEFAULT_OVERFLOWUID;
+int overflowgid = DEFAULT_OVERFLOWGID;
+
+#ifdef CONFIG_UID16
+EXPORT_SYMBOL(overflowuid);
+EXPORT_SYMBOL(overflowgid);
+#endif
+
+/*
+ * the same as above, but for filesystems which can only store a 16-bit
+ * UID and GID. as such, this is needed on all architectures
+ */
+
+int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
+int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
+
+EXPORT_SYMBOL(fs_overflowuid);
+EXPORT_SYMBOL(fs_overflowgid);
+
+/*
+ * this indicates whether you can reboot with ctrl-alt-del: the default is yes
+ */
+
+int C_A_D = 1;
+int cad_pid = 1;
+
+/*
+ *	Notifier list for kernel code which wants to be called
+ *	at shutdown. This is used to stop any idling DMA operations
+ *	and the like. 
+ */
+
+static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
+
+/*
+ *	Notifier chain core routines.  The exported routines below
+ *	are layered on top of these, with appropriate locking added.
+ */
+
+static int notifier_chain_register(struct notifier_block **nl,
+		struct notifier_block *n)
+{
+	while ((*nl) != NULL) {
+		if (n->priority > (*nl)->priority)
+			break;
+		nl = &((*nl)->next);
+	}
+	n->next = *nl;
+	rcu_assign_pointer(*nl, n);
+	return 0;
+}
+
+static int notifier_chain_unregister(struct notifier_block **nl,
+		struct notifier_block *n)
+{
+	while ((*nl) != NULL) {
+		if ((*nl) == n) {
+			rcu_assign_pointer(*nl, n->next);
+			return 0;
+		}
+		nl = &((*nl)->next);
+	}
+	return -ENOENT;
+}
+
+static int __kprobes notifier_call_chain(struct notifier_block **nl,
+		unsigned long val, void *v)
+{
+	int ret = NOTIFY_DONE;
+	struct notifier_block *nb;
+
+	nb = rcu_dereference(*nl);
+	while (nb) {
+		ret = nb->notifier_call(nb, val, v);
+		if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+			break;
+		nb = rcu_dereference(nb->next);
+	}
+	return ret;
+}
+
+/*
+ *	Atomic notifier chain routines.  Registration and unregistration
+ *	use a mutex, and call_chain is synchronized by RCU (no locks).
+ */
+
+/**
+ *	atomic_notifier_chain_register - Add notifier to an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to an atomic notifier chain.
+ *
+ *	Currently always returns zero.
+ */
+
+int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+		struct notifier_block *n)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&nh->lock, flags);
+	ret = notifier_chain_register(&nh->head, n);
+	spin_unlock_irqrestore(&nh->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
+
+/**
+ *	atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from an atomic notifier chain.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+		struct notifier_block *n)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&nh->lock, flags);
+	ret = notifier_chain_unregister(&nh->head, n);
+	spin_unlock_irqrestore(&nh->lock, flags);
+	synchronize_rcu();
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+
+/**
+ *	atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in an atomic context, so they must not block.
+ *	This routine uses RCU to synchronize with changes to the chain.
+ *
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise the return value is the return value
+ *	of the last notifier function called.
+ */
+ 
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = notifier_call_chain(&nh->head, val, v);
+	rcu_read_unlock();
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+
+/*
+ *	Blocking notifier chain routines.  All access to the chain is
+ *	synchronized by an rwsem.
+ */
+
+/**
+ *	blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to a blocking notifier chain.
+ *	Must be called in process context.
+ *
+ *	Currently always returns zero.
+ */
+ 
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+		struct notifier_block *n)
+{
+	int ret;
+
+	/*
+	 * This code gets used during boot-up, when task switching is
+	 * not yet working and interrupts must remain disabled.  At
+	 * such times we must not call down_write().
+	 */
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return notifier_chain_register(&nh->head, n);
+
+	down_write(&nh->rwsem);
+	ret = notifier_chain_register(&nh->head, n);
+	up_write(&nh->rwsem);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
+
+/**
+ *	blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from a blocking notifier chain.
+ *	Must be called from process context.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+		struct notifier_block *n)
+{
+	int ret;
+
+	/*
+	 * This code gets used during boot-up, when task switching is
+	 * not yet working and interrupts must remain disabled.  At
+	 * such times we must not call down_write().
+	 */
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return notifier_chain_unregister(&nh->head, n);
+
+	down_write(&nh->rwsem);
+	ret = notifier_chain_unregister(&nh->head, n);
+	up_write(&nh->rwsem);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+
+/**
+ *	blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in a process context, so they are allowed to block.
+ *
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise the return value is the return value
+ *	of the last notifier function called.
+ */
+ 
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	int ret;
+
+	down_read(&nh->rwsem);
+	ret = notifier_call_chain(&nh->head, val, v);
+	up_read(&nh->rwsem);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
+
+/*
+ *	Raw notifier chain routines.  There is no protection;
+ *	the caller must provide it.  Use at your own risk!
+ */
+
+/**
+ *	raw_notifier_chain_register - Add notifier to a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to a raw notifier chain.
+ *	All locking must be provided by the caller.
+ *
+ *	Currently always returns zero.
+ */
+
+int raw_notifier_chain_register(struct raw_notifier_head *nh,
+		struct notifier_block *n)
+{
+	return notifier_chain_register(&nh->head, n);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
+
+/**
+ *	raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from a raw notifier chain.
+ *	All locking must be provided by the caller.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+		struct notifier_block *n)
+{
+	return notifier_chain_unregister(&nh->head, n);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+
+/**
+ *	raw_notifier_call_chain - Call functions in a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in an undefined context.
+ *	All locking must be provided by the caller.
+ *
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then raw_notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise the return value is the return value
+ *	of the last notifier function called.
+ */
+
+int raw_notifier_call_chain(struct raw_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	return notifier_call_chain(&nh->head, val, v);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
+
+/**
+ *	register_reboot_notifier - Register function to be called at reboot time
+ *	@nb: Info about notifier function to be called
+ *
+ *	Registers a function with the list of functions
+ *	to be called at reboot time.
+ *
+ *	Currently always returns zero, as blocking_notifier_chain_register
+ *	always returns zero.
+ */
+ 
+int register_reboot_notifier(struct notifier_block * nb)
+{
+	return blocking_notifier_chain_register(&reboot_notifier_list, nb);
+}
+
+EXPORT_SYMBOL(register_reboot_notifier);
+
+/**
+ *	unregister_reboot_notifier - Unregister previously registered reboot notifier
+ *	@nb: Hook to be unregistered
+ *
+ *	Unregisters a previously registered reboot
+ *	notifier function.
+ *
+ *	Returns zero on success, or %-ENOENT on failure.
+ */
+ 
+int unregister_reboot_notifier(struct notifier_block * nb)
+{
+	return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+
+EXPORT_SYMBOL(unregister_reboot_notifier);
+
+#ifndef CONFIG_SECURITY
+int capable(int cap)
+{
+        if (cap_raised(current->cap_effective, cap)) {
+	       current->flags |= PF_SUPERPRIV;
+	       return 1;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(capable);
+#endif
+
+static int set_one_prio(struct task_struct *p, int niceval, int error)
+{
+	int no_nice;
+
+	if (p->uid != current->euid &&
+		p->euid != current->euid && !capable(CAP_SYS_NICE)) {
+		error = -EPERM;
+		goto out;
+	}
+	if (niceval < task_nice(p) && !can_nice(p, niceval)) {
+		error = -EACCES;
+		goto out;
+	}
+	no_nice = security_task_setnice(p, niceval);
+	if (no_nice) {
+		error = no_nice;
+		goto out;
+	}
+	if (error == -ESRCH)
+		error = 0;
+	set_user_nice(p, niceval);
+out:
+	return error;
+}
+
+asmlinkage long sys_setpriority(int which, int who, int niceval)
+{
+	struct task_struct *g, *p;
+	struct user_struct *user;
+	int error = -EINVAL;
+
+	if (which > 2 || which < 0)
+		goto out;
+
+	/* normalize: avoid signed division (rounding problems) */
+	error = -ESRCH;
+	if (niceval < -20)
+		niceval = -20;
+	if (niceval > 19)
+		niceval = 19;
+
+	read_lock(&tasklist_lock);
+	switch (which) {
+		case PRIO_PROCESS:
+			if (!who)
+				who = current->pid;
+			p = find_task_by_pid(who);
+			if (p)
+				error = set_one_prio(p, niceval, error);
+			break;
+		case PRIO_PGRP:
+			if (!who)
+				who = process_group(current);
+			do_each_task_pid(who, PIDTYPE_PGID, p) {
+				error = set_one_prio(p, niceval, error);
+			} while_each_task_pid(who, PIDTYPE_PGID, p);
+			break;
+		case PRIO_USER:
+			user = current->user;
+			if (!who)
+				who = current->uid;
+			else
+				if ((who != current->uid) && !(user = find_user(who)))
+					goto out_unlock;	/* No processes for this user */
+
+			do_each_thread(g, p)
+				if (p->uid == who)
+					error = set_one_prio(p, niceval, error);
+			while_each_thread(g, p);
+			if (who != current->uid)
+				free_uid(user);		/* For find_user() */
+			break;
+	}
+out_unlock:
+	read_unlock(&tasklist_lock);
+out:
+	return error;
+}
+
+/*
+ * Ugh. To avoid negative return values, "getpriority()" will
+ * not return the normal nice-value, but a negated value that
+ * has been offset by 20 (ie it returns 40..1 instead of -20..19)
+ * to stay compatible.
+ */
+asmlinkage long sys_getpriority(int which, int who)
+{
+	struct task_struct *g, *p;
+	struct user_struct *user;
+	long niceval, retval = -ESRCH;
+
+	if (which > 2 || which < 0)
+		return -EINVAL;
+
+	read_lock(&tasklist_lock);
+	switch (which) {
+		case PRIO_PROCESS:
+			if (!who)
+				who = current->pid;
+			p = find_task_by_pid(who);
+			if (p) {
+				niceval = 20 - task_nice(p);
+				if (niceval > retval)
+					retval = niceval;
+			}
+			break;
+		case PRIO_PGRP:
+			if (!who)
+				who = process_group(current);
+			do_each_task_pid(who, PIDTYPE_PGID, p) {
+				niceval = 20 - task_nice(p);
+				if (niceval > retval)
+					retval = niceval;
+			} while_each_task_pid(who, PIDTYPE_PGID, p);
+			break;
+		case PRIO_USER:
+			user = current->user;
+			if (!who)
+				who = current->uid;
+			else
+				if ((who != current->uid) && !(user = find_user(who)))
+					goto out_unlock;	/* No processes for this user */
+
+			do_each_thread(g, p)
+				if (p->uid == who) {
+					niceval = 20 - task_nice(p);
+					if (niceval > retval)
+						retval = niceval;
+				}
+			while_each_thread(g, p);
+			if (who != current->uid)
+				free_uid(user);		/* for find_user() */
+			break;
+	}
+out_unlock:
+	read_unlock(&tasklist_lock);
+
+	return retval;
+}
+
+/**
+ *	emergency_restart - reboot the system
+ *
+ *	Without shutting down any hardware or taking any locks
+ *	reboot the system.  This is called when we know we are in
+ *	trouble so this is our best effort to reboot.  This is
+ *	safe to call in interrupt context.
+ */
+void emergency_restart(void)
+{
+	machine_emergency_restart();
+}
+EXPORT_SYMBOL_GPL(emergency_restart);
+
+void kernel_restart_prepare(char *cmd)
+{
+	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+	system_state = SYSTEM_RESTART;
+	device_shutdown();
+}
+
+/**
+ *	kernel_restart - reboot the system
+ *	@cmd: pointer to buffer containing command to execute for restart
+ *		or %NULL
+ *
+ *	Shutdown everything and perform a clean reboot.
+ *	This is not safe to call in interrupt context.
+ */
+void kernel_restart(char *cmd)
+{
+	kernel_restart_prepare(cmd);
+	if (!cmd) {
+		printk(KERN_EMERG "Restarting system.\n");
+	} else {
+		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+	}
+	printk(".\n");
+	machine_restart(cmd);
+}
+EXPORT_SYMBOL_GPL(kernel_restart);
+
+/**
+ *	kernel_kexec - reboot the system
+ *
+ *	Move into place and start executing a preloaded standalone
+ *	executable.  If nothing was preloaded return an error.
+ */
+void kernel_kexec(void)
+{
+#ifdef CONFIG_KEXEC
+	struct kimage *image;
+	image = xchg(&kexec_image, NULL);
+	if (!image) {
+		return;
+	}
+	kernel_restart_prepare(NULL);
+	printk(KERN_EMERG "Starting new kernel\n");
+	machine_shutdown();
+	machine_kexec(image);
+#endif
+}
+EXPORT_SYMBOL_GPL(kernel_kexec);
+
+void kernel_shutdown_prepare(enum system_states state)
+{
+	blocking_notifier_call_chain(&reboot_notifier_list,
+		(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
+	system_state = state;
+	device_shutdown();
+}
+/**
+ *	kernel_halt - halt the system
+ *
+ *	Shutdown everything and perform a clean system halt.
+ */
+void kernel_halt(void)
+{
+	kernel_shutdown_prepare(SYSTEM_HALT);
+	printk(KERN_EMERG "System halted.\n");
+	machine_halt();
+}
+
+EXPORT_SYMBOL_GPL(kernel_halt);
+
+/**
+ *	kernel_power_off - power_off the system
+ *
+ *	Shutdown everything and perform a clean system power_off.
+ */
+void kernel_power_off(void)
+{
+	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
+	printk(KERN_EMERG "Power down.\n");
+	machine_power_off();
+}
+EXPORT_SYMBOL_GPL(kernel_power_off);
+/*
+ * Reboot system call: for obvious reasons only root may call it,
+ * and even root needs to set up some magic numbers in the registers
+ * so that some mistake won't make this reboot the whole machine.
+ * You can also set the meaning of the ctrl-alt-del-key here.
+ *
+ * reboot doesn't sync: do that yourself before calling this.
+ */
+asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg)
+{
+	char buffer[256];
+
+	/* We only trust the superuser with rebooting the system. */
+	if (!capable(CAP_SYS_BOOT))
+		return -EPERM;
+
+	/* For safety, we require "magic" arguments. */
+	if (magic1 != LINUX_REBOOT_MAGIC1 ||
+	    (magic2 != LINUX_REBOOT_MAGIC2 &&
+	                magic2 != LINUX_REBOOT_MAGIC2A &&
+			magic2 != LINUX_REBOOT_MAGIC2B &&
+	                magic2 != LINUX_REBOOT_MAGIC2C))
+		return -EINVAL;
+
+	/* Instead of trying to make the power_off code look like
+	 * halt when pm_power_off is not set do it the easy way.
+	 */
+	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+		cmd = LINUX_REBOOT_CMD_HALT;
+
+	lock_kernel();
+	switch (cmd) {
+	case LINUX_REBOOT_CMD_RESTART:
+		kernel_restart(NULL);
+		break;
+
+	case LINUX_REBOOT_CMD_CAD_ON:
+		C_A_D = 1;
+		break;
+
+	case LINUX_REBOOT_CMD_CAD_OFF:
+		C_A_D = 0;
+		break;
+
+	case LINUX_REBOOT_CMD_HALT:
+		kernel_halt();
+		unlock_kernel();
+		do_exit(0);
+		break;
+
+	case LINUX_REBOOT_CMD_POWER_OFF:
+		kernel_power_off();
+		unlock_kernel();
+		do_exit(0);
+		break;
+
+	case LINUX_REBOOT_CMD_RESTART2:
+		if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
+			unlock_kernel();
+			return -EFAULT;
+		}
+		buffer[sizeof(buffer) - 1] = '\0';
+
+		kernel_restart(buffer);
+		break;
+
+	case LINUX_REBOOT_CMD_KEXEC:
+		kernel_kexec();
+		unlock_kernel();
+		return -EINVAL;
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+	case LINUX_REBOOT_CMD_SW_SUSPEND:
+		{
+			int ret = software_suspend();
+			unlock_kernel();
+			return ret;
+		}
+#endif
+
+	default:
+		unlock_kernel();
+		return -EINVAL;
+	}
+	unlock_kernel();
+	return 0;
+}
+
+static void deferred_cad(void *dummy)
+{
+	kernel_restart(NULL);
+}
+
+/*
+ * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
+ * As it's called within an interrupt, it may NOT sync: the only choice
+ * is whether to reboot at once, or just ignore the ctrl-alt-del.
+ */
+void ctrl_alt_del(void)
+{
+	static DECLARE_WORK(cad_work, deferred_cad, NULL);
+
+	if (C_A_D)
+		schedule_work(&cad_work);
+	else
+		kill_proc(cad_pid, SIGINT, 1);
+}
+	
+
+/*
+ * Unprivileged users may change the real gid to the effective gid
+ * or vice versa.  (BSD-style)
+ *
+ * If you set the real gid at all, or set the effective gid to a value not
+ * equal to the real gid, then the saved gid is set to the new effective gid.
+ *
+ * This makes it possible for a setgid program to completely drop its
+ * privileges, which is often a useful assertion to make when you are doing
+ * a security audit over a program.
+ *
+ * The general idea is that a program which uses just setregid() will be
+ * 100% compatible with BSD.  A program which uses just setgid() will be
+ * 100% compatible with POSIX with saved IDs. 
+ *
+ * SMP: There are not races, the GIDs are checked only by filesystem
+ *      operations (as far as semantic preservation is concerned).
+ */
+asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
+{
+	int old_rgid = current->gid;
+	int old_egid = current->egid;
+	int new_rgid = old_rgid;
+	int new_egid = old_egid;
+	int retval;
+
+	retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
+	if (retval)
+		return retval;
+
+	if (rgid != (gid_t) -1) {
+		if ((old_rgid == rgid) ||
+		    (current->egid==rgid) ||
+		    capable(CAP_SETGID))
+			new_rgid = rgid;
+		else
+			return -EPERM;
+	}
+	if (egid != (gid_t) -1) {
+		if ((old_rgid == egid) ||
+		    (current->egid == egid) ||
+		    (current->sgid == egid) ||
+		    capable(CAP_SETGID))
+			new_egid = egid;
+		else {
+			return -EPERM;
+		}
+	}
+	if (new_egid != old_egid)
+	{
+		current->mm->dumpable = suid_dumpable;
+		smp_wmb();
+	}
+	if (rgid != (gid_t) -1 ||
+	    (egid != (gid_t) -1 && egid != old_rgid))
+		current->sgid = new_egid;
+	current->fsgid = new_egid;
+	current->egid = new_egid;
+	current->gid = new_rgid;
+	key_fsgid_changed(current);
+	proc_id_connector(current, PROC_EVENT_GID);
+	return 0;
+}
+
+/*
+ * setgid() is implemented like SysV w/ SAVED_IDS 
+ *
+ * SMP: Same implicit races as above.
+ */
+asmlinkage long sys_setgid(gid_t gid)
+{
+	int old_egid = current->egid;
+	int retval;
+
+	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
+	if (retval)
+		return retval;
+
+	if (capable(CAP_SETGID))
+	{
+		if(old_egid != gid)
+		{
+			current->mm->dumpable = suid_dumpable;
+			smp_wmb();
+		}
+		current->gid = current->egid = current->sgid = current->fsgid = gid;
+	}
+	else if ((gid == current->gid) || (gid == current->sgid))
+	{
+		if(old_egid != gid)
+		{
+			current->mm->dumpable = suid_dumpable;
+			smp_wmb();
+		}
+		current->egid = current->fsgid = gid;
+	}
+	else
+		return -EPERM;
+
+	key_fsgid_changed(current);
+	proc_id_connector(current, PROC_EVENT_GID);
+	return 0;
+}
+  
+static int set_user(uid_t new_ruid, int dumpclear)
+{
+	struct user_struct *new_user;
+
+	new_user = alloc_uid(new_ruid);
+	if (!new_user)
+		return -EAGAIN;
+
+	if (atomic_read(&new_user->processes) >=
+				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
+			new_user != &root_user) {
+		free_uid(new_user);
+		return -EAGAIN;
+	}
+
+	switch_uid(new_user);
+
+	if(dumpclear)
+	{
+		current->mm->dumpable = suid_dumpable;
+		smp_wmb();
+	}
+	current->uid = new_ruid;
+	return 0;
+}
+
+/*
+ * Unprivileged users may change the real uid to the effective uid
+ * or vice versa.  (BSD-style)
+ *
+ * If you set the real uid at all, or set the effective uid to a value not
+ * equal to the real uid, then the saved uid is set to the new effective uid.
+ *
+ * This makes it possible for a setuid program to completely drop its
+ * privileges, which is often a useful assertion to make when you are doing
+ * a security audit over a program.
+ *
+ * The general idea is that a program which uses just setreuid() will be
+ * 100% compatible with BSD.  A program which uses just setuid() will be
+ * 100% compatible with POSIX with saved IDs. 
+ */
+asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
+{
+	int old_ruid, old_euid, old_suid, new_ruid, new_euid;
+	int retval;
+
+	retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
+	if (retval)
+		return retval;
+
+	new_ruid = old_ruid = current->uid;
+	new_euid = old_euid = current->euid;
+	old_suid = current->suid;
+
+	if (ruid != (uid_t) -1) {
+		new_ruid = ruid;
+		if ((old_ruid != ruid) &&
+		    (current->euid != ruid) &&
+		    !capable(CAP_SETUID))
+			return -EPERM;
+	}
+
+	if (euid != (uid_t) -1) {
+		new_euid = euid;
+		if ((old_ruid != euid) &&
+		    (current->euid != euid) &&
+		    (current->suid != euid) &&
+		    !capable(CAP_SETUID))
+			return -EPERM;
+	}
+
+	if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
+		return -EAGAIN;
+
+	if (new_euid != old_euid)
+	{
+		current->mm->dumpable = suid_dumpable;
+		smp_wmb();
+	}
+	current->fsuid = current->euid = new_euid;
+	if (ruid != (uid_t) -1 ||
+	    (euid != (uid_t) -1 && euid != old_ruid))
+		current->suid = current->euid;
+	current->fsuid = current->euid;
+
+	key_fsuid_changed(current);
+	proc_id_connector(current, PROC_EVENT_UID);
+
+	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
+}
+
+
+		
+/*
+ * setuid() is implemented like SysV with SAVED_IDS 
+ * 
+ * Note that SAVED_ID's is deficient in that a setuid root program
+ * like sendmail, for example, cannot set its uid to be a normal 
+ * user and then switch back, because if you're root, setuid() sets
+ * the saved uid too.  If you don't like this, blame the bright people
+ * in the POSIX committee and/or USG.  Note that the BSD-style setreuid()
+ * will allow a root program to temporarily drop privileges and be able to
+ * regain them by swapping the real and effective uid.  
+ */
+asmlinkage long sys_setuid(uid_t uid)
+{
+	int old_euid = current->euid;
+	int old_ruid, old_suid, new_ruid, new_suid;
+	int retval;
+
+	retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
+	if (retval)
+		return retval;
+
+	old_ruid = new_ruid = current->uid;
+	old_suid = current->suid;
+	new_suid = old_suid;
+	
+	if (capable(CAP_SETUID)) {
+		if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
+			return -EAGAIN;
+		new_suid = uid;
+	} else if ((uid != current->uid) && (uid != new_suid))
+		return -EPERM;
+
+	if (old_euid != uid)
+	{
+		current->mm->dumpable = suid_dumpable;
+		smp_wmb();
+	}
+	current->fsuid = current->euid = uid;
+	current->suid = new_suid;
+
+	key_fsuid_changed(current);
+	proc_id_connector(current, PROC_EVENT_UID);
+
+	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
+}
+
+
+/*
+ * This function implements a generic ability to update ruid, euid,
+ * and suid.  This allows you to implement the 4.4 compatible seteuid().
+ */
+asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
+{
+	int old_ruid = current->uid;
+	int old_euid = current->euid;
+	int old_suid = current->suid;
+	int retval;
+
+	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
+	if (retval)
+		return retval;
+
+	if (!capable(CAP_SETUID)) {
+		if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
+		    (ruid != current->euid) && (ruid != current->suid))
+			return -EPERM;
+		if ((euid != (uid_t) -1) && (euid != current->uid) &&
+		    (euid != current->euid) && (euid != current->suid))
+			return -EPERM;
+		if ((suid != (uid_t) -1) && (suid != current->uid) &&
+		    (suid != current->euid) && (suid != current->suid))
+			return -EPERM;
+	}
+	if (ruid != (uid_t) -1) {
+		if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
+			return -EAGAIN;
+	}
+	if (euid != (uid_t) -1) {
+		if (euid != current->euid)
+		{
+			current->mm->dumpable = suid_dumpable;
+			smp_wmb();
+		}
+		current->euid = euid;
+	}
+	current->fsuid = current->euid;
+	if (suid != (uid_t) -1)
+		current->suid = suid;
+
+	key_fsuid_changed(current);
+	proc_id_connector(current, PROC_EVENT_UID);
+
+	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
+}
+
+asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
+{
+	int retval;
+
+	if (!(retval = put_user(current->uid, ruid)) &&
+	    !(retval = put_user(current->euid, euid)))
+		retval = put_user(current->suid, suid);
+
+	return retval;
+}
+
+/*
+ * Same as above, but for rgid, egid, sgid.
+ */
+asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
+{
+	int retval;
+
+	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
+	if (retval)
+		return retval;
+
+	if (!capable(CAP_SETGID)) {
+		if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
+		    (rgid != current->egid) && (rgid != current->sgid))
+			return -EPERM;
+		if ((egid != (gid_t) -1) && (egid != current->gid) &&
+		    (egid != current->egid) && (egid != current->sgid))
+			return -EPERM;
+		if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
+		    (sgid != current->egid) && (sgid != current->sgid))
+			return -EPERM;
+	}
+	if (egid != (gid_t) -1) {
+		if (egid != current->egid)
+		{
+			current->mm->dumpable = suid_dumpable;
+			smp_wmb();
+		}
+		current->egid = egid;
+	}
+	current->fsgid = current->egid;
+	if (rgid != (gid_t) -1)
+		current->gid = rgid;
+	if (sgid != (gid_t) -1)
+		current->sgid = sgid;
+
+	key_fsgid_changed(current);
+	proc_id_connector(current, PROC_EVENT_GID);
+	return 0;
+}
+
+asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
+{
+	int retval;
+
+	if (!(retval = put_user(current->gid, rgid)) &&
+	    !(retval = put_user(current->egid, egid)))
+		retval = put_user(current->sgid, sgid);
+
+	return retval;
+}
+
+
+/*
+ * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
+ * is used for "access()" and for the NFS daemon (letting nfsd stay at
+ * whatever uid it wants to). It normally shadows "euid", except when
+ * explicitly set by setfsuid() or for access..
+ */
+asmlinkage long sys_setfsuid(uid_t uid)
+{
+	int old_fsuid;
+
+	old_fsuid = current->fsuid;
+	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
+		return old_fsuid;
+
+	if (uid == current->uid || uid == current->euid ||
+	    uid == current->suid || uid == current->fsuid || 
+	    capable(CAP_SETUID))
+	{
+		if (uid != old_fsuid)
+		{
+			current->mm->dumpable = suid_dumpable;
+			smp_wmb();
+		}
+		current->fsuid = uid;
+	}
+
+	key_fsuid_changed(current);
+	proc_id_connector(current, PROC_EVENT_UID);
+
+	security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
+
+	return old_fsuid;
+}
+
+/*
+ * Samma på svenska..
+ */
+asmlinkage long sys_setfsgid(gid_t gid)
+{
+	int old_fsgid;
+
+	old_fsgid = current->fsgid;
+	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
+		return old_fsgid;
+
+	if (gid == current->gid || gid == current->egid ||
+	    gid == current->sgid || gid == current->fsgid || 
+	    capable(CAP_SETGID))
+	{
+		if (gid != old_fsgid)
+		{
+			current->mm->dumpable = suid_dumpable;
+			smp_wmb();
+		}
+		current->fsgid = gid;
+		key_fsgid_changed(current);
+		proc_id_connector(current, PROC_EVENT_GID);
+	}
+	return old_fsgid;
+}
+
+asmlinkage long sys_times(struct tms __user * tbuf)
+{
+	/*
+	 *	In the SMP world we might just be unlucky and have one of
+	 *	the times increment as we use it. Since the value is an
+	 *	atomically safe type this is just fine. Conceptually its
+	 *	as if the syscall took an instant longer to occur.
+	 */
+	if (tbuf) {
+		struct tms tmp;
+		struct task_struct *tsk = current;
+		struct task_struct *t;
+		cputime_t utime, stime, cutime, cstime;
+
+		spin_lock_irq(&tsk->sighand->siglock);
+		utime = tsk->signal->utime;
+		stime = tsk->signal->stime;
+		t = tsk;
+		do {
+			utime = cputime_add(utime, t->utime);
+			stime = cputime_add(stime, t->stime);
+			t = next_thread(t);
+		} while (t != tsk);
+
+		cutime = tsk->signal->cutime;
+		cstime = tsk->signal->cstime;
+		spin_unlock_irq(&tsk->sighand->siglock);
+
+		tmp.tms_utime = cputime_to_clock_t(utime);
+		tmp.tms_stime = cputime_to_clock_t(stime);
+		tmp.tms_cutime = cputime_to_clock_t(cutime);
+		tmp.tms_cstime = cputime_to_clock_t(cstime);
+		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
+			return -EFAULT;
+	}
+	return (long) jiffies_64_to_clock_t(get_jiffies_64());
+}
+
+/*
+ * This needs some heavy checking ...
+ * I just haven't the stomach for it. I also don't fully
+ * understand sessions/pgrp etc. Let somebody who does explain it.
+ *
+ * OK, I think I have the protection semantics right.... this is really
+ * only important on a multi-user system anyway, to make sure one user
+ * can't send a signal to a process owned by another.  -TYT, 12/12/91
+ *
+ * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
+ * LBT 04.03.94
+ */
+
+asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
+{
+	struct task_struct *p;
+	struct task_struct *group_leader = current->group_leader;
+	int err = -EINVAL;
+
+	if (!pid)
+		pid = group_leader->pid;
+	if (!pgid)
+		pgid = pid;
+	if (pgid < 0)
+		return -EINVAL;
+
+	/* From this point forward we keep holding onto the tasklist lock
+	 * so that our parent does not change from under us. -DaveM
+	 */
+	write_lock_irq(&tasklist_lock);
+
+	err = -ESRCH;
+	p = find_task_by_pid(pid);
+	if (!p)
+		goto out;
+
+	err = -EINVAL;
+	if (!thread_group_leader(p))
+		goto out;
+
+	if (p->real_parent == group_leader) {
+		err = -EPERM;
+		if (p->signal->session != group_leader->signal->session)
+			goto out;
+		err = -EACCES;
+		if (p->did_exec)
+			goto out;
+	} else {
+		err = -ESRCH;
+		if (p != group_leader)
+			goto out;
+	}
+
+	err = -EPERM;
+	if (p->signal->leader)
+		goto out;
+
+	if (pgid != pid) {
+		struct task_struct *p;
+
+		do_each_task_pid(pgid, PIDTYPE_PGID, p) {
+			if (p->signal->session == group_leader->signal->session)
+				goto ok_pgid;
+		} while_each_task_pid(pgid, PIDTYPE_PGID, p);
+		goto out;
+	}
+
+ok_pgid:
+	err = security_task_setpgid(p, pgid);
+	if (err)
+		goto out;
+
+	if (process_group(p) != pgid) {
+		detach_pid(p, PIDTYPE_PGID);
+		p->signal->pgrp = pgid;
+		attach_pid(p, PIDTYPE_PGID, pgid);
+	}
+
+	err = 0;
+out:
+	/* All paths lead to here, thus we are safe. -DaveM */
+	write_unlock_irq(&tasklist_lock);
+	return err;
+}
+
+asmlinkage long sys_getpgid(pid_t pid)
+{
+	if (!pid) {
+		return process_group(current);
+	} else {
+		int retval;
+		struct task_struct *p;
+
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+
+		retval = -ESRCH;
+		if (p) {
+			retval = security_task_getpgid(p);
+			if (!retval)
+				retval = process_group(p);
+		}
+		read_unlock(&tasklist_lock);
+		return retval;
+	}
+}
+
+#ifdef __ARCH_WANT_SYS_GETPGRP
+
+asmlinkage long sys_getpgrp(void)
+{
+	/* SMP - assuming writes are word atomic this is fine */
+	return process_group(current);
+}
+
+#endif
+
+asmlinkage long sys_getsid(pid_t pid)
+{
+	if (!pid) {
+		return current->signal->session;
+	} else {
+		int retval;
+		struct task_struct *p;
+
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+
+		retval = -ESRCH;
+		if(p) {
+			retval = security_task_getsid(p);
+			if (!retval)
+				retval = p->signal->session;
+		}
+		read_unlock(&tasklist_lock);
+		return retval;
+	}
+}
+
+asmlinkage long sys_setsid(void)
+{
+	struct task_struct *group_leader = current->group_leader;
+	struct pid *pid;
+	int err = -EPERM;
+
+	mutex_lock(&tty_mutex);
+	write_lock_irq(&tasklist_lock);
+
+	pid = find_pid(PIDTYPE_PGID, group_leader->pid);
+	if (pid)
+		goto out;
+
+	group_leader->signal->leader = 1;
+	__set_special_pids(group_leader->pid, group_leader->pid);
+	group_leader->signal->tty = NULL;
+	group_leader->signal->tty_old_pgrp = 0;
+	err = process_group(group_leader);
+out:
+	write_unlock_irq(&tasklist_lock);
+	mutex_unlock(&tty_mutex);
+	return err;
+}
+
+/*
+ * Supplementary group IDs
+ */
+
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
+struct group_info *groups_alloc(int gidsetsize)
+{
+	struct group_info *group_info;
+	int nblocks;
+	int i;
+
+	nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+	/* Make sure we always allocate at least one indirect block pointer */
+	nblocks = nblocks ? : 1;
+	group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+	if (!group_info)
+		return NULL;
+	group_info->ngroups = gidsetsize;
+	group_info->nblocks = nblocks;
+	atomic_set(&group_info->usage, 1);
+
+	if (gidsetsize <= NGROUPS_SMALL) {
+		group_info->blocks[0] = group_info->small_block;
+	} else {
+		for (i = 0; i < nblocks; i++) {
+			gid_t *b;
+			b = (void *)__get_free_page(GFP_USER);
+			if (!b)
+				goto out_undo_partial_alloc;
+			group_info->blocks[i] = b;
+		}
+	}
+	return group_info;
+
+out_undo_partial_alloc:
+	while (--i >= 0) {
+		free_page((unsigned long)group_info->blocks[i]);
+	}
+	kfree(group_info);
+	return NULL;
+}
+
+EXPORT_SYMBOL(groups_alloc);
+
+void groups_free(struct group_info *group_info)
+{
+	if (group_info->blocks[0] != group_info->small_block) {
+		int i;
+		for (i = 0; i < group_info->nblocks; i++)
+			free_page((unsigned long)group_info->blocks[i]);
+	}
+	kfree(group_info);
+}
+
+EXPORT_SYMBOL(groups_free);
+
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+    struct group_info *group_info)
+{
+	int i;
+	int count = group_info->ngroups;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		int cp_count = min(NGROUPS_PER_BLOCK, count);
+		int off = i * NGROUPS_PER_BLOCK;
+		int len = cp_count * sizeof(*grouplist);
+
+		if (copy_to_user(grouplist+off, group_info->blocks[i], len))
+			return -EFAULT;
+
+		count -= cp_count;
+	}
+	return 0;
+}
+
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+    gid_t __user *grouplist)
+ {
+	int i;
+	int count = group_info->ngroups;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		int cp_count = min(NGROUPS_PER_BLOCK, count);
+		int off = i * NGROUPS_PER_BLOCK;
+		int len = cp_count * sizeof(*grouplist);
+
+		if (copy_from_user(group_info->blocks[i], grouplist+off, len))
+			return -EFAULT;
+
+		count -= cp_count;
+	}
+	return 0;
+}
+
+/* a simple Shell sort */
+static void groups_sort(struct group_info *group_info)
+{
+	int base, max, stride;
+	int gidsetsize = group_info->ngroups;
+
+	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+		; /* nothing */
+	stride /= 3;
+
+	while (stride) {
+		max = gidsetsize - stride;
+		for (base = 0; base < max; base++) {
+			int left = base;
+			int right = left + stride;
+			gid_t tmp = GROUP_AT(group_info, right);
+
+			while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+				GROUP_AT(group_info, right) =
+				    GROUP_AT(group_info, left);
+				right = left;
+				left -= stride;
+			}
+			GROUP_AT(group_info, right) = tmp;
+		}
+		stride /= 3;
+	}
+}
+
+/* a simple bsearch */
+int groups_search(struct group_info *group_info, gid_t grp)
+{
+	int left, right;
+
+	if (!group_info)
+		return 0;
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		int mid = (left+right)/2;
+		int cmp = grp - GROUP_AT(group_info, mid);
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+/* validate and set current->group_info */
+int set_current_groups(struct group_info *group_info)
+{
+	int retval;
+	struct group_info *old_info;
+
+	retval = security_task_setgroups(group_info);
+	if (retval)
+		return retval;
+
+	groups_sort(group_info);
+	get_group_info(group_info);
+
+	task_lock(current);
+	old_info = current->group_info;
+	current->group_info = group_info;
+	task_unlock(current);
+
+	put_group_info(old_info);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(set_current_groups);
+
+asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
+{
+	int i = 0;
+
+	/*
+	 *	SMP: Nobody else can change our grouplist. Thus we are
+	 *	safe.
+	 */
+
+	if (gidsetsize < 0)
+		return -EINVAL;
+
+	/* no need to grab task_lock here; it cannot change */
+	get_group_info(current->group_info);
+	i = current->group_info->ngroups;
+	if (gidsetsize) {
+		if (i > gidsetsize) {
+			i = -EINVAL;
+			goto out;
+		}
+		if (groups_to_user(grouplist, current->group_info)) {
+			i = -EFAULT;
+			goto out;
+		}
+	}
+out:
+	put_group_info(current->group_info);
+	return i;
+}
+
+/*
+ *	SMP: Our groups are copy-on-write. We can set them safely
+ *	without another task interfering.
+ */
+ 
+asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
+{
+	struct group_info *group_info;
+	int retval;
+
+	if (!capable(CAP_SETGID))
+		return -EPERM;
+	if ((unsigned)gidsetsize > NGROUPS_MAX)
+		return -EINVAL;
+
+	group_info = groups_alloc(gidsetsize);
+	if (!group_info)
+		return -ENOMEM;
+	retval = groups_from_user(group_info, grouplist);
+	if (retval) {
+		put_group_info(group_info);
+		return retval;
+	}
+
+	retval = set_current_groups(group_info);
+	put_group_info(group_info);
+
+	return retval;
+}
+
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+	int retval = 1;
+	if (grp != current->fsgid) {
+		get_group_info(current->group_info);
+		retval = groups_search(current->group_info, grp);
+		put_group_info(current->group_info);
+	}
+	return retval;
+}
+
+EXPORT_SYMBOL(in_group_p);
+
+int in_egroup_p(gid_t grp)
+{
+	int retval = 1;
+	if (grp != current->egid) {
+		get_group_info(current->group_info);
+		retval = groups_search(current->group_info, grp);
+		put_group_info(current->group_info);
+	}
+	return retval;
+}
+
+EXPORT_SYMBOL(in_egroup_p);
+
+DECLARE_RWSEM(uts_sem);
+
+EXPORT_SYMBOL(uts_sem);
+
+asmlinkage long sys_newuname(struct new_utsname __user * name)
+{
+	int errno = 0;
+
+	down_read(&uts_sem);
+	if (copy_to_user(name,&system_utsname,sizeof *name))
+		errno = -EFAULT;
+	up_read(&uts_sem);
+	return errno;
+}
+
+asmlinkage long sys_sethostname(char __user *name, int len)
+{
+	int errno;
+	char tmp[__NEW_UTS_LEN];
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (len < 0 || len > __NEW_UTS_LEN)
+		return -EINVAL;
+	down_write(&uts_sem);
+	errno = -EFAULT;
+	if (!copy_from_user(tmp, name, len)) {
+		memcpy(system_utsname.nodename, tmp, len);
+		system_utsname.nodename[len] = 0;
+		errno = 0;
+	}
+	up_write(&uts_sem);
+	return errno;
+}
+
+#ifdef __ARCH_WANT_SYS_GETHOSTNAME
+
+asmlinkage long sys_gethostname(char __user *name, int len)
+{
+	int i, errno;
+
+	if (len < 0)
+		return -EINVAL;
+	down_read(&uts_sem);
+	i = 1 + strlen(system_utsname.nodename);
+	if (i > len)
+		i = len;
+	errno = 0;
+	if (copy_to_user(name, system_utsname.nodename, i))
+		errno = -EFAULT;
+	up_read(&uts_sem);
+	return errno;
+}
+
+#endif
+
+/*
+ * Only setdomainname; getdomainname can be implemented by calling
+ * uname()
+ */
+asmlinkage long sys_setdomainname(char __user *name, int len)
+{
+	int errno;
+	char tmp[__NEW_UTS_LEN];
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (len < 0 || len > __NEW_UTS_LEN)
+		return -EINVAL;
+
+	down_write(&uts_sem);
+	errno = -EFAULT;
+	if (!copy_from_user(tmp, name, len)) {
+		memcpy(system_utsname.domainname, tmp, len);
+		system_utsname.domainname[len] = 0;
+		errno = 0;
+	}
+	up_write(&uts_sem);
+	return errno;
+}
+
+asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
+{
+	if (resource >= RLIM_NLIMITS)
+		return -EINVAL;
+	else {
+		struct rlimit value;
+		task_lock(current->group_leader);
+		value = current->signal->rlim[resource];
+		task_unlock(current->group_leader);
+		return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+	}
+}
+
+#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
+
+/*
+ *	Back compatibility for getrlimit. Needed for some apps.
+ */
+ 
+asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim)
+{
+	struct rlimit x;
+	if (resource >= RLIM_NLIMITS)
+		return -EINVAL;
+
+	task_lock(current->group_leader);
+	x = current->signal->rlim[resource];
+	task_unlock(current->group_leader);
+	if(x.rlim_cur > 0x7FFFFFFF)
+		x.rlim_cur = 0x7FFFFFFF;
+	if(x.rlim_max > 0x7FFFFFFF)
+		x.rlim_max = 0x7FFFFFFF;
+	return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
+}
+
+#endif
+
+asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
+{
+	struct rlimit new_rlim, *old_rlim;
+	unsigned long it_prof_secs;
+	int retval;
+
+	if (resource >= RLIM_NLIMITS)
+		return -EINVAL;
+	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+		return -EFAULT;
+	if (new_rlim.rlim_cur > new_rlim.rlim_max)
+		return -EINVAL;
+	old_rlim = current->signal->rlim + resource;
+	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+	    !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
+		return -EPERM;
+
+	retval = security_task_setrlimit(resource, &new_rlim);
+	if (retval)
+		return retval;
+
+	task_lock(current->group_leader);
+	*old_rlim = new_rlim;
+	task_unlock(current->group_leader);
+
+	if (resource != RLIMIT_CPU)
+		goto out;
+
+	/*
+	 * RLIMIT_CPU handling.   Note that the kernel fails to return an error
+	 * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a
+	 * very long-standing error, and fixing it now risks breakage of
+	 * applications, so we live with it
+	 */
+	if (new_rlim.rlim_cur == RLIM_INFINITY)
+		goto out;
+
+	it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
+	if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
+		unsigned long rlim_cur = new_rlim.rlim_cur;
+		cputime_t cputime;
+
+		if (rlim_cur == 0) {
+			/*
+			 * The caller is asking for an immediate RLIMIT_CPU
+			 * expiry.  But we use the zero value to mean "it was
+			 * never set".  So let's cheat and make it one second
+			 * instead
+			 */
+			rlim_cur = 1;
+		}
+		cputime = secs_to_cputime(rlim_cur);
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&current->sighand->siglock);
+		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
+		spin_unlock_irq(&current->sighand->siglock);
+		read_unlock(&tasklist_lock);
+	}
+out:
+	return 0;
+}
+
+/*
+ * It would make sense to put struct rusage in the task_struct,
+ * except that would make the task_struct be *really big*.  After
+ * task_struct gets moved into malloc'ed memory, it would
+ * make sense to do this.  It will make moving the rest of the information
+ * a lot simpler!  (Which we're not doing right now because we're not
+ * measuring them yet).
+ *
+ * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
+ * races with threads incrementing their own counters.  But since word
+ * reads are atomic, we either get new values or old values and we don't
+ * care which for the sums.  We always take the siglock to protect reading
+ * the c* fields from p->signal from races with exit.c updating those
+ * fields when reaping, so a sample either gets all the additions of a
+ * given child after it's reaped, or none so this sample is before reaping.
+ *
+ * tasklist_lock locking optimisation:
+ * If we are current and single threaded, we do not need to take the tasklist
+ * lock or the siglock.  No one else can take our signal_struct away,
+ * no one else can reap the children to update signal->c* counters, and
+ * no one else can race with the signal-> fields.
+ * If we do not take the tasklist_lock, the signal-> fields could be read
+ * out of order while another thread was just exiting. So we place a
+ * read memory barrier when we avoid the lock.  On the writer side,
+ * write memory barrier is implied in  __exit_signal as __exit_signal releases
+ * the siglock spinlock after updating the signal-> fields.
+ *
+ * We don't really need the siglock when we access the non c* fields
+ * of the signal_struct (for RUSAGE_SELF) even in multithreaded
+ * case, since we take the tasklist lock for read and the non c* signal->
+ * fields are updated only in __exit_signal, which is called with
+ * tasklist_lock taken for write, hence these two threads cannot execute
+ * concurrently.
+ *
+ */
+
+static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
+{
+	struct task_struct *t;
+	unsigned long flags;
+	cputime_t utime, stime;
+	int need_lock = 0;
+
+	memset((char *) r, 0, sizeof *r);
+	utime = stime = cputime_zero;
+
+	if (p != current || !thread_group_empty(p))
+		need_lock = 1;
+
+	if (need_lock) {
+		read_lock(&tasklist_lock);
+		if (unlikely(!p->signal)) {
+			read_unlock(&tasklist_lock);
+			return;
+		}
+	} else
+		/* See locking comments above */
+		smp_rmb();
+
+	switch (who) {
+		case RUSAGE_BOTH:
+		case RUSAGE_CHILDREN:
+			spin_lock_irqsave(&p->sighand->siglock, flags);
+			utime = p->signal->cutime;
+			stime = p->signal->cstime;
+			r->ru_nvcsw = p->signal->cnvcsw;
+			r->ru_nivcsw = p->signal->cnivcsw;
+			r->ru_minflt = p->signal->cmin_flt;
+			r->ru_majflt = p->signal->cmaj_flt;
+			spin_unlock_irqrestore(&p->sighand->siglock, flags);
+
+			if (who == RUSAGE_CHILDREN)
+				break;
+
+		case RUSAGE_SELF:
+			utime = cputime_add(utime, p->signal->utime);
+			stime = cputime_add(stime, p->signal->stime);
+			r->ru_nvcsw += p->signal->nvcsw;
+			r->ru_nivcsw += p->signal->nivcsw;
+			r->ru_minflt += p->signal->min_flt;
+			r->ru_majflt += p->signal->maj_flt;
+			t = p;
+			do {
+				utime = cputime_add(utime, t->utime);
+				stime = cputime_add(stime, t->stime);
+				r->ru_nvcsw += t->nvcsw;
+				r->ru_nivcsw += t->nivcsw;
+				r->ru_minflt += t->min_flt;
+				r->ru_majflt += t->maj_flt;
+				t = next_thread(t);
+			} while (t != p);
+			break;
+
+		default:
+			BUG();
+	}
+
+	if (need_lock)
+		read_unlock(&tasklist_lock);
+	cputime_to_timeval(utime, &r->ru_utime);
+	cputime_to_timeval(stime, &r->ru_stime);
+}
+
+int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
+{
+	struct rusage r;
+	k_getrusage(p, who, &r);
+	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
+}
+
+asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
+{
+	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN)
+		return -EINVAL;
+	return getrusage(current, who, ru);
+}
+
+asmlinkage long sys_umask(int mask)
+{
+	mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
+	return mask;
+}
+    
+asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
+			  unsigned long arg4, unsigned long arg5)
+{
+	long error;
+
+	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
+	if (error)
+		return error;
+
+	switch (option) {
+		case PR_SET_PDEATHSIG:
+			if (!valid_signal(arg2)) {
+				error = -EINVAL;
+				break;
+			}
+			current->pdeath_signal = arg2;
+			break;
+		case PR_GET_PDEATHSIG:
+			error = put_user(current->pdeath_signal, (int __user *)arg2);
+			break;
+		case PR_GET_DUMPABLE:
+			error = current->mm->dumpable;
+			break;
+		case PR_SET_DUMPABLE:
+			if (arg2 < 0 || arg2 > 2) {
+				error = -EINVAL;
+				break;
+			}
+			current->mm->dumpable = arg2;
+			break;
+
+		case PR_SET_UNALIGN:
+			error = SET_UNALIGN_CTL(current, arg2);
+			break;
+		case PR_GET_UNALIGN:
+			error = GET_UNALIGN_CTL(current, arg2);
+			break;
+		case PR_SET_FPEMU:
+			error = SET_FPEMU_CTL(current, arg2);
+			break;
+		case PR_GET_FPEMU:
+			error = GET_FPEMU_CTL(current, arg2);
+			break;
+		case PR_SET_FPEXC:
+			error = SET_FPEXC_CTL(current, arg2);
+			break;
+		case PR_GET_FPEXC:
+			error = GET_FPEXC_CTL(current, arg2);
+			break;
+		case PR_GET_TIMING:
+			error = PR_TIMING_STATISTICAL;
+			break;
+		case PR_SET_TIMING:
+			if (arg2 == PR_TIMING_STATISTICAL)
+				error = 0;
+			else
+				error = -EINVAL;
+			break;
+
+		case PR_GET_KEEPCAPS:
+			if (current->keep_capabilities)
+				error = 1;
+			break;
+		case PR_SET_KEEPCAPS:
+			if (arg2 != 0 && arg2 != 1) {
+				error = -EINVAL;
+				break;
+			}
+			current->keep_capabilities = arg2;
+			break;
+		case PR_SET_NAME: {
+			struct task_struct *me = current;
+			unsigned char ncomm[sizeof(me->comm)];
+
+			ncomm[sizeof(me->comm)-1] = 0;
+			if (strncpy_from_user(ncomm, (char __user *)arg2,
+						sizeof(me->comm)-1) < 0)
+				return -EFAULT;
+			set_task_comm(me, ncomm);
+			return 0;
+		}
+		case PR_GET_NAME: {
+			struct task_struct *me = current;
+			unsigned char tcomm[sizeof(me->comm)];
+
+			get_task_comm(tcomm, me);
+			if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
+				return -EFAULT;
+			return 0;
+		}
+		default:
+			error = -EINVAL;
+			break;
+	}
+	return error;
+}
diff -urN oldtree/kernel/workqueue.c newtree/kernel/workqueue.c
--- oldtree/kernel/workqueue.c	2006-03-08 18:48:02.984065500 +0000
+++ newtree/kernel/workqueue.c	2006-03-08 15:22:33.385513750 +0000
@@ -192,8 +192,6 @@
 	struct k_sigaction sa;
 	sigset_t blocked;
 
-	current->flags |= PF_NOFREEZE;
-
 	set_user_nice(current, -5);
 
 	/* Block and flush all signals */
@@ -214,6 +212,7 @@
 			schedule();
 		else
 			__set_current_state(TASK_RUNNING);
+		try_to_freeze();
 		remove_wait_queue(&cwq->more_work, &wait);
 
 		if (!list_empty(&cwq->worklist))
@@ -283,7 +282,8 @@
 }
 
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
-						   int cpu)
+						   int cpu,
+						   unsigned long freezer_flags)
 {
 	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	struct task_struct *p;
@@ -297,10 +297,21 @@
 	init_waitqueue_head(&cwq->more_work);
 	init_waitqueue_head(&cwq->work_done);
 
-	if (is_single_threaded(wq))
-		p = kthread_create(worker_thread, cwq, "%s", wq->name);
-	else
-		p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
+	if (is_single_threaded(wq)) {
+		if (freezer_flags)
+			p = kthread_nofreeze_create(worker_thread, cwq,
+					"%s", wq->name);
+		else
+			p = kthread_create(worker_thread, cwq,
+					"%s", wq->name);
+	} else {
+		if (freezer_flags)
+			p = kthread_nofreeze_create(worker_thread, cwq,
+					"%s/%d", wq->name, cpu);
+		else
+			p = kthread_create(worker_thread, cwq,
+					"%s/%d", wq->name, cpu);
+	}
 	if (IS_ERR(p))
 		return NULL;
 	cwq->thread = p;
@@ -308,7 +319,8 @@
 }
 
 struct workqueue_struct *__create_workqueue(const char *name,
-					    int singlethread)
+					    int singlethread,
+					    unsigned long freezer_flags)
 {
 	int cpu, destroy = 0;
 	struct workqueue_struct *wq;
@@ -329,7 +341,7 @@
 	lock_cpu_hotplug();
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
-		p = create_workqueue_thread(wq, singlethread_cpu);
+		p = create_workqueue_thread(wq, singlethread_cpu, freezer_flags);
 		if (!p)
 			destroy = 1;
 		else
@@ -339,7 +351,7 @@
 		list_add(&wq->list, &workqueues);
 		spin_unlock(&workqueue_lock);
 		for_each_online_cpu(cpu) {
-			p = create_workqueue_thread(wq, cpu);
+			p = create_workqueue_thread(wq, cpu, freezer_flags);
 			if (p) {
 				kthread_bind(p, cpu);
 				wake_up_process(p);
@@ -558,7 +570,7 @@
 	case CPU_UP_PREPARE:
 		/* Create a new workqueue thread for it. */
 		list_for_each_entry(wq, &workqueues, list) {
-			if (!create_workqueue_thread(wq, hotcpu)) {
+			if (!create_workqueue_thread(wq, hotcpu, 0)) {
 				printk("workqueue for %i failed\n", hotcpu);
 				return NOTIFY_BAD;
 			}
@@ -601,7 +613,7 @@
 {
 	singlethread_cpu = first_cpu(cpu_possible_map);
 	hotcpu_notifier(workqueue_cpu_callback, 0);
-	keventd_wq = create_workqueue("events");
+	keventd_wq = create_nofreeze_workqueue("events");
 	BUG_ON(!keventd_wq);
 }
 
diff -urN oldtree/kernel/workqueue.c.orig newtree/kernel/workqueue.c.orig
--- oldtree/kernel/workqueue.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/kernel/workqueue.c.orig	2006-03-08 15:21:19.256881000 +0000
@@ -0,0 +1,617 @@
+/*
+ * linux/kernel/workqueue.c
+ *
+ * Generic mechanism for defining kernel helper threads for running
+ * arbitrary tasks in process context.
+ *
+ * Started by Ingo Molnar, Copyright (C) 2002
+ *
+ * Derived from the taskqueue/keventd code by:
+ *
+ *   David Woodhouse <dwmw2@infradead.org>
+ *   Andrew Morton <andrewm@uow.edu.au>
+ *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
+ *   Theodore Ts'o <tytso@mit.edu>
+ *
+ * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/completion.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/hardirq.h>
+
+/*
+ * The per-CPU workqueue (if single thread, we always use the first
+ * possible cpu).
+ *
+ * The sequence counters are for flush_scheduled_work().  It wants to wait
+ * until until all currently-scheduled works are completed, but it doesn't
+ * want to be livelocked by new, incoming ones.  So it waits until
+ * remove_sequence is >= the insert_sequence which pertained when
+ * flush_scheduled_work() was called.
+ */
+struct cpu_workqueue_struct {
+
+	spinlock_t lock;
+
+	long remove_sequence;	/* Least-recently added (next to run) */
+	long insert_sequence;	/* Next to add */
+
+	struct list_head worklist;
+	wait_queue_head_t more_work;
+	wait_queue_head_t work_done;
+
+	struct workqueue_struct *wq;
+	task_t *thread;
+
+	int run_depth;		/* Detect run_workqueue() recursion depth */
+} ____cacheline_aligned;
+
+/*
+ * The externally visible workqueue abstraction is an array of
+ * per-CPU workqueues:
+ */
+struct workqueue_struct {
+	struct cpu_workqueue_struct *cpu_wq;
+	const char *name;
+	struct list_head list; 	/* Empty if single thread */
+};
+
+/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
+   threads to each one as cpus come/go. */
+static DEFINE_SPINLOCK(workqueue_lock);
+static LIST_HEAD(workqueues);
+
+static int singlethread_cpu;
+
+/* If it's single threaded, it isn't in the list of workqueues. */
+static inline int is_single_threaded(struct workqueue_struct *wq)
+{
+	return list_empty(&wq->list);
+}
+
+/* Preempt must be disabled. */
+static void __queue_work(struct cpu_workqueue_struct *cwq,
+			 struct work_struct *work)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cwq->lock, flags);
+	work->wq_data = cwq;
+	list_add_tail(&work->entry, &cwq->worklist);
+	cwq->insert_sequence++;
+	wake_up(&cwq->more_work);
+	spin_unlock_irqrestore(&cwq->lock, flags);
+}
+
+/*
+ * Queue work on a workqueue. Return non-zero if it was successfully
+ * added.
+ *
+ * We queue the work to the CPU it was submitted, but there is no
+ * guarantee that it will be processed by that CPU.
+ */
+int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
+{
+	int ret = 0, cpu = get_cpu();
+
+	if (!test_and_set_bit(0, &work->pending)) {
+		if (unlikely(is_single_threaded(wq)))
+			cpu = singlethread_cpu;
+		BUG_ON(!list_empty(&work->entry));
+		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+		ret = 1;
+	}
+	put_cpu();
+	return ret;
+}
+
+static void delayed_work_timer_fn(unsigned long __data)
+{
+	struct work_struct *work = (struct work_struct *)__data;
+	struct workqueue_struct *wq = work->wq_data;
+	int cpu = smp_processor_id();
+
+	if (unlikely(is_single_threaded(wq)))
+		cpu = singlethread_cpu;
+
+	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+}
+
+int fastcall queue_delayed_work(struct workqueue_struct *wq,
+			struct work_struct *work, unsigned long delay)
+{
+	int ret = 0;
+	struct timer_list *timer = &work->timer;
+
+	if (!test_and_set_bit(0, &work->pending)) {
+		BUG_ON(timer_pending(timer));
+		BUG_ON(!list_empty(&work->entry));
+
+		/* This stores wq for the moment, for the timer_fn */
+		work->wq_data = wq;
+		timer->expires = jiffies + delay;
+		timer->data = (unsigned long)work;
+		timer->function = delayed_work_timer_fn;
+		add_timer(timer);
+		ret = 1;
+	}
+	return ret;
+}
+
+static void run_workqueue(struct cpu_workqueue_struct *cwq)
+{
+	unsigned long flags;
+
+	/*
+	 * Keep taking off work from the queue until
+	 * done.
+	 */
+	spin_lock_irqsave(&cwq->lock, flags);
+	cwq->run_depth++;
+	if (cwq->run_depth > 3) {
+		/* morton gets to eat his hat */
+		printk("%s: recursion depth exceeded: %d\n",
+			__FUNCTION__, cwq->run_depth);
+		dump_stack();
+	}
+	while (!list_empty(&cwq->worklist)) {
+		struct work_struct *work = list_entry(cwq->worklist.next,
+						struct work_struct, entry);
+		void (*f) (void *) = work->func;
+		void *data = work->data;
+
+		list_del_init(cwq->worklist.next);
+		spin_unlock_irqrestore(&cwq->lock, flags);
+
+		BUG_ON(work->wq_data != cwq);
+		clear_bit(0, &work->pending);
+		f(data);
+
+		spin_lock_irqsave(&cwq->lock, flags);
+		cwq->remove_sequence++;
+		wake_up(&cwq->work_done);
+	}
+	cwq->run_depth--;
+	spin_unlock_irqrestore(&cwq->lock, flags);
+}
+
+static int worker_thread(void *__cwq)
+{
+	struct cpu_workqueue_struct *cwq = __cwq;
+	DECLARE_WAITQUEUE(wait, current);
+	struct k_sigaction sa;
+	sigset_t blocked;
+
+	current->flags |= PF_NOFREEZE;
+
+	set_user_nice(current, -5);
+
+	/* Block and flush all signals */
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(current);
+
+	/* SIG_IGN makes children autoreap: see do_notify_parent(). */
+	sa.sa.sa_handler = SIG_IGN;
+	sa.sa.sa_flags = 0;
+	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
+	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		add_wait_queue(&cwq->more_work, &wait);
+		if (list_empty(&cwq->worklist))
+			schedule();
+		else
+			__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&cwq->more_work, &wait);
+
+		if (!list_empty(&cwq->worklist))
+			run_workqueue(cwq);
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
+{
+	if (cwq->thread == current) {
+		/*
+		 * Probably keventd trying to flush its own queue. So simply run
+		 * it by hand rather than deadlocking.
+		 */
+		run_workqueue(cwq);
+	} else {
+		DEFINE_WAIT(wait);
+		long sequence_needed;
+
+		spin_lock_irq(&cwq->lock);
+		sequence_needed = cwq->insert_sequence;
+
+		while (sequence_needed - cwq->remove_sequence > 0) {
+			prepare_to_wait(&cwq->work_done, &wait,
+					TASK_UNINTERRUPTIBLE);
+			spin_unlock_irq(&cwq->lock);
+			schedule();
+			spin_lock_irq(&cwq->lock);
+		}
+		finish_wait(&cwq->work_done, &wait);
+		spin_unlock_irq(&cwq->lock);
+	}
+}
+
+/*
+ * flush_workqueue - ensure that any scheduled work has run to completion.
+ *
+ * Forces execution of the workqueue and blocks until its completion.
+ * This is typically used in driver shutdown handlers.
+ *
+ * This function will sample each workqueue's current insert_sequence number and
+ * will sleep until the head sequence is greater than or equal to that.  This
+ * means that we sleep until all works which were queued on entry have been
+ * handled, but we are not livelocked by new incoming ones.
+ *
+ * This function used to run the workqueues itself.  Now we just wait for the
+ * helper threads to do it.
+ */
+void fastcall flush_workqueue(struct workqueue_struct *wq)
+{
+	might_sleep();
+
+	if (is_single_threaded(wq)) {
+		/* Always use first cpu's area. */
+		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
+	} else {
+		int cpu;
+
+		lock_cpu_hotplug();
+		for_each_online_cpu(cpu)
+			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+		unlock_cpu_hotplug();
+	}
+}
+
+static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
+						   int cpu)
+{
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	struct task_struct *p;
+
+	spin_lock_init(&cwq->lock);
+	cwq->wq = wq;
+	cwq->thread = NULL;
+	cwq->insert_sequence = 0;
+	cwq->remove_sequence = 0;
+	INIT_LIST_HEAD(&cwq->worklist);
+	init_waitqueue_head(&cwq->more_work);
+	init_waitqueue_head(&cwq->work_done);
+
+	if (is_single_threaded(wq))
+		p = kthread_create(worker_thread, cwq, "%s", wq->name);
+	else
+		p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
+	if (IS_ERR(p))
+		return NULL;
+	cwq->thread = p;
+	return p;
+}
+
+struct workqueue_struct *__create_workqueue(const char *name,
+					    int singlethread)
+{
+	int cpu, destroy = 0;
+	struct workqueue_struct *wq;
+	struct task_struct *p;
+
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+	if (!wq)
+		return NULL;
+
+	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+	if (!wq->cpu_wq) {
+		kfree(wq);
+		return NULL;
+	}
+
+	wq->name = name;
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	lock_cpu_hotplug();
+	if (singlethread) {
+		INIT_LIST_HEAD(&wq->list);
+		p = create_workqueue_thread(wq, singlethread_cpu);
+		if (!p)
+			destroy = 1;
+		else
+			wake_up_process(p);
+	} else {
+		spin_lock(&workqueue_lock);
+		list_add(&wq->list, &workqueues);
+		spin_unlock(&workqueue_lock);
+		for_each_online_cpu(cpu) {
+			p = create_workqueue_thread(wq, cpu);
+			if (p) {
+				kthread_bind(p, cpu);
+				wake_up_process(p);
+			} else
+				destroy = 1;
+		}
+	}
+	unlock_cpu_hotplug();
+
+	/*
+	 * Was there any error during startup? If yes then clean up:
+	 */
+	if (destroy) {
+		destroy_workqueue(wq);
+		wq = NULL;
+	}
+	return wq;
+}
+
+static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
+{
+	struct cpu_workqueue_struct *cwq;
+	unsigned long flags;
+	struct task_struct *p;
+
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	spin_lock_irqsave(&cwq->lock, flags);
+	p = cwq->thread;
+	cwq->thread = NULL;
+	spin_unlock_irqrestore(&cwq->lock, flags);
+	if (p)
+		kthread_stop(p);
+}
+
+void destroy_workqueue(struct workqueue_struct *wq)
+{
+	int cpu;
+
+	flush_workqueue(wq);
+
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	lock_cpu_hotplug();
+	if (is_single_threaded(wq))
+		cleanup_workqueue_thread(wq, singlethread_cpu);
+	else {
+		for_each_online_cpu(cpu)
+			cleanup_workqueue_thread(wq, cpu);
+		spin_lock(&workqueue_lock);
+		list_del(&wq->list);
+		spin_unlock(&workqueue_lock);
+	}
+	unlock_cpu_hotplug();
+	free_percpu(wq->cpu_wq);
+	kfree(wq);
+}
+
+static struct workqueue_struct *keventd_wq;
+
+int fastcall schedule_work(struct work_struct *work)
+{
+	return queue_work(keventd_wq, work);
+}
+
+int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
+{
+	return queue_delayed_work(keventd_wq, work, delay);
+}
+
+int schedule_delayed_work_on(int cpu,
+			struct work_struct *work, unsigned long delay)
+{
+	int ret = 0;
+	struct timer_list *timer = &work->timer;
+
+	if (!test_and_set_bit(0, &work->pending)) {
+		BUG_ON(timer_pending(timer));
+		BUG_ON(!list_empty(&work->entry));
+		/* This stores keventd_wq for the moment, for the timer_fn */
+		work->wq_data = keventd_wq;
+		timer->expires = jiffies + delay;
+		timer->data = (unsigned long)work;
+		timer->function = delayed_work_timer_fn;
+		add_timer_on(timer, cpu);
+		ret = 1;
+	}
+	return ret;
+}
+
+int schedule_on_each_cpu(void (*func) (void *info), void *info)
+{
+	int cpu;
+	struct work_struct *work;
+
+	work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
+
+	if (!work)
+		return -ENOMEM;
+	for_each_online_cpu(cpu) {
+		INIT_WORK(work + cpu, func, info);
+		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
+				work + cpu);
+	}
+	flush_workqueue(keventd_wq);
+	kfree(work);
+	return 0;
+}
+
+void flush_scheduled_work(void)
+{
+	flush_workqueue(keventd_wq);
+}
+
+/**
+ * cancel_rearming_delayed_workqueue - reliably kill off a delayed
+ *			work whose handler rearms the delayed work.
+ * @wq:   the controlling workqueue structure
+ * @work: the delayed work struct
+ */
+void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
+				       struct work_struct *work)
+{
+	while (!cancel_delayed_work(work))
+		flush_workqueue(wq);
+}
+EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
+
+/**
+ * cancel_rearming_delayed_work - reliably kill off a delayed keventd
+ *			work whose handler rearms the delayed work.
+ * @work: the delayed work struct
+ */
+void cancel_rearming_delayed_work(struct work_struct *work)
+{
+	cancel_rearming_delayed_workqueue(keventd_wq, work);
+}
+EXPORT_SYMBOL(cancel_rearming_delayed_work);
+
+/**
+ * execute_in_process_context - reliably execute the routine with user context
+ * @fn:		the function to execute
+ * @data:	data to pass to the function
+ * @ew:		guaranteed storage for the execute work structure (must
+ *		be available when the work executes)
+ *
+ * Executes the function immediately if process context is available,
+ * otherwise schedules the function for delayed execution.
+ *
+ * Returns:	0 - function was executed
+ *		1 - function was scheduled for execution
+ */
+int execute_in_process_context(void (*fn)(void *data), void *data,
+			       struct execute_work *ew)
+{
+	if (!in_interrupt()) {
+		fn(data);
+		return 0;
+	}
+
+	INIT_WORK(&ew->work, fn, data);
+	schedule_work(&ew->work);
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(execute_in_process_context);
+
+int keventd_up(void)
+{
+	return keventd_wq != NULL;
+}
+
+int current_is_keventd(void)
+{
+	struct cpu_workqueue_struct *cwq;
+	int cpu = smp_processor_id();	/* preempt-safe: keventd is per-cpu */
+	int ret = 0;
+
+	BUG_ON(!keventd_wq);
+
+	cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
+	if (current == cwq->thread)
+		ret = 1;
+
+	return ret;
+
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* Take the work from this (downed) CPU. */
+static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
+{
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	LIST_HEAD(list);
+	struct work_struct *work;
+
+	spin_lock_irq(&cwq->lock);
+	list_splice_init(&cwq->worklist, &list);
+
+	while (!list_empty(&list)) {
+		printk("Taking work for %s\n", wq->name);
+		work = list_entry(list.next,struct work_struct,entry);
+		list_del(&work->entry);
+		__queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
+	}
+	spin_unlock_irq(&cwq->lock);
+}
+
+/* We're holding the cpucontrol mutex here */
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	unsigned int hotcpu = (unsigned long)hcpu;
+	struct workqueue_struct *wq;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		/* Create a new workqueue thread for it. */
+		list_for_each_entry(wq, &workqueues, list) {
+			if (!create_workqueue_thread(wq, hotcpu)) {
+				printk("workqueue for %i failed\n", hotcpu);
+				return NOTIFY_BAD;
+			}
+		}
+		break;
+
+	case CPU_ONLINE:
+		/* Kick off worker threads. */
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq;
+
+			cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
+			kthread_bind(cwq->thread, hotcpu);
+			wake_up_process(cwq->thread);
+		}
+		break;
+
+	case CPU_UP_CANCELED:
+		list_for_each_entry(wq, &workqueues, list) {
+			/* Unbind so it can run. */
+			kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
+				     any_online_cpu(cpu_online_map));
+			cleanup_workqueue_thread(wq, hotcpu);
+		}
+		break;
+
+	case CPU_DEAD:
+		list_for_each_entry(wq, &workqueues, list)
+			cleanup_workqueue_thread(wq, hotcpu);
+		list_for_each_entry(wq, &workqueues, list)
+			take_over_work(wq, hotcpu);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
+void init_workqueues(void)
+{
+	singlethread_cpu = first_cpu(cpu_possible_map);
+	hotcpu_notifier(workqueue_cpu_callback, 0);
+	keventd_wq = create_workqueue("events");
+	BUG_ON(!keventd_wq);
+}
+
+EXPORT_SYMBOL_GPL(__create_workqueue);
+EXPORT_SYMBOL_GPL(queue_work);
+EXPORT_SYMBOL_GPL(queue_delayed_work);
+EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL_GPL(destroy_workqueue);
+
+EXPORT_SYMBOL(schedule_work);
+EXPORT_SYMBOL(schedule_delayed_work);
+EXPORT_SYMBOL(schedule_delayed_work_on);
+EXPORT_SYMBOL(flush_scheduled_work);
diff -urN oldtree/lib/Kconfig newtree/lib/Kconfig
--- oldtree/lib/Kconfig	2006-01-03 03:21:10.000000000 +0000
+++ newtree/lib/Kconfig	2006-03-08 15:22:33.385513750 +0000
@@ -38,6 +38,9 @@
 	  require M here.  See Castagnoli93.
 	  Module will be libcrc32c.
 
+config DYN_PAGEFLAGS
+	bool
+
 #
 # compression support is select'ed if needed
 #
diff -urN oldtree/lib/Makefile newtree/lib/Makefile
--- oldtree/lib/Makefile	2006-03-08 18:48:02.988065750 +0000
+++ newtree/lib/Makefile	2006-03-08 15:22:33.389514000 +0000
@@ -29,6 +29,8 @@
   lib-y += dec_and_lock.o
 endif
 
+obj-$(CONFIG_DYN_PAGEFLAGS) += dyn_pageflags.o
+
 obj-$(CONFIG_CRC_CCITT)	+= crc-ccitt.o
 obj-$(CONFIG_CRC16)	+= crc16.o
 obj-$(CONFIG_CRC32)	+= crc32.o
diff -urN oldtree/lib/Makefile.orig newtree/lib/Makefile.orig
--- oldtree/lib/Makefile.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/lib/Makefile.orig	2006-03-08 15:21:19.260881250 +0000
@@ -0,0 +1,59 @@
+#
+# Makefile for some libs needed in the kernel.
+#
+
+lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
+	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
+	 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
+	 sha1.o
+
+lib-y	+= kobject.o kref.o kobject_uevent.o klist.o
+
+obj-y += sort.o parser.o halfmd4.o iomap_copy.o
+
+ifeq ($(CONFIG_DEBUG_KOBJECT),y)
+CFLAGS_kobject.o += -DDEBUG
+CFLAGS_kobject_uevent.o += -DDEBUG
+endif
+
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
+lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
+lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
+lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
+lib-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
+obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
+
+ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
+  lib-y += dec_and_lock.o
+endif
+
+obj-$(CONFIG_CRC_CCITT)	+= crc-ccitt.o
+obj-$(CONFIG_CRC16)	+= crc16.o
+obj-$(CONFIG_CRC32)	+= crc32.o
+obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
+obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
+obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
+
+obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
+obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
+obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
+
+obj-$(CONFIG_TEXTSEARCH) += textsearch.o
+obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
+obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
+obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
+
+obj-$(CONFIG_SWIOTLB) += swiotlb.o
+
+hostprogs-y	:= gen_crc32table
+clean-files	:= crc32table.h
+
+$(obj)/crc32.o: $(obj)/crc32table.h
+
+quiet_cmd_crc32 = GEN     $@
+      cmd_crc32 = $< > $@
+
+$(obj)/crc32table.h: $(obj)/gen_crc32table
+	$(call cmd,crc32)
diff -urN oldtree/lib/dyn_pageflags.c newtree/lib/dyn_pageflags.c
--- oldtree/lib/dyn_pageflags.c	1970-01-01 00:00:00.000000000 +0000
+++ newtree/lib/dyn_pageflags.c	2006-03-08 17:07:59.696883500 +0000
@@ -0,0 +1,328 @@
+/*
+ * lib/dyn_pageflags.c
+ *
+ * Copyright (C) 2004-2006 Nigel Cunningham <nigel@suspend2.net>
+ * 
+ * This file is released under the GPLv2.
+ *
+ * Routines for dynamically allocating and releasing bitmaps
+ * used as pseudo-pageflags.
+ *
+ * Arrays are not contiguous. The first sizeof(void *) bytes are
+ * the pointer to the next page in the bitmap. This allows us to
+ * work under low memory conditions where order 0 might be all
+ * that's available. In their original use (suspend2), it also
+ * lets us save the pages at suspend time, reload and relocate them
+ * as necessary at resume time without much effort.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/dyn_pageflags.h>
+#include <linux/bootmem.h>
+
+#define page_to_zone_offset(pg) (page_to_pfn(pg) - page_zone(pg)->zone_start_pfn)
+
+/* 
+ * num_zones
+ * 
+ * How many zones are there?
+ *
+ */
+
+static int num_zones(void)
+{
+	int result = 0;
+	struct zone *zone;
+
+	for_each_zone(zone)
+		result++;
+
+	return result;
+}
+
+/* 
+ * pages_for_zone(struct zone *zone)
+ * 
+ * How many pages do we need for a bitmap for this zone?
+ *
+ */
+
+static int pages_for_zone(struct zone *zone)
+{
+	return (zone->spanned_pages + (PAGE_SIZE << 3) - 1) >>
+			(PAGE_SHIFT + 3);
+}
+
+/*
+ * page_zone_number(struct page *page)
+ *
+ * Which zone index does the page match?
+ *
+ */
+
+static int page_zone_number(struct page *page)
+{
+	struct zone *zone, *zone_sought = page_zone(page);
+	int zone_num = 0;
+
+	for_each_zone(zone)
+		if (zone == zone_sought)
+			return zone_num;
+		else
+			zone_num++;
+
+	printk("Was looking for a zone for page %p.\n", page);
+	BUG_ON(1);
+
+	return 0;
+}
+
+/*
+ * dyn_pageflags_pages_per_bitmap
+ *
+ * Number of pages needed for a bitmap covering all zones.
+ *
+ */
+int dyn_pageflags_pages_per_bitmap(void)
+{
+	int total = 0;
+	struct zone *zone;
+
+	for_each_zone(zone)
+		total += pages_for_zone(zone);
+
+	return total;
+}
+
+/* 
+ * clear_dyn_pageflags(dyn_pageflags_t pagemap)
+ *
+ * Clear an array used to store local page flags.
+ *
+ */
+
+void clear_dyn_pageflags(dyn_pageflags_t pagemap)
+{
+	int i = 0, zone_num = 0;
+	struct zone *zone;
+	
+	BUG_ON(!pagemap);
+
+	for_each_zone(zone) {
+		for (i = 0; i < pages_for_zone(zone); i++)
+			memset((pagemap[zone_num][i]), 0, PAGE_SIZE);
+		zone_num++;
+	}
+}
+
+/* 
+ * allocate_dyn_pageflags(dyn_pageflags_t *pagemap)
+ *
+ * Allocate a bitmap for dynamic page flags.
+ *
+ */
+int allocate_dyn_pageflags(dyn_pageflags_t *pagemap)
+{
+	int i, zone_num = 0;
+	struct zone *zone;
+
+	BUG_ON(*pagemap);
+
+	*pagemap = kmalloc(sizeof(void *) * num_zones(), GFP_ATOMIC);
+
+	if (!*pagemap)
+		return -ENOMEM;
+
+	for_each_zone(zone) {
+		int zone_pages = pages_for_zone(zone);
+		(*pagemap)[zone_num] = kmalloc(sizeof(void *) * zone_pages,
+					       GFP_ATOMIC);
+
+		if (!(*pagemap)[zone_num]) {
+			kfree (*pagemap);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < zone_pages; i++) {
+			unsigned long address = get_zeroed_page(GFP_ATOMIC);
+			(*pagemap)[zone_num][i] = (unsigned long *) address;
+			if (!(*pagemap)[zone_num][i]) {
+				printk("Error. Unable to allocate memory for "
+					"dynamic pageflags.");
+				free_dyn_pageflags(pagemap);
+				return -ENOMEM;
+			}
+		}
+		zone_num++;
+	}
+
+	return 0;
+}
+
+/* 
+ * free_dyn_pageflags(dyn_pageflags_t *pagemap)
+ *
+ * Free a dynamically allocated pageflags bitmap. For Suspend2 usage, we
+ * support data being relocated from slab to pages that don't conflict
+ * with the image that will be copied back. This is the reason for the
+ * PageSlab tests below.
+ *
+ */
+void free_dyn_pageflags(dyn_pageflags_t *pagemap)
+{
+	int i = 0, zone_num = 0;
+	struct zone *zone;
+
+	if (!*pagemap)
+		return;
+	
+	for_each_zone(zone) {
+		int zone_pages = pages_for_zone(zone);
+
+		if (!((*pagemap)[zone_num]))
+			continue;
+		for (i = 0; i < zone_pages; i++)
+			if ((*pagemap)[zone_num][i])
+				free_page((unsigned long) (*pagemap)[zone_num][i]);
+	
+		if (PageSlab(virt_to_page((*pagemap)[zone_num])))
+			kfree((*pagemap)[zone_num]);
+		else
+			free_page((unsigned long) (*pagemap)[zone_num]);
+
+		zone_num++;
+	}
+
+	if (PageSlab(virt_to_page((*pagemap))))
+		kfree(*pagemap);
+	else
+		free_page((unsigned long) (*pagemap));
+
+	*pagemap = NULL;
+	return;
+}
+
+/*
+ * dyn_pageflags_ul_ptr(dyn_pageflags_t *bitmap, struct page *pg)
+ *
+ * Get a pointer to the unsigned long containing the flag in the bitmap
+ * for the given page.
+ *
+ */
+
+unsigned long *dyn_pageflags_ul_ptr(dyn_pageflags_t *bitmap, struct page *pg)
+{
+	int zone_pfn = page_to_zone_offset(pg);
+	int zone_num = page_zone_number(pg);
+	int pagenum = PAGENUMBER(zone_pfn);
+	int page_offset = PAGEINDEX(zone_pfn);
+	return ((*bitmap)[zone_num][pagenum]) + page_offset;
+}
+
+/*
+ * test_dynpageflag(dyn_pageflags_t *bitmap, struct page *page)
+ *
+ * Is the page flagged in the given bitmap?
+ *
+ */
+
+int test_dynpageflag(dyn_pageflags_t *bitmap, struct page *page)
+{
+	unsigned long *ul = dyn_pageflags_ul_ptr(bitmap, page);
+	int zone_offset = page_to_zone_offset(page);
+	int bit = PAGEBIT(zone_offset);
+	
+	return test_bit(bit, ul);
+}
+
+/*
+ * set_dynpageflag(dyn_pageflags_t *bitmap, struct page *page)
+ *
+ * Set the flag for the page in the given bitmap.
+ *
+ */
+
+void set_dynpageflag(dyn_pageflags_t *bitmap, struct page *page)
+{
+	unsigned long *ul = dyn_pageflags_ul_ptr(bitmap, page);
+	int zone_offset = page_to_zone_offset(page);
+	int bit = PAGEBIT(zone_offset);
+	set_bit(bit, ul);
+}
+
+/*
+ * clear_dynpageflags(dyn_pageflags_t *bitmap, struct page *page)
+ *
+ * Clear the flag for the page in the given bitmap.
+ *
+ */
+
+void clear_dynpageflag(dyn_pageflags_t *bitmap, struct page *page)
+{
+	unsigned long *ul = dyn_pageflags_ul_ptr(bitmap, page);
+	int zone_offset = page_to_zone_offset(page);
+	int bit = PAGEBIT(zone_offset);
+	clear_bit(bit, ul);
+}
+
+/*
+ * get_next_bit_on(dyn_pageflags_t bitmap, int counter)
+ *
+ * Given a pfn (possibly -1), find the next pfn in the bitmap that
+ * is set. If there are no more flags set, return max_pfn.
+ *
+ */
+
+int get_next_bit_on(dyn_pageflags_t bitmap, int counter)
+{
+	struct page *page;
+	struct zone *zone;
+	unsigned long *ul;
+	int zone_offset, pagebit, zone_num, first;
+
+	BUG_ON(counter == max_pfn);
+
+	first = (counter == -1);
+	
+	if (first)
+	page = pfn_to_page(counter);
+	zone = page_zone(page);
+	zone_num = page_zone_number(page);
+
+	if (!first)
+		counter++;
+	
+	zone_offset = counter - zone->zone_start_pfn;
+
+	do {
+		if (zone_offset >= zone->spanned_pages) {
+			do {
+				zone = next_zone(zone);
+				if (!zone)
+					return max_pfn;
+				zone_num++;
+			} while(!zone->spanned_pages);
+
+			counter = zone->zone_start_pfn;
+			zone_offset = 0;
+			page = pfn_to_page(counter);
+		}
+		
+		/*
+		 * This could be optimised, but there are more
+		 * important things and the code is simple at
+		 * the moment 
+		 */
+		ul = (bitmap[zone_num][PAGENUMBER(zone_offset)]) + PAGEINDEX(zone_offset);
+		
+		pagebit = PAGEBIT(zone_offset);
+
+		counter++;
+		zone_offset++;
+		page = pfn_to_page(counter);
+	
+	} while((counter <= max_pfn) && (!test_bit(pagebit, ul)));
+	return counter - 1;
+}
+
diff -urN oldtree/lib/vsprintf.c newtree/lib/vsprintf.c
--- oldtree/lib/vsprintf.c	2006-01-03 03:21:10.000000000 +0000
+++ newtree/lib/vsprintf.c	2006-03-08 15:22:33.393514250 +0000
@@ -236,6 +236,34 @@
 	return buf;
 }
 
+/*
+ * vsnprintf_used
+ *
+ * Functionality    : Print a string with parameters to a buffer of a 
+ *                    limited size. Unlike vsnprintf, we return the number
+ *                    of bytes actually put in the buffer, not the number
+ *                    that would have been put in if it was big enough.
+ */
+int snprintf_used(char *buffer, int buffer_size, const char *fmt, ...)
+{
+	int result;
+	va_list args;
+
+	if (!buffer_size) {
+		return 0;
+	}
+
+	va_start(args, fmt);
+	result = vsnprintf(buffer, buffer_size, fmt, args);
+	va_end(args);
+
+	if (result > buffer_size) {
+		return buffer_size;
+	}
+
+	return result;
+}
+
 /**
  * vsnprintf - Format a string and place it in a buffer
  * @buf: The buffer to place the result into
diff -urN oldtree/mm/memory.c newtree/mm/memory.c
--- oldtree/mm/memory.c	2006-03-08 18:48:03.000066500 +0000
+++ newtree/mm/memory.c	2006-03-08 15:22:33.397514500 +0000
@@ -953,6 +953,15 @@
 	return page;
 }
 
+/*
+ * We want the address of the page for Suspend2 to mark as being in pageset1. 
+ */
+
+struct page *suspend2_follow_page(struct mm_struct *mm, unsigned long address)
+{
+	return follow_page(mm->mmap, address, 0);
+}
+
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
diff -urN oldtree/mm/memory.c.orig newtree/mm/memory.c.orig
--- oldtree/mm/memory.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/mm/memory.c.orig	2006-03-08 15:21:19.272882000 +0000
@@ -0,0 +1,2437 @@
+/*
+ *  linux/mm/memory.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ */
+
+/*
+ * demand-loading started 01.12.91 - seems it is high on the list of
+ * things wanted, and it should be easy to implement. - Linus
+ */
+
+/*
+ * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
+ * pages started 02.12.91, seems to work. - Linus.
+ *
+ * Tested sharing by executing about 30 /bin/sh: under the old kernel it
+ * would have taken more than the 6M I have free, but it worked well as
+ * far as I could see.
+ *
+ * Also corrected some "invalidate()"s - I wasn't doing enough of them.
+ */
+
+/*
+ * Real VM (paging to/from disk) started 18.12.91. Much more work and
+ * thought has to go into this. Oh, well..
+ * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
+ *		Found it. Everything seems to work now.
+ * 20.12.91  -  Ok, making the swap-device changeable like the root.
+ */
+
+/*
+ * 05.04.94  -  Multi-page memory management added for v1.1.
+ * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
+ *
+ * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
+ *		(Gerhard.Wichert@pdb.siemens.de)
+ *
+ * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+
+#include <linux/swapops.h>
+#include <linux/elf.h>
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+/* use the per-pgdat data instead for discontigmem - mbligh */
+unsigned long max_mapnr;
+struct page *mem_map;
+
+EXPORT_SYMBOL(max_mapnr);
+EXPORT_SYMBOL(mem_map);
+#endif
+
+unsigned long num_physpages;
+/*
+ * A number of key systems in x86 including ioremap() rely on the assumption
+ * that high_memory defines the upper bound on direct map memory, then end
+ * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
+ * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
+ * and ZONE_HIGHMEM.
+ */
+void * high_memory;
+unsigned long vmalloc_earlyreserve;
+
+EXPORT_SYMBOL(num_physpages);
+EXPORT_SYMBOL(high_memory);
+EXPORT_SYMBOL(vmalloc_earlyreserve);
+
+int randomize_va_space __read_mostly = 1;
+
+static int __init disable_randmaps(char *s)
+{
+	randomize_va_space = 0;
+	return 0;
+}
+__setup("norandmaps", disable_randmaps);
+
+
+/*
+ * If a p?d_bad entry is found while walking page tables, report
+ * the error, before resetting entry to p?d_none.  Usually (but
+ * very seldom) called out from the p?d_none_or_clear_bad macros.
+ */
+
+void pgd_clear_bad(pgd_t *pgd)
+{
+	pgd_ERROR(*pgd);
+	pgd_clear(pgd);
+}
+
+void pud_clear_bad(pud_t *pud)
+{
+	pud_ERROR(*pud);
+	pud_clear(pud);
+}
+
+void pmd_clear_bad(pmd_t *pmd)
+{
+	pmd_ERROR(*pmd);
+	pmd_clear(pmd);
+}
+
+/*
+ * Note: this doesn't free the actual pages themselves. That
+ * has been handled earlier when unmapping all the memory regions.
+ */
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
+{
+	struct page *page = pmd_page(*pmd);
+	pmd_clear(pmd);
+	pte_lock_deinit(page);
+	pte_free_tlb(tlb, page);
+	dec_page_state(nr_page_table_pages);
+	tlb->mm->nr_ptes--;
+}
+
+static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+				unsigned long addr, unsigned long end,
+				unsigned long floor, unsigned long ceiling)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	unsigned long start;
+
+	start = addr;
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		free_pte_range(tlb, pmd);
+	} while (pmd++, addr = next, addr != end);
+
+	start &= PUD_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PUD_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
+
+	pmd = pmd_offset(pud, start);
+	pud_clear(pud);
+	pmd_free_tlb(tlb, pmd);
+}
+
+static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+				unsigned long addr, unsigned long end,
+				unsigned long floor, unsigned long ceiling)
+{
+	pud_t *pud;
+	unsigned long next;
+	unsigned long start;
+
+	start = addr;
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+	} while (pud++, addr = next, addr != end);
+
+	start &= PGDIR_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PGDIR_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
+
+	pud = pud_offset(pgd, start);
+	pgd_clear(pgd);
+	pud_free_tlb(tlb, pud);
+}
+
+/*
+ * This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
+ */
+void free_pgd_range(struct mmu_gather **tlb,
+			unsigned long addr, unsigned long end,
+			unsigned long floor, unsigned long ceiling)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long start;
+
+	/*
+	 * The next few lines have given us lots of grief...
+	 *
+	 * Why are we testing PMD* at this top level?  Because often
+	 * there will be no work to do at all, and we'd prefer not to
+	 * go all the way down to the bottom just to discover that.
+	 *
+	 * Why all these "- 1"s?  Because 0 represents both the bottom
+	 * of the address space and the top of it (using -1 for the
+	 * top wouldn't help much: the masks would do the wrong thing).
+	 * The rule is that addr 0 and floor 0 refer to the bottom of
+	 * the address space, but end 0 and ceiling 0 refer to the top
+	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
+	 * that end 0 case should be mythical).
+	 *
+	 * Wherever addr is brought up or ceiling brought down, we must
+	 * be careful to reject "the opposite 0" before it confuses the
+	 * subsequent tests.  But what about where end is brought down
+	 * by PMD_SIZE below? no, end can't go down to 0 there.
+	 *
+	 * Whereas we round start (addr) and ceiling down, by different
+	 * masks at different levels, in order to test whether a table
+	 * now has no other vmas using it, so can be freed, we don't
+	 * bother to round floor or end up - the tests don't need that.
+	 */
+
+	addr &= PMD_MASK;
+	if (addr < floor) {
+		addr += PMD_SIZE;
+		if (!addr)
+			return;
+	}
+	if (ceiling) {
+		ceiling &= PMD_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		end -= PMD_SIZE;
+	if (addr > end - 1)
+		return;
+
+	start = addr;
+	pgd = pgd_offset((*tlb)->mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+	} while (pgd++, addr = next, addr != end);
+
+	if (!(*tlb)->fullmm)
+		flush_tlb_pgtables((*tlb)->mm, start, end);
+}
+
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+		unsigned long floor, unsigned long ceiling)
+{
+	while (vma) {
+		struct vm_area_struct *next = vma->vm_next;
+		unsigned long addr = vma->vm_start;
+
+		/*
+		 * Hide vma from rmap and vmtruncate before freeing pgtables
+		 */
+		anon_vma_unlink(vma);
+		unlink_file_vma(vma);
+
+		if (is_vm_hugetlb_page(vma)) {
+			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
+				floor, next? next->vm_start: ceiling);
+		} else {
+			/*
+			 * Optimization: gather nearby vmas into one call down
+			 */
+			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
+			       && !is_vm_hugetlb_page(next)) {
+				vma = next;
+				next = vma->vm_next;
+				anon_vma_unlink(vma);
+				unlink_file_vma(vma);
+			}
+			free_pgd_range(tlb, addr, vma->vm_end,
+				floor, next? next->vm_start: ceiling);
+		}
+		vma = next;
+	}
+}
+
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	struct page *new = pte_alloc_one(mm, address);
+	if (!new)
+		return -ENOMEM;
+
+	pte_lock_init(new);
+	spin_lock(&mm->page_table_lock);
+	if (pmd_present(*pmd)) {	/* Another has populated it */
+		pte_lock_deinit(new);
+		pte_free(new);
+	} else {
+		mm->nr_ptes++;
+		inc_page_state(nr_page_table_pages);
+		pmd_populate(mm, pmd, new);
+	}
+	spin_unlock(&mm->page_table_lock);
+	return 0;
+}
+
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+{
+	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+	if (!new)
+		return -ENOMEM;
+
+	spin_lock(&init_mm.page_table_lock);
+	if (pmd_present(*pmd))		/* Another has populated it */
+		pte_free_kernel(new);
+	else
+		pmd_populate_kernel(&init_mm, pmd, new);
+	spin_unlock(&init_mm.page_table_lock);
+	return 0;
+}
+
+static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+{
+	if (file_rss)
+		add_mm_counter(mm, file_rss, file_rss);
+	if (anon_rss)
+		add_mm_counter(mm, anon_rss, anon_rss);
+}
+
+/*
+ * This function is called to print an error when a bad pte
+ * is found. For example, we might have a PFN-mapped pte in
+ * a region that doesn't allow it.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+	printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+			"vm_flags = %lx, vaddr = %lx\n",
+		(long long)pte_val(pte),
+		(vma->vm_mm == current->mm ? current->comm : "???"),
+		vma->vm_flags, vaddr);
+	dump_stack();
+}
+
+static inline int is_cow_mapping(unsigned int flags)
+{
+	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+}
+
+/*
+ * This function gets the "struct page" associated with a pte.
+ *
+ * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
+ * will have each page table entry just pointing to a raw page frame
+ * number, and as far as the VM layer is concerned, those do not have
+ * pages associated with them - even if the PFN might point to memory
+ * that otherwise is perfectly fine and has a "struct page".
+ *
+ * The way we recognize those mappings is through the rules set up
+ * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
+ * and the vm_pgoff will point to the first PFN mapped: thus every
+ * page that is a raw mapping will always honor the rule
+ *
+ *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
+ *
+ * and if that isn't true, the page has been COW'ed (in which case it
+ * _does_ have a "struct page" associated with it even if it is in a
+ * VM_PFNMAP range).
+ */
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+{
+	unsigned long pfn = pte_pfn(pte);
+
+	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
+		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
+		if (pfn == vma->vm_pgoff + off)
+			return NULL;
+		if (!is_cow_mapping(vma->vm_flags))
+			return NULL;
+	}
+
+#ifdef CONFIG_DEBUG_VM
+	if (unlikely(!pfn_valid(pfn))) {
+		print_bad_pte(vma, pte, addr);
+		return NULL;
+	}
+#endif
+
+	/*
+	 * NOTE! We still have PageReserved() pages in the page 
+	 * tables. 
+	 *
+	 * The PAGE_ZERO() pages and various VDSO mappings can
+	 * cause them to exist.
+	 */
+	return pfn_to_page(pfn);
+}
+
+/*
+ * copy one vm_area from one task to the other. Assumes the page tables
+ * already present in the new task to be cleared in the whole range
+ * covered by this vma.
+ */
+
+static inline void
+copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+		unsigned long addr, int *rss)
+{
+	unsigned long vm_flags = vma->vm_flags;
+	pte_t pte = *src_pte;
+	struct page *page;
+
+	/* pte contains position in swap or file, so copy. */
+	if (unlikely(!pte_present(pte))) {
+		if (!pte_file(pte)) {
+			swap_duplicate(pte_to_swp_entry(pte));
+			/* make sure dst_mm is on swapoff's mmlist. */
+			if (unlikely(list_empty(&dst_mm->mmlist))) {
+				spin_lock(&mmlist_lock);
+				if (list_empty(&dst_mm->mmlist))
+					list_add(&dst_mm->mmlist,
+						 &src_mm->mmlist);
+				spin_unlock(&mmlist_lock);
+			}
+		}
+		goto out_set_pte;
+	}
+
+	/*
+	 * If it's a COW mapping, write protect it both
+	 * in the parent and the child
+	 */
+	if (is_cow_mapping(vm_flags)) {
+		ptep_set_wrprotect(src_mm, addr, src_pte);
+		pte = *src_pte;
+	}
+
+	/*
+	 * If it's a shared mapping, mark it clean in
+	 * the child
+	 */
+	if (vm_flags & VM_SHARED)
+		pte = pte_mkclean(pte);
+	pte = pte_mkold(pte);
+
+	page = vm_normal_page(vma, addr, pte);
+	if (page) {
+		get_page(page);
+		page_dup_rmap(page);
+		rss[!!PageAnon(page)]++;
+	}
+
+out_set_pte:
+	set_pte_at(dst_mm, addr, dst_pte, pte);
+}
+
+static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long end)
+{
+	pte_t *src_pte, *dst_pte;
+	spinlock_t *src_ptl, *dst_ptl;
+	int progress = 0;
+	int rss[2];
+
+again:
+	rss[1] = rss[0] = 0;
+	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
+	if (!dst_pte)
+		return -ENOMEM;
+	src_pte = pte_offset_map_nested(src_pmd, addr);
+	src_ptl = pte_lockptr(src_mm, src_pmd);
+	spin_lock(src_ptl);
+
+	do {
+		/*
+		 * We are holding two locks at this point - either of them
+		 * could generate latencies in another task on another CPU.
+		 */
+		if (progress >= 32) {
+			progress = 0;
+			if (need_resched() ||
+			    need_lockbreak(src_ptl) ||
+			    need_lockbreak(dst_ptl))
+				break;
+		}
+		if (pte_none(*src_pte)) {
+			progress++;
+			continue;
+		}
+		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+		progress += 8;
+	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+
+	spin_unlock(src_ptl);
+	pte_unmap_nested(src_pte - 1);
+	add_mm_rss(dst_mm, rss[0], rss[1]);
+	pte_unmap_unlock(dst_pte - 1, dst_ptl);
+	cond_resched();
+	if (addr != end)
+		goto again;
+	return 0;
+}
+
+static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long end)
+{
+	pmd_t *src_pmd, *dst_pmd;
+	unsigned long next;
+
+	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
+	if (!dst_pmd)
+		return -ENOMEM;
+	src_pmd = pmd_offset(src_pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(src_pmd))
+			continue;
+		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
+						vma, addr, next))
+			return -ENOMEM;
+	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long end)
+{
+	pud_t *src_pud, *dst_pud;
+	unsigned long next;
+
+	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
+	if (!dst_pud)
+		return -ENOMEM;
+	src_pud = pud_offset(src_pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(src_pud))
+			continue;
+		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
+						vma, addr, next))
+			return -ENOMEM;
+	} while (dst_pud++, src_pud++, addr = next, addr != end);
+	return 0;
+}
+
+int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		struct vm_area_struct *vma)
+{
+	pgd_t *src_pgd, *dst_pgd;
+	unsigned long next;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	/*
+	 * Don't copy ptes where a page fault will fill them correctly.
+	 * Fork becomes much lighter when there are big shared or private
+	 * readonly mappings. The tradeoff is that copy_page_range is more
+	 * efficient than faulting.
+	 */
+	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+		if (!vma->anon_vma)
+			return 0;
+	}
+
+	if (is_vm_hugetlb_page(vma))
+		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+
+	dst_pgd = pgd_offset(dst_mm, addr);
+	src_pgd = pgd_offset(src_mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(src_pgd))
+			continue;
+		if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+						vma, addr, next))
+			return -ENOMEM;
+	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+	return 0;
+}
+
+static unsigned long zap_pte_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pmd_t *pmd,
+				unsigned long addr, unsigned long end,
+				long *zap_work, struct zap_details *details)
+{
+	struct mm_struct *mm = tlb->mm;
+	pte_t *pte;
+	spinlock_t *ptl;
+	int file_rss = 0;
+	int anon_rss = 0;
+
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	do {
+		pte_t ptent = *pte;
+		if (pte_none(ptent)) {
+			(*zap_work)--;
+			continue;
+		}
+		if (pte_present(ptent)) {
+			struct page *page;
+
+			(*zap_work) -= PAGE_SIZE;
+
+			page = vm_normal_page(vma, addr, ptent);
+			if (unlikely(details) && page) {
+				/*
+				 * unmap_shared_mapping_pages() wants to
+				 * invalidate cache without truncating:
+				 * unmap shared but keep private pages.
+				 */
+				if (details->check_mapping &&
+				    details->check_mapping != page->mapping)
+					continue;
+				/*
+				 * Each page->index must be checked when
+				 * invalidating or truncating nonlinear.
+				 */
+				if (details->nonlinear_vma &&
+				    (page->index < details->first_index ||
+				     page->index > details->last_index))
+					continue;
+			}
+			ptent = ptep_get_and_clear_full(mm, addr, pte,
+							tlb->fullmm);
+			tlb_remove_tlb_entry(tlb, pte, addr);
+			if (unlikely(!page))
+				continue;
+			if (unlikely(details) && details->nonlinear_vma
+			    && linear_page_index(details->nonlinear_vma,
+						addr) != page->index)
+				set_pte_at(mm, addr, pte,
+					   pgoff_to_pte(page->index));
+			if (PageAnon(page))
+				anon_rss--;
+			else {
+				if (pte_dirty(ptent))
+					set_page_dirty(page);
+				if (pte_young(ptent))
+					mark_page_accessed(page);
+				file_rss--;
+			}
+			page_remove_rmap(page);
+			tlb_remove_page(tlb, page);
+			continue;
+		}
+		/*
+		 * If details->check_mapping, we leave swap entries;
+		 * if details->nonlinear_vma, we leave file entries.
+		 */
+		if (unlikely(details))
+			continue;
+		if (!pte_file(ptent))
+			free_swap_and_cache(pte_to_swp_entry(ptent));
+		pte_clear_full(mm, addr, pte, tlb->fullmm);
+	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+
+	add_mm_rss(mm, file_rss, anon_rss);
+	pte_unmap_unlock(pte - 1, ptl);
+
+	return addr;
+}
+
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pud_t *pud,
+				unsigned long addr, unsigned long end,
+				long *zap_work, struct zap_details *details)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd)) {
+			(*zap_work)--;
+			continue;
+		}
+		next = zap_pte_range(tlb, vma, pmd, addr, next,
+						zap_work, details);
+	} while (pmd++, addr = next, (addr != end && *zap_work > 0));
+
+	return addr;
+}
+
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pgd_t *pgd,
+				unsigned long addr, unsigned long end,
+				long *zap_work, struct zap_details *details)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud)) {
+			(*zap_work)--;
+			continue;
+		}
+		next = zap_pmd_range(tlb, vma, pud, addr, next,
+						zap_work, details);
+	} while (pud++, addr = next, (addr != end && *zap_work > 0));
+
+	return addr;
+}
+
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end,
+				long *zap_work, struct zap_details *details)
+{
+	pgd_t *pgd;
+	unsigned long next;
+
+	if (details && !details->check_mapping && !details->nonlinear_vma)
+		details = NULL;
+
+	BUG_ON(addr >= end);
+	tlb_start_vma(tlb, vma);
+	pgd = pgd_offset(vma->vm_mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd)) {
+			(*zap_work)--;
+			continue;
+		}
+		next = zap_pud_range(tlb, vma, pgd, addr, next,
+						zap_work, details);
+	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
+	tlb_end_vma(tlb, vma);
+
+	return addr;
+}
+
+#ifdef CONFIG_PREEMPT
+# define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
+#else
+/* No preempt: go for improved straight-line efficiency */
+# define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
+#endif
+
+/**
+ * unmap_vmas - unmap a range of memory covered by a list of vma's
+ * @tlbp: address of the caller's struct mmu_gather
+ * @vma: the starting vma
+ * @start_addr: virtual address at which to start unmapping
+ * @end_addr: virtual address at which to end unmapping
+ * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
+ * @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * Returns the end address of the unmapping (restart addr if interrupted).
+ *
+ * Unmap all pages in the vma list.
+ *
+ * We aim to not hold locks for too long (for scheduling latency reasons).
+ * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
+ * return the ending mmu_gather to the caller.
+ *
+ * Only addresses between `start' and `end' will be unmapped.
+ *
+ * The VMA list must be sorted in ascending virtual address order.
+ *
+ * unmap_vmas() assumes that the caller will flush the whole unmapped address
+ * range after unmap_vmas() returns.  So the only responsibility here is to
+ * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
+ * drops the lock and schedules.
+ */
+unsigned long unmap_vmas(struct mmu_gather **tlbp,
+		struct vm_area_struct *vma, unsigned long start_addr,
+		unsigned long end_addr, unsigned long *nr_accounted,
+		struct zap_details *details)
+{
+	long zap_work = ZAP_BLOCK_SIZE;
+	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
+	int tlb_start_valid = 0;
+	unsigned long start = start_addr;
+	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
+	int fullmm = (*tlbp)->fullmm;
+
+	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
+		unsigned long end;
+
+		start = max(vma->vm_start, start_addr);
+		if (start >= vma->vm_end)
+			continue;
+		end = min(vma->vm_end, end_addr);
+		if (end <= vma->vm_start)
+			continue;
+
+		if (vma->vm_flags & VM_ACCOUNT)
+			*nr_accounted += (end - start) >> PAGE_SHIFT;
+
+		while (start != end) {
+			if (!tlb_start_valid) {
+				tlb_start = start;
+				tlb_start_valid = 1;
+			}
+
+			if (unlikely(is_vm_hugetlb_page(vma))) {
+				unmap_hugepage_range(vma, start, end);
+				zap_work -= (end - start) /
+						(HPAGE_SIZE / PAGE_SIZE);
+				start = end;
+			} else
+				start = unmap_page_range(*tlbp, vma,
+						start, end, &zap_work, details);
+
+			if (zap_work > 0) {
+				BUG_ON(start != end);
+				break;
+			}
+
+			tlb_finish_mmu(*tlbp, tlb_start, start);
+
+			if (need_resched() ||
+				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
+				if (i_mmap_lock) {
+					*tlbp = NULL;
+					goto out;
+				}
+				cond_resched();
+			}
+
+			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
+			tlb_start_valid = 0;
+			zap_work = ZAP_BLOCK_SIZE;
+		}
+	}
+out:
+	return start;	/* which is now the end (or restart) address */
+}
+
+/**
+ * zap_page_range - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of nonlinear truncation or shared cache invalidation
+ */
+unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
+		unsigned long size, struct zap_details *details)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather *tlb;
+	unsigned long end = address + size;
+	unsigned long nr_accounted = 0;
+
+	lru_add_drain();
+	tlb = tlb_gather_mmu(mm, 0);
+	update_hiwater_rss(mm);
+	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	if (tlb)
+		tlb_finish_mmu(tlb, address, end);
+	return end;
+}
+
+/*
+ * Do a quick page-table lookup for a single page.
+ */
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+			unsigned int flags)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+	struct page *page;
+	struct mm_struct *mm = vma->vm_mm;
+
+	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+	if (!IS_ERR(page)) {
+		BUG_ON(flags & FOLL_GET);
+		goto out;
+	}
+
+	page = NULL;
+	pgd = pgd_offset(mm, address);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		goto no_page_table;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		goto no_page_table;
+	
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		goto no_page_table;
+
+	if (pmd_huge(*pmd)) {
+		BUG_ON(flags & FOLL_GET);
+		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+		goto out;
+	}
+
+	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!ptep)
+		goto out;
+
+	pte = *ptep;
+	if (!pte_present(pte))
+		goto unlock;
+	if ((flags & FOLL_WRITE) && !pte_write(pte))
+		goto unlock;
+	page = vm_normal_page(vma, address, pte);
+	if (unlikely(!page))
+		goto unlock;
+
+	if (flags & FOLL_GET)
+		get_page(page);
+	if (flags & FOLL_TOUCH) {
+		if ((flags & FOLL_WRITE) &&
+		    !pte_dirty(pte) && !PageDirty(page))
+			set_page_dirty(page);
+		mark_page_accessed(page);
+	}
+unlock:
+	pte_unmap_unlock(ptep, ptl);
+out:
+	return page;
+
+no_page_table:
+	/*
+	 * When core dumping an enormous anonymous area that nobody
+	 * has touched so far, we don't want to allocate page tables.
+	 */
+	if (flags & FOLL_ANON) {
+		page = ZERO_PAGE(address);
+		if (flags & FOLL_GET)
+			get_page(page);
+		BUG_ON(flags & FOLL_WRITE);
+	}
+	return page;
+}
+
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		unsigned long start, int len, int write, int force,
+		struct page **pages, struct vm_area_struct **vmas)
+{
+	int i;
+	unsigned int vm_flags;
+
+	/* 
+	 * Require read or write permissions.
+	 * If 'force' is set, we only require the "MAY" flags.
+	 */
+	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+	i = 0;
+
+	do {
+		struct vm_area_struct *vma;
+		unsigned int foll_flags;
+
+		vma = find_extend_vma(mm, start);
+		if (!vma && in_gate_area(tsk, start)) {
+			unsigned long pg = start & PAGE_MASK;
+			struct vm_area_struct *gate_vma = get_gate_vma(tsk);
+			pgd_t *pgd;
+			pud_t *pud;
+			pmd_t *pmd;
+			pte_t *pte;
+			if (write) /* user gate pages are read-only */
+				return i ? : -EFAULT;
+			if (pg > TASK_SIZE)
+				pgd = pgd_offset_k(pg);
+			else
+				pgd = pgd_offset_gate(mm, pg);
+			BUG_ON(pgd_none(*pgd));
+			pud = pud_offset(pgd, pg);
+			BUG_ON(pud_none(*pud));
+			pmd = pmd_offset(pud, pg);
+			if (pmd_none(*pmd))
+				return i ? : -EFAULT;
+			pte = pte_offset_map(pmd, pg);
+			if (pte_none(*pte)) {
+				pte_unmap(pte);
+				return i ? : -EFAULT;
+			}
+			if (pages) {
+				struct page *page = vm_normal_page(gate_vma, start, *pte);
+				pages[i] = page;
+				if (page)
+					get_page(page);
+			}
+			pte_unmap(pte);
+			if (vmas)
+				vmas[i] = gate_vma;
+			i++;
+			start += PAGE_SIZE;
+			len--;
+			continue;
+		}
+
+		if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
+				|| !(vm_flags & vma->vm_flags))
+			return i ? : -EFAULT;
+
+		if (is_vm_hugetlb_page(vma)) {
+			i = follow_hugetlb_page(mm, vma, pages, vmas,
+						&start, &len, i);
+			continue;
+		}
+
+		foll_flags = FOLL_TOUCH;
+		if (pages)
+			foll_flags |= FOLL_GET;
+		if (!write && !(vma->vm_flags & VM_LOCKED) &&
+		    (!vma->vm_ops || !vma->vm_ops->nopage))
+			foll_flags |= FOLL_ANON;
+
+		do {
+			struct page *page;
+
+			if (write)
+				foll_flags |= FOLL_WRITE;
+
+			cond_resched();
+			while (!(page = follow_page(vma, start, foll_flags))) {
+				int ret;
+				ret = __handle_mm_fault(mm, vma, start,
+						foll_flags & FOLL_WRITE);
+				/*
+				 * The VM_FAULT_WRITE bit tells us that do_wp_page has
+				 * broken COW when necessary, even if maybe_mkwrite
+				 * decided not to set pte_write. We can thus safely do
+				 * subsequent page lookups as if they were reads.
+				 */
+				if (ret & VM_FAULT_WRITE)
+					foll_flags &= ~FOLL_WRITE;
+				
+				switch (ret & ~VM_FAULT_WRITE) {
+				case VM_FAULT_MINOR:
+					tsk->min_flt++;
+					break;
+				case VM_FAULT_MAJOR:
+					tsk->maj_flt++;
+					break;
+				case VM_FAULT_SIGBUS:
+					return i ? i : -EFAULT;
+				case VM_FAULT_OOM:
+					return i ? i : -ENOMEM;
+				default:
+					BUG();
+				}
+			}
+			if (pages) {
+				pages[i] = page;
+				flush_dcache_page(page);
+			}
+			if (vmas)
+				vmas[i] = vma;
+			i++;
+			start += PAGE_SIZE;
+			len--;
+		} while (len && start < vma->vm_end);
+	} while (len);
+	return i;
+}
+EXPORT_SYMBOL(get_user_pages);
+
+static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+			unsigned long addr, unsigned long end, pgprot_t prot)
+{
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	if (!pte)
+		return -ENOMEM;
+	do {
+		struct page *page = ZERO_PAGE(addr);
+		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+		page_cache_get(page);
+		page_add_file_rmap(page);
+		inc_mm_counter(mm, file_rss);
+		BUG_ON(!pte_none(*pte));
+		set_pte_at(mm, addr, pte, zero_pte);
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap_unlock(pte - 1, ptl);
+	return 0;
+}
+
+static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
+			unsigned long addr, unsigned long end, pgprot_t prot)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+		if (zeromap_pte_range(mm, pmd, addr, next, prot))
+			return -ENOMEM;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
+			unsigned long addr, unsigned long end, pgprot_t prot)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+		if (zeromap_pmd_range(mm, pud, addr, next, prot))
+			return -ENOMEM;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+int zeromap_page_range(struct vm_area_struct *vma,
+			unsigned long addr, unsigned long size, pgprot_t prot)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long end = addr + size;
+	struct mm_struct *mm = vma->vm_mm;
+	int err;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset(mm, addr);
+	flush_cache_range(vma, addr, end);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = zeromap_pud_range(mm, pgd, addr, next, prot);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+	return err;
+}
+
+pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+{
+	pgd_t * pgd = pgd_offset(mm, addr);
+	pud_t * pud = pud_alloc(mm, pgd, addr);
+	if (pud) {
+		pmd_t * pmd = pmd_alloc(mm, pud, addr);
+		if (pmd)
+			return pte_alloc_map_lock(mm, pmd, addr, ptl);
+	}
+	return NULL;
+}
+
+/*
+ * This is the old fallback for page remapping.
+ *
+ * For historical reasons, it only allows reserved pages. Only
+ * old drivers should use this, and they needed to mark their
+ * pages reserved for the old functions anyway.
+ */
+static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+{
+	int retval;
+	pte_t *pte;
+	spinlock_t *ptl;  
+
+	retval = -EINVAL;
+	if (PageAnon(page))
+		goto out;
+	retval = -ENOMEM;
+	flush_dcache_page(page);
+	pte = get_locked_pte(mm, addr, &ptl);
+	if (!pte)
+		goto out;
+	retval = -EBUSY;
+	if (!pte_none(*pte))
+		goto out_unlock;
+
+	/* Ok, finally just insert the thing.. */
+	get_page(page);
+	inc_mm_counter(mm, file_rss);
+	page_add_file_rmap(page);
+	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+
+	retval = 0;
+out_unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
+	return retval;
+}
+
+/*
+ * This allows drivers to insert individual pages they've allocated
+ * into a user vma.
+ *
+ * The page has to be a nice clean _individual_ kernel allocation.
+ * If you allocate a compound page, you need to have marked it as
+ * such (__GFP_COMP), or manually just split the page up yourself
+ * (see split_page()).
+ *
+ * NOTE! Traditionally this was done with "remap_pfn_range()" which
+ * took an arbitrary page protection parameter. This doesn't allow
+ * that. Your vma protection will have to be set up correctly, which
+ * means that if you want a shared writable mapping, you'd better
+ * ask for a shared writable mapping!
+ *
+ * The page does not need to be reserved.
+ */
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+{
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return -EFAULT;
+	if (!page_count(page))
+		return -EINVAL;
+	vma->vm_flags |= VM_INSERTPAGE;
+	return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_page);
+
+/*
+ * maps a range of physical memory into the requested pages. the old
+ * mappings are removed. any references to nonexistent pages results
+ * in null mappings (currently treated as "copy-on-access")
+ */
+static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+			unsigned long addr, unsigned long end,
+			unsigned long pfn, pgprot_t prot)
+{
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	if (!pte)
+		return -ENOMEM;
+	do {
+		BUG_ON(!pte_none(*pte));
+		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+		pfn++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap_unlock(pte - 1, ptl);
+	return 0;
+}
+
+static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
+			unsigned long addr, unsigned long end,
+			unsigned long pfn, pgprot_t prot)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pfn -= addr >> PAGE_SHIFT;
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+		if (remap_pte_range(mm, pmd, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot))
+			return -ENOMEM;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
+			unsigned long addr, unsigned long end,
+			unsigned long pfn, pgprot_t prot)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pfn -= addr >> PAGE_SHIFT;
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+		if (remap_pmd_range(mm, pud, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot))
+			return -ENOMEM;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+/*  Note: this is only safe if the mm semaphore is held when called. */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+		    unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long end = addr + PAGE_ALIGN(size);
+	struct mm_struct *mm = vma->vm_mm;
+	int err;
+
+	/*
+	 * Physically remapped pages are special. Tell the
+	 * rest of the world about it:
+	 *   VM_IO tells people not to look at these pages
+	 *	(accesses can have side effects).
+	 *   VM_RESERVED is specified all over the place, because
+	 *	in 2.4 it kept swapout's vma scan off this vma; but
+	 *	in 2.6 the LRU scan won't even find its pages, so this
+	 *	flag means no more than count its pages in reserved_vm,
+	 * 	and omit it from core dump, even when VM_IO turned off.
+	 *   VM_PFNMAP tells the core MM that the base pages are just
+	 *	raw PFN mappings, and do not have a "struct page" associated
+	 *	with them.
+	 *
+	 * There's a horrible special case to handle copy-on-write
+	 * behaviour that some programs depend on. We mark the "original"
+	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+	 */
+	if (is_cow_mapping(vma->vm_flags)) {
+		if (addr != vma->vm_start || end != vma->vm_end)
+			return -EINVAL;
+		vma->vm_pgoff = pfn;
+	}
+
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+	BUG_ON(addr >= end);
+	pfn -= addr >> PAGE_SHIFT;
+	pgd = pgd_offset(mm, addr);
+	flush_cache_range(vma, addr, end);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = remap_pud_range(mm, pgd, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+	return err;
+}
+EXPORT_SYMBOL(remap_pfn_range);
+
+/*
+ * handle_pte_fault chooses page fault handler according to an entry
+ * which was read non-atomically.  Before making any commitment, on
+ * those architectures or configurations (e.g. i386 with PAE) which
+ * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * must check under lock before unmapping the pte and proceeding
+ * (but do_wp_page is only called after already making such a check;
+ * and do_anonymous_page and do_no_page can safely check later on).
+ */
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
+				pte_t *page_table, pte_t orig_pte)
+{
+	int same = 1;
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+	if (sizeof(pte_t) > sizeof(unsigned long)) {
+		spinlock_t *ptl = pte_lockptr(mm, pmd);
+		spin_lock(ptl);
+		same = pte_same(*page_table, orig_pte);
+		spin_unlock(ptl);
+	}
+#endif
+	pte_unmap(page_table);
+	return same;
+}
+
+/*
+ * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
+ * servicing faults for write access.  In the normal case, do always want
+ * pte_mkwrite.  But get_user_pages can cause write faults for mappings
+ * that do not have writing enabled, when used by access_process_vm.
+ */
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+	if (likely(vma->vm_flags & VM_WRITE))
+		pte = pte_mkwrite(pte);
+	return pte;
+}
+
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+{
+	/*
+	 * If the source page was a PFN mapping, we don't have
+	 * a "struct page" for it. We do a best-effort copy by
+	 * just copying from the original user address. If that
+	 * fails, we just zero-fill it. Live with it.
+	 */
+	if (unlikely(!src)) {
+		void *kaddr = kmap_atomic(dst, KM_USER0);
+		void __user *uaddr = (void __user *)(va & PAGE_MASK);
+
+		/*
+		 * This really shouldn't fail, because the page is there
+		 * in the page tables. But it might just be unreadable,
+		 * in which case we just give up and fill the result with
+		 * zeroes.
+		 */
+		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+			memset(kaddr, 0, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+		
+	}
+	copy_user_highpage(dst, src, va);
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		spinlock_t *ptl, pte_t orig_pte)
+{
+	struct page *old_page, *new_page;
+	pte_t entry;
+	int ret = VM_FAULT_MINOR;
+
+	old_page = vm_normal_page(vma, address, orig_pte);
+	if (!old_page)
+		goto gotten;
+
+	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
+		int reuse = can_share_swap_page(old_page);
+		unlock_page(old_page);
+		if (reuse) {
+			flush_cache_page(vma, address, pte_pfn(orig_pte));
+			entry = pte_mkyoung(orig_pte);
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+			ptep_set_access_flags(vma, address, page_table, entry, 1);
+			update_mmu_cache(vma, address, entry);
+			lazy_mmu_prot_update(entry);
+			ret |= VM_FAULT_WRITE;
+			goto unlock;
+		}
+	}
+
+	/*
+	 * Ok, we need to copy. Oh, well..
+	 */
+	page_cache_get(old_page);
+gotten:
+	pte_unmap_unlock(page_table, ptl);
+
+	if (unlikely(anon_vma_prepare(vma)))
+		goto oom;
+	if (old_page == ZERO_PAGE(address)) {
+		new_page = alloc_zeroed_user_highpage(vma, address);
+		if (!new_page)
+			goto oom;
+	} else {
+		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+		if (!new_page)
+			goto oom;
+		cow_user_page(new_page, old_page, address);
+	}
+
+	/*
+	 * Re-check the pte - we dropped the lock
+	 */
+	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (likely(pte_same(*page_table, orig_pte))) {
+		if (old_page) {
+			page_remove_rmap(old_page);
+			if (!PageAnon(old_page)) {
+				dec_mm_counter(mm, file_rss);
+				inc_mm_counter(mm, anon_rss);
+			}
+		} else
+			inc_mm_counter(mm, anon_rss);
+		flush_cache_page(vma, address, pte_pfn(orig_pte));
+		entry = mk_pte(new_page, vma->vm_page_prot);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		ptep_establish(vma, address, page_table, entry);
+		update_mmu_cache(vma, address, entry);
+		lazy_mmu_prot_update(entry);
+		lru_cache_add_active(new_page);
+		page_add_new_anon_rmap(new_page, vma, address);
+
+		/* Free the old page.. */
+		new_page = old_page;
+		ret |= VM_FAULT_WRITE;
+	}
+	if (new_page)
+		page_cache_release(new_page);
+	if (old_page)
+		page_cache_release(old_page);
+unlock:
+	pte_unmap_unlock(page_table, ptl);
+	return ret;
+oom:
+	if (old_page)
+		page_cache_release(old_page);
+	return VM_FAULT_OOM;
+}
+
+/*
+ * Helper functions for unmap_mapping_range().
+ *
+ * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
+ *
+ * We have to restart searching the prio_tree whenever we drop the lock,
+ * since the iterator is only valid while the lock is held, and anyway
+ * a later vma might be split and reinserted earlier while lock dropped.
+ *
+ * The list of nonlinear vmas could be handled more efficiently, using
+ * a placeholder, but handle it in the same way until a need is shown.
+ * It is important to search the prio_tree before nonlinear list: a vma
+ * may become nonlinear and be shifted from prio_tree to nonlinear list
+ * while the lock is dropped; but never shifted from list to prio_tree.
+ *
+ * In order to make forward progress despite restarting the search,
+ * vm_truncate_count is used to mark a vma as now dealt with, so we can
+ * quickly skip it next time around.  Since the prio_tree search only
+ * shows us those vmas affected by unmapping the range in question, we
+ * can't efficiently keep all vmas in step with mapping->truncate_count:
+ * so instead reset them all whenever it wraps back to 0 (then go to 1).
+ * mapping->truncate_count and vma->vm_truncate_count are protected by
+ * i_mmap_lock.
+ *
+ * In order to make forward progress despite repeatedly restarting some
+ * large vma, note the restart_addr from unmap_vmas when it breaks out:
+ * and restart from that address when we reach that vma again.  It might
+ * have been split or merged, shrunk or extended, but never shifted: so
+ * restart_addr remains valid so long as it remains in the vma's range.
+ * unmap_mapping_range forces truncate_count to leap over page-aligned
+ * values so we can save vma's restart_addr in its truncate_count field.
+ */
+#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
+
+static void reset_vma_truncate_counts(struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+		vma->vm_truncate_count = 0;
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+		vma->vm_truncate_count = 0;
+}
+
+static int unmap_mapping_range_vma(struct vm_area_struct *vma,
+		unsigned long start_addr, unsigned long end_addr,
+		struct zap_details *details)
+{
+	unsigned long restart_addr;
+	int need_break;
+
+again:
+	restart_addr = vma->vm_truncate_count;
+	if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
+		start_addr = restart_addr;
+		if (start_addr >= end_addr) {
+			/* Top of vma has been split off since last time */
+			vma->vm_truncate_count = details->truncate_count;
+			return 0;
+		}
+	}
+
+	restart_addr = zap_page_range(vma, start_addr,
+					end_addr - start_addr, details);
+	need_break = need_resched() ||
+			need_lockbreak(details->i_mmap_lock);
+
+	if (restart_addr >= end_addr) {
+		/* We have now completed this vma: mark it so */
+		vma->vm_truncate_count = details->truncate_count;
+		if (!need_break)
+			return 0;
+	} else {
+		/* Note restart_addr in vma's truncate_count field */
+		vma->vm_truncate_count = restart_addr;
+		if (!need_break)
+			goto again;
+	}
+
+	spin_unlock(details->i_mmap_lock);
+	cond_resched();
+	spin_lock(details->i_mmap_lock);
+	return -EINTR;
+}
+
+static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+					    struct zap_details *details)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	pgoff_t vba, vea, zba, zea;
+
+restart:
+	vma_prio_tree_foreach(vma, &iter, root,
+			details->first_index, details->last_index) {
+		/* Skip quickly over those we have already dealt with */
+		if (vma->vm_truncate_count == details->truncate_count)
+			continue;
+
+		vba = vma->vm_pgoff;
+		vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
+		/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
+		zba = details->first_index;
+		if (zba < vba)
+			zba = vba;
+		zea = details->last_index;
+		if (zea > vea)
+			zea = vea;
+
+		if (unmap_mapping_range_vma(vma,
+			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
+			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
+				details) < 0)
+			goto restart;
+	}
+}
+
+static inline void unmap_mapping_range_list(struct list_head *head,
+					    struct zap_details *details)
+{
+	struct vm_area_struct *vma;
+
+	/*
+	 * In nonlinear VMAs there is no correspondence between virtual address
+	 * offset and file offset.  So we must perform an exhaustive search
+	 * across *all* the pages in each nonlinear VMA, not just the pages
+	 * whose virtual address lies outside the file truncation point.
+	 */
+restart:
+	list_for_each_entry(vma, head, shared.vm_set.list) {
+		/* Skip quickly over those we have already dealt with */
+		if (vma->vm_truncate_count == details->truncate_count)
+			continue;
+		details->nonlinear_vma = vma;
+		if (unmap_mapping_range_vma(vma, vma->vm_start,
+					vma->vm_end, details) < 0)
+			goto restart;
+	}
+}
+
+/**
+ * unmap_mapping_range - unmap the portion of all mmaps
+ * in the specified address_space corresponding to the specified
+ * page range in the underlying file.
+ * @mapping: the address space containing mmaps to be unmapped.
+ * @holebegin: byte in first page to unmap, relative to the start of
+ * the underlying file.  This will be rounded down to a PAGE_SIZE
+ * boundary.  Note that this is different from vmtruncate(), which
+ * must keep the partial page.  In contrast, we must get rid of
+ * partial pages.
+ * @holelen: size of prospective hole in bytes.  This will be rounded
+ * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
+ * end of the file.
+ * @even_cows: 1 when truncating a file, unmap even private COWed pages;
+ * but 0 when invalidating pagecache, don't throw away private data.
+ */
+void unmap_mapping_range(struct address_space *mapping,
+		loff_t const holebegin, loff_t const holelen, int even_cows)
+{
+	struct zap_details details;
+	pgoff_t hba = holebegin >> PAGE_SHIFT;
+	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	/* Check for overflow. */
+	if (sizeof(holelen) > sizeof(hlen)) {
+		long long holeend =
+			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (holeend & ~(long long)ULONG_MAX)
+			hlen = ULONG_MAX - hba + 1;
+	}
+
+	details.check_mapping = even_cows? NULL: mapping;
+	details.nonlinear_vma = NULL;
+	details.first_index = hba;
+	details.last_index = hba + hlen - 1;
+	if (details.last_index < details.first_index)
+		details.last_index = ULONG_MAX;
+	details.i_mmap_lock = &mapping->i_mmap_lock;
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	/* serialize i_size write against truncate_count write */
+	smp_wmb();
+	/* Protect against page faults, and endless unmapping loops */
+	mapping->truncate_count++;
+	/*
+	 * For archs where spin_lock has inclusive semantics like ia64
+	 * this smp_mb() will prevent to read pagetable contents
+	 * before the truncate_count increment is visible to
+	 * other cpus.
+	 */
+	smp_mb();
+	if (unlikely(is_restart_addr(mapping->truncate_count))) {
+		if (mapping->truncate_count == 0)
+			reset_vma_truncate_counts(mapping);
+		mapping->truncate_count++;
+	}
+	details.truncate_count = mapping->truncate_count;
+
+	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+		unmap_mapping_range_tree(&mapping->i_mmap, &details);
+	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
+		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
+	spin_unlock(&mapping->i_mmap_lock);
+}
+EXPORT_SYMBOL(unmap_mapping_range);
+
+/*
+ * Handle all mappings that got truncated by a "truncate()"
+ * system call.
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page.  Ugly, but necessary.
+ */
+int vmtruncate(struct inode * inode, loff_t offset)
+{
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long limit;
+
+	if (inode->i_size < offset)
+		goto do_expand;
+	/*
+	 * truncation of in-use swapfiles is disallowed - it would cause
+	 * subsequent swapout to scribble on the now-freed blocks.
+	 */
+	if (IS_SWAPFILE(inode))
+		goto out_busy;
+	i_size_write(inode, offset);
+	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+	truncate_inode_pages(mapping, offset);
+	goto out_truncate;
+
+do_expand:
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	if (limit != RLIM_INFINITY && offset > limit)
+		goto out_sig;
+	if (offset > inode->i_sb->s_maxbytes)
+		goto out_big;
+	i_size_write(inode, offset);
+
+out_truncate:
+	if (inode->i_op && inode->i_op->truncate)
+		inode->i_op->truncate(inode);
+	return 0;
+out_sig:
+	send_sig(SIGXFSZ, current, 0);
+out_big:
+	return -EFBIG;
+out_busy:
+	return -ETXTBSY;
+}
+EXPORT_SYMBOL(vmtruncate);
+
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * If the underlying filesystem is not going to provide
+	 * a way to truncate a range of blocks (punch a hole) -
+	 * we should return failure right now.
+	 */
+	if (!inode->i_op || !inode->i_op->truncate_range)
+		return -ENOSYS;
+
+	mutex_lock(&inode->i_mutex);
+	down_write(&inode->i_alloc_sem);
+	unmap_mapping_range(mapping, offset, (end - offset), 1);
+	truncate_inode_pages_range(mapping, offset, end);
+	inode->i_op->truncate_range(inode, offset, end);
+	up_write(&inode->i_alloc_sem);
+	mutex_unlock(&inode->i_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(vmtruncate_range);
+
+/* 
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time.  We also make sure to queue
+ * the 'original' request together with the readahead ones...  
+ *
+ * This has been extended to use the NUMA policies from the mm triggering
+ * the readahead.
+ *
+ * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
+ */
+void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
+{
+#ifdef CONFIG_NUMA
+	struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
+#endif
+	int i, num;
+	struct page *new_page;
+	unsigned long offset;
+
+	/*
+	 * Get the number of handles we should do readahead io to.
+	 */
+	num = valid_swaphandles(entry, &offset);
+	for (i = 0; i < num; offset++, i++) {
+		/* Ok, do the async read-ahead now */
+		new_page = read_swap_cache_async(swp_entry(swp_type(entry),
+							   offset), vma, addr);
+		if (!new_page)
+			break;
+		page_cache_release(new_page);
+#ifdef CONFIG_NUMA
+		/*
+		 * Find the next applicable VMA for the NUMA policy.
+		 */
+		addr += PAGE_SIZE;
+		if (addr == 0)
+			vma = NULL;
+		if (vma) {
+			if (addr >= vma->vm_end) {
+				vma = next_vma;
+				next_vma = vma ? vma->vm_next : NULL;
+			}
+			if (vma && addr < vma->vm_start)
+				vma = NULL;
+		} else {
+			if (next_vma && addr >= next_vma->vm_start) {
+				vma = next_vma;
+				next_vma = vma->vm_next;
+			}
+		}
+#endif
+	}
+	lru_add_drain();	/* Push any new pages onto the LRU now */
+}
+
+/*
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access, pte_t orig_pte)
+{
+	spinlock_t *ptl;
+	struct page *page;
+	swp_entry_t entry;
+	pte_t pte;
+	int ret = VM_FAULT_MINOR;
+
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+		goto out;
+
+	entry = pte_to_swp_entry(orig_pte);
+again:
+	page = lookup_swap_cache(entry);
+	if (!page) {
+ 		swapin_readahead(entry, address, vma);
+ 		page = read_swap_cache_async(entry, vma, address);
+		if (!page) {
+			/*
+			 * Back out if somebody else faulted in this pte
+			 * while we released the pte lock.
+			 */
+			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+			if (likely(pte_same(*page_table, orig_pte)))
+				ret = VM_FAULT_OOM;
+			goto unlock;
+		}
+
+		/* Had to read the page from swap area: Major fault */
+		ret = VM_FAULT_MAJOR;
+		inc_page_state(pgmajfault);
+		grab_swap_token();
+	}
+
+	mark_page_accessed(page);
+	lock_page(page);
+	if (!PageSwapCache(page)) {
+		/* Page migration has occured */
+		unlock_page(page);
+		page_cache_release(page);
+		goto again;
+	}
+
+	/*
+	 * Back out if somebody else already faulted in this pte.
+	 */
+	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (unlikely(!pte_same(*page_table, orig_pte)))
+		goto out_nomap;
+
+	if (unlikely(!PageUptodate(page))) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_nomap;
+	}
+
+	/* The page isn't present yet, go ahead with the fault. */
+
+	inc_mm_counter(mm, anon_rss);
+	pte = mk_pte(page, vma->vm_page_prot);
+	if (write_access && can_share_swap_page(page)) {
+		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+		write_access = 0;
+	}
+
+	flush_icache_page(vma, page);
+	set_pte_at(mm, address, page_table, pte);
+	page_add_anon_rmap(page, vma, address);
+
+	swap_free(entry);
+	if (vm_swap_full())
+		remove_exclusive_swap_page(page);
+	unlock_page(page);
+
+	if (write_access) {
+		if (do_wp_page(mm, vma, address,
+				page_table, pmd, ptl, pte) == VM_FAULT_OOM)
+			ret = VM_FAULT_OOM;
+		goto out;
+	}
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, address, pte);
+	lazy_mmu_prot_update(pte);
+unlock:
+	pte_unmap_unlock(page_table, ptl);
+out:
+	return ret;
+out_nomap:
+	pte_unmap_unlock(page_table, ptl);
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+}
+
+/*
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access)
+{
+	struct page *page;
+	spinlock_t *ptl;
+	pte_t entry;
+
+	if (write_access) {
+		/* Allocate our own private page. */
+		pte_unmap(page_table);
+
+		if (unlikely(anon_vma_prepare(vma)))
+			goto oom;
+		page = alloc_zeroed_user_highpage(vma, address);
+		if (!page)
+			goto oom;
+
+		entry = mk_pte(page, vma->vm_page_prot);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+
+		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+		if (!pte_none(*page_table))
+			goto release;
+		inc_mm_counter(mm, anon_rss);
+		lru_cache_add_active(page);
+		page_add_new_anon_rmap(page, vma, address);
+	} else {
+		/* Map the ZERO_PAGE - vm_page_prot is readonly */
+		page = ZERO_PAGE(address);
+		page_cache_get(page);
+		entry = mk_pte(page, vma->vm_page_prot);
+
+		ptl = pte_lockptr(mm, pmd);
+		spin_lock(ptl);
+		if (!pte_none(*page_table))
+			goto release;
+		inc_mm_counter(mm, file_rss);
+		page_add_file_rmap(page);
+	}
+
+	set_pte_at(mm, address, page_table, entry);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+unlock:
+	pte_unmap_unlock(page_table, ptl);
+	return VM_FAULT_MINOR;
+release:
+	page_cache_release(page);
+	goto unlock;
+oom:
+	return VM_FAULT_OOM;
+}
+
+/*
+ * do_no_page() tries to create a new page mapping. It aggressively
+ * tries to share with existing pages, but makes a separate copy if
+ * the "write_access" parameter is true in order to avoid the next
+ * page fault.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access)
+{
+	spinlock_t *ptl;
+	struct page *new_page;
+	struct address_space *mapping = NULL;
+	pte_t entry;
+	unsigned int sequence = 0;
+	int ret = VM_FAULT_MINOR;
+	int anon = 0;
+
+	pte_unmap(page_table);
+	BUG_ON(vma->vm_flags & VM_PFNMAP);
+
+	if (vma->vm_file) {
+		mapping = vma->vm_file->f_mapping;
+		sequence = mapping->truncate_count;
+		smp_rmb(); /* serializes i_size against truncate_count */
+	}
+retry:
+	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
+	/*
+	 * No smp_rmb is needed here as long as there's a full
+	 * spin_lock/unlock sequence inside the ->nopage callback
+	 * (for the pagecache lookup) that acts as an implicit
+	 * smp_mb() and prevents the i_size read to happen
+	 * after the next truncate_count read.
+	 */
+
+	/* no page was available -- either SIGBUS or OOM */
+	if (new_page == NOPAGE_SIGBUS)
+		return VM_FAULT_SIGBUS;
+	if (new_page == NOPAGE_OOM)
+		return VM_FAULT_OOM;
+
+	/*
+	 * Should we do an early C-O-W break?
+	 */
+	if (write_access && !(vma->vm_flags & VM_SHARED)) {
+		struct page *page;
+
+		if (unlikely(anon_vma_prepare(vma)))
+			goto oom;
+		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+		if (!page)
+			goto oom;
+		copy_user_highpage(page, new_page, address);
+		page_cache_release(new_page);
+		new_page = page;
+		anon = 1;
+	}
+
+	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	/*
+	 * For a file-backed vma, someone could have truncated or otherwise
+	 * invalidated this page.  If unmap_mapping_range got called,
+	 * retry getting the page.
+	 */
+	if (mapping && unlikely(sequence != mapping->truncate_count)) {
+		pte_unmap_unlock(page_table, ptl);
+		page_cache_release(new_page);
+		cond_resched();
+		sequence = mapping->truncate_count;
+		smp_rmb();
+		goto retry;
+	}
+
+	/*
+	 * This silly early PAGE_DIRTY setting removes a race
+	 * due to the bad i386 page protection. But it's valid
+	 * for other architectures too.
+	 *
+	 * Note that if write_access is true, we either now have
+	 * an exclusive copy of the page, or this is a shared mapping,
+	 * so we can make it writable and dirty to avoid having to
+	 * handle that later.
+	 */
+	/* Only go through if we didn't race with anybody else... */
+	if (pte_none(*page_table)) {
+		flush_icache_page(vma, new_page);
+		entry = mk_pte(new_page, vma->vm_page_prot);
+		if (write_access)
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		set_pte_at(mm, address, page_table, entry);
+		if (anon) {
+			inc_mm_counter(mm, anon_rss);
+			lru_cache_add_active(new_page);
+			page_add_new_anon_rmap(new_page, vma, address);
+		} else {
+			inc_mm_counter(mm, file_rss);
+			page_add_file_rmap(new_page);
+		}
+	} else {
+		/* One of our sibling threads was faster, back out. */
+		page_cache_release(new_page);
+		goto unlock;
+	}
+
+	/* no need to invalidate: a not-present page shouldn't be cached */
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+unlock:
+	pte_unmap_unlock(page_table, ptl);
+	return ret;
+oom:
+	page_cache_release(new_page);
+	return VM_FAULT_OOM;
+}
+
+/*
+ * Fault of a previously existing named mapping. Repopulate the pte
+ * from the encoded file_pte if possible. This enables swappable
+ * nonlinear vmas.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access, pte_t orig_pte)
+{
+	pgoff_t pgoff;
+	int err;
+
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+		return VM_FAULT_MINOR;
+
+	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
+		/*
+		 * Page table corrupted: show pte and kill process.
+		 */
+		print_bad_pte(vma, orig_pte, address);
+		return VM_FAULT_OOM;
+	}
+	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
+
+	pgoff = pte_to_pgoff(orig_pte);
+	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
+					vma->vm_page_prot, pgoff, 0);
+	if (err == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (err)
+		return VM_FAULT_SIGBUS;
+	return VM_FAULT_MAJOR;
+}
+
+/*
+ * These routines also need to handle stuff like marking pages dirty
+ * and/or accessed for architectures that don't do it in hardware (most
+ * RISC architectures).  The early dirtying is also good on the i386.
+ *
+ * There is also a hook called "update_mmu_cache()" that architectures
+ * with external mmu caches can use to update those (ie the Sparc or
+ * PowerPC hashed page tables that act as extended TLBs).
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static inline int handle_pte_fault(struct mm_struct *mm,
+		struct vm_area_struct *vma, unsigned long address,
+		pte_t *pte, pmd_t *pmd, int write_access)
+{
+	pte_t entry;
+	pte_t old_entry;
+	spinlock_t *ptl;
+
+	old_entry = entry = *pte;
+	if (!pte_present(entry)) {
+		if (pte_none(entry)) {
+			if (!vma->vm_ops || !vma->vm_ops->nopage)
+				return do_anonymous_page(mm, vma, address,
+					pte, pmd, write_access);
+			return do_no_page(mm, vma, address,
+					pte, pmd, write_access);
+		}
+		if (pte_file(entry))
+			return do_file_page(mm, vma, address,
+					pte, pmd, write_access, entry);
+		return do_swap_page(mm, vma, address,
+					pte, pmd, write_access, entry);
+	}
+
+	ptl = pte_lockptr(mm, pmd);
+	spin_lock(ptl);
+	if (unlikely(!pte_same(*pte, entry)))
+		goto unlock;
+	if (write_access) {
+		if (!pte_write(entry))
+			return do_wp_page(mm, vma, address,
+					pte, pmd, ptl, entry);
+		entry = pte_mkdirty(entry);
+	}
+	entry = pte_mkyoung(entry);
+	if (!pte_same(old_entry, entry)) {
+		ptep_set_access_flags(vma, address, pte, entry, write_access);
+		update_mmu_cache(vma, address, entry);
+		lazy_mmu_prot_update(entry);
+	} else {
+		/*
+		 * This is needed only for protection faults but the arch code
+		 * is not yet telling us if this is a protection fault or not.
+		 * This still avoids useless tlb flushes for .text page faults
+		 * with threads.
+		 */
+		if (write_access)
+			flush_tlb_page(vma, address);
+	}
+unlock:
+	pte_unmap_unlock(pte, ptl);
+	return VM_FAULT_MINOR;
+}
+
+/*
+ * By the time we get here, we already hold the mm semaphore
+ */
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, int write_access)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	__set_current_state(TASK_RUNNING);
+
+	inc_page_state(pgfault);
+
+	if (unlikely(is_vm_hugetlb_page(vma)))
+		return hugetlb_fault(mm, vma, address, write_access);
+
+	pgd = pgd_offset(mm, address);
+	pud = pud_alloc(mm, pgd, address);
+	if (!pud)
+		return VM_FAULT_OOM;
+	pmd = pmd_alloc(mm, pud, address);
+	if (!pmd)
+		return VM_FAULT_OOM;
+	pte = pte_alloc_map(mm, pmd, address);
+	if (!pte)
+		return VM_FAULT_OOM;
+
+	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+}
+
+EXPORT_SYMBOL_GPL(__handle_mm_fault);
+
+#ifndef __PAGETABLE_PUD_FOLDED
+/*
+ * Allocate page upper directory.
+ * We've already handled the fast-path in-line.
+ */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+	pud_t *new = pud_alloc_one(mm, address);
+	if (!new)
+		return -ENOMEM;
+
+	spin_lock(&mm->page_table_lock);
+	if (pgd_present(*pgd))		/* Another has populated it */
+		pud_free(new);
+	else
+		pgd_populate(mm, pgd, new);
+	spin_unlock(&mm->page_table_lock);
+	return 0;
+}
+#else
+/* Workaround for gcc 2.96 */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+	return 0;
+}
+#endif /* __PAGETABLE_PUD_FOLDED */
+
+#ifndef __PAGETABLE_PMD_FOLDED
+/*
+ * Allocate page middle directory.
+ * We've already handled the fast-path in-line.
+ */
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+{
+	pmd_t *new = pmd_alloc_one(mm, address);
+	if (!new)
+		return -ENOMEM;
+
+	spin_lock(&mm->page_table_lock);
+#ifndef __ARCH_HAS_4LEVEL_HACK
+	if (pud_present(*pud))		/* Another has populated it */
+		pmd_free(new);
+	else
+		pud_populate(mm, pud, new);
+#else
+	if (pgd_present(*pud))		/* Another has populated it */
+		pmd_free(new);
+	else
+		pgd_populate(mm, pud, new);
+#endif /* __ARCH_HAS_4LEVEL_HACK */
+	spin_unlock(&mm->page_table_lock);
+	return 0;
+}
+#else
+/* Workaround for gcc 2.96 */
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+{
+	return 0;
+}
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+int make_pages_present(unsigned long addr, unsigned long end)
+{
+	int ret, len, write;
+	struct vm_area_struct * vma;
+
+	vma = find_vma(current->mm, addr);
+	if (!vma)
+		return -1;
+	write = (vma->vm_flags & VM_WRITE) != 0;
+	if (addr >= end)
+		BUG();
+	if (end > vma->vm_end)
+		BUG();
+	len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
+	ret = get_user_pages(current, current->mm, addr,
+			len, write, 0, NULL, NULL);
+	if (ret < 0)
+		return ret;
+	return ret == len ? 0 : -1;
+}
+
+/* 
+ * Map a vmalloc()-space virtual address to the physical page.
+ */
+struct page * vmalloc_to_page(void * vmalloc_addr)
+{
+	unsigned long addr = (unsigned long) vmalloc_addr;
+	struct page *page = NULL;
+	pgd_t *pgd = pgd_offset_k(addr);
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+  
+	if (!pgd_none(*pgd)) {
+		pud = pud_offset(pgd, addr);
+		if (!pud_none(*pud)) {
+			pmd = pmd_offset(pud, addr);
+			if (!pmd_none(*pmd)) {
+				ptep = pte_offset_map(pmd, addr);
+				pte = *ptep;
+				if (pte_present(pte))
+					page = pte_page(pte);
+				pte_unmap(ptep);
+			}
+		}
+	}
+	return page;
+}
+
+EXPORT_SYMBOL(vmalloc_to_page);
+
+/*
+ * Map a vmalloc()-space virtual address to the physical page frame number.
+ */
+unsigned long vmalloc_to_pfn(void * vmalloc_addr)
+{
+	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+}
+
+EXPORT_SYMBOL(vmalloc_to_pfn);
+
+#if !defined(__HAVE_ARCH_GATE_AREA)
+
+#if defined(AT_SYSINFO_EHDR)
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+	gate_vma.vm_mm = NULL;
+	gate_vma.vm_start = FIXADDR_USER_START;
+	gate_vma.vm_end = FIXADDR_USER_END;
+	gate_vma.vm_page_prot = PAGE_READONLY;
+	gate_vma.vm_flags = 0;
+	return 0;
+}
+__initcall(gate_vma_init);
+#endif
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+#ifdef AT_SYSINFO_EHDR
+	return &gate_vma;
+#else
+	return NULL;
+#endif
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+#ifdef AT_SYSINFO_EHDR
+	if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
+		return 1;
+#endif
+	return 0;
+}
+
+#endif	/* __HAVE_ARCH_GATE_AREA */
diff -urN oldtree/mm/page_alloc.c newtree/mm/page_alloc.c
--- oldtree/mm/page_alloc.c	2006-03-08 18:48:03.008067000 +0000
+++ newtree/mm/page_alloc.c	2006-03-08 15:22:33.405515000 +0000
@@ -25,6 +25,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
@@ -469,6 +470,7 @@
 {
 	if (order == 0) {
 		__ClearPageReserved(page);
+		ClearPageNosave(page);
 		set_page_count(page, 0);
 		set_page_refcounted(page);
 		__free_page(page);
@@ -482,6 +484,7 @@
 			if (loop + 1 < BITS_PER_LONG)
 				prefetchw(p + 1);
 			__ClearPageReserved(p);
+			ClearPageNosave(p);
 			set_page_count(p, 0);
 		}
 
@@ -1040,8 +1043,8 @@
 
 	/* This allocation should allow future memory freeing. */
 
-	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
-			&& !in_interrupt()) {
+	if ((((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && 
+				!in_interrupt()) || (test_freezer_state(FREEZER_ON))) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 nofail_alloc:
 			/* go through the zonelist yet again, ignoring mins */
diff -urN oldtree/mm/page_alloc.c.orig newtree/mm/page_alloc.c.orig
--- oldtree/mm/page_alloc.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/mm/page_alloc.c.orig	2006-03-08 15:21:19.280882500 +0000
@@ -0,0 +1,2862 @@
+/*
+ *  linux/mm/page_alloc.c
+ *
+ *  Manages the free list, the system allocates free pages here.
+ *  Note that kmalloc() lives in slab.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
+ *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/topology.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nodemask.h>
+#include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
+
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+/*
+ * MCD - HACK: Find somewhere to initialize this EARLY, or make this
+ * initializer cleaner
+ */
+nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
+EXPORT_SYMBOL(node_online_map);
+nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
+EXPORT_SYMBOL(node_possible_map);
+unsigned long totalram_pages __read_mostly;
+unsigned long totalhigh_pages __read_mostly;
+long nr_swap_pages;
+int percpu_pagelist_fraction;
+
+static void __free_pages_ok(struct page *page, unsigned int order);
+
+/*
+ * results with 256, 32 in the lowmem_reserve sysctl:
+ *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
+ *	1G machine -> (16M dma, 784M normal, 224M high)
+ *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
+ *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
+ *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
+ */
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
+
+EXPORT_SYMBOL(totalram_pages);
+
+/*
+ * Used by page_zone() to look up the address of the struct zone whose
+ * id is encoded in the upper bits of page->flags
+ */
+struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
+EXPORT_SYMBOL(zone_table);
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
+int min_free_kbytes = 1024;
+
+unsigned long __initdata nr_kernel_pages;
+unsigned long __initdata nr_all_pages;
+
+#ifdef CONFIG_DEBUG_VM
+static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
+{
+	int ret = 0;
+	unsigned seq;
+	unsigned long pfn = page_to_pfn(page);
+
+	do {
+		seq = zone_span_seqbegin(zone);
+		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+			ret = 1;
+		else if (pfn < zone->zone_start_pfn)
+			ret = 1;
+	} while (zone_span_seqretry(zone, seq));
+
+	return ret;
+}
+
+static int page_is_consistent(struct zone *zone, struct page *page)
+{
+#ifdef CONFIG_HOLES_IN_ZONE
+	if (!pfn_valid(page_to_pfn(page)))
+		return 0;
+#endif
+	if (zone != page_zone(page))
+		return 0;
+
+	return 1;
+}
+/*
+ * Temporary debugging check for pages not lying within a given zone.
+ */
+static int bad_range(struct zone *zone, struct page *page)
+{
+	if (page_outside_zone_boundaries(zone, page))
+		return 1;
+	if (!page_is_consistent(zone, page))
+		return 1;
+
+	return 0;
+}
+
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
+{
+	return 0;
+}
+#endif
+
+static void bad_page(struct page *page)
+{
+	printk(KERN_EMERG "Bad page state in process '%s'\n"
+		KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
+		KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
+		KERN_EMERG "Backtrace:\n",
+		current->comm, page, (int)(2*sizeof(unsigned long)),
+		(unsigned long)page->flags, page->mapping,
+		page_mapcount(page), page_count(page));
+	dump_stack();
+	{
+		int i;
+		unsigned char *ptr = (unsigned char *)page;
+		ptr -= 64;
+
+		printk(KERN_EMERG "Hexdump:");
+		for (i=0;i<192;i++) {
+			if ((i%16) == 0) {
+				printk("\n");
+				printk(KERN_EMERG "%03x:", i);
+			}
+			printk(" %02x", ptr[i]);
+		}
+		printk("\n");
+	}
+	page->flags &= ~(1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_dirty	|
+			1 << PG_reclaim |
+			1 << PG_slab    |
+			1 << PG_swapcache |
+			1 << PG_writeback );
+	set_page_count(page, 0);
+	reset_page_mapcount(page);
+	page->mapping = NULL;
+	add_taint(TAINT_BAD_PAGE);
+}
+
+/*
+ * Higher-order pages are called "compound pages".  They are structured thusly:
+ *
+ * The first PAGE_SIZE page is called the "head page".
+ *
+ * The remaining PAGE_SIZE pages are called "tail pages".
+ *
+ * All pages have PG_compound set.  All pages have their ->private pointing at
+ * the head page (even the head page has this).
+ *
+ * The first tail page's ->lru.next holds the address of the compound page's
+ * put_page() function.  Its ->lru.prev holds the order of allocation.
+ * This usage means that zero-order pages may not be compound.
+ */
+
+static void free_compound_page(struct page *page)
+{
+	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
+}
+
+static void prep_compound_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+
+	page[1].lru.next = (void *)free_compound_page;	/* set dtor */
+	page[1].lru.prev = (void *)order;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = page + i;
+
+		__SetPageCompound(p);
+		set_page_private(p, (unsigned long)page);
+	}
+}
+
+static void destroy_compound_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+
+	if (unlikely((unsigned long)page[1].lru.prev != order))
+		bad_page(page);
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = page + i;
+
+		if (unlikely(!PageCompound(p) |
+				(page_private(p) != (unsigned long)page)))
+			bad_page(page);
+		__ClearPageCompound(p);
+	}
+}
+
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+{
+	int i;
+
+	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+	/*
+	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
+	 * and __GFP_HIGHMEM from hard or soft interrupt context.
+	 */
+	BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
+	for (i = 0; i < (1 << order); i++)
+		clear_highpage(page + i);
+}
+
+/*
+ * function for dealing with page's order in buddy system.
+ * zone->lock is already acquired when we use these.
+ * So, we don't need atomic page->flags operations here.
+ */
+static inline unsigned long page_order(struct page *page) {
+	return page_private(page);
+}
+
+static inline void set_page_order(struct page *page, int order) {
+	set_page_private(page, order);
+	__SetPagePrivate(page);
+}
+
+static inline void rmv_page_order(struct page *page)
+{
+	__ClearPagePrivate(page);
+	set_page_private(page, 0);
+}
+
+/*
+ * Locate the struct page for both the matching buddy in our
+ * pair (buddy1) and the combined O(n+1) page they form (page).
+ *
+ * 1) Any buddy B1 will have an order O twin B2 which satisfies
+ * the following equation:
+ *     B2 = B1 ^ (1 << O)
+ * For example, if the starting buddy (buddy2) is #8 its order
+ * 1 buddy is #10:
+ *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
+ *
+ * 2) Any buddy B will have an order O+1 parent P which
+ * satisfies the following equation:
+ *     P = B & ~(1 << O)
+ *
+ * Assumption: *_mem_map is contigious at least up to MAX_ORDER
+ */
+static inline struct page *
+__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
+{
+	unsigned long buddy_idx = page_idx ^ (1 << order);
+
+	return page + (buddy_idx - page_idx);
+}
+
+static inline unsigned long
+__find_combined_index(unsigned long page_idx, unsigned int order)
+{
+	return (page_idx & ~(1 << order));
+}
+
+/*
+ * This function checks whether a page is free && is the buddy
+ * we can do coalesce a page and its buddy if
+ * (a) the buddy is not in a hole &&
+ * (b) the buddy is free &&
+ * (c) the buddy is on the buddy system &&
+ * (d) a page and its buddy have the same order.
+ * for recording page's order, we use page_private(page) and PG_private.
+ *
+ */
+static inline int page_is_buddy(struct page *page, int order)
+{
+#ifdef CONFIG_HOLES_IN_ZONE
+	if (!pfn_valid(page_to_pfn(page)))
+		return 0;
+#endif
+
+       if (PagePrivate(page)           &&
+           (page_order(page) == order) &&
+            page_count(page) == 0)
+               return 1;
+       return 0;
+}
+
+/*
+ * Freeing function for a buddy system allocator.
+ *
+ * The concept of a buddy system is to maintain direct-mapped table
+ * (containing bit values) for memory blocks of various "orders".
+ * The bottom level table contains the map for the smallest allocatable
+ * units of memory (here, pages), and each level above it describes
+ * pairs of units from the levels below, hence, "buddies".
+ * At a high level, all that happens here is marking the table entry
+ * at the bottom level available, and propagating the changes upward
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep a list of pages, which are heads of continuous
+ * free pages of length of (1 << order) and marked with PG_Private.Page's
+ * order is recorded in page_private(page) field.
+ * So when we are allocating or freeing one, we can derive the state of the
+ * other.  That is, if we allocate a small block, and both were   
+ * free, the remainder of the region must be split into blocks.   
+ * If a block is freed, and its buddy is also free, then this
+ * triggers coalescing into a block of larger size.            
+ *
+ * -- wli
+ */
+
+static inline void __free_one_page(struct page *page,
+		struct zone *zone, unsigned int order)
+{
+	unsigned long page_idx;
+	int order_size = 1 << order;
+
+	if (unlikely(PageCompound(page)))
+		destroy_compound_page(page, order);
+
+	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+
+	BUG_ON(page_idx & (order_size - 1));
+	BUG_ON(bad_range(zone, page));
+
+	zone->free_pages += order_size;
+	while (order < MAX_ORDER-1) {
+		unsigned long combined_idx;
+		struct free_area *area;
+		struct page *buddy;
+
+		buddy = __page_find_buddy(page, page_idx, order);
+		if (!page_is_buddy(buddy, order))
+			break;		/* Move the buddy up one level. */
+
+		list_del(&buddy->lru);
+		area = zone->free_area + order;
+		area->nr_free--;
+		rmv_page_order(buddy);
+		combined_idx = __find_combined_index(page_idx, order);
+		page = page + (combined_idx - page_idx);
+		page_idx = combined_idx;
+		order++;
+	}
+	set_page_order(page, order);
+	list_add(&page->lru, &zone->free_area[order].free_list);
+	zone->free_area[order].nr_free++;
+}
+
+static inline int free_pages_check(struct page *page)
+{
+	if (unlikely(page_mapcount(page) |
+		(page->mapping != NULL)  |
+		(page_count(page) != 0)  |
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_reclaim	|
+			1 << PG_slab	|
+			1 << PG_swapcache |
+			1 << PG_writeback |
+			1 << PG_reserved ))))
+		bad_page(page);
+	if (PageDirty(page))
+		__ClearPageDirty(page);
+	/*
+	 * For now, we report if PG_reserved was found set, but do not
+	 * clear it, and do not free the page.  But we shall soon need
+	 * to do more, for when the ZERO_PAGE count wraps negative.
+	 */
+	return PageReserved(page);
+}
+
+/*
+ * Frees a list of pages. 
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free.
+ *
+ * If the zone was previously in an "all pages pinned" state then look to
+ * see if this freeing clears that state.
+ *
+ * And clear the zone's pages_scanned counter, to hold off the "all pages are
+ * pinned" detection logic.
+ */
+static void free_pages_bulk(struct zone *zone, int count,
+					struct list_head *list, int order)
+{
+	spin_lock(&zone->lock);
+	zone->all_unreclaimable = 0;
+	zone->pages_scanned = 0;
+	while (count--) {
+		struct page *page;
+
+		BUG_ON(list_empty(list));
+		page = list_entry(list->prev, struct page, lru);
+		/* have to delete it as __free_one_page list manipulates */
+		list_del(&page->lru);
+		__free_one_page(page, zone, order);
+	}
+	spin_unlock(&zone->lock);
+}
+
+static void free_one_page(struct zone *zone, struct page *page, int order)
+{
+	LIST_HEAD(list);
+	list_add(&page->lru, &list);
+	free_pages_bulk(zone, 1, &list, order);
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+	unsigned long flags;
+	int i;
+	int reserved = 0;
+
+	arch_free_page(page, order);
+	if (!PageHighMem(page))
+		mutex_debug_check_no_locks_freed(page_address(page),
+						 PAGE_SIZE<<order);
+
+	for (i = 0 ; i < (1 << order) ; ++i)
+		reserved += free_pages_check(page + i);
+	if (reserved)
+		return;
+
+	kernel_map_pages(page, 1 << order, 0);
+	local_irq_save(flags);
+	__mod_page_state(pgfree, 1 << order);
+	free_one_page(page_zone(page), page, order);
+	local_irq_restore(flags);
+}
+
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+	if (order == 0) {
+		__ClearPageReserved(page);
+		set_page_count(page, 0);
+		set_page_refcounted(page);
+		__free_page(page);
+	} else {
+		int loop;
+
+		prefetchw(page);
+		for (loop = 0; loop < BITS_PER_LONG; loop++) {
+			struct page *p = &page[loop];
+
+			if (loop + 1 < BITS_PER_LONG)
+				prefetchw(p + 1);
+			__ClearPageReserved(p);
+			set_page_count(p, 0);
+		}
+
+		set_page_refcounted(page);
+		__free_pages(page, order);
+	}
+}
+
+
+/*
+ * The order of subdivision here is critical for the IO subsystem.
+ * Please do not alter this order without good reasons and regression
+ * testing. Specifically, as large blocks of memory are subdivided,
+ * the order in which smaller blocks are delivered depends on the order
+ * they're subdivided in this function. This is the primary factor
+ * influencing the order in which pages are delivered to the IO
+ * subsystem according to empirical testing, and this is also justified
+ * by considering the behavior of a buddy system containing a single
+ * large block of memory acted on by a series of small allocations.
+ * This behavior is a critical factor in sglist merging's success.
+ *
+ * -- wli
+ */
+static inline void expand(struct zone *zone, struct page *page,
+ 	int low, int high, struct free_area *area)
+{
+	unsigned long size = 1 << high;
+
+	while (high > low) {
+		area--;
+		high--;
+		size >>= 1;
+		BUG_ON(bad_range(zone, &page[size]));
+		list_add(&page[size].lru, &area->free_list);
+		area->nr_free++;
+		set_page_order(&page[size], high);
+	}
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+{
+	if (unlikely(page_mapcount(page) |
+		(page->mapping != NULL)  |
+		(page_count(page) != 0)  |
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private	|
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_dirty	|
+			1 << PG_reclaim	|
+			1 << PG_slab    |
+			1 << PG_swapcache |
+			1 << PG_writeback |
+			1 << PG_reserved ))))
+		bad_page(page);
+
+	/*
+	 * For now, we report if PG_reserved was found set, but do not
+	 * clear it, and do not allocate the page: as a safety net.
+	 */
+	if (PageReserved(page))
+		return 1;
+
+	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+			1 << PG_referenced | 1 << PG_arch_1 |
+			1 << PG_checked | 1 << PG_mappedtodisk);
+	set_page_private(page, 0);
+	set_page_refcounted(page);
+	kernel_map_pages(page, 1 << order, 1);
+
+	if (gfp_flags & __GFP_ZERO)
+		prep_zero_page(page, order, gfp_flags);
+
+	if (order && (gfp_flags & __GFP_COMP))
+		prep_compound_page(page, order);
+
+	return 0;
+}
+
+/* 
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order)
+{
+	struct free_area * area;
+	unsigned int current_order;
+	struct page *page;
+
+	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+		area = zone->free_area + current_order;
+		if (list_empty(&area->free_list))
+			continue;
+
+		page = list_entry(area->free_list.next, struct page, lru);
+		list_del(&page->lru);
+		rmv_page_order(page);
+		area->nr_free--;
+		zone->free_pages -= 1UL << order;
+		expand(zone, page, order, current_order, area);
+		return page;
+	}
+
+	return NULL;
+}
+
+/* 
+ * Obtain a specified number of elements from the buddy allocator, all under
+ * a single hold of the lock, for efficiency.  Add them to the supplied list.
+ * Returns the number of new pages which were placed at *list.
+ */
+static int rmqueue_bulk(struct zone *zone, unsigned int order, 
+			unsigned long count, struct list_head *list)
+{
+	int i;
+	
+	spin_lock(&zone->lock);
+	for (i = 0; i < count; ++i) {
+		struct page *page = __rmqueue(zone, order);
+		if (unlikely(page == NULL))
+			break;
+		list_add_tail(&page->lru, list);
+	}
+	spin_unlock(&zone->lock);
+	return i;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Called from the slab reaper to drain pagesets on a particular node that
+ * belong to the currently executing processor.
+ */
+void drain_node_pages(int nodeid)
+{
+	int i, z;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	for (z = 0; z < MAX_NR_ZONES; z++) {
+		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
+		struct per_cpu_pageset *pset;
+
+		pset = zone_pcp(zone, smp_processor_id());
+		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &pset->pcp[i];
+			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+			pcp->count = 0;
+		}
+	}
+	local_irq_restore(flags);
+}
+#endif
+
+#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
+static void __drain_pages(unsigned int cpu)
+{
+	unsigned long flags;
+	struct zone *zone;
+	int i;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset;
+
+		pset = zone_pcp(zone, cpu);
+		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &pset->pcp[i];
+			local_irq_save(flags);
+			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+			pcp->count = 0;
+			local_irq_restore(flags);
+		}
+	}
+}
+#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_PM
+
+void mark_free_pages(struct zone *zone)
+{
+	unsigned long zone_pfn, flags;
+	int order;
+	struct list_head *curr;
+
+	if (!zone->spanned_pages)
+		return;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
+
+	for (order = MAX_ORDER - 1; order >= 0; --order)
+		list_for_each(curr, &zone->free_area[order].free_list) {
+			unsigned long start_pfn, i;
+
+			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
+
+			for (i=0; i < (1<<order); i++)
+				SetPageNosaveFree(pfn_to_page(start_pfn+i));
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);	
+	__drain_pages(smp_processor_id());
+	local_irq_restore(flags);	
+}
+#endif /* CONFIG_PM */
+
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
+{
+#ifdef CONFIG_NUMA
+	pg_data_t *pg = z->zone_pgdat;
+	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
+	struct per_cpu_pageset *p;
+
+	p = zone_pcp(z, cpu);
+	if (pg == orig) {
+		p->numa_hit++;
+	} else {
+		p->numa_miss++;
+		zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
+	}
+	if (pg == NODE_DATA(numa_node_id()))
+		p->local_node++;
+	else
+		p->other_node++;
+#endif
+}
+
+/*
+ * Free a 0-order page
+ */
+static void fastcall free_hot_cold_page(struct page *page, int cold)
+{
+	struct zone *zone = page_zone(page);
+	struct per_cpu_pages *pcp;
+	unsigned long flags;
+
+	arch_free_page(page, 0);
+
+	if (PageAnon(page))
+		page->mapping = NULL;
+	if (free_pages_check(page))
+		return;
+
+	kernel_map_pages(page, 1, 0);
+
+	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+	local_irq_save(flags);
+	__inc_page_state(pgfree);
+	list_add(&page->lru, &pcp->list);
+	pcp->count++;
+	if (pcp->count >= pcp->high) {
+		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+		pcp->count -= pcp->batch;
+	}
+	local_irq_restore(flags);
+	put_cpu();
+}
+
+void fastcall free_hot_page(struct page *page)
+{
+	free_hot_cold_page(page, 0);
+}
+	
+void fastcall free_cold_page(struct page *page)
+{
+	free_hot_cold_page(page, 1);
+}
+
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
+{
+	int i;
+
+	BUG_ON(PageCompound(page));
+	BUG_ON(!page_count(page));
+	for (i = 1; i < (1 << order); i++)
+		set_page_refcounted(page + i);
+}
+
+/*
+ * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
+ * we cheat by calling it from here, in the order > 0 path.  Saves a branch
+ * or two.
+ */
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
+			struct zone *zone, int order, gfp_t gfp_flags)
+{
+	unsigned long flags;
+	struct page *page;
+	int cold = !!(gfp_flags & __GFP_COLD);
+	int cpu;
+
+again:
+	cpu  = get_cpu();
+	if (likely(order == 0)) {
+		struct per_cpu_pages *pcp;
+
+		pcp = &zone_pcp(zone, cpu)->pcp[cold];
+		local_irq_save(flags);
+		if (!pcp->count) {
+			pcp->count += rmqueue_bulk(zone, 0,
+						pcp->batch, &pcp->list);
+			if (unlikely(!pcp->count))
+				goto failed;
+		}
+		page = list_entry(pcp->list.next, struct page, lru);
+		list_del(&page->lru);
+		pcp->count--;
+	} else {
+		spin_lock_irqsave(&zone->lock, flags);
+		page = __rmqueue(zone, order);
+		spin_unlock(&zone->lock);
+		if (!page)
+			goto failed;
+	}
+
+	__mod_page_state_zone(zone, pgalloc, 1 << order);
+	zone_statistics(zonelist, zone, cpu);
+	local_irq_restore(flags);
+	put_cpu();
+
+	BUG_ON(bad_range(zone, page));
+	if (prep_new_page(page, order, gfp_flags))
+		goto again;
+	return page;
+
+failed:
+	local_irq_restore(flags);
+	put_cpu();
+	return NULL;
+}
+
+#define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
+#define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
+#define ALLOC_HARDER		0x10 /* try to alloc harder */
+#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
+
+/*
+ * Return 1 if free pages are above 'mark'. This takes into account the order
+ * of the allocation.
+ */
+int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags)
+{
+	/* free_pages my go negative - that's OK */
+	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
+	int o;
+
+	if (alloc_flags & ALLOC_HIGH)
+		min -= min / 2;
+	if (alloc_flags & ALLOC_HARDER)
+		min -= min / 4;
+
+	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+		return 0;
+	for (o = 0; o < order; o++) {
+		/* At the next order, this order's pages become unavailable */
+		free_pages -= z->free_area[o].nr_free << o;
+
+		/* Require fewer higher order pages to be free */
+		min >>= 1;
+
+		if (free_pages <= min)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * get_page_from_freeliest goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist, int alloc_flags)
+{
+	struct zone **z = zonelist->zones;
+	struct page *page = NULL;
+	int classzone_idx = zone_idx(*z);
+
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
+	do {
+		if ((alloc_flags & ALLOC_CPUSET) &&
+				!cpuset_zone_allowed(*z, gfp_mask))
+			continue;
+
+		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+			unsigned long mark;
+			if (alloc_flags & ALLOC_WMARK_MIN)
+				mark = (*z)->pages_min;
+			else if (alloc_flags & ALLOC_WMARK_LOW)
+				mark = (*z)->pages_low;
+			else
+				mark = (*z)->pages_high;
+			if (!zone_watermark_ok(*z, order, mark,
+				    classzone_idx, alloc_flags))
+				if (!zone_reclaim_mode ||
+				    !zone_reclaim(*z, gfp_mask, order))
+					continue;
+		}
+
+		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
+		if (page) {
+			break;
+		}
+	} while (*(++z) != NULL);
+	return page;
+}
+
+#ifdef CONFIG_PAGE_OWNER
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+{
+	return	p > (void *)tinfo &&
+		p < (void *)tinfo + THREAD_SIZE - 3;
+}
+
+static inline void __stack_trace(struct page *page, unsigned long *stack,
+			unsigned long bp)
+{
+	int i = 0;
+	unsigned long addr;
+	struct thread_info *tinfo = (struct thread_info *)
+		((unsigned long)stack & (~(THREAD_SIZE - 1)));
+
+	memset(page->trace, 0, sizeof(long) * 8);
+
+#ifdef CONFIG_FRAME_POINTER
+	while (valid_stack_ptr(tinfo, (void *)bp)) {
+		addr = *(unsigned long *)(bp + sizeof(long));
+		page->trace[i] = addr;
+		if (++i >= 8)
+			break;
+		bp = *(unsigned long *)bp;
+	}
+#else
+	while (valid_stack_ptr(tinfo, stack)) {
+		addr = *stack++;
+		if (__kernel_text_address(addr)) {
+			page->trace[i] = addr;
+			if (++i >= 8)
+				break;
+		}
+	}
+#endif
+}
+
+static inline void set_page_owner(struct page *page,
+			unsigned int order, unsigned int gfp_mask)
+{
+	unsigned long address, bp;
+#ifdef CONFIG_X86_64
+	asm ("movq %%rbp, %0" : "=r" (bp) : );
+#else
+	asm ("movl %%ebp, %0" : "=r" (bp) : );
+#endif
+	page->order = (int) order;
+	page->gfp_mask = gfp_mask;
+	__stack_trace(page, &address, bp);
+}
+#endif /* CONFIG_PAGE_OWNER */
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page * fastcall
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist)
+{
+	const gfp_t wait = gfp_mask & __GFP_WAIT;
+	struct zone **z;
+	struct page *page;
+	struct reclaim_state reclaim_state;
+	struct task_struct *p = current;
+	int do_retry;
+	int alloc_flags;
+	int did_some_progress;
+
+	might_sleep_if(wait);
+
+restart:
+	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
+
+	if (unlikely(*z == NULL)) {
+		/* Should this ever happen?? */
+		return NULL;
+	}
+
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+	if (page)
+		goto got_pg;
+
+	do {
+		if (cpuset_zone_allowed(*z, gfp_mask))
+			wakeup_kswapd(*z, order);
+	} while (*(++z));
+
+	/*
+	 * OK, we're below the kswapd watermark and have kicked background
+	 * reclaim. Now things get more complex, so set up alloc_flags according
+	 * to how we want to proceed.
+	 *
+	 * The caller may dip into page reserves a bit more if the caller
+	 * cannot run direct reclaim, or if the caller has realtime scheduling
+	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+	 */
+	alloc_flags = ALLOC_WMARK_MIN;
+	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+		alloc_flags |= ALLOC_HARDER;
+	if (gfp_mask & __GFP_HIGH)
+		alloc_flags |= ALLOC_HIGH;
+	alloc_flags |= ALLOC_CPUSET;
+
+	/*
+	 * Go through the zonelist again. Let __GFP_HIGH and allocations
+	 * coming from realtime tasks go deeper into reserves.
+	 *
+	 * This is the last chance, in general, before the goto nopage.
+	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
+	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+	if (page)
+		goto got_pg;
+
+	/* This allocation should allow future memory freeing. */
+
+	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+			&& !in_interrupt()) {
+		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+nofail_alloc:
+			/* go through the zonelist yet again, ignoring mins */
+			page = get_page_from_freelist(gfp_mask, order,
+				zonelist, ALLOC_NO_WATERMARKS);
+			if (page)
+				goto got_pg;
+			if (gfp_mask & __GFP_NOFAIL) {
+				blk_congestion_wait(WRITE, HZ/50);
+				goto nofail_alloc;
+			}
+		}
+		goto nopage;
+	}
+
+	/* Atomic allocations - we can't balance anything */
+	if (!wait)
+		goto nopage;
+
+rebalance:
+	cond_resched();
+
+	/* We now go into synchronous reclaim */
+	cpuset_memory_pressure_bump();
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+
+	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
+
+	p->reclaim_state = NULL;
+	p->flags &= ~PF_MEMALLOC;
+
+	cond_resched();
+
+	if (likely(did_some_progress)) {
+		page = get_page_from_freelist(gfp_mask, order,
+						zonelist, alloc_flags);
+		if (page)
+			goto got_pg;
+	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+		/*
+		 * Go through the zonelist yet one more time, keep
+		 * very high watermark here, this is only to catch
+		 * a parallel oom killing, we must fail if we're still
+		 * under heavy pressure.
+		 */
+		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+				zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+		if (page)
+			goto got_pg;
+
+		out_of_memory(zonelist, gfp_mask, order);
+		goto restart;
+	}
+
+	/*
+	 * Don't let big-order allocations loop unless the caller explicitly
+	 * requests that.  Wait for some write requests to complete then retry.
+	 *
+	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
+	 * <= 3, but that may not be true in other implementations.
+	 */
+	do_retry = 0;
+	if (!(gfp_mask & __GFP_NORETRY)) {
+		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+			do_retry = 1;
+		if (gfp_mask & __GFP_NOFAIL)
+			do_retry = 1;
+	}
+	if (do_retry) {
+		blk_congestion_wait(WRITE, HZ/50);
+		goto rebalance;
+	}
+
+nopage:
+	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+		printk(KERN_WARNING "%s: page allocation failure."
+			" order:%d, mode:0x%x\n",
+			p->comm, order, gfp_mask);
+		dump_stack();
+		show_mem();
+	}
+got_pg:
+#ifdef CONFIG_PAGE_OWNER
+	if (page)
+		set_page_owner(page, order, gfp_mask);
+#endif
+	return page;
+}
+
+EXPORT_SYMBOL(__alloc_pages);
+
+/*
+ * Common helper functions.
+ */
+fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
+{
+	struct page * page;
+	page = alloc_pages(gfp_mask, order);
+	if (!page)
+		return 0;
+	return (unsigned long) page_address(page);
+}
+
+EXPORT_SYMBOL(__get_free_pages);
+
+fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
+{
+	struct page * page;
+
+	/*
+	 * get_zeroed_page() returns a 32-bit address, which cannot represent
+	 * a highmem page
+	 */
+	BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
+
+	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
+	if (page)
+		return (unsigned long) page_address(page);
+	return 0;
+}
+
+EXPORT_SYMBOL(get_zeroed_page);
+
+void __pagevec_free(struct pagevec *pvec)
+{
+	int i = pagevec_count(pvec);
+
+	while (--i >= 0)
+		free_hot_cold_page(pvec->pages[i], pvec->cold);
+}
+
+fastcall void __free_pages(struct page *page, unsigned int order)
+{
+	if (put_page_testzero(page)) {
+		if (order == 0)
+			free_hot_page(page);
+		else
+			__free_pages_ok(page, order);
+#ifdef CONFIG_PAGE_OWNER
+		page->order = -1;
+#endif
+	}
+}
+
+EXPORT_SYMBOL(__free_pages);
+
+fastcall void free_pages(unsigned long addr, unsigned int order)
+{
+	if (addr != 0) {
+		BUG_ON(!virt_addr_valid((void *)addr));
+		__free_pages(virt_to_page((void *)addr), order);
+	}
+}
+
+EXPORT_SYMBOL(free_pages);
+
+/*
+ * Total amount of free (allocatable) RAM:
+ */
+unsigned int nr_free_pages(void)
+{
+	unsigned int sum = 0;
+	struct zone *zone;
+
+	for_each_zone(zone)
+		sum += zone->free_pages;
+
+	return sum;
+}
+
+EXPORT_SYMBOL(nr_free_pages);
+
+#ifdef CONFIG_NUMA
+unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
+{
+	unsigned int i, sum = 0;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		sum += pgdat->node_zones[i].free_pages;
+
+	return sum;
+}
+#endif
+
+static unsigned int nr_free_zone_pages(int offset)
+{
+	/* Just pick one node, since fallback list is circular */
+	pg_data_t *pgdat = NODE_DATA(numa_node_id());
+	unsigned int sum = 0;
+
+	struct zonelist *zonelist = pgdat->node_zonelists + offset;
+	struct zone **zonep = zonelist->zones;
+	struct zone *zone;
+
+	for (zone = *zonep++; zone; zone = *zonep++) {
+		unsigned long size = zone->present_pages;
+		unsigned long high = zone->pages_high;
+		if (size > high)
+			sum += size - high;
+	}
+
+	return sum;
+}
+
+/*
+ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ */
+unsigned int nr_free_buffer_pages(void)
+{
+	return nr_free_zone_pages(gfp_zone(GFP_USER));
+}
+
+/*
+ * Amount of free RAM allocatable within all zones
+ */
+unsigned int nr_free_pagecache_pages(void)
+{
+	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
+}
+
+#ifdef CONFIG_HIGHMEM
+unsigned int nr_free_highpages (void)
+{
+	pg_data_t *pgdat;
+	unsigned int pages = 0;
+
+	for_each_online_pgdat(pgdat)
+		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+
+	return pages;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+static void show_node(struct zone *zone)
+{
+	printk("Node %d ", zone->zone_pgdat->node_id);
+}
+#else
+#define show_node(zone)	do { } while (0)
+#endif
+
+/*
+ * Accumulate the page_state information across all CPUs.
+ * The result is unavoidably approximate - it can change
+ * during and after execution of this function.
+ */
+static DEFINE_PER_CPU(struct page_state, page_states) = {0};
+
+atomic_t nr_pagecache = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_pagecache);
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
+#endif
+
+static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
+{
+	int cpu = 0;
+
+	memset(ret, 0, nr * sizeof(unsigned long));
+	cpus_and(*cpumask, *cpumask, cpu_online_map);
+
+	cpu = first_cpu(*cpumask);
+	while (cpu < NR_CPUS) {
+		unsigned long *in, *out, off;
+
+		if (!cpu_isset(cpu, *cpumask))
+			continue;
+
+		in = (unsigned long *)&per_cpu(page_states, cpu);
+
+		cpu = next_cpu(cpu, *cpumask);
+
+		if (likely(cpu < NR_CPUS))
+			prefetch(&per_cpu(page_states, cpu));
+
+		out = (unsigned long *)ret;
+		for (off = 0; off < nr; off++)
+			*out++ += *in++;
+	}
+}
+
+void get_page_state_node(struct page_state *ret, int node)
+{
+	int nr;
+	cpumask_t mask = node_to_cpumask(node);
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr+1, &mask);
+}
+
+void get_page_state(struct page_state *ret)
+{
+	int nr;
+	cpumask_t mask = CPU_MASK_ALL;
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr + 1, &mask);
+}
+
+void get_full_page_state(struct page_state *ret)
+{
+	cpumask_t mask = CPU_MASK_ALL;
+
+	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
+}
+
+unsigned long read_page_state_offset(unsigned long offset)
+{
+	unsigned long ret = 0;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		unsigned long in;
+
+		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
+		ret += *((unsigned long *)in);
+	}
+	return ret;
+}
+
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+	void *ptr;
+
+	ptr = &__get_cpu_var(page_states);
+	*(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+	unsigned long flags;
+	void *ptr;
+
+	local_irq_save(flags);
+	ptr = &__get_cpu_var(page_states);
+	*(unsigned long *)(ptr + offset) += delta;
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_page_state_offset);
+
+void __get_zone_counts(unsigned long *active, unsigned long *inactive,
+			unsigned long *free, struct pglist_data *pgdat)
+{
+	struct zone *zones = pgdat->node_zones;
+	int i;
+
+	*active = 0;
+	*inactive = 0;
+	*free = 0;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		*active += zones[i].nr_active;
+		*inactive += zones[i].nr_inactive;
+		*free += zones[i].free_pages;
+	}
+}
+
+void get_zone_counts(unsigned long *active,
+		unsigned long *inactive, unsigned long *free)
+{
+	struct pglist_data *pgdat;
+
+	*active = 0;
+	*inactive = 0;
+	*free = 0;
+	for_each_online_pgdat(pgdat) {
+		unsigned long l, m, n;
+		__get_zone_counts(&l, &m, &n, pgdat);
+		*active += l;
+		*inactive += m;
+		*free += n;
+	}
+}
+
+void si_meminfo(struct sysinfo *val)
+{
+	val->totalram = totalram_pages;
+	val->sharedram = 0;
+	val->freeram = nr_free_pages();
+	val->bufferram = nr_blockdev_pages();
+#ifdef CONFIG_HIGHMEM
+	val->totalhigh = totalhigh_pages;
+	val->freehigh = nr_free_highpages();
+#else
+	val->totalhigh = 0;
+	val->freehigh = 0;
+#endif
+	val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	val->totalram = pgdat->node_present_pages;
+	val->freeram = nr_free_pages_pgdat(pgdat);
+	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+	val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ */
+void show_free_areas(void)
+{
+	struct page_state ps;
+	int cpu, temperature;
+	unsigned long active;
+	unsigned long inactive;
+	unsigned long free;
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		show_node(zone);
+		printk("%s per-cpu:", zone->name);
+
+		if (!populated_zone(zone)) {
+			printk(" empty\n");
+			continue;
+		} else
+			printk("\n");
+
+		for_each_online_cpu(cpu) {
+			struct per_cpu_pageset *pageset;
+
+			pageset = zone_pcp(zone, cpu);
+
+			for (temperature = 0; temperature < 2; temperature++)
+				printk("cpu %d %s: high %d, batch %d used:%d\n",
+					cpu,
+					temperature ? "cold" : "hot",
+					pageset->pcp[temperature].high,
+					pageset->pcp[temperature].batch,
+					pageset->pcp[temperature].count);
+		}
+	}
+
+	get_page_state(&ps);
+	get_zone_counts(&active, &inactive, &free);
+
+	printk("Free pages: %11ukB (%ukB HighMem)\n",
+		K(nr_free_pages()),
+		K(nr_free_highpages()));
+
+	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
+		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+		active,
+		inactive,
+		ps.nr_dirty,
+		ps.nr_writeback,
+		ps.nr_unstable,
+		nr_free_pages(),
+		ps.nr_slab,
+		ps.nr_mapped,
+		ps.nr_page_table_pages);
+
+	for_each_zone(zone) {
+		int i;
+
+		show_node(zone);
+		printk("%s"
+			" free:%lukB"
+			" min:%lukB"
+			" low:%lukB"
+			" high:%lukB"
+			" active:%lukB"
+			" inactive:%lukB"
+			" present:%lukB"
+			" pages_scanned:%lu"
+			" all_unreclaimable? %s"
+			"\n",
+			zone->name,
+			K(zone->free_pages),
+			K(zone->pages_min),
+			K(zone->pages_low),
+			K(zone->pages_high),
+			K(zone->nr_active),
+			K(zone->nr_inactive),
+			K(zone->present_pages),
+			zone->pages_scanned,
+			(zone->all_unreclaimable ? "yes" : "no")
+			);
+		printk("lowmem_reserve[]:");
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			printk(" %lu", zone->lowmem_reserve[i]);
+		printk("\n");
+	}
+
+	for_each_zone(zone) {
+ 		unsigned long nr, flags, order, total = 0;
+
+		show_node(zone);
+		printk("%s: ", zone->name);
+		if (!populated_zone(zone)) {
+			printk("empty\n");
+			continue;
+		}
+
+		spin_lock_irqsave(&zone->lock, flags);
+		for (order = 0; order < MAX_ORDER; order++) {
+			nr = zone->free_area[order].nr_free;
+			total += nr << order;
+			printk("%lu*%lukB ", nr, K(1UL) << order);
+		}
+		spin_unlock_irqrestore(&zone->lock, flags);
+		printk("= %lukB\n", K(total));
+	}
+
+	show_swap_cache_info();
+}
+
+/*
+ * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
+ */
+static int __init build_zonelists_node(pg_data_t *pgdat,
+			struct zonelist *zonelist, int nr_zones, int zone_type)
+{
+	struct zone *zone;
+
+	BUG_ON(zone_type > ZONE_HIGHMEM);
+
+	do {
+		zone = pgdat->node_zones + zone_type;
+		if (populated_zone(zone)) {
+#ifndef CONFIG_HIGHMEM
+			BUG_ON(zone_type > ZONE_NORMAL);
+#endif
+			zonelist->zones[nr_zones++] = zone;
+			check_highest_zone(zone_type);
+		}
+		zone_type--;
+
+	} while (zone_type >= 0);
+	return nr_zones;
+}
+
+static inline int highest_zone(int zone_bits)
+{
+	int res = ZONE_NORMAL;
+	if (zone_bits & (__force int)__GFP_HIGHMEM)
+		res = ZONE_HIGHMEM;
+	if (zone_bits & (__force int)__GFP_DMA32)
+		res = ZONE_DMA32;
+	if (zone_bits & (__force int)__GFP_DMA)
+		res = ZONE_DMA;
+	return res;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (num_online_nodes())
+static int __initdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: nodemask_t of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list.  The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+	int n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	/* Use the local node if we haven't already */
+	if (!node_isset(node, *used_node_mask)) {
+		node_set(node, *used_node_mask);
+		return node;
+	}
+
+	for_each_online_node(n) {
+		cpumask_t tmp;
+
+		/* Don't want a node to appear more than once */
+		if (node_isset(n, *used_node_mask))
+			continue;
+
+		/* Use the distance array to find the distance */
+		val = node_distance(node, n);
+
+		/* Penalize nodes under us ("prefer the next node") */
+		val += (n < node);
+
+		/* Give preference to headless and unused nodes */
+		tmp = node_to_cpumask(n);
+		if (!cpus_empty(tmp))
+			val += PENALTY_FOR_NODE_WITH_CPUS;
+
+		/* Slight preference for less loaded node */
+		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+		val += node_load[n];
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node >= 0)
+		node_set(best_node, *used_node_mask);
+
+	return best_node;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+	int prev_node, load;
+	struct zonelist *zonelist;
+	nodemask_t used_mask;
+
+	/* initialize zonelists */
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		zonelist->zones[0] = NULL;
+	}
+
+	/* NUMA-aware ordering of nodes */
+	local_node = pgdat->node_id;
+	load = num_online_nodes();
+	prev_node = local_node;
+	nodes_clear(used_mask);
+	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+
+		/*
+		 * We don't want to pressure a particular node.
+		 * So adding penalty to the first node in same
+		 * distance group to make it round-robin.
+		 */
+
+		if (distance != node_distance(local_node, prev_node))
+			node_load[node] += load;
+		prev_node = node;
+		load--;
+		for (i = 0; i < GFP_ZONETYPES; i++) {
+			zonelist = pgdat->node_zonelists + i;
+			for (j = 0; zonelist->zones[j] != NULL; j++);
+
+			k = highest_zone(i);
+
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			zonelist->zones[j] = NULL;
+		}
+	}
+}
+
+#else	/* CONFIG_NUMA */
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+
+	local_node = pgdat->node_id;
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		struct zonelist *zonelist;
+
+		zonelist = pgdat->node_zonelists + i;
+
+		j = 0;
+		k = highest_zone(i);
+ 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		/*
+ 		 * Now we build the zonelist so that it contains the zones
+ 		 * of all the other nodes.
+ 		 * We don't want to pressure a particular node, so when
+ 		 * building the zones for node N, we make sure that the
+ 		 * zones coming right after the local ones are those from
+ 		 * node N+1 (modulo N)
+ 		 */
+		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+		for (node = 0; node < local_node; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+
+		zonelist->zones[j] = NULL;
+	}
+}
+
+#endif	/* CONFIG_NUMA */
+
+void __init build_all_zonelists(void)
+{
+	int i;
+
+	for_each_online_node(i)
+		build_zonelists(NODE_DATA(i));
+	printk("Built %i zonelists\n", num_online_nodes());
+	cpuset_init_current_mems_allowed();
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE	256
+
+static inline unsigned long wait_table_size(unsigned long pages)
+{
+	unsigned long size = 1;
+
+	pages /= PAGES_PER_WAITQUEUE;
+
+	while (size < pages)
+		size <<= 1;
+
+	/*
+	 * Once we have dozens or even hundreds of threads sleeping
+	 * on IO we've got bigger problems than wait queue collision.
+	 * Limit the size of the wait table to a reasonable size.
+	 */
+	size = min(size, 4096UL);
+
+	return max(size, 4UL);
+}
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+	return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		totalpages += zones_size[i];
+	pgdat->node_spanned_pages = totalpages;
+
+	realtotalpages = totalpages;
+	if (zholes_size)
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			realtotalpages -= zholes_size[i];
+	pgdat->node_present_pages = realtotalpages;
+	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+		unsigned long start_pfn)
+{
+	struct page *page;
+	unsigned long end_pfn = start_pfn + size;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		if (!early_pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		set_page_links(page, zone, nid, pfn);
+		init_page_count(page);
+		reset_page_mapcount(page);
+		SetPageReserved(page);
+		INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
+		if (!is_highmem_idx(zone))
+			set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+#ifdef CONFIG_PAGE_OWNER
+		page->order = -1;
+#endif
+	}
+}
+
+void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
+				unsigned long size)
+{
+	int order;
+	for (order = 0; order < MAX_ORDER ; order++) {
+		INIT_LIST_HEAD(&zone->free_area[order].free_list);
+		zone->free_area[order].nr_free = 0;
+	}
+}
+
+#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+		unsigned long size)
+{
+	unsigned long snum = pfn_to_section_nr(pfn);
+	unsigned long end = pfn_to_section_nr(pfn + size);
+
+	if (FLAGS_HAS_NODE)
+		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+	else
+		for (; snum <= end; snum++)
+			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(size, nid, zone, start_pfn) \
+	memmap_init_zone((size), (nid), (zone), (start_pfn))
+#endif
+
+static int __cpuinit zone_batchsize(struct zone *zone)
+{
+	int batch;
+
+	/*
+	 * The per-cpu-pages pools are set to around 1000th of the
+	 * size of the zone.  But no more than 1/2 of a meg.
+	 *
+	 * OK, so we don't know how big the cache is.  So guess.
+	 */
+	batch = zone->present_pages / 1024;
+	if (batch * PAGE_SIZE > 512 * 1024)
+		batch = (512 * 1024) / PAGE_SIZE;
+	batch /= 4;		/* We effectively *= 4 below */
+	if (batch < 1)
+		batch = 1;
+
+	/*
+	 * Clamp the batch to a 2^n - 1 value. Having a power
+	 * of 2 value was found to be more likely to have
+	 * suboptimal cache aliasing properties in some cases.
+	 *
+	 * For example if 2 tasks are alternately allocating
+	 * batches of pages, one task can end up with a lot
+	 * of pages of one half of the possible page colors
+	 * and the other with pages of the other colors.
+	 */
+	batch = (1 << (fls(batch + batch/2)-1)) - 1;
+
+	return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+	struct per_cpu_pages *pcp;
+
+	memset(p, 0, sizeof(*p));
+
+	pcp = &p->pcp[0];		/* hot */
+	pcp->count = 0;
+	pcp->high = 6 * batch;
+	pcp->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&pcp->list);
+
+	pcp = &p->pcp[1];		/* cold*/
+	pcp->count = 0;
+	pcp->high = 2 * batch;
+	pcp->batch = max(1UL, batch/2);
+	INIT_LIST_HEAD(&pcp->list);
+}
+
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+				unsigned long high)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0]; /* hot list */
+	pcp->high = high;
+	pcp->batch = max(1UL, high/4);
+	if ((high/4) > (PAGE_SHIFT * 8))
+		pcp->batch = PAGE_SHIFT * 8;
+}
+
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __cpuinit process_zones(int cpu)
+{
+	struct zone *zone, *dzone;
+
+	for_each_zone(zone) {
+
+		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
+					 GFP_KERNEL, cpu_to_node(cpu));
+		if (!zone_pcp(zone, cpu))
+			goto bad;
+
+		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+		if (percpu_pagelist_fraction)
+			setup_pagelist_highmark(zone_pcp(zone, cpu),
+			 	(zone->present_pages / percpu_pagelist_fraction));
+	}
+
+	return 0;
+bad:
+	for_each_zone(dzone) {
+		if (dzone == zone)
+			break;
+		kfree(zone_pcp(dzone, cpu));
+		zone_pcp(dzone, cpu) = NULL;
+	}
+	return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+		zone_pcp(zone, cpu) = NULL;
+		kfree(pset);
+	}
+}
+
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
+		unsigned long action,
+		void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int ret = NOTIFY_OK;
+
+	switch (action) {
+		case CPU_UP_PREPARE:
+			if (process_zones(cpu))
+				ret = NOTIFY_BAD;
+			break;
+		case CPU_UP_CANCELED:
+		case CPU_DEAD:
+			free_zone_pagesets(cpu);
+			break;
+		default:
+			break;
+	}
+	return ret;
+}
+
+static struct notifier_block pageset_notifier =
+	{ &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset(void)
+{
+	int err;
+
+	/* Initialize per_cpu_pageset for cpu 0.
+	 * A cpuup callback will do this for every cpu
+	 * as it comes online
+	 */
+	err = process_zones(smp_processor_id());
+	BUG_ON(err);
+	register_cpu_notifier(&pageset_notifier);
+}
+
+#endif
+
+static __meminit
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+	int i;
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	/*
+	 * The per-page waitqueue mechanism uses hashed waitqueues
+	 * per zone.
+	 */
+	zone->wait_table_size = wait_table_size(zone_size_pages);
+	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
+	zone->wait_table = (wait_queue_head_t *)
+		alloc_bootmem_node(pgdat, zone->wait_table_size
+					* sizeof(wait_queue_head_t));
+
+	for(i = 0; i < zone->wait_table_size; ++i)
+		init_waitqueue_head(zone->wait_table + i);
+}
+
+static __meminit void zone_pcp_init(struct zone *zone)
+{
+	int cpu;
+	unsigned long batch = zone_batchsize(zone);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+		/* Early boot. Slab allocator not functional yet */
+		zone_pcp(zone, cpu) = &boot_pageset[cpu];
+		setup_pageset(&boot_pageset[cpu],0);
+#else
+		setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+	}
+	printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+		zone->name, zone->present_pages, batch);
+}
+
+static __meminit void init_currently_empty_zone(struct zone *zone,
+		unsigned long zone_start_pfn, unsigned long size)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	zone_wait_table_init(zone, size);
+	pgdat->nr_zones = zone_idx(zone) + 1;
+
+	zone->zone_start_pfn = zone_start_pfn;
+
+	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+}
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long j;
+	int nid = pgdat->node_id;
+	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+
+	pgdat_resize_init(pgdat);
+	pgdat->nr_zones = 0;
+	init_waitqueue_head(&pgdat->kswapd_wait);
+	pgdat->kswapd_max_order = 0;
+	
+	for (j = 0; j < MAX_NR_ZONES; j++) {
+		struct zone *zone = pgdat->node_zones + j;
+		unsigned long size, realsize;
+
+		realsize = size = zones_size[j];
+		if (zholes_size)
+			realsize -= zholes_size[j];
+
+		if (j < ZONE_HIGHMEM)
+			nr_kernel_pages += realsize;
+		nr_all_pages += realsize;
+
+		zone->spanned_pages = size;
+		zone->present_pages = realsize;
+		zone->name = zone_names[j];
+		spin_lock_init(&zone->lock);
+		spin_lock_init(&zone->lru_lock);
+		zone_seqlock_init(zone);
+		zone->zone_pgdat = pgdat;
+		zone->free_pages = 0;
+
+		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+		zone_pcp_init(zone);
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_list);
+		zone->nr_scan_active = 0;
+		zone->nr_scan_inactive = 0;
+		zone->nr_active = 0;
+		zone->nr_inactive = 0;
+		atomic_set(&zone->reclaim_in_progress, 0);
+		if (!size)
+			continue;
+
+		zonetable_add(zone, nid, j, zone_start_pfn, size);
+		init_currently_empty_zone(zone, zone_start_pfn, size);
+		zone_start_pfn += size;
+	}
+}
+
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+	/* Skip empty nodes */
+	if (!pgdat->node_spanned_pages)
+		return;
+
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	/* ia64 gets its own node_mem_map, before this, without bootmem */
+	if (!pgdat->node_mem_map) {
+		unsigned long size;
+		struct page *map;
+
+		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+		map = alloc_remap(pgdat->node_id, size);
+		if (!map)
+			map = alloc_bootmem_node(pgdat, size);
+		pgdat->node_mem_map = map;
+	}
+#ifdef CONFIG_FLATMEM
+	/*
+	 * With no DISCONTIG, the global mem_map is just set as node 0's
+	 */
+	if (pgdat == NODE_DATA(0))
+		mem_map = NODE_DATA(0)->node_mem_map;
+#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+}
+
+void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long node_start_pfn,
+		unsigned long *zholes_size)
+{
+	pgdat->node_id = nid;
+	pgdat->node_start_pfn = node_start_pfn;
+	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+
+	alloc_node_mem_map(pgdat);
+
+	free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+static bootmem_data_t contig_bootmem_data;
+struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
+
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
+void __init free_area_init(unsigned long *zones_size)
+{
+	free_area_init_node(0, NODE_DATA(0), zones_size,
+			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+}
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+	pg_data_t *pgdat;
+	loff_t node = *pos;
+	for (pgdat = first_online_pgdat();
+	     pgdat && node;
+	     pgdat = next_online_pgdat(pgdat))
+		--node;
+
+	return pgdat;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	pg_data_t *pgdat = (pg_data_t *)arg;
+
+	(*pos)++;
+	return next_online_pgdat(pgdat);
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+/* 
+ * This walks the free areas for each zone.
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+	pg_data_t *pgdat = (pg_data_t *)arg;
+	struct zone *zone;
+	struct zone *node_zones = pgdat->node_zones;
+	unsigned long flags;
+	int order;
+
+	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+		if (!populated_zone(zone))
+			continue;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+		for (order = 0; order < MAX_ORDER; ++order)
+			seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+		spin_unlock_irqrestore(&zone->lock, flags);
+		seq_putc(m, '\n');
+	}
+	return 0;
+}
+
+struct seq_operations fragmentation_op = {
+	.start	= frag_start,
+	.next	= frag_next,
+	.stop	= frag_stop,
+	.show	= frag_show,
+};
+
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+	pg_data_t *pgdat = arg;
+	struct zone *zone;
+	struct zone *node_zones = pgdat->node_zones;
+	unsigned long flags;
+
+	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+		int i;
+
+		if (!populated_zone(zone))
+			continue;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+		seq_printf(m,
+			   "\n  pages free     %lu"
+			   "\n        min      %lu"
+			   "\n        low      %lu"
+			   "\n        high     %lu"
+			   "\n        active   %lu"
+			   "\n        inactive %lu"
+			   "\n        scanned  %lu (a: %lu i: %lu)"
+			   "\n        spanned  %lu"
+			   "\n        present  %lu",
+			   zone->free_pages,
+			   zone->pages_min,
+			   zone->pages_low,
+			   zone->pages_high,
+			   zone->nr_active,
+			   zone->nr_inactive,
+			   zone->pages_scanned,
+			   zone->nr_scan_active, zone->nr_scan_inactive,
+			   zone->spanned_pages,
+			   zone->present_pages);
+		seq_printf(m,
+			   "\n        protection: (%lu",
+			   zone->lowmem_reserve[0]);
+		for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+			seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+		seq_printf(m,
+			   ")"
+			   "\n  pagesets");
+		for_each_online_cpu(i) {
+			struct per_cpu_pageset *pageset;
+			int j;
+
+			pageset = zone_pcp(zone, i);
+			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+				if (pageset->pcp[j].count)
+					break;
+			}
+			if (j == ARRAY_SIZE(pageset->pcp))
+				continue;
+			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+				seq_printf(m,
+					   "\n    cpu: %i pcp: %i"
+					   "\n              count: %i"
+					   "\n              high:  %i"
+					   "\n              batch: %i",
+					   i, j,
+					   pageset->pcp[j].count,
+					   pageset->pcp[j].high,
+					   pageset->pcp[j].batch);
+			}
+#ifdef CONFIG_NUMA
+			seq_printf(m,
+				   "\n            numa_hit:       %lu"
+				   "\n            numa_miss:      %lu"
+				   "\n            numa_foreign:   %lu"
+				   "\n            interleave_hit: %lu"
+				   "\n            local_node:     %lu"
+				   "\n            other_node:     %lu",
+				   pageset->numa_hit,
+				   pageset->numa_miss,
+				   pageset->numa_foreign,
+				   pageset->interleave_hit,
+				   pageset->local_node,
+				   pageset->other_node);
+#endif
+		}
+		seq_printf(m,
+			   "\n  all_unreclaimable: %u"
+			   "\n  prev_priority:     %i"
+			   "\n  temp_priority:     %i"
+			   "\n  start_pfn:         %lu",
+			   zone->all_unreclaimable,
+			   zone->prev_priority,
+			   zone->temp_priority,
+			   zone->zone_start_pfn);
+		spin_unlock_irqrestore(&zone->lock, flags);
+		seq_putc(m, '\n');
+	}
+	return 0;
+}
+
+struct seq_operations zoneinfo_op = {
+	.start	= frag_start, /* iterate over all zones. The same as in
+			       * fragmentation. */
+	.next	= frag_next,
+	.stop	= frag_stop,
+	.show	= zoneinfo_show,
+};
+
+static char *vmstat_text[] = {
+	"nr_dirty",
+	"nr_writeback",
+	"nr_unstable",
+	"nr_page_table_pages",
+	"nr_mapped",
+	"nr_slab",
+
+	"pgpgin",
+	"pgpgout",
+	"pswpin",
+	"pswpout",
+
+	"pgalloc_high",
+	"pgalloc_normal",
+	"pgalloc_dma32",
+	"pgalloc_dma",
+
+	"pgfree",
+	"pgactivate",
+	"pgdeactivate",
+
+	"pgfault",
+	"pgmajfault",
+
+	"pgrefill_high",
+	"pgrefill_normal",
+	"pgrefill_dma32",
+	"pgrefill_dma",
+
+	"pgsteal_high",
+	"pgsteal_normal",
+	"pgsteal_dma32",
+	"pgsteal_dma",
+
+	"pgscan_kswapd_high",
+	"pgscan_kswapd_normal",
+	"pgscan_kswapd_dma32",
+	"pgscan_kswapd_dma",
+
+	"pgscan_direct_high",
+	"pgscan_direct_normal",
+	"pgscan_direct_dma32",
+	"pgscan_direct_dma",
+
+	"pginodesteal",
+	"slabs_scanned",
+	"kswapd_steal",
+	"kswapd_inodesteal",
+	"pageoutrun",
+	"allocstall",
+
+	"pgrotated",
+	"nr_bounce",
+};
+
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+	struct page_state *ps;
+
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+
+	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	m->private = ps;
+	if (!ps)
+		return ERR_PTR(-ENOMEM);
+	get_full_page_state(ps);
+	ps->pgpgin /= 2;		/* sectors -> kbytes */
+	ps->pgpgout /= 2;
+	return (unsigned long *)ps + *pos;
+}
+
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+	return (unsigned long *)m->private + *pos;
+}
+
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+	unsigned long *l = arg;
+	unsigned long off = l - (unsigned long *)m->private;
+
+	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+	return 0;
+}
+
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+	kfree(m->private);
+	m->private = NULL;
+}
+
+struct seq_operations vmstat_op = {
+	.start	= vmstat_start,
+	.next	= vmstat_next,
+	.stop	= vmstat_stop,
+	.show	= vmstat_show,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int page_alloc_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	long *count;
+	unsigned long *src, *dest;
+
+	if (action == CPU_DEAD) {
+		int i;
+
+		/* Drain local pagecache count. */
+		count = &per_cpu(nr_pagecache_local, cpu);
+		atomic_add(*count, &nr_pagecache);
+		*count = 0;
+		local_irq_disable();
+		__drain_pages(cpu);
+
+		/* Add dead cpu's page_states to our own. */
+		dest = (unsigned long *)&__get_cpu_var(page_states);
+		src = (unsigned long *)&per_cpu(page_states, cpu);
+
+		for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
+				i++) {
+			dest[i] += src[i];
+			src[i] = 0;
+		}
+
+		local_irq_enable();
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void __init page_alloc_init(void)
+{
+	hotcpu_notifier(page_alloc_cpu_notify, 0);
+}
+
+/*
+ * setup_per_zone_lowmem_reserve - called whenever
+ *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
+ *	has a correct pages reserved value, so an adequate number of
+ *	pages are left in the zone after a successful __alloc_pages().
+ */
+static void setup_per_zone_lowmem_reserve(void)
+{
+	struct pglist_data *pgdat;
+	int j, idx;
+
+	for_each_online_pgdat(pgdat) {
+		for (j = 0; j < MAX_NR_ZONES; j++) {
+			struct zone *zone = pgdat->node_zones + j;
+			unsigned long present_pages = zone->present_pages;
+
+			zone->lowmem_reserve[j] = 0;
+
+			for (idx = j-1; idx >= 0; idx--) {
+				struct zone *lower_zone;
+
+				if (sysctl_lowmem_reserve_ratio[idx] < 1)
+					sysctl_lowmem_reserve_ratio[idx] = 1;
+
+				lower_zone = pgdat->node_zones + idx;
+				lower_zone->lowmem_reserve[j] = present_pages /
+					sysctl_lowmem_reserve_ratio[idx];
+				present_pages += lower_zone->present_pages;
+			}
+		}
+	}
+}
+
+/*
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
+ *	that the pages_{min,low,high} values for each zone are set correctly 
+ *	with respect to min_free_kbytes.
+ */
+void setup_per_zone_pages_min(void)
+{
+	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long lowmem_pages = 0;
+	struct zone *zone;
+	unsigned long flags;
+
+	/* Calculate total number of !ZONE_HIGHMEM pages */
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			lowmem_pages += zone->present_pages;
+	}
+
+	for_each_zone(zone) {
+		unsigned long tmp;
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		tmp = (pages_min * zone->present_pages) / lowmem_pages;
+		if (is_highmem(zone)) {
+			/*
+			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+			 * need highmem pages, so cap pages_min to a small
+			 * value here.
+			 *
+			 * The (pages_high-pages_low) and (pages_low-pages_min)
+			 * deltas controls asynch page reclaim, and so should
+			 * not be capped for highmem.
+			 */
+			int min_pages;
+
+			min_pages = zone->present_pages / 1024;
+			if (min_pages < SWAP_CLUSTER_MAX)
+				min_pages = SWAP_CLUSTER_MAX;
+			if (min_pages > 128)
+				min_pages = 128;
+			zone->pages_min = min_pages;
+		} else {
+			/*
+			 * If it's a lowmem zone, reserve a number of pages
+			 * proportionate to the zone's size.
+			 */
+			zone->pages_min = tmp;
+		}
+
+		zone->pages_low   = zone->pages_min + tmp / 4;
+		zone->pages_high  = zone->pages_min + tmp / 2;
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+}
+
+/*
+ * Initialise min_free_kbytes.
+ *
+ * For small machines we want it small (128k min).  For large machines
+ * we want it large (64MB max).  But it is not linear, because network
+ * bandwidth does not increase linearly with machine size.  We use
+ *
+ * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
+ *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
+ *
+ * which yields
+ *
+ * 16MB:	512k
+ * 32MB:	724k
+ * 64MB:	1024k
+ * 128MB:	1448k
+ * 256MB:	2048k
+ * 512MB:	2896k
+ * 1024MB:	4096k
+ * 2048MB:	5792k
+ * 4096MB:	8192k
+ * 8192MB:	11584k
+ * 16384MB:	16384k
+ */
+static int __init init_per_zone_pages_min(void)
+{
+	unsigned long lowmem_kbytes;
+
+	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
+
+	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
+	if (min_free_kbytes < 128)
+		min_free_kbytes = 128;
+	if (min_free_kbytes > 65536)
+		min_free_kbytes = 65536;
+	setup_per_zone_pages_min();
+	setup_per_zone_lowmem_reserve();
+	return 0;
+}
+module_init(init_per_zone_pages_min)
+
+/*
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ *	that we can call two helper functions whenever min_free_kbytes
+ *	changes.
+ */
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	setup_per_zone_pages_min();
+	return 0;
+}
+
+/*
+ * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
+ *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
+ *	whenever sysctl_lowmem_reserve_ratio changes.
+ *
+ * The reserve ratio obviously has absolutely no relation with the
+ * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * if in function of the boot time zone sizes.
+ */
+int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	setup_per_zone_lowmem_reserve();
+	return 0;
+}
+
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	unsigned int cpu;
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	for_each_zone(zone) {
+		for_each_online_cpu(cpu) {
+			unsigned long  high;
+			high = zone->present_pages / percpu_pagelist_fraction;
+			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+		}
+	}
+	return 0;
+}
+
+__initdata int hashdist = HASHDIST_DEFAULT;
+
+#ifdef CONFIG_NUMA
+static int __init set_hashdist(char *str)
+{
+	if (!str)
+		return 0;
+	hashdist = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("hashdist=", set_hashdist);
+#endif
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ *   quantity of entries
+ * - limit is the number of hash buckets, not the total allocation size
+ */
+void *__init alloc_large_system_hash(const char *tablename,
+				     unsigned long bucketsize,
+				     unsigned long numentries,
+				     int scale,
+				     int flags,
+				     unsigned int *_hash_shift,
+				     unsigned int *_hash_mask,
+				     unsigned long limit)
+{
+	unsigned long long max = limit;
+	unsigned long log2qty, size;
+	void *table = NULL;
+
+	/* allow the kernel cmdline to have a say */
+	if (!numentries) {
+		/* round applicable memory size up to nearest megabyte */
+		numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
+		numentries >>= 20 - PAGE_SHIFT;
+		numentries <<= 20 - PAGE_SHIFT;
+
+		/* limit to 1 bucket per 2^scale bytes of low memory */
+		if (scale > PAGE_SHIFT)
+			numentries >>= (scale - PAGE_SHIFT);
+		else
+			numentries <<= (PAGE_SHIFT - scale);
+	}
+	/* rounded up to nearest power of 2 in size */
+	numentries = 1UL << (long_log2(numentries) + 1);
+
+	/* limit allocation size to 1/16 total memory by default */
+	if (max == 0) {
+		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
+		do_div(max, bucketsize);
+	}
+
+	if (numentries > max)
+		numentries = max;
+
+	log2qty = long_log2(numentries);
+
+	do {
+		size = bucketsize << log2qty;
+		if (flags & HASH_EARLY)
+			table = alloc_bootmem(size);
+		else if (hashdist)
+			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+		else {
+			unsigned long order;
+			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
+				;
+			table = (void*) __get_free_pages(GFP_ATOMIC, order);
+		}
+	} while (!table && size > PAGE_SIZE && --log2qty);
+
+	if (!table)
+		panic("Failed to allocate %s hash table\n", tablename);
+
+	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
+	       tablename,
+	       (1U << log2qty),
+	       long_log2(size) - PAGE_SHIFT,
+	       size);
+
+	if (_hash_shift)
+		*_hash_shift = log2qty;
+	if (_hash_mask)
+		*_hash_mask = (1 << log2qty) - 1;
+
+	return table;
+}
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+/*
+ * pfn <-> page translation. out-of-line version.
+ * (see asm-generic/memory_model.h)
+ */
+#if defined(CONFIG_FLATMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	return mem_map + (pfn - ARCH_PFN_OFFSET);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+	return (page - mem_map) + ARCH_PFN_OFFSET;
+}
+#elif defined(CONFIG_DISCONTIGMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	int nid = arch_pfn_to_nid(pfn);
+	return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+	return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
+}
+#elif defined(CONFIG_SPARSEMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
+}
+
+unsigned long page_to_pfn(struct page *page)
+{
+	long section_id = page_to_section(page);
+	return page - __section_mem_map_addr(__nr_to_section(section_id));
+}
+#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
+EXPORT_SYMBOL(pfn_to_page);
+EXPORT_SYMBOL(page_to_pfn);
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
diff -urN oldtree/mm/swapfile.c newtree/mm/swapfile.c
--- oldtree/mm/swapfile.c	2006-03-08 18:48:03.016067500 +0000
+++ newtree/mm/swapfile.c	2006-03-08 15:22:33.409515250 +0000
@@ -1248,6 +1248,7 @@
 	swap_file = p->swap_file;
 	p->swap_file = NULL;
 	p->max = 0;
+	p->bdev = NULL;
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
diff -urN oldtree/mm/swapfile.c.orig newtree/mm/swapfile.c.orig
--- oldtree/mm/swapfile.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/mm/swapfile.c.orig	2006-03-08 15:21:19.292883250 +0000
@@ -0,0 +1,1771 @@
+/*
+ *  linux/mm/swapfile.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/shm.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
+#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/capability.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <linux/swapops.h>
+
+DEFINE_SPINLOCK(swap_lock);
+unsigned int nr_swapfiles;
+long total_swap_pages;
+static int swap_overflow;
+
+static const char Bad_file[] = "Bad swap file entry ";
+static const char Unused_file[] = "Unused swap file entry ";
+static const char Bad_offset[] = "Bad swap offset entry ";
+static const char Unused_offset[] = "Unused swap offset entry ";
+
+struct swap_list_t swap_list = {-1, -1};
+
+static struct swap_info_struct swap_info[MAX_SWAPFILES];
+
+static DEFINE_MUTEX(swapon_mutex);
+
+/*
+ * We need this because the bdev->unplug_fn can sleep and we cannot
+ * hold swap_lock while calling the unplug_fn. And swap_lock
+ * cannot be turned into a mutex.
+ */
+static DECLARE_RWSEM(swap_unplug_sem);
+
+void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
+{
+	swp_entry_t entry;
+
+	down_read(&swap_unplug_sem);
+	entry.val = page_private(page);
+	if (PageSwapCache(page)) {
+		struct block_device *bdev = swap_info[swp_type(entry)].bdev;
+		struct backing_dev_info *bdi;
+
+		/*
+		 * If the page is removed from swapcache from under us (with a
+		 * racy try_to_unuse/swapoff) we need an additional reference
+		 * count to avoid reading garbage from page_private(page) above.
+		 * If the WARN_ON triggers during a swapoff it maybe the race
+		 * condition and it's harmless. However if it triggers without
+		 * swapoff it signals a problem.
+		 */
+		WARN_ON(page_count(page) <= 1);
+
+		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
+		blk_run_backing_dev(bdi, page);
+	}
+	up_read(&swap_unplug_sem);
+}
+
+#define SWAPFILE_CLUSTER	256
+#define LATENCY_LIMIT		256
+
+static inline unsigned long scan_swap_map(struct swap_info_struct *si)
+{
+	unsigned long offset, last_in_cluster;
+	int latency_ration = LATENCY_LIMIT;
+
+	/* 
+	 * We try to cluster swap pages by allocating them sequentially
+	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
+	 * way, however, we resort to first-free allocation, starting
+	 * a new cluster.  This prevents us from scattering swap pages
+	 * all over the entire swap partition, so that we reduce
+	 * overall disk seek times between swap pages.  -- sct
+	 * But we do now try to find an empty cluster.  -Andrea
+	 */
+
+	si->flags += SWP_SCANNING;
+	if (unlikely(!si->cluster_nr)) {
+		si->cluster_nr = SWAPFILE_CLUSTER - 1;
+		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
+			goto lowest;
+		spin_unlock(&swap_lock);
+
+		offset = si->lowest_bit;
+		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+
+		/* Locate the first empty (unaligned) cluster */
+		for (; last_in_cluster <= si->highest_bit; offset++) {
+			if (si->swap_map[offset])
+				last_in_cluster = offset + SWAPFILE_CLUSTER;
+			else if (offset == last_in_cluster) {
+				spin_lock(&swap_lock);
+				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+				goto cluster;
+			}
+			if (unlikely(--latency_ration < 0)) {
+				cond_resched();
+				latency_ration = LATENCY_LIMIT;
+			}
+		}
+		spin_lock(&swap_lock);
+		goto lowest;
+	}
+
+	si->cluster_nr--;
+cluster:
+	offset = si->cluster_next;
+	if (offset > si->highest_bit)
+lowest:		offset = si->lowest_bit;
+checks:	if (!(si->flags & SWP_WRITEOK))
+		goto no_page;
+	if (!si->highest_bit)
+		goto no_page;
+	if (!si->swap_map[offset]) {
+		if (offset == si->lowest_bit)
+			si->lowest_bit++;
+		if (offset == si->highest_bit)
+			si->highest_bit--;
+		si->inuse_pages++;
+		if (si->inuse_pages == si->pages) {
+			si->lowest_bit = si->max;
+			si->highest_bit = 0;
+		}
+		si->swap_map[offset] = 1;
+		si->cluster_next = offset + 1;
+		si->flags -= SWP_SCANNING;
+		return offset;
+	}
+
+	spin_unlock(&swap_lock);
+	while (++offset <= si->highest_bit) {
+		if (!si->swap_map[offset]) {
+			spin_lock(&swap_lock);
+			goto checks;
+		}
+		if (unlikely(--latency_ration < 0)) {
+			cond_resched();
+			latency_ration = LATENCY_LIMIT;
+		}
+	}
+	spin_lock(&swap_lock);
+	goto lowest;
+
+no_page:
+	si->flags -= SWP_SCANNING;
+	return 0;
+}
+
+swp_entry_t get_swap_page(void)
+{
+	struct swap_info_struct *si;
+	pgoff_t offset;
+	int type, next;
+	int wrapped = 0;
+
+	spin_lock(&swap_lock);
+	if (nr_swap_pages <= 0)
+		goto noswap;
+	nr_swap_pages--;
+
+	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+		si = swap_info + type;
+		next = si->next;
+		if (next < 0 ||
+		    (!wrapped && si->prio != swap_info[next].prio)) {
+			next = swap_list.head;
+			wrapped++;
+		}
+
+		if (!si->highest_bit)
+			continue;
+		if (!(si->flags & SWP_WRITEOK))
+			continue;
+
+		swap_list.next = next;
+		offset = scan_swap_map(si);
+		if (offset) {
+			spin_unlock(&swap_lock);
+			return swp_entry(type, offset);
+		}
+		next = swap_list.next;
+	}
+
+	nr_swap_pages++;
+noswap:
+	spin_unlock(&swap_lock);
+	return (swp_entry_t) {0};
+}
+
+swp_entry_t get_swap_page_of_type(int type)
+{
+	struct swap_info_struct *si;
+	pgoff_t offset;
+
+	spin_lock(&swap_lock);
+	si = swap_info + type;
+	if (si->flags & SWP_WRITEOK) {
+		nr_swap_pages--;
+		offset = scan_swap_map(si);
+		if (offset) {
+			spin_unlock(&swap_lock);
+			return swp_entry(type, offset);
+		}
+		nr_swap_pages++;
+	}
+	spin_unlock(&swap_lock);
+	return (swp_entry_t) {0};
+}
+
+static struct swap_info_struct * swap_info_get(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+	unsigned long offset, type;
+
+	if (!entry.val)
+		goto out;
+	type = swp_type(entry);
+	if (type >= nr_swapfiles)
+		goto bad_nofile;
+	p = & swap_info[type];
+	if (!(p->flags & SWP_USED))
+		goto bad_device;
+	offset = swp_offset(entry);
+	if (offset >= p->max)
+		goto bad_offset;
+	if (!p->swap_map[offset])
+		goto bad_free;
+	spin_lock(&swap_lock);
+	return p;
+
+bad_free:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
+	goto out;
+bad_offset:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
+	goto out;
+bad_device:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
+	goto out;
+bad_nofile:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
+out:
+	return NULL;
+}	
+
+static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+{
+	int count = p->swap_map[offset];
+
+	if (count < SWAP_MAP_MAX) {
+		count--;
+		p->swap_map[offset] = count;
+		if (!count) {
+			if (offset < p->lowest_bit)
+				p->lowest_bit = offset;
+			if (offset > p->highest_bit)
+				p->highest_bit = offset;
+			if (p->prio > swap_info[swap_list.next].prio)
+				swap_list.next = p - swap_info;
+			nr_swap_pages++;
+			p->inuse_pages--;
+		}
+	}
+	return count;
+}
+
+/*
+ * Caller has made sure that the swapdevice corresponding to entry
+ * is still around or has not been recycled.
+ */
+void swap_free(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+
+	p = swap_info_get(entry);
+	if (p) {
+		swap_entry_free(p, swp_offset(entry));
+		spin_unlock(&swap_lock);
+	}
+}
+
+/*
+ * How many references to page are currently swapped out?
+ */
+static inline int page_swapcount(struct page *page)
+{
+	int count = 0;
+	struct swap_info_struct *p;
+	swp_entry_t entry;
+
+	entry.val = page_private(page);
+	p = swap_info_get(entry);
+	if (p) {
+		/* Subtract the 1 for the swap cache itself */
+		count = p->swap_map[swp_offset(entry)] - 1;
+		spin_unlock(&swap_lock);
+	}
+	return count;
+}
+
+/*
+ * We can use this swap cache entry directly
+ * if there are no other references to it.
+ */
+int can_share_swap_page(struct page *page)
+{
+	int count;
+
+	BUG_ON(!PageLocked(page));
+	count = page_mapcount(page);
+	if (count <= 1 && PageSwapCache(page))
+		count += page_swapcount(page);
+	return count == 1;
+}
+
+/*
+ * Work out if there are any other processes sharing this
+ * swap cache page. Free it if you can. Return success.
+ */
+int remove_exclusive_swap_page(struct page *page)
+{
+	int retval;
+	struct swap_info_struct * p;
+	swp_entry_t entry;
+
+	BUG_ON(PagePrivate(page));
+	BUG_ON(!PageLocked(page));
+
+	if (!PageSwapCache(page))
+		return 0;
+	if (PageWriteback(page))
+		return 0;
+	if (page_count(page) != 2) /* 2: us + cache */
+		return 0;
+
+	entry.val = page_private(page);
+	p = swap_info_get(entry);
+	if (!p)
+		return 0;
+
+	/* Is the only swap cache user the cache itself? */
+	retval = 0;
+	if (p->swap_map[swp_offset(entry)] == 1) {
+		/* Recheck the page count with the swapcache lock held.. */
+		write_lock_irq(&swapper_space.tree_lock);
+		if ((page_count(page) == 2) && !PageWriteback(page)) {
+			__delete_from_swap_cache(page);
+			SetPageDirty(page);
+			retval = 1;
+		}
+		write_unlock_irq(&swapper_space.tree_lock);
+	}
+	spin_unlock(&swap_lock);
+
+	if (retval) {
+		swap_free(entry);
+		page_cache_release(page);
+	}
+
+	return retval;
+}
+
+/*
+ * Free the swap entry like above, but also try to
+ * free the page cache entry if it is the last user.
+ */
+void free_swap_and_cache(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+	struct page *page = NULL;
+
+	p = swap_info_get(entry);
+	if (p) {
+		if (swap_entry_free(p, swp_offset(entry)) == 1)
+			page = find_trylock_page(&swapper_space, entry.val);
+		spin_unlock(&swap_lock);
+	}
+	if (page) {
+		int one_user;
+
+		BUG_ON(PagePrivate(page));
+		page_cache_get(page);
+		one_user = (page_count(page) == 2);
+		/* Only cache user (+us), or swap space full? Free it! */
+		if (!PageWriteback(page) && (one_user || vm_swap_full())) {
+			delete_from_swap_cache(page);
+			SetPageDirty(page);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+}
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Find the swap type that corresponds to given device (if any)
+ *
+ * This is needed for software suspend and is done in such a way that inode
+ * aliasing is allowed.
+ */
+int swap_type_of(dev_t device)
+{
+	int i;
+
+	spin_lock(&swap_lock);
+	for (i = 0; i < nr_swapfiles; i++) {
+		struct inode *inode;
+
+		if (!(swap_info[i].flags & SWP_WRITEOK))
+			continue;
+		if (!device) {
+			spin_unlock(&swap_lock);
+			return i;
+		}
+		inode = swap_info->swap_file->f_dentry->d_inode;
+		if (S_ISBLK(inode->i_mode) &&
+		    device == MKDEV(imajor(inode), iminor(inode))) {
+			spin_unlock(&swap_lock);
+			return i;
+		}
+	}
+	spin_unlock(&swap_lock);
+	return -ENODEV;
+}
+
+/*
+ * Return either the total number of swap pages of given type, or the number
+ * of free pages of that type (depending on @free)
+ *
+ * This is needed for software suspend
+ */
+unsigned int count_swap_pages(int type, int free)
+{
+	unsigned int n = 0;
+
+	if (type < nr_swapfiles) {
+		spin_lock(&swap_lock);
+		if (swap_info[type].flags & SWP_WRITEOK) {
+			n = swap_info[type].pages;
+			if (free)
+				n -= swap_info[type].inuse_pages;
+		}
+		spin_unlock(&swap_lock);
+	}
+	return n;
+}
+#endif
+
+/*
+ * No need to decide whether this PTE shares the swap entry with others,
+ * just let do_wp_page work it out if a write is requested later - to
+ * force COW, vm_page_prot omits write permission from any private vma.
+ */
+static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
+		unsigned long addr, swp_entry_t entry, struct page *page)
+{
+	inc_mm_counter(vma->vm_mm, anon_rss);
+	get_page(page);
+	set_pte_at(vma->vm_mm, addr, pte,
+		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
+	page_add_anon_rmap(page, vma, addr);
+	swap_free(entry);
+	/*
+	 * Move the page to the active list so it is not
+	 * immediately swapped out again after swapon.
+	 */
+	activate_page(page);
+}
+
+static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+				unsigned long addr, unsigned long end,
+				swp_entry_t entry, struct page *page)
+{
+	pte_t swp_pte = swp_entry_to_pte(entry);
+	pte_t *pte;
+	spinlock_t *ptl;
+	int found = 0;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	do {
+		/*
+		 * swapoff spends a _lot_ of time in this loop!
+		 * Test inline before going to call unuse_pte.
+		 */
+		if (unlikely(pte_same(*pte, swp_pte))) {
+			unuse_pte(vma, pte++, addr, entry, page);
+			found = 1;
+			break;
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap_unlock(pte - 1, ptl);
+	return found;
+}
+
+static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+				unsigned long addr, unsigned long end,
+				swp_entry_t entry, struct page *page)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		if (unuse_pte_range(vma, pmd, addr, next, entry, page))
+			return 1;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+				unsigned long addr, unsigned long end,
+				swp_entry_t entry, struct page *page)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		if (unuse_pmd_range(vma, pud, addr, next, entry, page))
+			return 1;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+static int unuse_vma(struct vm_area_struct *vma,
+				swp_entry_t entry, struct page *page)
+{
+	pgd_t *pgd;
+	unsigned long addr, end, next;
+
+	if (page->mapping) {
+		addr = page_address_in_vma(page, vma);
+		if (addr == -EFAULT)
+			return 0;
+		else
+			end = addr + PAGE_SIZE;
+	} else {
+		addr = vma->vm_start;
+		end = vma->vm_end;
+	}
+
+	pgd = pgd_offset(vma->vm_mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		if (unuse_pud_range(vma, pgd, addr, next, entry, page))
+			return 1;
+	} while (pgd++, addr = next, addr != end);
+	return 0;
+}
+
+static int unuse_mm(struct mm_struct *mm,
+				swp_entry_t entry, struct page *page)
+{
+	struct vm_area_struct *vma;
+
+	if (!down_read_trylock(&mm->mmap_sem)) {
+		/*
+		 * Activate page so shrink_cache is unlikely to unmap its
+		 * ptes while lock is dropped, so swapoff can make progress.
+		 */
+		activate_page(page);
+		unlock_page(page);
+		down_read(&mm->mmap_sem);
+		lock_page(page);
+	}
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->anon_vma && unuse_vma(vma, entry, page))
+			break;
+	}
+	up_read(&mm->mmap_sem);
+	/*
+	 * Currently unuse_mm cannot fail, but leave error handling
+	 * at call sites for now, since we change it from time to time.
+	 */
+	return 0;
+}
+
+#ifdef CONFIG_MIGRATION
+int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
+{
+	swp_entry_t entry = { .val = page_private(page) };
+
+	return unuse_vma(vma, entry, page);
+}
+#endif
+
+/*
+ * Scan swap_map from current position to next entry still in use.
+ * Recycle to start on reaching the end, returning 0 when empty.
+ */
+static unsigned int find_next_to_unuse(struct swap_info_struct *si,
+					unsigned int prev)
+{
+	unsigned int max = si->max;
+	unsigned int i = prev;
+	int count;
+
+	/*
+	 * No need for swap_lock here: we're just looking
+	 * for whether an entry is in use, not modifying it; false
+	 * hits are okay, and sys_swapoff() has already prevented new
+	 * allocations from this area (while holding swap_lock).
+	 */
+	for (;;) {
+		if (++i >= max) {
+			if (!prev) {
+				i = 0;
+				break;
+			}
+			/*
+			 * No entries in use at top of swap_map,
+			 * loop back to start and recheck there.
+			 */
+			max = prev + 1;
+			prev = 0;
+			i = 1;
+		}
+		count = si->swap_map[i];
+		if (count && count != SWAP_MAP_BAD)
+			break;
+	}
+	return i;
+}
+
+/*
+ * We completely avoid races by reading each swap page in advance,
+ * and then search for the process using it.  All the necessary
+ * page table adjustments can then be made atomically.
+ */
+static int try_to_unuse(unsigned int type)
+{
+	struct swap_info_struct * si = &swap_info[type];
+	struct mm_struct *start_mm;
+	unsigned short *swap_map;
+	unsigned short swcount;
+	struct page *page;
+	swp_entry_t entry;
+	unsigned int i = 0;
+	int retval = 0;
+	int reset_overflow = 0;
+	int shmem;
+
+	/*
+	 * When searching mms for an entry, a good strategy is to
+	 * start at the first mm we freed the previous entry from
+	 * (though actually we don't notice whether we or coincidence
+	 * freed the entry).  Initialize this start_mm with a hold.
+	 *
+	 * A simpler strategy would be to start at the last mm we
+	 * freed the previous entry from; but that would take less
+	 * advantage of mmlist ordering, which clusters forked mms
+	 * together, child after parent.  If we race with dup_mmap(), we
+	 * prefer to resolve parent before child, lest we miss entries
+	 * duplicated after we scanned child: using last mm would invert
+	 * that.  Though it's only a serious concern when an overflowed
+	 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
+	 */
+	start_mm = &init_mm;
+	atomic_inc(&init_mm.mm_users);
+
+	/*
+	 * Keep on scanning until all entries have gone.  Usually,
+	 * one pass through swap_map is enough, but not necessarily:
+	 * there are races when an instance of an entry might be missed.
+	 */
+	while ((i = find_next_to_unuse(si, i)) != 0) {
+		if (signal_pending(current)) {
+			retval = -EINTR;
+			break;
+		}
+
+		/* 
+		 * Get a page for the entry, using the existing swap
+		 * cache page if there is one.  Otherwise, get a clean
+		 * page and read the swap into it. 
+		 */
+		swap_map = &si->swap_map[i];
+		entry = swp_entry(type, i);
+again:
+		page = read_swap_cache_async(entry, NULL, 0);
+		if (!page) {
+			/*
+			 * Either swap_duplicate() failed because entry
+			 * has been freed independently, and will not be
+			 * reused since sys_swapoff() already disabled
+			 * allocation from here, or alloc_page() failed.
+			 */
+			if (!*swap_map)
+				continue;
+			retval = -ENOMEM;
+			break;
+		}
+
+		/*
+		 * Don't hold on to start_mm if it looks like exiting.
+		 */
+		if (atomic_read(&start_mm->mm_users) == 1) {
+			mmput(start_mm);
+			start_mm = &init_mm;
+			atomic_inc(&init_mm.mm_users);
+		}
+
+		/*
+		 * Wait for and lock page.  When do_swap_page races with
+		 * try_to_unuse, do_swap_page can handle the fault much
+		 * faster than try_to_unuse can locate the entry.  This
+		 * apparently redundant "wait_on_page_locked" lets try_to_unuse
+		 * defer to do_swap_page in such a case - in some tests,
+		 * do_swap_page and try_to_unuse repeatedly compete.
+		 */
+		wait_on_page_locked(page);
+		wait_on_page_writeback(page);
+		lock_page(page);
+		if (!PageSwapCache(page)) {
+			/* Page migration has occured */
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
+		wait_on_page_writeback(page);
+
+		/*
+		 * Remove all references to entry.
+		 * Whenever we reach init_mm, there's no address space
+		 * to search, but use it as a reminder to search shmem.
+		 */
+		shmem = 0;
+		swcount = *swap_map;
+		if (swcount > 1) {
+			if (start_mm == &init_mm)
+				shmem = shmem_unuse(entry, page);
+			else
+				retval = unuse_mm(start_mm, entry, page);
+		}
+		if (*swap_map > 1) {
+			int set_start_mm = (*swap_map >= swcount);
+			struct list_head *p = &start_mm->mmlist;
+			struct mm_struct *new_start_mm = start_mm;
+			struct mm_struct *prev_mm = start_mm;
+			struct mm_struct *mm;
+
+			atomic_inc(&new_start_mm->mm_users);
+			atomic_inc(&prev_mm->mm_users);
+			spin_lock(&mmlist_lock);
+			while (*swap_map > 1 && !retval &&
+					(p = p->next) != &start_mm->mmlist) {
+				mm = list_entry(p, struct mm_struct, mmlist);
+				if (atomic_inc_return(&mm->mm_users) == 1) {
+					atomic_dec(&mm->mm_users);
+					continue;
+				}
+				spin_unlock(&mmlist_lock);
+				mmput(prev_mm);
+				prev_mm = mm;
+
+				cond_resched();
+
+				swcount = *swap_map;
+				if (swcount <= 1)
+					;
+				else if (mm == &init_mm) {
+					set_start_mm = 1;
+					shmem = shmem_unuse(entry, page);
+				} else
+					retval = unuse_mm(mm, entry, page);
+				if (set_start_mm && *swap_map < swcount) {
+					mmput(new_start_mm);
+					atomic_inc(&mm->mm_users);
+					new_start_mm = mm;
+					set_start_mm = 0;
+				}
+				spin_lock(&mmlist_lock);
+			}
+			spin_unlock(&mmlist_lock);
+			mmput(prev_mm);
+			mmput(start_mm);
+			start_mm = new_start_mm;
+		}
+		if (retval) {
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+
+		/*
+		 * How could swap count reach 0x7fff when the maximum
+		 * pid is 0x7fff, and there's no way to repeat a swap
+		 * page within an mm (except in shmem, where it's the
+		 * shared object which takes the reference count)?
+		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
+		 *
+		 * If that's wrong, then we should worry more about
+		 * exit_mmap() and do_munmap() cases described above:
+		 * we might be resetting SWAP_MAP_MAX too early here.
+		 * We know "Undead"s can happen, they're okay, so don't
+		 * report them; but do report if we reset SWAP_MAP_MAX.
+		 */
+		if (*swap_map == SWAP_MAP_MAX) {
+			spin_lock(&swap_lock);
+			*swap_map = 1;
+			spin_unlock(&swap_lock);
+			reset_overflow = 1;
+		}
+
+		/*
+		 * If a reference remains (rare), we would like to leave
+		 * the page in the swap cache; but try_to_unmap could
+		 * then re-duplicate the entry once we drop page lock,
+		 * so we might loop indefinitely; also, that page could
+		 * not be swapped out to other storage meanwhile.  So:
+		 * delete from cache even if there's another reference,
+		 * after ensuring that the data has been saved to disk -
+		 * since if the reference remains (rarer), it will be
+		 * read from disk into another page.  Splitting into two
+		 * pages would be incorrect if swap supported "shared
+		 * private" pages, but they are handled by tmpfs files.
+		 *
+		 * Note shmem_unuse already deleted a swappage from
+		 * the swap cache, unless the move to filepage failed:
+		 * in which case it left swappage in cache, lowered its
+		 * swap count to pass quickly through the loops above,
+		 * and now we must reincrement count to try again later.
+		 */
+		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
+			struct writeback_control wbc = {
+				.sync_mode = WB_SYNC_NONE,
+			};
+
+			swap_writepage(page, &wbc);
+			lock_page(page);
+			wait_on_page_writeback(page);
+		}
+		if (PageSwapCache(page)) {
+			if (shmem)
+				swap_duplicate(entry);
+			else
+				delete_from_swap_cache(page);
+		}
+
+		/*
+		 * So we could skip searching mms once swap count went
+		 * to 1, we did not mark any present ptes as dirty: must
+		 * mark page dirty so shrink_list will preserve it.
+		 */
+		SetPageDirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+
+		/*
+		 * Make sure that we aren't completely killing
+		 * interactive performance.
+		 */
+		cond_resched();
+	}
+
+	mmput(start_mm);
+	if (reset_overflow) {
+		printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
+		swap_overflow = 0;
+	}
+	return retval;
+}
+
+/*
+ * After a successful try_to_unuse, if no swap is now in use, we know
+ * we can empty the mmlist.  swap_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_lock, and an mm must be
+ * added to the mmlist just after page_duplicate - before would be racy.
+ */
+static void drain_mmlist(void)
+{
+	struct list_head *p, *next;
+	unsigned int i;
+
+	for (i = 0; i < nr_swapfiles; i++)
+		if (swap_info[i].inuse_pages)
+			return;
+	spin_lock(&mmlist_lock);
+	list_for_each_safe(p, next, &init_mm.mmlist)
+		list_del_init(p);
+	spin_unlock(&mmlist_lock);
+}
+
+/*
+ * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
+ * corresponds to page offset `offset'.
+ */
+sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
+{
+	struct swap_extent *se = sis->curr_swap_extent;
+	struct swap_extent *start_se = se;
+
+	for ( ; ; ) {
+		struct list_head *lh;
+
+		if (se->start_page <= offset &&
+				offset < (se->start_page + se->nr_pages)) {
+			return se->start_block + (offset - se->start_page);
+		}
+		lh = se->list.next;
+		if (lh == &sis->extent_list)
+			lh = lh->next;
+		se = list_entry(lh, struct swap_extent, list);
+		sis->curr_swap_extent = se;
+		BUG_ON(se == start_se);		/* It *must* be present */
+	}
+}
+
+/*
+ * Free all of a swapdev's extent information
+ */
+static void destroy_swap_extents(struct swap_info_struct *sis)
+{
+	while (!list_empty(&sis->extent_list)) {
+		struct swap_extent *se;
+
+		se = list_entry(sis->extent_list.next,
+				struct swap_extent, list);
+		list_del(&se->list);
+		kfree(se);
+	}
+}
+
+/*
+ * Add a block range (and the corresponding page range) into this swapdev's
+ * extent list.  The extent list is kept sorted in page order.
+ *
+ * This function rather assumes that it is called in ascending page order.
+ */
+static int
+add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
+		unsigned long nr_pages, sector_t start_block)
+{
+	struct swap_extent *se;
+	struct swap_extent *new_se;
+	struct list_head *lh;
+
+	lh = sis->extent_list.prev;	/* The highest page extent */
+	if (lh != &sis->extent_list) {
+		se = list_entry(lh, struct swap_extent, list);
+		BUG_ON(se->start_page + se->nr_pages != start_page);
+		if (se->start_block + se->nr_pages == start_block) {
+			/* Merge it */
+			se->nr_pages += nr_pages;
+			return 0;
+		}
+	}
+
+	/*
+	 * No merge.  Insert a new extent, preserving ordering.
+	 */
+	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
+	if (new_se == NULL)
+		return -ENOMEM;
+	new_se->start_page = start_page;
+	new_se->nr_pages = nr_pages;
+	new_se->start_block = start_block;
+
+	list_add_tail(&new_se->list, &sis->extent_list);
+	return 1;
+}
+
+/*
+ * A `swap extent' is a simple thing which maps a contiguous range of pages
+ * onto a contiguous range of disk blocks.  An ordered list of swap extents
+ * is built at swapon time and is then used at swap_writepage/swap_readpage
+ * time for locating where on disk a page belongs.
+ *
+ * If the swapfile is an S_ISBLK block device, a single extent is installed.
+ * This is done so that the main operating code can treat S_ISBLK and S_ISREG
+ * swap files identically.
+ *
+ * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
+ * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
+ * swapfiles are handled *identically* after swapon time.
+ *
+ * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
+ * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
+ * some stray blocks are found which do not fall within the PAGE_SIZE alignment
+ * requirements, they are simply tossed out - we will never use those blocks
+ * for swapping.
+ *
+ * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon.  This
+ * prevents root from shooting her foot off by ftruncating an in-use swapfile,
+ * which will scribble on the fs.
+ *
+ * The amount of disk space which a single swap extent represents varies.
+ * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
+ * extents in the list.  To avoid much list walking, we cache the previous
+ * search location in `curr_swap_extent', and start new searches from there.
+ * This is extremely effective.  The average number of iterations in
+ * map_swap_page() has been measured at about 0.3 per page.  - akpm.
+ */
+static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
+{
+	struct inode *inode;
+	unsigned blocks_per_page;
+	unsigned long page_no;
+	unsigned blkbits;
+	sector_t probe_block;
+	sector_t last_block;
+	sector_t lowest_block = -1;
+	sector_t highest_block = 0;
+	int nr_extents = 0;
+	int ret;
+
+	inode = sis->swap_file->f_mapping->host;
+	if (S_ISBLK(inode->i_mode)) {
+		ret = add_swap_extent(sis, 0, sis->max, 0);
+		*span = sis->pages;
+		goto done;
+	}
+
+	blkbits = inode->i_blkbits;
+	blocks_per_page = PAGE_SIZE >> blkbits;
+
+	/*
+	 * Map all the blocks into the extent list.  This code doesn't try
+	 * to be very smart.
+	 */
+	probe_block = 0;
+	page_no = 0;
+	last_block = i_size_read(inode) >> blkbits;
+	while ((probe_block + blocks_per_page) <= last_block &&
+			page_no < sis->max) {
+		unsigned block_in_page;
+		sector_t first_block;
+
+		first_block = bmap(inode, probe_block);
+		if (first_block == 0)
+			goto bad_bmap;
+
+		/*
+		 * It must be PAGE_SIZE aligned on-disk
+		 */
+		if (first_block & (blocks_per_page - 1)) {
+			probe_block++;
+			goto reprobe;
+		}
+
+		for (block_in_page = 1; block_in_page < blocks_per_page;
+					block_in_page++) {
+			sector_t block;
+
+			block = bmap(inode, probe_block + block_in_page);
+			if (block == 0)
+				goto bad_bmap;
+			if (block != first_block + block_in_page) {
+				/* Discontiguity */
+				probe_block++;
+				goto reprobe;
+			}
+		}
+
+		first_block >>= (PAGE_SHIFT - blkbits);
+		if (page_no) {	/* exclude the header page */
+			if (first_block < lowest_block)
+				lowest_block = first_block;
+			if (first_block > highest_block)
+				highest_block = first_block;
+		}
+
+		/*
+		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+		 */
+		ret = add_swap_extent(sis, page_no, 1, first_block);
+		if (ret < 0)
+			goto out;
+		nr_extents += ret;
+		page_no++;
+		probe_block += blocks_per_page;
+reprobe:
+		continue;
+	}
+	ret = nr_extents;
+	*span = 1 + highest_block - lowest_block;
+	if (page_no == 0)
+		page_no = 1;	/* force Empty message */
+	sis->max = page_no;
+	sis->pages = page_no - 1;
+	sis->highest_bit = page_no - 1;
+done:
+	sis->curr_swap_extent = list_entry(sis->extent_list.prev,
+					struct swap_extent, list);
+	goto out;
+bad_bmap:
+	printk(KERN_ERR "swapon: swapfile has holes\n");
+	ret = -EINVAL;
+out:
+	return ret;
+}
+
+#if 0	/* We don't need this yet */
+#include <linux/backing-dev.h>
+int page_queue_congested(struct page *page)
+{
+	struct backing_dev_info *bdi;
+
+	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */
+
+	if (PageSwapCache(page)) {
+		swp_entry_t entry = { .val = page_private(page) };
+		struct swap_info_struct *sis;
+
+		sis = get_swap_info_struct(swp_type(entry));
+		bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
+	} else
+		bdi = page->mapping->backing_dev_info;
+	return bdi_write_congested(bdi);
+}
+#endif
+
+asmlinkage long sys_swapoff(const char __user * specialfile)
+{
+	struct swap_info_struct * p = NULL;
+	unsigned short *swap_map;
+	struct file *swap_file, *victim;
+	struct address_space *mapping;
+	struct inode *inode;
+	char * pathname;
+	int i, type, prev;
+	int err;
+	
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	pathname = getname(specialfile);
+	err = PTR_ERR(pathname);
+	if (IS_ERR(pathname))
+		goto out;
+
+	victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
+	putname(pathname);
+	err = PTR_ERR(victim);
+	if (IS_ERR(victim))
+		goto out;
+
+	mapping = victim->f_mapping;
+	prev = -1;
+	spin_lock(&swap_lock);
+	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
+		p = swap_info + type;
+		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
+			if (p->swap_file->f_mapping == mapping)
+				break;
+		}
+		prev = type;
+	}
+	if (type < 0) {
+		err = -EINVAL;
+		spin_unlock(&swap_lock);
+		goto out_dput;
+	}
+	if (!security_vm_enough_memory(p->pages))
+		vm_unacct_memory(p->pages);
+	else {
+		err = -ENOMEM;
+		spin_unlock(&swap_lock);
+		goto out_dput;
+	}
+	if (prev < 0) {
+		swap_list.head = p->next;
+	} else {
+		swap_info[prev].next = p->next;
+	}
+	if (type == swap_list.next) {
+		/* just pick something that's safe... */
+		swap_list.next = swap_list.head;
+	}
+	nr_swap_pages -= p->pages;
+	total_swap_pages -= p->pages;
+	p->flags &= ~SWP_WRITEOK;
+	spin_unlock(&swap_lock);
+
+	current->flags |= PF_SWAPOFF;
+	err = try_to_unuse(type);
+	current->flags &= ~PF_SWAPOFF;
+
+	if (err) {
+		/* re-insert swap space back into swap_list */
+		spin_lock(&swap_lock);
+		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
+			if (p->prio >= swap_info[i].prio)
+				break;
+		p->next = i;
+		if (prev < 0)
+			swap_list.head = swap_list.next = p - swap_info;
+		else
+			swap_info[prev].next = p - swap_info;
+		nr_swap_pages += p->pages;
+		total_swap_pages += p->pages;
+		p->flags |= SWP_WRITEOK;
+		spin_unlock(&swap_lock);
+		goto out_dput;
+	}
+
+	/* wait for any unplug function to finish */
+	down_write(&swap_unplug_sem);
+	up_write(&swap_unplug_sem);
+
+	destroy_swap_extents(p);
+	mutex_lock(&swapon_mutex);
+	spin_lock(&swap_lock);
+	drain_mmlist();
+
+	/* wait for anyone still in scan_swap_map */
+	p->highest_bit = 0;		/* cuts scans short */
+	while (p->flags >= SWP_SCANNING) {
+		spin_unlock(&swap_lock);
+		schedule_timeout_uninterruptible(1);
+		spin_lock(&swap_lock);
+	}
+
+	swap_file = p->swap_file;
+	p->swap_file = NULL;
+	p->max = 0;
+	swap_map = p->swap_map;
+	p->swap_map = NULL;
+	p->flags = 0;
+	spin_unlock(&swap_lock);
+	mutex_unlock(&swapon_mutex);
+	vfree(swap_map);
+	inode = mapping->host;
+	if (S_ISBLK(inode->i_mode)) {
+		struct block_device *bdev = I_BDEV(inode);
+		set_blocksize(bdev, p->old_block_size);
+		bd_release(bdev);
+	} else {
+		mutex_lock(&inode->i_mutex);
+		inode->i_flags &= ~S_SWAPFILE;
+		mutex_unlock(&inode->i_mutex);
+	}
+	filp_close(swap_file, NULL);
+	err = 0;
+
+out_dput:
+	filp_close(victim, NULL);
+out:
+	return err;
+}
+
+#ifdef CONFIG_PROC_FS
+/* iterator */
+static void *swap_start(struct seq_file *swap, loff_t *pos)
+{
+	struct swap_info_struct *ptr = swap_info;
+	int i;
+	loff_t l = *pos;
+
+	mutex_lock(&swapon_mutex);
+
+	for (i = 0; i < nr_swapfiles; i++, ptr++) {
+		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+			continue;
+		if (!l--)
+			return ptr;
+	}
+
+	return NULL;
+}
+
+static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
+{
+	struct swap_info_struct *ptr = v;
+	struct swap_info_struct *endptr = swap_info + nr_swapfiles;
+
+	for (++ptr; ptr < endptr; ptr++) {
+		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+			continue;
+		++*pos;
+		return ptr;
+	}
+
+	return NULL;
+}
+
+static void swap_stop(struct seq_file *swap, void *v)
+{
+	mutex_unlock(&swapon_mutex);
+}
+
+static int swap_show(struct seq_file *swap, void *v)
+{
+	struct swap_info_struct *ptr = v;
+	struct file *file;
+	int len;
+
+	if (v == swap_info)
+		seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+
+	file = ptr->swap_file;
+	len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
+	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
+		       len < 40 ? 40 - len : 1, " ",
+		       S_ISBLK(file->f_dentry->d_inode->i_mode) ?
+				"partition" : "file\t",
+		       ptr->pages << (PAGE_SHIFT - 10),
+		       ptr->inuse_pages << (PAGE_SHIFT - 10),
+		       ptr->prio);
+	return 0;
+}
+
+static struct seq_operations swaps_op = {
+	.start =	swap_start,
+	.next =		swap_next,
+	.stop =		swap_stop,
+	.show =		swap_show
+};
+
+static int swaps_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &swaps_op);
+}
+
+static struct file_operations proc_swaps_operations = {
+	.open		= swaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init procswaps_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("swaps", 0, NULL);
+	if (entry)
+		entry->proc_fops = &proc_swaps_operations;
+	return 0;
+}
+__initcall(procswaps_init);
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
+ *
+ * The swapon system call
+ */
+asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
+{
+	struct swap_info_struct * p;
+	char *name = NULL;
+	struct block_device *bdev = NULL;
+	struct file *swap_file = NULL;
+	struct address_space *mapping;
+	unsigned int type;
+	int i, prev;
+	int error;
+	static int least_priority;
+	union swap_header *swap_header = NULL;
+	int swap_header_version;
+	unsigned int nr_good_pages = 0;
+	int nr_extents = 0;
+	sector_t span;
+	unsigned long maxpages = 1;
+	int swapfilesize;
+	unsigned short *swap_map;
+	struct page *page = NULL;
+	struct inode *inode = NULL;
+	int did_down = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	spin_lock(&swap_lock);
+	p = swap_info;
+	for (type = 0 ; type < nr_swapfiles ; type++,p++)
+		if (!(p->flags & SWP_USED))
+			break;
+	error = -EPERM;
+	/*
+	 * Test if adding another swap device is possible. There are
+	 * two limiting factors: 1) the number of bits for the swap
+	 * type swp_entry_t definition and 2) the number of bits for
+	 * the swap type in the swap ptes as defined by the different
+	 * architectures. To honor both limitations a swap entry
+	 * with swap offset 0 and swap type ~0UL is created, encoded
+	 * to a swap pte, decoded to a swp_entry_t again and finally
+	 * the swap type part is extracted. This will mask all bits
+	 * from the initial ~0UL that can't be encoded in either the
+	 * swp_entry_t or the architecture definition of a swap pte.
+	 */
+	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
+		spin_unlock(&swap_lock);
+		goto out;
+	}
+	if (type >= nr_swapfiles)
+		nr_swapfiles = type+1;
+	INIT_LIST_HEAD(&p->extent_list);
+	p->flags = SWP_USED;
+	p->swap_file = NULL;
+	p->old_block_size = 0;
+	p->swap_map = NULL;
+	p->lowest_bit = 0;
+	p->highest_bit = 0;
+	p->cluster_nr = 0;
+	p->inuse_pages = 0;
+	p->next = -1;
+	if (swap_flags & SWAP_FLAG_PREFER) {
+		p->prio =
+		  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
+	} else {
+		p->prio = --least_priority;
+	}
+	spin_unlock(&swap_lock);
+	name = getname(specialfile);
+	error = PTR_ERR(name);
+	if (IS_ERR(name)) {
+		name = NULL;
+		goto bad_swap_2;
+	}
+	swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+	error = PTR_ERR(swap_file);
+	if (IS_ERR(swap_file)) {
+		swap_file = NULL;
+		goto bad_swap_2;
+	}
+
+	p->swap_file = swap_file;
+	mapping = swap_file->f_mapping;
+	inode = mapping->host;
+
+	error = -EBUSY;
+	for (i = 0; i < nr_swapfiles; i++) {
+		struct swap_info_struct *q = &swap_info[i];
+
+		if (i == type || !q->swap_file)
+			continue;
+		if (mapping == q->swap_file->f_mapping)
+			goto bad_swap;
+	}
+
+	error = -EINVAL;
+	if (S_ISBLK(inode->i_mode)) {
+		bdev = I_BDEV(inode);
+		error = bd_claim(bdev, sys_swapon);
+		if (error < 0) {
+			bdev = NULL;
+			error = -EINVAL;
+			goto bad_swap;
+		}
+		p->old_block_size = block_size(bdev);
+		error = set_blocksize(bdev, PAGE_SIZE);
+		if (error < 0)
+			goto bad_swap;
+		p->bdev = bdev;
+	} else if (S_ISREG(inode->i_mode)) {
+		p->bdev = inode->i_sb->s_bdev;
+		mutex_lock(&inode->i_mutex);
+		did_down = 1;
+		if (IS_SWAPFILE(inode)) {
+			error = -EBUSY;
+			goto bad_swap;
+		}
+	} else {
+		goto bad_swap;
+	}
+
+	swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
+
+	/*
+	 * Read the swap header.
+	 */
+	if (!mapping->a_ops->readpage) {
+		error = -EINVAL;
+		goto bad_swap;
+	}
+	page = read_cache_page(mapping, 0,
+			(filler_t *)mapping->a_ops->readpage, swap_file);
+	if (IS_ERR(page)) {
+		error = PTR_ERR(page);
+		goto bad_swap;
+	}
+	wait_on_page_locked(page);
+	if (!PageUptodate(page))
+		goto bad_swap;
+	kmap(page);
+	swap_header = page_address(page);
+
+	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
+		swap_header_version = 1;
+	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
+		swap_header_version = 2;
+	else {
+		printk(KERN_ERR "Unable to find swap-space signature\n");
+		error = -EINVAL;
+		goto bad_swap;
+	}
+	
+	switch (swap_header_version) {
+	case 1:
+		printk(KERN_ERR "version 0 swap is no longer supported. "
+			"Use mkswap -v1 %s\n", name);
+		error = -EINVAL;
+		goto bad_swap;
+	case 2:
+		/* Check the swap header's sub-version and the size of
+                   the swap file and bad block lists */
+		if (swap_header->info.version != 1) {
+			printk(KERN_WARNING
+			       "Unable to handle swap header version %d\n",
+			       swap_header->info.version);
+			error = -EINVAL;
+			goto bad_swap;
+		}
+
+		p->lowest_bit  = 1;
+		p->cluster_next = 1;
+
+		/*
+		 * Find out how many pages are allowed for a single swap
+		 * device. There are two limiting factors: 1) the number of
+		 * bits for the swap offset in the swp_entry_t type and
+		 * 2) the number of bits in the a swap pte as defined by
+		 * the different architectures. In order to find the
+		 * largest possible bit mask a swap entry with swap type 0
+		 * and swap offset ~0UL is created, encoded to a swap pte,
+		 * decoded to a swp_entry_t again and finally the swap
+		 * offset is extracted. This will mask all the bits from
+		 * the initial ~0UL mask that can't be encoded in either
+		 * the swp_entry_t or the architecture definition of a
+		 * swap pte.
+		 */
+		maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
+		if (maxpages > swap_header->info.last_page)
+			maxpages = swap_header->info.last_page;
+		p->highest_bit = maxpages - 1;
+
+		error = -EINVAL;
+		if (!maxpages)
+			goto bad_swap;
+		if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
+			goto bad_swap;
+		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+			goto bad_swap;
+
+		/* OK, set up the swap map and apply the bad block list */
+		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
+			error = -ENOMEM;
+			goto bad_swap;
+		}
+
+		error = 0;
+		memset(p->swap_map, 0, maxpages * sizeof(short));
+		for (i = 0; i < swap_header->info.nr_badpages; i++) {
+			int page_nr = swap_header->info.badpages[i];
+			if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
+				error = -EINVAL;
+			else
+				p->swap_map[page_nr] = SWAP_MAP_BAD;
+		}
+		nr_good_pages = swap_header->info.last_page -
+				swap_header->info.nr_badpages -
+				1 /* header page */;
+		if (error)
+			goto bad_swap;
+	}
+
+	if (swapfilesize && maxpages > swapfilesize) {
+		printk(KERN_WARNING
+		       "Swap area shorter than signature indicates\n");
+		error = -EINVAL;
+		goto bad_swap;
+	}
+	if (nr_good_pages) {
+		p->swap_map[0] = SWAP_MAP_BAD;
+		p->max = maxpages;
+		p->pages = nr_good_pages;
+		nr_extents = setup_swap_extents(p, &span);
+		if (nr_extents < 0) {
+			error = nr_extents;
+			goto bad_swap;
+		}
+		nr_good_pages = p->pages;
+	}
+	if (!nr_good_pages) {
+		printk(KERN_WARNING "Empty swap-file\n");
+		error = -EINVAL;
+		goto bad_swap;
+	}
+
+	mutex_lock(&swapon_mutex);
+	spin_lock(&swap_lock);
+	p->flags = SWP_ACTIVE;
+	nr_swap_pages += nr_good_pages;
+	total_swap_pages += nr_good_pages;
+
+	printk(KERN_INFO "Adding %uk swap on %s.  "
+			"Priority:%d extents:%d across:%lluk\n",
+		nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
+		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
+
+	/* insert swap space into swap_list: */
+	prev = -1;
+	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
+		if (p->prio >= swap_info[i].prio) {
+			break;
+		}
+		prev = i;
+	}
+	p->next = i;
+	if (prev < 0) {
+		swap_list.head = swap_list.next = p - swap_info;
+	} else {
+		swap_info[prev].next = p - swap_info;
+	}
+	spin_unlock(&swap_lock);
+	mutex_unlock(&swapon_mutex);
+	error = 0;
+	goto out;
+bad_swap:
+	if (bdev) {
+		set_blocksize(bdev, p->old_block_size);
+		bd_release(bdev);
+	}
+	destroy_swap_extents(p);
+bad_swap_2:
+	spin_lock(&swap_lock);
+	swap_map = p->swap_map;
+	p->swap_file = NULL;
+	p->swap_map = NULL;
+	p->flags = 0;
+	if (!(swap_flags & SWAP_FLAG_PREFER))
+		++least_priority;
+	spin_unlock(&swap_lock);
+	vfree(swap_map);
+	if (swap_file)
+		filp_close(swap_file, NULL);
+out:
+	if (page && !IS_ERR(page)) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+	if (name)
+		putname(name);
+	if (did_down) {
+		if (!error)
+			inode->i_flags |= S_SWAPFILE;
+		mutex_unlock(&inode->i_mutex);
+	}
+	return error;
+}
+
+void si_swapinfo(struct sysinfo *val)
+{
+	unsigned int i;
+	unsigned long nr_to_be_unused = 0;
+
+	spin_lock(&swap_lock);
+	for (i = 0; i < nr_swapfiles; i++) {
+		if (!(swap_info[i].flags & SWP_USED) ||
+		     (swap_info[i].flags & SWP_WRITEOK))
+			continue;
+		nr_to_be_unused += swap_info[i].inuse_pages;
+	}
+	val->freeswap = nr_swap_pages + nr_to_be_unused;
+	val->totalswap = total_swap_pages + nr_to_be_unused;
+	spin_unlock(&swap_lock);
+}
+
+/*
+ * Verify that a swap entry is valid and increment its swap map count.
+ *
+ * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
+ * "permanent", but will be reclaimed by the next swapoff.
+ */
+int swap_duplicate(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+	unsigned long offset, type;
+	int result = 0;
+
+	type = swp_type(entry);
+	if (type >= nr_swapfiles)
+		goto bad_file;
+	p = type + swap_info;
+	offset = swp_offset(entry);
+
+	spin_lock(&swap_lock);
+	if (offset < p->max && p->swap_map[offset]) {
+		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
+			p->swap_map[offset]++;
+			result = 1;
+		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
+			if (swap_overflow++ < 5)
+				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
+			p->swap_map[offset] = SWAP_MAP_MAX;
+			result = 1;
+		}
+	}
+	spin_unlock(&swap_lock);
+out:
+	return result;
+
+bad_file:
+	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
+	goto out;
+}
+
+struct swap_info_struct *
+get_swap_info_struct(unsigned type)
+{
+	return &swap_info[type];
+}
+
+/*
+ * swap_lock prevents swap_map being freed. Don't grab an extra
+ * reference on the swaphandle, it doesn't matter if it becomes unused.
+ */
+int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
+{
+	int ret = 0, i = 1 << page_cluster;
+	unsigned long toff;
+	struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
+
+	if (!page_cluster)	/* no readahead */
+		return 0;
+	toff = (swp_offset(entry) >> page_cluster) << page_cluster;
+	if (!toff)		/* first page is swap header */
+		toff++, i--;
+	*offset = toff;
+
+	spin_lock(&swap_lock);
+	do {
+		/* Don't read-ahead past the end of the swap area */
+		if (toff >= swapdev->max)
+			break;
+		/* Don't read in free or bad pages */
+		if (!swapdev->swap_map[toff])
+			break;
+		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
+			break;
+		toff++;
+		ret++;
+	} while (--i);
+	spin_unlock(&swap_lock);
+	return ret;
+}
diff -urN oldtree/mm/vmscan.c newtree/mm/vmscan.c
--- oldtree/mm/vmscan.c	2006-03-08 18:48:03.020067750 +0000
+++ newtree/mm/vmscan.c	2006-03-08 15:22:33.413515500 +0000
@@ -1739,7 +1739,8 @@
 	for ( ; ; ) {
 		unsigned long new_order;
 
-		try_to_freeze();
+		if (try_to_freeze())
+			pgdat->kswapd_max_order = 0;
 
 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 		new_order = pgdat->kswapd_max_order;
diff -urN oldtree/mm/vmscan.c.orig newtree/mm/vmscan.c.orig
--- oldtree/mm/vmscan.c.orig	1970-01-01 00:00:00.000000000 +0000
+++ newtree/mm/vmscan.c.orig	2006-03-08 15:21:19.296883500 +0000
@@ -0,0 +1,1994 @@
+/*
+ *  linux/mm/vmscan.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Swap reorganised 29.12.95, Stephen Tweedie.
+ *  kswapd added: 7.1.96  sct
+ *  Removed kswapd_ctl limits, and swap out as many pages as needed
+ *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
+ *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
+ *  Multiqueue VM started 5.8.00, Rik van Riel.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/swap-prefetch.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>	/* for try_to_release_page(),
+					buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/notifier.h>
+#include <linux/rwsem.h>
+#include <linux/delay.h>
+
+#include <asm/tlbflush.h>
+#include <asm/div64.h>
+
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
+struct scan_control {
+	/* Incremented by the number of inactive pages that were scanned */
+	unsigned long nr_scanned;
+
+	unsigned long nr_mapped;	/* From page_state */
+
+	/* This context's GFP mask */
+	gfp_t gfp_mask;
+
+	int may_writepage;
+
+	/* Can pages be swapped as part of reclaim? */
+	int may_swap;
+
+	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
+	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
+	 * In this context, it doesn't matter that we scan the
+	 * whole list at once. */
+	int swap_cluster_max;
+};
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+#ifdef ARCH_HAS_PREFETCH
+#define prefetch_prev_lru_page(_page, _base, _field)			\
+	do {								\
+		if ((_page)->lru.prev != _base) {			\
+			struct page *prev;				\
+									\
+			prev = lru_to_page(&(_page->lru));		\
+			prefetch(&prev->_field);			\
+		}							\
+	} while (0)
+#else
+#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+#ifdef ARCH_HAS_PREFETCHW
+#define prefetchw_prev_lru_page(_page, _base, _field)			\
+	do {								\
+		if ((_page)->lru.prev != _base) {			\
+			struct page *prev;				\
+									\
+			prev = lru_to_page(&(_page->lru));		\
+			prefetchw(&prev->_field);			\
+		}							\
+	} while (0)
+#else
+#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+/*
+ * From 0 .. 100.  Higher means more swappy.
+ */
+int vm_swappiness = 60;
+static long total_memory;
+
+static LIST_HEAD(shrinker_list);
+static DECLARE_RWSEM(shrinker_rwsem);
+
+/*
+ * Add a shrinker callback to be called from the vm
+ */
+struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
+{
+        struct shrinker *shrinker;
+
+        shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
+        if (shrinker) {
+	        shrinker->shrinker = theshrinker;
+	        shrinker->seeks = seeks;
+	        shrinker->nr = 0;
+		shrinker->s_stats = alloc_percpu(struct shrinker_stats);
+		if (!shrinker->s_stats) {
+			kfree(shrinker);
+			return NULL;
+		}
+	        down_write(&shrinker_rwsem);
+	        list_add_tail(&shrinker->list, &shrinker_list);
+	        up_write(&shrinker_rwsem);
+	}
+	return shrinker;
+}
+EXPORT_SYMBOL(set_shrinker);
+
+/*
+ * Remove one
+ */
+void remove_shrinker(struct shrinker *shrinker)
+{
+	down_write(&shrinker_rwsem);
+	list_del(&shrinker->list);
+	up_write(&shrinker_rwsem);
+	free_percpu(shrinker->s_stats);
+	kfree(shrinker);
+}
+EXPORT_SYMBOL(remove_shrinker);
+
+#define SHRINK_BATCH 128
+/*
+ * Call the shrink functions to age shrinkable caches
+ *
+ * Here we assume it costs one seek to replace a lru page and that it also
+ * takes a seek to recreate a cache object.  With this in mind we age equal
+ * percentages of the lru and ageable caches.  This should balance the seeks
+ * generated by these structures.
+ *
+ * If the vm encounted mapped pages on the LRU it increase the pressure on
+ * slab to avoid swapping.
+ *
+ * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
+ *
+ * `lru_pages' represents the number of on-LRU pages in all the zones which
+ * are eligible for the caller's allocation attempt.  It is used for balancing
+ * slab reclaim versus page reclaim.
+ *
+ * Returns the number of slab objects which we shrunk.
+ */
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+			unsigned long lru_pages)
+{
+	struct shrinker *shrinker;
+	unsigned long ret = 0;
+
+	if (scanned == 0)
+		scanned = SWAP_CLUSTER_MAX;
+
+	if (!down_read_trylock(&shrinker_rwsem))
+		return 1;	/* Assume we'll be able to shrink next time */
+
+	list_for_each_entry(shrinker, &shrinker_list, list) {
+		unsigned long long delta;
+		unsigned long total_scan;
+		unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
+
+		delta = (4 * scanned) / shrinker->seeks;
+		delta *= max_pass;
+		do_div(delta, lru_pages + 1);
+		shrinker->nr += delta;
+		if (shrinker->nr < 0) {
+			printk(KERN_ERR "%s: nr=%ld\n",
+					__FUNCTION__, shrinker->nr);
+			shrinker->nr = max_pass;
+		}
+
+		/*
+		 * Avoid risking looping forever due to too large nr value:
+		 * never try to free more than twice the estimate number of
+		 * freeable entries.
+		 */
+		if (shrinker->nr > max_pass * 2)
+			shrinker->nr = max_pass * 2;
+
+		total_scan = shrinker->nr;
+		shrinker->nr = 0;
+
+		while (total_scan >= SHRINK_BATCH) {
+			long this_scan = SHRINK_BATCH;
+			int shrink_ret;
+			int nr_before;
+
+			nr_before = (*shrinker->shrinker)(0, gfp_mask);
+			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+			if (shrink_ret == -1)
+				break;
+			if (shrink_ret < nr_before) {
+				ret += nr_before - shrink_ret;
+				shrinker_stat_add(shrinker, nr_freed,
+					(nr_before - shrink_ret));
+			}
+			shrinker_stat_add(shrinker, nr_req, this_scan);
+			mod_page_state(slabs_scanned, this_scan);
+			total_scan -= this_scan;
+
+			cond_resched();
+		}
+
+		shrinker->nr += total_scan;
+	}
+	up_read(&shrinker_rwsem);
+	return ret;
+}
+
+/* Called without lock on whether page is mapped, so answer is unstable */
+static inline int page_mapping_inuse(struct page *page)
+{
+	struct address_space *mapping;
+
+	/* Page is in somebody's page tables. */
+	if (page_mapped(page))
+		return 1;
+
+	/* Be more reluctant to reclaim swapcache than pagecache */
+	if (PageSwapCache(page))
+		return 1;
+
+	mapping = page_mapping(page);
+	if (!mapping)
+		return 0;
+
+	/* File is mmap'd by somebody? */
+	return mapping_mapped(mapping);
+}
+
+static inline int is_page_cache_freeable(struct page *page)
+{
+	return page_count(page) - !!PagePrivate(page) == 2;
+}
+
+static int may_write_to_queue(struct backing_dev_info *bdi)
+{
+	if (current->flags & PF_SWAPWRITE)
+		return 1;
+	if (!bdi_write_congested(bdi))
+		return 1;
+	if (bdi == current->backing_dev_info)
+		return 1;
+	return 0;
+}
+
+/*
+ * We detected a synchronous write error writing a page out.  Probably
+ * -ENOSPC.  We need to propagate that into the address_space for a subsequent
+ * fsync(), msync() or close().
+ *
+ * The tricky part is that after writepage we cannot touch the mapping: nothing
+ * prevents it from being freed up.  But we have a ref on the page and once
+ * that page is locked, the mapping is pinned.
+ *
+ * We're allowed to run sleeping lock_page() here because we know the caller has
+ * __GFP_FS.
+ */
+static void handle_write_error(struct address_space *mapping,
+				struct page *page, int error)
+{
+	lock_page(page);
+	if (page_mapping(page) == mapping) {
+		if (error == -ENOSPC)
+			set_bit(AS_ENOSPC, &mapping->flags);
+		else
+			set_bit(AS_EIO, &mapping->flags);
+	}
+	unlock_page(page);
+}
+
+/*
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
+ */
+static pageout_t pageout(struct page *page, struct address_space *mapping)
+{
+	/*
+	 * If the page is dirty, only perform writeback if that write
+	 * will be non-blocking.  To prevent this allocation from being
+	 * stalled by pagecache activity.  But note that there may be
+	 * stalls if we need to run get_block().  We could test
+	 * PagePrivate for that.
+	 *
+	 * If this process is currently in generic_file_write() against
+	 * this page's queue, we can perform writeback even if that
+	 * will block.
+	 *
+	 * If the page is swapcache, write it back even if that would
+	 * block, for some throttling. This happens by accident, because
+	 * swap_backing_dev_info is bust: it doesn't reflect the
+	 * congestion state of the swapdevs.  Easy to fix, if needed.
+	 * See swapfile.c:page_queue_congested().
+	 */
+	if (!is_page_cache_freeable(page))
+		return PAGE_KEEP;
+	if (!mapping) {
+		/*
+		 * Some data journaling orphaned pages can have
+		 * page->mapping == NULL while being dirty with clean buffers.
+		 */
+		if (PagePrivate(page)) {
+			if (try_to_free_buffers(page)) {
+				ClearPageDirty(page);
+				printk("%s: orphaned page\n", __FUNCTION__);
+				return PAGE_CLEAN;
+			}
+		}
+		return PAGE_KEEP;
+	}
+	if (mapping->a_ops->writepage == NULL)
+		return PAGE_ACTIVATE;
+	if (!may_write_to_queue(mapping->backing_dev_info))
+		return PAGE_KEEP;
+
+	if (clear_page_dirty_for_io(page)) {
+		int res;
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_NONE,
+			.nr_to_write = SWAP_CLUSTER_MAX,
+			.nonblocking = 1,
+			.for_reclaim = 1,
+		};
+
+		SetPageReclaim(page);
+		res = mapping->a_ops->writepage(page, &wbc);
+		if (res < 0)
+			handle_write_error(mapping, page, res);
+		if (res == AOP_WRITEPAGE_ACTIVATE) {
+			ClearPageReclaim(page);
+			return PAGE_ACTIVATE;
+		}
+		if (!PageWriteback(page)) {
+			/* synchronous write or broken a_ops? */
+			ClearPageReclaim(page);
+		}
+
+		return PAGE_SUCCESS;
+	}
+
+	return PAGE_CLEAN;
+}
+
+static int remove_mapping(struct address_space *mapping, struct page *page)
+{
+	if (!mapping)
+		return 0;		/* truncate got there first */
+
+	write_lock_irq(&mapping->tree_lock);
+
+	/*
+	 * The non-racy check for busy page.  It is critical to check
+	 * PageDirty _after_ making sure that the page is freeable and
+	 * not in use by anybody. 	(pagecache + us == 2)
+	 */
+	if (unlikely(page_count(page) != 2))
+		goto cannot_free;
+	smp_rmb();
+	if (unlikely(PageDirty(page)))
+		goto cannot_free;
+
+	if (PageSwapCache(page)) {
+		swp_entry_t swap = { .val = page_private(page) };
+		add_to_swapped_list(page);
+		__delete_from_swap_cache(page);
+		write_unlock_irq(&mapping->tree_lock);
+		swap_free(swap);
+		__put_page(page);	/* The pagecache ref */
+		return 1;
+	}
+
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+	__put_page(page);
+	return 1;
+
+cannot_free:
+	write_unlock_irq(&mapping->tree_lock);
+	return 0;
+}
+
+/*
+ * shrink_page_list() returns the number of reclaimed pages
+ */
+static unsigned long shrink_page_list(struct list_head *page_list,
+					struct scan_control *sc)
+{
+	LIST_HEAD(ret_pages);
+	struct pagevec freed_pvec;
+	int pgactivate = 0;
+	unsigned long nr_reclaimed = 0;
+
+	cond_resched();
+
+	pagevec_init(&freed_pvec, 1);
+	while (!list_empty(page_list)) {
+		struct address_space *mapping;
+		struct page *page;
+		int may_enter_fs;
+		int referenced;
+
+		cond_resched();
+
+		page = lru_to_page(page_list);
+		list_del(&page->lru);
+
+		if (TestSetPageLocked(page))
+			goto keep;
+
+		BUG_ON(PageActive(page));
+
+		sc->nr_scanned++;
+
+		if (!sc->may_swap && page_mapped(page))
+			goto keep_locked;
+
+		/* Double the slab pressure for mapped and swapcache pages */
+		if (page_mapped(page) || PageSwapCache(page))
+			sc->nr_scanned++;
+
+		if (PageWriteback(page))
+			goto keep_locked;
+
+		referenced = page_referenced(page, 1);
+		/* In active use or really unfreeable?  Activate it. */
+		if (referenced && page_mapping_inuse(page))
+			goto activate_locked;
+
+#ifdef CONFIG_SWAP
+		/*
+		 * Anonymous process memory has backing store?
+		 * Try to allocate it some swap space here.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page))
+			if (!add_to_swap(page, GFP_ATOMIC))
+				goto activate_locked;
+#endif /* CONFIG_SWAP */
+
+		mapping = page_mapping(page);
+		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+		/*
+		 * The page is mapped into the page tables of one or more
+		 * processes. Try to unmap it here.
+		 */
+		if (page_mapped(page) && mapping) {
+			switch (try_to_unmap(page, 0)) {
+			case SWAP_FAIL:
+				goto activate_locked;
+			case SWAP_AGAIN:
+				goto keep_locked;
+			case SWAP_SUCCESS:
+				; /* try to free the page below */
+			}
+		}
+
+		if (PageDirty(page)) {
+			if (referenced)
+				goto keep_locked;
+			if (!may_enter_fs)
+				goto keep_locked;
+			if (!sc->may_writepage)
+				goto keep_locked;
+
+			/* Page is dirty, try to write it out here */
+			switch(pageout(page, mapping)) {
+			case PAGE_KEEP:
+				goto keep_locked;
+			case PAGE_ACTIVATE:
+				goto activate_locked;
+			case PAGE_SUCCESS:
+				if (PageWriteback(page) || PageDirty(page))
+					goto keep;
+				/*
+				 * A synchronous write - probably a ramdisk.  Go
+				 * ahead and try to reclaim the page.
+				 */
+				if (TestSetPageLocked(page))
+					goto keep;
+				if (PageDirty(page) || PageWriteback(page))
+					goto keep_locked;
+				mapping = page_mapping(page);
+			case PAGE_CLEAN:
+				; /* try to free the page below */
+			}
+		}
+
+		/*
+		 * If the page has buffers, try to free the buffer mappings
+		 * associated with this page. If we succeed we try to free
+		 * the page as well.
+		 *
+		 * We do this even if the page is PageDirty().
+		 * try_to_release_page() does not perform I/O, but it is
+		 * possible for a page to have PageDirty set, but it is actually
+		 * clean (all its buffers are clean).  This happens if the
+		 * buffers were written out directly, with submit_bh(). ext3
+		 * will do this, as well as the blockdev mapping. 
+		 * try_to_release_page() will discover that cleanness and will
+		 * drop the buffers and mark the page clean - it can be freed.
+		 *
+		 * Rarely, pages can have buffers and no ->mapping.  These are
+		 * the pages which were not successfully invalidated in
+		 * truncate_complete_page().  We try to drop those buffers here
+		 * and if that worked, and the page is no longer mapped into
+		 * process address space (page_count == 1) it can be freed.
+		 * Otherwise, leave the page on the LRU so it is swappable.
+		 */
+		if (PagePrivate(page)) {
+			if (!try_to_release_page(page, sc->gfp_mask))
+				goto activate_locked;
+			/*
+			 * file system may manually remove page from the page
+			 * cache in ->releasepage(). Check for this.
+			 */
+			mapping = page_mapping(page);
+			if (!mapping && page_count(page) == 1)
+				goto free_it;
+		}
+
+		if (!remove_mapping(mapping, page))
+			goto keep_locked;
+
+free_it:
+		unlock_page(page);
+		nr_reclaimed++;
+		if (!pagevec_add(&freed_pvec, page))
+			__pagevec_release_nonlru(&freed_pvec);
+		continue;
+
+activate_locked:
+		SetPageActive(page);
+		pgactivate++;
+keep_locked:
+		unlock_page(page);
+keep:
+		list_add(&page->lru, &ret_pages);
+		BUG_ON(PageLRU(page));
+	}
+	list_splice(&ret_pages, page_list);
+	if (pagevec_count(&freed_pvec))
+		__pagevec_release_nonlru(&freed_pvec);
+	mod_page_state(pgactivate, pgactivate);
+	return nr_reclaimed;
+}
+
+#ifdef CONFIG_MIGRATION
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+unsigned long putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	unsigned long count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+	return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (page_mapped(page) && mapping)
+		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+			goto unlock_retry;
+
+	if (PageDirty(page)) {
+		/* Page is dirty, try to write it out here */
+		switch(pageout(page, mapping)) {
+		case PAGE_KEEP:
+		case PAGE_ACTIVATE:
+			goto unlock_retry;
+
+		case PAGE_SUCCESS:
+			goto retry;
+
+		case PAGE_CLEAN:
+			; /* try to free the page below */
+		}
+	}
+
+	if (PagePrivate(page)) {
+		if (!try_to_release_page(page, GFP_KERNEL) ||
+		    (!mapping && page_count(page) == 1))
+			goto unlock_retry;
+	}
+
+	if (remove_mapping(mapping, page)) {
+		/* Success */
+		unlock_page(page);
+		return 0;
+	}
+
+unlock_retry:
+	unlock_page(page);
+
+retry:
+	return -EAGAIN;
+}
+EXPORT_SYMBOL(swap_page);
+
+/*
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+				struct page *page, int nr_refs)
+{
+	struct address_space *mapping = page_mapping(page);
+	struct page **radix_pointer;
+
+	/*
+	 * Avoid doing any of the following work if the page count
+	 * indicates that the page is in use or truncate has removed
+	 * the page.
+	 */
+	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+		return -EAGAIN;
+
+	/*
+	 * Establish swap ptes for anonymous pages or destroy pte
+	 * maps for files.
+	 *
+	 * In order to reestablish file backed mappings the fault handlers
+	 * will take the radix tree_lock which may then be used to stop
+  	 * processses from accessing this page until the new page is ready.
+	 *
+	 * A process accessing via a swap pte (an anonymous page) will take a
+	 * page_lock on the old page which will block the process until the
+	 * migration attempt is complete. At that time the PageSwapCache bit
+	 * will be examined. If the page was migrated then the PageSwapCache
+	 * bit will be clear and the operation to retrieve the page will be
+	 * retried which will find the new page in the radix tree. Then a new
+	 * direct mapping may be generated based on the radix tree contents.
+	 *
+	 * If the page was not migrated then the PageSwapCache bit
+	 * is still set and the operation may continue.
+	 */
+	if (try_to_unmap(page, 1) == SWAP_FAIL)
+		/* A vma has VM_LOCKED set -> Permanent failure */
+		return -EPERM;
+
+	/*
+	 * Give up if we were unable to remove all mappings.
+	 */
+	if (page_mapcount(page))
+		return -EAGAIN;
+
+	write_lock_irq(&mapping->tree_lock);
+
+	radix_pointer = (struct page **)radix_tree_lookup_slot(
+						&mapping->page_tree,
+						page_index(page));
+
+	if (!page_mapping(page) || page_count(page) != nr_refs ||
+			*radix_pointer != page) {
+		write_unlock_irq(&mapping->tree_lock);
+		return -EAGAIN;
+	}
+
+	/*
+	 * Now we know that no one else is looking at the page.
+	 *
+	 * Certain minimal information about a page must be available
+	 * in order for other subsystems to properly handle the page if they
+	 * find it through the radix tree update before we are finished
+	 * copying the page.
+	 */
+	get_page(newpage);
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
+	if (PageSwapCache(page)) {
+		SetPageSwapCache(newpage);
+		set_page_private(newpage, page_private(page));
+	}
+
+	*radix_pointer = newpage;
+	__put_page(page);
+	write_unlock_irq(&mapping->tree_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+	copy_highpage(newpage, page);
+
+	if (PageError(page))
+		SetPageError(newpage);
+	if (PageReferenced(page))
+		SetPageReferenced(newpage);
+	if (PageUptodate(page))
+		SetPageUptodate(newpage);
+	if (PageActive(page))
+		SetPageActive(newpage);
+	if (PageChecked(page))
+		SetPageChecked(newpage);
+	if (PageMappedToDisk(page))
+		SetPageMappedToDisk(newpage);
+
+	if (PageDirty(page)) {
+		clear_page_dirty_for_io(page);
+		set_page_dirty(newpage);
+ 	}
+
+	ClearPageSwapCache(page);
+	ClearPageActive(page);
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page->mapping = NULL;
+
+	/*
+	 * If any waiters have accumulated on the new page then
+	 * wake them up.
+	 */
+	if (PageWriteback(newpage))
+		end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+	int rc;
+
+	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
+
+	rc = migrate_page_remove_references(newpage, page, 2);
+
+	if (rc)
+		return rc;
+
+	migrate_page_copy(newpage, page);
+
+	/*
+	 * Remove auxiliary swap entries and replace
+	 * them with real ptes.
+	 *
+	 * Note that a real pte entry will allow processes that are not
+	 * waiting on the page lock to use the new page via the page tables
+	 * before the new page is unlocked.
+	 */
+	remove_from_swap(newpage);
+	return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+unsigned long migrate_pages(struct list_head *from, struct list_head *to,
+		  struct list_head *moved, struct list_head *failed)
+{
+	unsigned long retry;
+	unsigned long nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int swapwrite = current->flags & PF_SWAPWRITE;
+	int rc;
+
+	if (!swapwrite)
+		current->flags |= PF_SWAPWRITE;
+
+redo:
+	retry = 0;
+
+	list_for_each_entry_safe(page, page2, from, lru) {
+		struct page *newpage = NULL;
+		struct address_space *mapping;
+
+		cond_resched();
+
+		rc = 0;
+		if (page_count(page) == 1)
+			/* page was freed from under us. So we are done. */
+			goto next;
+
+		if (to && list_empty(to))
+			break;
+
+		/*
+		 * Skip locked pages during the first two passes to give the
+		 * functions holding the lock time to release the page. Later we
+		 * use lock_page() to have a higher chance of acquiring the
+		 * lock.
+		 */
+		rc = -EAGAIN;
+		if (pass > 2)
+			lock_page(page);
+		else
+			if (TestSetPageLocked(page))
+				goto next;
+
+		/*
+		 * Only wait on writeback if we have already done a pass where
+		 * we we may have triggered writeouts for lots of pages.
+		 */
+		if (pass > 0) {
+			wait_on_page_writeback(page);
+		} else {
+			if (PageWriteback(page))
+				goto unlock_page;
+		}
+
+		/*
+		 * Anonymous pages must have swap cache references otherwise
+		 * the information contained in the page maps cannot be
+		 * preserved.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!add_to_swap(page, GFP_KERNEL)) {
+				rc = -ENOMEM;
+				goto unlock_page;
+			}
+		}
+
+		if (!to) {
+			rc = swap_page(page);
+			goto next;
+		}
+
+		newpage = lru_to_page(to);
+		lock_page(newpage);
+
+		/*
+		 * Pages are properly locked and writeback is complete.
+		 * Try to migrate the page.
+		 */
+		mapping = page_mapping(page);
+		if (!mapping)
+			goto unlock_both;
+
+		if (mapping->a_ops->migratepage) {
+			/*
+			 * Most pages have a mapping and most filesystems
+			 * should provide a migration function. Anonymous
+			 * pages are part of swap space which also has its
+			 * own migration function. This is the most common
+			 * path for page migration.
+			 */
+			rc = mapping->a_ops->migratepage(newpage, page);
+			goto unlock_both;
+                }
+
+		/*
+		 * Default handling if a filesystem does not provide
+		 * a migration function. We can only migrate clean
+		 * pages so try to write out any dirty pages first.
+		 */
+		if (PageDirty(page)) {
+			switch (pageout(page, mapping)) {
+			case PAGE_KEEP:
+			case PAGE_ACTIVATE:
+				goto unlock_both;
+
+			case PAGE_SUCCESS:
+				unlock_page(newpage);
+				goto next;
+
+			case PAGE_CLEAN:
+				; /* try to migrate the page below */
+			}
+                }
+
+		/*
+		 * Buffers are managed in a filesystem specific way.
+		 * We must have no buffers or drop them.
+		 */
+		if (!page_has_buffers(page) ||
+		    try_to_release_page(page, GFP_KERNEL)) {
+			rc = migrate_page(newpage, page);
+			goto unlock_both;
+		}
+
+		/*
+		 * On early passes with mapped pages simply
+		 * retry. There may be a lock held for some
+		 * buffers that may go away. Later
+		 * swap them out.
+		 */
+		if (pass > 4) {
+			/*
+			 * Persistently unable to drop buffers..... As a
+			 * measure of last resort we fall back to
+			 * swap_page().
+			 */
+			unlock_page(newpage);
+			newpage = NULL;
+			rc = swap_page(page);
+			goto next;
+		}
+
+unlock_both:
+		unlock_page(newpage);
+
+unlock_page:
+		unlock_page(page);
+
+next:
+		if (rc == -EAGAIN) {
+			retry++;
+		} else if (rc) {
+			/* Permanent failure */
+			list_move(&page->lru, failed);
+			nr_failed++;
+		} else {
+			if (newpage) {
+				/* Successful migration. Return page to LRU */
+				move_to_lru(newpage);
+			}
+			list_move(&page->lru, moved);
+		}
+	}
+	if (retry && pass++ < 10)
+		goto redo;
+
+	if (!swapwrite)
+		current->flags &= ~PF_SWAPWRITE;
+
+	return nr_failed + retry;
+}
+
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list with elevated refcount.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page)
+{
+	int ret = 0;
+
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+		spin_lock_irq(&zone->lru_lock);
+		if (PageLRU(page)) {
+			ret = 1;
+			get_page(page);
+			ClearPageLRU(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+	}
+
+	return ret;
+}
+#endif
+
+/*
+ * zone->lru_lock is heavily contended.  Some of the functions that
+ * shrink the lists perform better by taking out a batch of pages
+ * and working on them outside the LRU lock.
+ *
+ * For pagecache intensive workloads, this function is the hottest
+ * spot in the kernel (apart from copy_*_user functions).
+ *
+ * Appropriate locks must be held before calling this function.
+ *
+ * @nr_to_scan:	The number of pages to look through on the list.
+ * @src:	The LRU list to pull pages off.
+ * @dst:	The temp list to put pages on to.
+ * @scanned:	The number of pages that were scanned.
+ *
+ * returns how many pages were moved onto *@dst.
+ */
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+		struct list_head *src, struct list_head *dst,
+		unsigned long *scanned)
+{
+	unsigned long nr_taken = 0;
+	struct page *page;
+	unsigned long scan;
+
+	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+		struct list_head *target;
+		page = lru_to_page(src);
+		prefetchw_prev_lru_page(page, src, flags);
+
+		BUG_ON(!PageLRU(page));
+
+		list_del(&page->lru);
+		target = src;
+		if (likely(get_page_unless_zero(page))) {
+			/*
+			 * Be careful not to clear PageLRU until after we're
+			 * sure the page is not being freed elsewhere -- the
+			 * page release code relies on it.
+			 */
+			ClearPageLRU(page);
+			target = dst;
+			nr_taken++;
+		} /* else it is being freed elsewhere */
+
+		list_add(&page->lru, target);
+	}
+
+	*scanned = scan;
+	return nr_taken;
+}
+
+/*
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
+ */
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+				struct zone *zone, struct scan_control *sc)
+{
+	LIST_HEAD(page_list);
+	struct pagevec pvec;
+	unsigned long nr_scanned = 0;
+	unsigned long nr_reclaimed = 0;
+
+	pagevec_init(&pvec, 1);
+
+	lru_add_drain();
+	spin_lock_irq(&zone->lru_lock);
+	do {
+		struct page *page;
+		unsigned long nr_taken;
+		unsigned long nr_scan;
+		unsigned long nr_freed;
+
+		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
+					     &zone->inactive_list,
+					     &page_list, &nr_scan);
+		zone->nr_inactive -= nr_taken;
+		zone->pages_scanned += nr_scan;
+		spin_unlock_irq(&zone->lru_lock);
+
+		nr_scanned += nr_scan;
+		nr_freed = shrink_page_list(&page_list, sc);
+		nr_reclaimed += nr_freed;
+		local_irq_disable();
+		if (current_is_kswapd()) {
+			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+			__mod_page_state(kswapd_steal, nr_freed);
+		} else
+			__mod_page_state_zone(zone, pgscan_direct, nr_scan);
+		__mod_page_state_zone(zone, pgsteal, nr_freed);
+
+		if (nr_taken == 0)
+			goto done;
+
+		spin_lock(&zone->lru_lock);
+		/*
+		 * Put back any unfreeable pages.
+		 */
+		while (!list_empty(&page_list)) {
+			page = lru_to_page(&page_list);
+			BUG_ON(PageLRU(page));
+			SetPageLRU(page);
+			list_del(&page->lru);
+			if (PageActive(page))
+				add_page_to_active_list(zone, page);
+			else
+				add_page_to_inactive_list(zone, page);
+			if (!pagevec_add(&pvec, page)) {
+				spin_unlock_irq(&zone->lru_lock);
+				__pagevec_release(&pvec);
+				spin_lock_irq(&zone->lru_lock);
+			}
+		}
+  	} while (nr_scanned < max_scan);
+	spin_unlock(&zone->lru_lock);
+done:
+	local_irq_enable();
+	pagevec_release(&pvec);
+	return nr_reclaimed;
+}
+
+/*
+ * This moves pages from the active list to the inactive list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone->lru_lock across the whole operation.  But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone->lru_lock around each page.  It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_count against each page.
+ * But we had to alter page->flags anyway.
+ */
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+				struct scan_control *sc)
+{
+	unsigned long pgmoved;
+	int pgdeactivate = 0;
+	unsigned long pgscanned;
+	LIST_HEAD(l_hold);	/* The pages which were snipped off */
+	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
+	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
+	struct page *page;
+	struct pagevec pvec;
+	int reclaim_mapped = 0;
+
+	if (sc->may_swap) {
+		long mapped_ratio;
+		long distress;
+		long swap_tendency;
+
+		/*
+		 * `distress' is a measure of how much trouble we're having
+		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+		 */
+		distress = 100 >> zone->prev_priority;
+
+		/*
+		 * The point of this algorithm is to decide when to start
+		 * reclaiming mapped memory instead of just pagecache.  Work out
+		 * how much memory
+		 * is mapped.
+		 */
+		mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+
+		/*
+		 * Now decide how much we really want to unmap some pages.  The
+		 * mapped ratio is downgraded - just because there's a lot of
+		 * mapped memory doesn't necessarily mean that page reclaim
+		 * isn't succeeding.
+		 *
+		 * The distress ratio is important - we don't want to start
+		 * going oom.
+		 *
+		 * A 100% value of vm_swappiness overrides this algorithm
+		 * altogether.
+		 */
+		swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+		/*
+		 * Now use this metric to decide whether to start moving mapped
+		 * memory onto the inactive list.
+		 */
+		if (swap_tendency >= 100)
+			reclaim_mapped = 1;
+	}
+
+	lru_add_drain();
+	spin_lock_irq(&zone->lru_lock);
+	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
+				    &l_hold, &pgscanned);
+	zone->pages_scanned += pgscanned;
+	zone->nr_active -= pgmoved;
+	spin_unlock_irq(&zone->lru_lock);
+
+	while (!list_empty(&l_hold)) {
+		cond_resched();
+		page = lru_to_page(&l_hold);
+		list_del(&page->lru);
+		if (page_mapped(page)) {
+			if (!reclaim_mapped ||
+			    (total_swap_pages == 0 && PageAnon(page)) ||
+			    page_referenced(page, 0)) {
+				list_add(&page->lru, &l_active);
+				continue;
+			}
+		}
+		list_add(&page->lru, &l_inactive);
+	}
+
+	pagevec_init(&pvec, 1);
+	pgmoved = 0;
+	spin_lock_irq(&zone->lru_lock);
+	while (!list_empty(&l_inactive)) {
+		page = lru_to_page(&l_inactive);
+		prefetchw_prev_lru_page(page, &l_inactive, flags);
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		BUG_ON(!PageActive(page));
+		ClearPageActive(page);
+
+		list_move(&page->lru, &zone->inactive_list);
+		pgmoved++;
+		if (!pagevec_add(&pvec, page)) {
+			zone->nr_inactive += pgmoved;
+			spin_unlock_irq(&zone->lru_lock);
+			pgdeactivate += pgmoved;
+			pgmoved = 0;
+			if (buffer_heads_over_limit)
+				pagevec_strip(&pvec);
+			__pagevec_release(&pvec);
+			spin_lock_irq(&zone->lru_lock);
+		}
+	}
+	zone->nr_inactive += pgmoved;
+	pgdeactivate += pgmoved;
+	if (buffer_heads_over_limit) {
+		spin_unlock_irq(&zone->lru_lock);
+		pagevec_strip(&pvec);
+		spin_lock_irq(&zone->lru_lock);
+	}
+
+	pgmoved = 0;
+	while (!list_empty(&l_active)) {
+		page = lru_to_page(&l_active);
+		prefetchw_prev_lru_page(page, &l_active, flags);
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		BUG_ON(!PageActive(page));
+		list_move(&page->lru, &zone->active_list);
+		pgmoved++;
+		if (!pagevec_add(&pvec, page)) {
+			zone->nr_active += pgmoved;
+			pgmoved = 0;
+			spin_unlock_irq(&zone->lru_lock);
+			__pagevec_release(&pvec);
+			spin_lock_irq(&zone->lru_lock);
+		}
+	}
+	zone->nr_active += pgmoved;
+	spin_unlock(&zone->lru_lock);
+
+	__mod_page_state_zone(zone, pgrefill, pgscanned);
+	__mod_page_state(pgdeactivate, pgdeactivate);
+	local_irq_enable();
+
+	pagevec_release(&pvec);
+}
+
+/*
+ * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
+ */
+static unsigned long shrink_zone(int priority, struct zone *zone,
+				struct scan_control *sc)
+{
+	unsigned long nr_active;
+	unsigned long nr_inactive;
+	unsigned long nr_to_scan;
+	unsigned long nr_reclaimed = 0;
+
+	atomic_inc(&zone->reclaim_in_progress);
+
+	/*
+	 * Add one to `nr_to_scan' just to make sure that the kernel will
+	 * slowly sift through the active list.
+	 */
+	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
+	nr_active = zone->nr_scan_active;
+	if (nr_active >= sc->swap_cluster_max)
+		zone->nr_scan_active = 0;
+	else
+		nr_active = 0;
+
+	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
+	nr_inactive = zone->nr_scan_inactive;
+	if (nr_inactive >= sc->swap_cluster_max)
+		zone->nr_scan_inactive = 0;
+	else
+		nr_inactive = 0;
+
+	while (nr_active || nr_inactive) {
+		if (nr_active) {
+			nr_to_scan = min(nr_active,
+					(unsigned long)sc->swap_cluster_max);
+			nr_active -= nr_to_scan;
+			shrink_active_list(nr_to_scan, zone, sc);
+		}
+
+		if (nr_inactive) {
+			nr_to_scan = min(nr_inactive,
+					(unsigned long)sc->swap_cluster_max);
+			nr_inactive -= nr_to_scan;
+			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+								sc);
+		}
+	}
+
+	throttle_vm_writeout();
+
+	atomic_dec(&zone->reclaim_in_progress);
+	return nr_reclaimed;
+}
+
+/*
+ * This is the direct reclaim path, for page-allocating processes.  We only
+ * try to reclaim pages from zones which will satisfy the caller's allocation
+ * request.
+ *
+ * We reclaim from a zone even if that zone is over pages_high.  Because:
+ * a) The caller may be trying to free *extra* pages to satisfy a higher-order
+ *    allocation or
+ * b) The zones may be over pages_high but they must go *over* pages_high to
+ *    satisfy the `incremental min' zone defense algorithm.
+ *
+ * Returns the number of reclaimed pages.
+ *
+ * If a zone is deemed to be full of pinned pages then just give it a light
+ * scan then give up on it.
+ */
+static unsigned long shrink_zones(int priority, struct zone **zones,
+					struct scan_control *sc)
+{
+	unsigned long nr_reclaimed = 0;
+	int i;
+
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *zone = zones[i];
+
+		if (!populated_zone(zone))
+			continue;
+
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+			continue;
+
+		zone->temp_priority = priority;
+		if (zone->prev_priority > priority)
+			zone->prev_priority = priority;
+
+		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+			continue;	/* Let kswapd poll it */
+
+		nr_reclaimed += shrink_zone(priority, zone, sc);
+	}
+	return nr_reclaimed;
+}
+ 
+/*
+ * This is the main entry point to direct page reclaim.
+ *
+ * If a full scan of the inactive list fails to free enough memory then we
+ * are "out of memory" and something needs to be killed.
+ *
+ * If the caller is !__GFP_FS then the probability of a failure is reasonably
+ * high - the zone may be full of dirty or under-writeback pages, which this
+ * caller can't do much about.  We kick pdflush and take explicit naps in the
+ * hope that some of these pages can be written.  But if the allocating task
+ * holds filesystem locks which prevent writeout this might not work, and the
+ * allocation attempt will fail.
+ */
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+{
+	int priority;
+	int ret = 0;
+	unsigned long total_scanned = 0;
+	unsigned long nr_reclaimed = 0;
+	struct reclaim_state *reclaim_state = current->reclaim_state;
+	unsigned long lru_pages = 0;
+	int i;
+	struct scan_control sc = {
+		.gfp_mask = gfp_mask,
+		.may_writepage = !laptop_mode,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.may_swap = 1,
+	};
+
+	delay_swap_prefetch();
+
+	inc_page_state(allocstall);
+
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *zone = zones[i];
+
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+			continue;
+
+		zone->temp_priority = DEF_PRIORITY;
+		lru_pages += zone->nr_active + zone->nr_inactive;
+	}
+
+	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+		sc.nr_mapped = read_page_state(nr_mapped);
+		sc.nr_scanned = 0;
+		if (!priority)
+			disable_swap_token();
+		nr_reclaimed += shrink_zones(priority, zones, &sc);
+		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
+		if (reclaim_state) {
+			nr_reclaimed += reclaim_state->reclaimed_slab;
+			reclaim_state->reclaimed_slab = 0;
+		}
+		total_scanned += sc.nr_scanned;
+		if (nr_reclaimed >= sc.swap_cluster_max) {
+			ret = 1;
+			goto out;
+		}
+
+		/*
+		 * Try to write back as many pages as we just scanned.  This
+		 * tends to cause slow streaming writers to write data to the
+		 * disk smoothly, at the dirtying rate, which is nice.   But
+		 * that's undesirable in laptop mode, where we *want* lumpy
+		 * writeout.  So in laptop mode, write out the whole world.
+		 */
+		if (total_scanned > sc.swap_cluster_max +
+					sc.swap_cluster_max / 2) {
+			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+			sc.may_writepage = 1;
+		}
+
+		/* Take a nap, wait for some writeback to complete */
+		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+			blk_congestion_wait(WRITE, HZ/10);
+	}
+out:
+	for (i = 0; zones[i] != 0; i++) {
+		struct zone *zone = zones[i];
+
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+			continue;
+
+		zone->prev_priority = zone->temp_priority;
+	}
+	return ret;
+}
+
+/*
+ * For kswapd, balance_pgdat() will work across all this node's zones until
+ * they are all at pages_high.
+ *
+ * If `nr_pages' is non-zero then it is the number of pages which are to be
+ * reclaimed, regardless of the zone occupancies.  This is a software suspend
+ * special.
+ *
+ * Returns the number of pages which were actually freed.
+ *
+ * There is special handling here for zones which are full of pinned pages.
+ * This can happen if the pages are all mlocked, or if they are all used by
+ * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
+ * What we do is to detect the case where all pages in the zone have been
+ * scanned twice and there has been zero successful reclaim.  Mark the zone as
+ * dead and from now on, only perform a short scan.  Basically we're polling
+ * the zone for when the problem goes away.
+ *
+ * kswapd scans the zones in the highmem->normal->dma direction.  It skips
+ * zones which have free_pages > pages_high, but once a zone is found to have
+ * free_pages <= pages_high, we scan that zone and the lower zones regardless
+ * of the number of free pages in the lower zones.  This interoperates with
+ * the page allocator fallback scheme to ensure that aging of pages is balanced
+ * across the zones.
+ */
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+				int order)
+{
+	unsigned long to_free = nr_pages;
+	int all_zones_ok;
+	int priority;
+	int i;
+	unsigned long total_scanned;
+	unsigned long nr_reclaimed;
+	struct reclaim_state *reclaim_state = current->reclaim_state;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_swap = 1,
+		.swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+	};
+
+loop_again:
+	total_scanned = 0;
+	nr_reclaimed = 0;
+	sc.may_writepage = !laptop_mode,
+	sc.nr_mapped = read_page_state(nr_mapped);
+
+	inc_page_state(pageoutrun);
+
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		struct zone *zone = pgdat->node_zones + i;
+
+		zone->temp_priority = DEF_PRIORITY;
+	}
+
+	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
+		unsigned long lru_pages = 0;
+
+		/* The swap token gets in the way of swapout... */
+		if (!priority)
+			disable_swap_token();
+
+		all_zones_ok = 1;
+
+		if (nr_pages == 0) {
+			/*
+			 * Scan in the highmem->dma direction for the highest
+			 * zone which needs scanning
+			 */
+			for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+				struct zone *zone = pgdat->node_zones + i;
+
+				if (!populated_zone(zone))
+					continue;
+
+				if (zone->all_unreclaimable &&
+						priority != DEF_PRIORITY)
+					continue;
+
+				if (!zone_watermark_ok(zone, order,
+						zone->pages_high, 0, 0)) {
+					end_zone = i;
+					goto scan;
+				}
+			}
+			goto out;
+		} else {
+			end_zone = pgdat->nr_zones - 1;
+		}
+scan:
+		for (i = 0; i <= end_zone; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+
+			lru_pages += zone->nr_active + zone->nr_inactive;
+		}
+
+		/*
+		 * Now scan the zone in the dma->highmem direction, stopping
+		 * at the last zone which needs scanning.
+		 *
+		 * We do this because the page allocator works in the opposite
+		 * direction.  This prevents the page allocator from allocating
+		 * pages behind kswapd's direction of progress, which would
+		 * cause too much scanning of the lower zones.
+		 */
+		for (i = 0; i <= end_zone; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+			int nr_slab;
+
+			if (!populated_zone(zone))
+				continue;
+
+			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+				continue;
+
+			if (nr_pages == 0) {	/* Not software suspend */
+				if (!zone_watermark_ok(zone, order,
+						zone->pages_high, end_zone, 0))
+					all_zones_ok = 0;
+			}
+			zone->temp_priority = priority;
+			if (zone->prev_priority > priority)
+				zone->prev_priority = priority;
+			sc.nr_scanned = 0;
+			nr_reclaimed += shrink_zone(priority, zone, &sc);
+			reclaim_state->reclaimed_slab = 0;
+			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+						lru_pages);
+			nr_reclaimed += reclaim_state->reclaimed_slab;
+			total_scanned += sc.nr_scanned;
+			if (zone->all_unreclaimable)
+				continue;
+			if (nr_slab == 0 && zone->pages_scanned >=
+				    (zone->nr_active + zone->nr_inactive) * 4)
+				zone->all_unreclaimable = 1;
+			/*
+			 * If we've done a decent amount of scanning and
+			 * the reclaim ratio is low, start doing writepage
+			 * even in laptop mode
+			 */
+			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
+				sc.may_writepage = 1;
+		}
+		if (nr_pages && to_free > nr_reclaimed)
+			continue;	/* swsusp: need to do more work */
+		if (all_zones_ok)
+			break;		/* kswapd: all done */
+		/*
+		 * OK, kswapd is getting into trouble.  Take a nap, then take
+		 * another pass across the zones.
+		 */
+		if (total_scanned && priority < DEF_PRIORITY - 2)
+			blk_congestion_wait(WRITE, HZ/10);
+
+		/*
+		 * We do this so kswapd doesn't build up large priorities for
+		 * example when it is freeing in parallel with allocators. It
+		 * matches the direct reclaim path behaviour in terms of impact
+		 * on zone->*_priority.
+		 */
+		if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
+			break;
+	}
+out:
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		struct zone *zone = pgdat->node_zones + i;
+
+		zone->prev_priority = zone->temp_priority;
+	}
+	if (!all_zones_ok) {
+		cond_resched();
+		goto loop_again;
+	}
+
+	return nr_reclaimed;
+}
+
+/*
+ * The background pageout daemon, started as a kernel thread
+ * from the init process. 
+ *
+ * This basically trickles out pages so that we have _some_
+ * free memory available even if there is no other activity
+ * that frees anything up. This is needed for things like routing
+ * etc, where we otherwise might have all activity going on in
+ * asynchronous contexts that cannot page things out.
+ *
+ * If there are applications that are active memory-allocators
+ * (most normal use), this basically shouldn't matter.
+ */
+static int kswapd(void *p)
+{
+	unsigned long order;
+	pg_data_t *pgdat = (pg_data_t*)p;
+	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
+	struct reclaim_state reclaim_state = {
+		.reclaimed_slab = 0,
+	};
+	cpumask_t cpumask;
+
+	daemonize("kswapd%d", pgdat->node_id);
+	cpumask = node_to_cpumask(pgdat->node_id);
+	if (!cpus_empty(cpumask))
+		set_cpus_allowed(tsk, cpumask);
+	current->reclaim_state = &reclaim_state;
+
+	/*
+	 * Tell the memory management that we're a "memory allocator",
+	 * and that if we need more memory we should get access to it
+	 * regardless (see "__alloc_pages()"). "kswapd" should
+	 * never get caught in the normal page freeing logic.
+	 *
+	 * (Kswapd normally doesn't need memory anyway, but sometimes
+	 * you need a small amount of memory in order to be able to
+	 * page out something else, and this flag essentially protects
+	 * us from recursively trying to free more memory as we're
+	 * trying to free the first piece of memory in the first place).
+	 */
+	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+
+	order = 0;
+	for ( ; ; ) {
+		unsigned long new_order;
+
+		try_to_freeze();
+
+		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+		new_order = pgdat->kswapd_max_order;
+		pgdat->kswapd_max_order = 0;
+		if (order < new_order) {
+			/*
+			 * Don't sleep if someone wants a larger 'order'
+			 * allocation
+			 */
+			order = new_order;
+		} else {
+			schedule();
+			order = pgdat->kswapd_max_order;
+		}
+		finish_wait(&pgdat->kswapd_wait, &wait);
+
+		balance_pgdat(pgdat, 0, order);
+	}
+	return 0;
+}
+
+/*
+ * A zone is low on free memory, so wake its kswapd task to service it.
+ */
+void wakeup_kswapd(struct zone *zone, int order)
+{
+	pg_data_t *pgdat;
+
+	if (!populated_zone(zone))
+		return;
+
+	pgdat = zone->zone_pgdat;
+	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
+		return;
+	if (pgdat->kswapd_max_order < order)
+		pgdat->kswapd_max_order = order;
+	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+		return;
+	if (!waitqueue_active(&pgdat->kswapd_wait))
+		return;
+	wake_up_interruptible(&pgdat->kswapd_wait);
+}
+
+#ifdef CONFIG_PM
+/*
+ * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
+ * pages.
+ */
+unsigned long shrink_all_memory(unsigned long nr_pages)
+{
+	pg_data_t *pgdat;
+	unsigned long nr_to_free = nr_pages;
+	unsigned long ret = 0;
+	unsigned retry = 2;
+	struct reclaim_state reclaim_state = {
+		.reclaimed_slab = 0,
+	};
+
+	delay_swap_prefetch();
+
+	current->reclaim_state = &reclaim_state;
+repeat:
+	for_each_online_pgdat(pgdat) {
+		unsigned long freed;
+
+		freed = balance_pgdat(pgdat, nr_to_free, 0);
+		ret += freed;
+		nr_to_free -= freed;
+		if ((long)nr_to_free <= 0)
+			break;
+	}
+	if (retry-- && ret < nr_pages) {
+		blk_congestion_wait(WRITE, HZ/5);
+		goto repeat;
+	}
+	current->reclaim_state = NULL;
+	return ret;
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* It's optimal to keep kswapds on the same CPUs as their memory, but
+   not required for correctness.  So if the last cpu in a node goes
+   away, we get changed to run anywhere: as the first one comes back,
+   restore their cpu bindings. */
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action, void *hcpu)
+{
+	pg_data_t *pgdat;
+	cpumask_t mask;
+
+	if (action == CPU_ONLINE) {
+		for_each_online_pgdat(pgdat) {
+			mask = node_to_cpumask(pgdat->node_id);
+			if (any_online_cpu(mask) != NR_CPUS)
+				/* One of our CPUs online: restore mask */
+				set_cpus_allowed(pgdat->kswapd, mask);
+		}
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __init kswapd_init(void)
+{
+	pg_data_t *pgdat;
+
+	swap_setup();
+	for_each_online_pgdat(pgdat) {
+		pid_t pid;
+
+		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+		BUG_ON(pid < 0);
+		pgdat->kswapd = find_task_by_pid(pid);
+	}
+	total_memory = nr_free_pagecache_pages();
+	hotcpu_notifier(cpu_callback, 0);
+	return 0;
+}
+
+module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
+#define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
+#define RECLAIM_SLAB (1<<3)	/* Do a global slab shrink if the zone is out of memory */
+
+/*
+ * Mininum time between zone reclaim scans
+ */
+int zone_reclaim_interval __read_mostly = 30*HZ;
+
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+	/* Minimum pages needed in order to stay on node */
+	const unsigned long nr_pages = 1 << order;
+	struct task_struct *p = current;
+	struct reclaim_state reclaim_state;
+	cpumask_t mask;
+	int node_id;
+	int priority;
+	unsigned long nr_reclaimed = 0;
+	struct scan_control sc = {
+		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+		.nr_mapped = read_page_state(nr_mapped),
+		.swap_cluster_max = max_t(unsigned long, nr_pages,
+					SWAP_CLUSTER_MAX),
+		.gfp_mask = gfp_mask,
+	};
+
+	/*
+	 * Do not reclaim if there was a recent unsuccessful attempt at zone
+	 * reclaim.  In that case we let allocations go off node for the
+	 * zone_reclaim_interval.  Otherwise we would scan for each off-node
+	 * page allocation.
+	 */
+	if (time_before(jiffies,
+		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+			return 0;
+
+	/*
+	 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+	 * not have reclaimable pages and if we should not delay the allocation
+	 * then do not scan.
+	 */
+	if (!(gfp_mask & __GFP_WAIT) ||
+		zone->all_unreclaimable ||
+		atomic_read(&zone->reclaim_in_progress) > 0)
+			return 0;
+
+	node_id = zone->zone_pgdat->node_id;
+	mask = node_to_cpumask(node_id);
+	if (!cpus_empty(mask) && node_id != numa_node_id())
+		return 0;
+
+	disable_swap_token();
+
+	cond_resched();
+	/*
+	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
+	 * and we also need to be able to write out pages for RECLAIM_WRITE
+	 * and RECLAIM_SWAP.
+	 */
+	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+
+	/*
+	 * Free memory by calling shrink zone with increasing priorities
+	 * until we have enough memory freed.
+	 */
+	priority = ZONE_RECLAIM_PRIORITY;
+	do {
+		nr_reclaimed += shrink_zone(priority, zone, &sc);
+		priority--;
+	} while (priority >= 0 && nr_reclaimed < nr_pages);
+
+	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+		/*
+		 * shrink_slab() does not currently allow us to determine how
+		 * many pages were freed in this zone. So we just shake the slab
+		 * a bit and then go off node for this particular allocation
+		 * despite possibly having freed enough memory to allocate in
+		 * this zone.  If we freed local memory then the next
+		 * allocations will be local again.
+		 *
+		 * shrink_slab will free memory on all zones and may take
+		 * a long time.
+		 */
+		shrink_slab(sc.nr_scanned, gfp_mask, order);
+	}
+
+	p->reclaim_state = NULL;
+	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+
+	if (nr_reclaimed == 0) {
+		/*
+		 * We were unable to reclaim enough pages to stay on node.  We
+		 * now allow off node accesses for a certain time period before
+		 * trying again to reclaim pages from the local zone.
+		 */
+		zone->last_unsuccessful_zone_reclaim = jiffies;
+	}
+
+	return nr_reclaimed >= nr_pages;
+}
+#endif
