From 6fd592daae2182adc47f405e20d07f34f52d07dd Mon Sep 17 00:00:00 2001
From: "Carlos R. Mafra" <crmafra2@gmail.com>
Date: Mon, 5 May 2008 20:11:22 -0300
Subject: x86: clean up computation of HPET .mult variables

While reading through the HPET code I realized that the
computation of .mult variables could be done with less
lines of code, resulting in a 1.6% text size saving
for hpet.o

So I propose the following patch, which applies against
today's Linus -git tree.

>From 0c6507e400e9ca5f7f14331e18f8c12baf75a9d3 Mon Sep 17 00:00:00 2001
From: Carlos R. Mafra <crmafra@ift.unesp.br>
Date: Mon, 5 May 2008 19:38:53 -0300

The computation of clocksource_hpet.mult

       tmp = (u64)hpet_period << HPET_SHIFT;
       do_div(tmp, FSEC_PER_NSEC);
       clocksource_hpet.mult = (u32)tmp;

can be streamlined if we note that it is equal to

       clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);

Furthermore, the computation of hpet_clockevent.mult

       uint64_t hpet_freq;

       hpet_freq = 1000000000000000ULL;
       do_div(hpet_freq, hpet_period);
       hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
                                     NSEC_PER_SEC, hpet_clockevent.shift);

can also be streamlined with the observation that hpet_period and hpet_freq are
inverse to each other (in proper units).

So instead of computing hpet_freq and using (schematically)
div_sc(hpet_freq, 10^9, shift) we use the trick of calling with the
arguments in reverse order, div_sc(10^6, hpet_period, shift).

The different power of ten is due to frequency being in Hertz (1/sec)
and the period being in units of femtosecond. Explicitly,

mult = (hpet_freq * 2^shift)/10^9    (before)
mult = (10^6 * 2^shift)/hpet_period  (after)

because hpet_freq = 10^15/hpet_period.

The comments in the code are also updated to reflect the changes.

As a result,

   text    data     bss     dec     hex filename
   2957     425      92    3474     d92 arch/x86/kernel/hpet.o
   3006     425      92    3523     dc3 arch/x86/kernel/hpet.o.old

a 1.6% reduction in text size.

Signed-off-by: Carlos R. Mafra <crmafra@ift.unesp.br>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c | 43 ++++++++++++++++++-------------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 9b5cfcdfc426..ea230ec69057 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -17,7 +17,7 @@
 
 /* FSEC = 10^-15
    NSEC = 10^-9 */
-#define FSEC_PER_NSEC	1000000
+#define FSEC_PER_NSEC	1000000L
 
 /*
  * HPET address is set in acpi/boot.c, when an ACPI entry exists
@@ -206,20 +206,19 @@ static void hpet_enable_legacy_int(void)
 
 static void hpet_legacy_clockevent_register(void)
 {
-	uint64_t hpet_freq;
-
 	/* Start HPET legacy interrupts */
 	hpet_enable_legacy_int();
 
 	/*
-	 * The period is a femto seconds value. We need to calculate the
-	 * scaled math multiplication factor for nanosecond to hpet tick
-	 * conversion.
+	 * The mult factor is defined as (include/linux/clockchips.h)
+	 *  mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
+	 * hpet_period is in units of femtoseconds (per cycle), so
+	 *  mult/2^shift = cyc/ns = 10^6/hpet_period
+	 *  mult = (10^6 * 2^shift)/hpet_period
+	 *  mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
 	 */
-	hpet_freq = 1000000000000000ULL;
-	do_div(hpet_freq, hpet_period);
-	hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
-				      NSEC_PER_SEC, hpet_clockevent.shift);
+	hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
+				      hpet_period, hpet_clockevent.shift);
 	/* Calculate the min / max delta */
 	hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
 							   &hpet_clockevent);
@@ -324,7 +323,7 @@ static struct clocksource clocksource_hpet = {
 
 static int hpet_clocksource_register(void)
 {
-	u64 tmp, start, now;
+	u64 start, now;
 	cycle_t t1;
 
 	/* Start the counter */
@@ -351,21 +350,15 @@ static int hpet_clocksource_register(void)
 		return -ENODEV;
 	}
 
-	/* Initialize and register HPET clocksource
-	 *
-	 * hpet period is in femto seconds per cycle
-	 * so we need to convert this to ns/cyc units
-	 * approximated by mult/2^shift
-	 *
-	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
-	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
-	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
-	 *  (fsec/cyc << shift)/1000000 = mult
-	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+	/*
+	 * The definition of mult is (include/linux/clocksource.h)
+	 * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
+	 * so we first need to convert hpet_period to ns/cyc units:
+	 *  mult/2^shift = ns/cyc = hpet_period/10^6
+	 *  mult = (hpet_period * 2^shift)/10^6
+	 *  mult = (hpet_period << shift)/FSEC_PER_NSEC
 	 */
-	tmp = (u64)hpet_period << HPET_SHIFT;
-	do_div(tmp, FSEC_PER_NSEC);
-	clocksource_hpet.mult = (u32)tmp;
+	clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
 
 	clocksource_register(&clocksource_hpet);
 
-- 
cgit v1.2.3


From e8aa4667baf74dfd85fbaab86861465acb811085 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Fri, 9 May 2008 11:49:11 +0200
Subject: x86: enable hpet=force for AMD SB400

Add quirk to allow forced usage of HPET on ATI SB400.
I stumbled over machines where HPET is enabled but not reported
by BIOS. This patch configures the HPET base address and makes
it known to the OS.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/quirks.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d89a648fe710..5fe6bd5cc4c7 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -65,6 +65,7 @@ static enum {
 	ICH_FORCE_HPET_RESUME,
 	VT8237_FORCE_HPET_RESUME,
 	NVIDIA_FORCE_HPET_RESUME,
+	ATI_FORCE_HPET_RESUME,
 } force_hpet_resume_type;
 
 static void __iomem *rcba_base;
@@ -330,6 +331,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
 			 vt8237_force_enable_hpet);
 
+static void ati_force_hpet_resume(void)
+{
+	pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
+	printk(KERN_DEBUG "Force enabled HPET at resume\n");
+}
+
+static void ati_force_enable_hpet(struct pci_dev *dev)
+{
+	u32 uninitialized_var(val);
+
+	if (!hpet_force_user || hpet_address || force_hpet_address)
+		return;
+
+	pci_write_config_dword(dev, 0x14, 0xfed00000);
+	pci_read_config_dword(dev, 0x14, &val);
+	force_hpet_address = val;
+	force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
+	dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
+		   force_hpet_address);
+	cached_dev = dev;
+	return;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+			 ati_force_enable_hpet);
+
 /*
  * Undocumented chipset feature taken from LinuxBIOS.
  */
@@ -397,6 +423,9 @@ void force_hpet_resume(void)
 	case NVIDIA_FORCE_HPET_RESUME:
 		nvidia_force_hpet_resume();
 		return;
+	case ATI_FORCE_HPET_RESUME:
+		ati_force_hpet_resume();
+		return;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 7c4728f4a865067d96fb84f1d9c65e0ccd1f355d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 10 May 2008 21:42:14 +0200
Subject: x86: print info about available HPET quirk

We have a lot of HPET quirks available which might force enable HPET
even when the BIOS does not enable it. Some of those quirks depend on
the command line option "hpet=force".

Andrew pointed out that hoping that the user will find out about this
boot option is not really helpful.

Emit a kernel info which informs the user about the "hpet=force" boot
option when we enter a quirk which depends on this option and the user
did not provide it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/quirks.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 5fe6bd5cc4c7..ddbf34d16a01 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -175,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
 
 static struct pci_dev *cached_dev;
 
+static void hpet_print_force_info(void)
+{
+	printk(KERN_INFO "HPET not enabled in BIOS. "
+	       "You might try hpet=force boot option\n");
+}
+
 static void old_ich_force_hpet_resume(void)
 {
 	u32 val;
@@ -254,6 +260,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
 {
 	if (hpet_force_user)
 		old_ich_force_enable_hpet(dev);
+	else
+		hpet_print_force_info();
 }
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
@@ -291,8 +299,13 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev)
 {
 	u32 uninitialized_var(val);
 
-	if (!hpet_force_user || hpet_address || force_hpet_address)
+	if (hpet_address || force_hpet_address)
+		return;
+
+	if (!hpet_force_user) {
+		hpet_print_force_info();
 		return;
+	}
 
 	pci_read_config_dword(dev, 0x68, &val);
 	/*
@@ -341,8 +354,13 @@ static void ati_force_enable_hpet(struct pci_dev *dev)
 {
 	u32 uninitialized_var(val);
 
-	if (!hpet_force_user || hpet_address || force_hpet_address)
+	if (hpet_address || force_hpet_address)
+		return;
+
+	if (!hpet_force_user) {
+		hpet_print_force_info();
 		return;
+	}
 
 	pci_write_config_dword(dev, 0x14, 0xfed00000);
 	pci_read_config_dword(dev, 0x14, &val);
@@ -369,8 +387,13 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
 {
 	u32 uninitialized_var(val);
 
-	if (!hpet_force_user || hpet_address || force_hpet_address)
+	if (hpet_address || force_hpet_address)
+		return;
+
+	if (!hpet_force_user) {
+		hpet_print_force_info();
 		return;
+	}
 
 	pci_write_config_dword(dev, 0x44, 0xfed00001);
 	pci_read_config_dword(dev, 0x44, &val);
-- 
cgit v1.2.3


From 8edc5cc5ec880c96de8e6686fb0d7a5231e91c05 Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Mon, 12 May 2008 15:43:34 +0200
Subject: x86: remove 6 bank limitation in 64 bit MCE reporting code

Eliminate the 6 bank restriction in 64 bit mce reporting code. This
restriction is artificial (due to static creation of sysfs files) and 32
bit code does not have any such restriction.

This change helps in reporting the details of machine checks on a
machine check exception with errors in bank 6 and above on CPUs that
support those banks. Without the patch, machine check errors in those
banks are not reported.

We still have 128 (MCE_EXTENDED_BANK) bank restriction instead of max
256 supported in hardware. That is not changed in the patch below as it
will have some user level mcelog utility dependency, with bank 128 being
used for thermal reporting currently.

The patch below does not create sysfs control (bankNctl) for banks
higher than 6 as well. That needs some pre-cleanup in /sysfs mce layout,
removal of per cpu /sysfs entries for bankctl as they are really global
system level control today. That change will follow. This basic change
is critical to report the detailed errors on banks higher than 6.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae0..f1f3f5e163b7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -31,7 +31,7 @@
 #include <asm/idle.h>
 
 #define MISC_MCELOG_MINOR 227
-#define NR_BANKS 6
+#define NR_SYSFS_BANKS 6
 
 atomic_t mce_entry;
 
@@ -46,7 +46,7 @@ static int mce_dont_init;
  */
 static int tolerant = 1;
 static int banks;
-static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
 static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = -1;
@@ -209,7 +209,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	barrier();
 
 	for (i = 0; i < banks; i++) {
-		if (!bank[i])
+		if (i < NR_SYSFS_BANKS && !bank[i])
 			continue;
 
 		m.misc = 0;
@@ -444,9 +444,10 @@ static void mce_init(void *dummy)
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	banks = cap & 0xff;
-	if (banks > NR_BANKS) {
-		printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
-		banks = NR_BANKS;
+	if (banks > MCE_EXTENDED_BANK) {
+		printk(KERN_INFO "MCE: warning: using only %d banks\n",
+		       MCE_EXTENDED_BANK);
+		banks = MCE_EXTENDED_BANK;
 	}
 	/* Use accurate RIP reporting if available. */
 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
@@ -462,7 +463,7 @@ static void mce_init(void *dummy)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
 	for (i = 0; i < banks; i++) {
-		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+		wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 }
@@ -766,7 +767,10 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
 	}								\
 	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
-/* TBD should generate these dynamically based on number of available banks */
+/*
+ * TBD should generate these dynamically based on number of available banks.
+ * Have only 6 contol banks in /sysfs until then.
+ */
 ACCESSOR(bank0ctl,bank[0],mce_restart())
 ACCESSOR(bank1ctl,bank[1],mce_restart())
 ACCESSOR(bank2ctl,bank[2],mce_restart())
-- 
cgit v1.2.3


From 9b7dc567d03d74a1fbae84e88949b6a60d922d82 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 2 May 2008 20:10:09 +0200
Subject: x86: unify interrupt vector defines

The interrupt vector defines are copied 4 times around with minimal
differences. Move them all into asm-x86/irq_vectors.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S                        |   2 +-
 arch/x86/kernel/vmiclock_32.c                     |   3 +-
 arch/x86/mach-visws/visws_apic.c                  |   3 +-
 include/asm-x86/hw_irq.h                          |  12 +-
 include/asm-x86/hw_irq_64.h                       |  71 ---------
 include/asm-x86/irq_32.h                          |   2 +-
 include/asm-x86/irq_64.h                          |  29 +---
 include/asm-x86/irq_vectors.h                     | 168 ++++++++++++++++++++++
 include/asm-x86/mach-default/irq_vectors.h        |  96 -------------
 include/asm-x86/mach-default/irq_vectors_limits.h |  16 ---
 include/asm-x86/mach-visws/irq_vectors.h          |  62 --------
 include/asm-x86/mach-voyager/irq_vectors.h        |  79 ----------
 12 files changed, 184 insertions(+), 359 deletions(-)
 create mode 100644 include/asm-x86/irq_vectors.h
 delete mode 100644 include/asm-x86/mach-default/irq_vectors.h
 delete mode 100644 include/asm-x86/mach-default/irq_vectors_limits.h
 delete mode 100644 include/asm-x86/mach-visws/irq_vectors.h
 delete mode 100644 include/asm-x86/mach-voyager/irq_vectors.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2a609dc3271c..0c7f64b99e17 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -51,7 +51,7 @@
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
 #include <asm/processor-flags.h>
-#include "irq_vectors.h"
+#include <asm/irq_vectors.h>
 
 /*
  * We use macros for low-level operations which need to be overridden
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index a2b030780aa9..ba7d19e102b1 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -33,8 +33,7 @@
 #include <asm/apic.h>
 #include <asm/timer.h>
 #include <asm/i8253.h>
-
-#include <irq_vectors.h>
+#include <asm/irq_vectors.h>
 
 #define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
 #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c
index cef9cb1d15ac..d8b2cfd85d92 100644
--- a/arch/x86/mach-visws/visws_apic.c
+++ b/arch/x86/mach-visws/visws_apic.c
@@ -21,10 +21,9 @@
 #include <asm/io.h>
 #include <asm/apic.h>
 #include <asm/i8259.h>
+#include <asm/irq_vectors.h>
 
 #include "cobalt.h"
-#include "irq_vectors.h"
-
 
 static DEFINE_SPINLOCK(cobalt_lock);
 
diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h
index 38af08ed43cf..a8c5e8bdaa49 100644
--- a/include/asm-x86/hw_irq.h
+++ b/include/asm-x86/hw_irq.h
@@ -13,7 +13,7 @@
  * unified by tglx
  */
 
-#define NMI_VECTOR		0x02
+#include <asm/irq_vectors.h>
 
 #ifndef __ASSEMBLY__
 
@@ -75,6 +75,16 @@ extern void send_IPI(int dest, int vector);
 extern atomic_t irq_err_count;
 extern atomic_t irq_mis_count;
 
+/* Voyager functions */
+extern asmlinkage void vic_cpi_interrupt(void);
+extern asmlinkage void vic_sys_interrupt(void);
+extern asmlinkage void vic_cmn_interrupt(void);
+extern asmlinkage void qic_timer_interrupt(void);
+extern asmlinkage void qic_invalidate_interrupt(void);
+extern asmlinkage void qic_reschedule_interrupt(void);
+extern asmlinkage void qic_enable_irq_interrupt(void);
+extern asmlinkage void qic_call_function_interrupt(void);
+
 #endif /* !ASSEMBLY_ */
 
 #ifdef CONFIG_X86_32
diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h
index 28674576e9fe..98c9d494a711 100644
--- a/include/asm-x86/hw_irq_64.h
+++ b/include/asm-x86/hw_irq_64.h
@@ -1,74 +1,3 @@
-/*
- * IDT vectors usable for external interrupt sources start
- * at 0x20:
- */
-#define FIRST_EXTERNAL_VECTOR	0x20
-
-#define IA32_SYSCALL_VECTOR	0x80
-
-
-/* Reserve the lowest usable priority level 0x20 - 0x2f for triggering
- * cleanup after irq migration.
- */
-#define IRQ_MOVE_CLEANUP_VECTOR	FIRST_EXTERNAL_VECTOR
-
-/*
- * Vectors 0x30-0x3f are used for ISA interrupts.
- */
-#define IRQ0_VECTOR		(FIRST_EXTERNAL_VECTOR + 0x10)
-#define IRQ1_VECTOR		(IRQ0_VECTOR + 1)
-#define IRQ2_VECTOR		(IRQ0_VECTOR + 2)
-#define IRQ3_VECTOR		(IRQ0_VECTOR + 3)
-#define IRQ4_VECTOR		(IRQ0_VECTOR + 4)
-#define IRQ5_VECTOR		(IRQ0_VECTOR + 5)
-#define IRQ6_VECTOR		(IRQ0_VECTOR + 6)
-#define IRQ7_VECTOR		(IRQ0_VECTOR + 7)
-#define IRQ8_VECTOR		(IRQ0_VECTOR + 8)
-#define IRQ9_VECTOR		(IRQ0_VECTOR + 9)
-#define IRQ10_VECTOR		(IRQ0_VECTOR + 10)
-#define IRQ11_VECTOR		(IRQ0_VECTOR + 11)
-#define IRQ12_VECTOR		(IRQ0_VECTOR + 12)
-#define IRQ13_VECTOR		(IRQ0_VECTOR + 13)
-#define IRQ14_VECTOR		(IRQ0_VECTOR + 14)
-#define IRQ15_VECTOR		(IRQ0_VECTOR + 15)
-
-/*
- * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
- *
- *  some of the following vectors are 'rare', they are merged
- *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
- *  TLB, reschedule and local APIC vectors are performance-critical.
- */
-#define SPURIOUS_APIC_VECTOR	0xff
-#define ERROR_APIC_VECTOR	0xfe
-#define RESCHEDULE_VECTOR	0xfd
-#define CALL_FUNCTION_VECTOR	0xfc
-/* fb free - please don't readd KDB here because it's useless
-   (hint - think what a NMI bit does to a vector) */
-#define THERMAL_APIC_VECTOR	0xfa
-#define THRESHOLD_APIC_VECTOR   0xf9
-/* f8 free */
-#define INVALIDATE_TLB_VECTOR_END	0xf7
-#define INVALIDATE_TLB_VECTOR_START	0xf0	/* f0-f7 used for TLB flush */
-
-#define NUM_INVALIDATE_TLB_VECTORS	8
-
-/*
- * Local APIC timer IRQ vector is on a different priority level,
- * to work around the 'lost local interrupt if more than 2 IRQ
- * sources per level' errata.
- */
-#define LOCAL_TIMER_VECTOR	0xef
-
-/*
- * First APIC vector available to drivers: (vectors 0x30-0xee)
- * we start at 0x41 to spread out vectors evenly between priority
- * levels. (0x80 is the syscall vector)
- */
-#define FIRST_DEVICE_VECTOR	(IRQ15_VECTOR + 2)
-#define FIRST_SYSTEM_VECTOR	0xef   /* duplicated in irq.h */
-
-
 #ifndef __ASSEMBLY__
 
 typedef int vector_irq_t[NR_VECTORS];
diff --git a/include/asm-x86/irq_32.h b/include/asm-x86/irq_32.h
index 0b79f3185243..c5c7542f79ad 100644
--- a/include/asm-x86/irq_32.h
+++ b/include/asm-x86/irq_32.h
@@ -12,7 +12,7 @@
 
 #include <linux/sched.h>
 /* include comes from machine specific directory */
-#include "irq_vectors.h"
+#include <asm/irq_vectors.h>
 #include <asm/thread_info.h>
 
 static inline int irq_canonicalize(int irq)
diff --git a/include/asm-x86/irq_64.h b/include/asm-x86/irq_64.h
index 7608176590b6..3037ec667bfb 100644
--- a/include/asm-x86/irq_64.h
+++ b/include/asm-x86/irq_64.h
@@ -11,34 +11,7 @@
  */
 
 #include <asm/apicdef.h>
-
-#define TIMER_IRQ 0
-
-/*
- * 16 8259A IRQ's, 208 potential APIC interrupt sources.
- * Right now the APIC is mostly only used for SMP.
- * 256 vectors is an architectural limit. (we can have
- * more than 256 devices theoretically, but they will
- * have to use shared interrupts)
- * Since vectors 0x00-0x1f are used/reserved for the CPU,
- * the usable vector space is 0x20-0xff (224 vectors)
- */
-
-/*
- * The maximum number of vectors supported by x86_64 processors
- * is limited to 256. For processors other than x86_64, NR_VECTORS
- * should be changed accordingly.
- */
-#define NR_VECTORS 256
-
-#define FIRST_SYSTEM_VECTOR	0xef   /* duplicated in hw_irq.h */
-
-#if NR_CPUS < MAX_IO_APICS
-#define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
-#else
-#define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
-#endif
-#define NR_IRQ_VECTORS NR_IRQS
+#include <asm/irq_vectors.h>
 
 static inline int irq_canonicalize(int irq)
 {
diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h
new file mode 100644
index 000000000000..daceaaf0a3a4
--- /dev/null
+++ b/include/asm-x86/irq_vectors.h
@@ -0,0 +1,168 @@
+#ifndef _ASM_IRQ_VECTORS_H
+#define _ASM_IRQ_VECTORS_H
+
+#include <linux/threads.h>
+
+#define NMI_VECTOR		0x02
+
+/*
+ * IDT vectors usable for external interrupt sources start
+ * at 0x20:
+ */
+#define FIRST_EXTERNAL_VECTOR	0x20
+
+#ifdef CONFIG_X86_32
+# define SYSCALL_VECTOR		0x80
+#else
+# define IA32_SYSCALL_VECTOR	0x80
+#endif
+
+/*
+ * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit.
+ *
+ * Reserve the lowest usable priority level 0x20 - 0x2f for triggering
+ * cleanup after irq migration on 64 bit.
+ */
+#define IRQ_MOVE_CLEANUP_VECTOR	FIRST_EXTERNAL_VECTOR
+
+/*
+ * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit
+ */
+#define IRQ0_VECTOR		(FIRST_EXTERNAL_VECTOR + 0x10)
+#define IRQ1_VECTOR		(IRQ0_VECTOR + 1)
+#define IRQ2_VECTOR		(IRQ0_VECTOR + 2)
+#define IRQ3_VECTOR		(IRQ0_VECTOR + 3)
+#define IRQ4_VECTOR		(IRQ0_VECTOR + 4)
+#define IRQ5_VECTOR		(IRQ0_VECTOR + 5)
+#define IRQ6_VECTOR		(IRQ0_VECTOR + 6)
+#define IRQ7_VECTOR		(IRQ0_VECTOR + 7)
+#define IRQ8_VECTOR		(IRQ0_VECTOR + 8)
+#define IRQ9_VECTOR		(IRQ0_VECTOR + 9)
+#define IRQ10_VECTOR		(IRQ0_VECTOR + 10)
+#define IRQ11_VECTOR		(IRQ0_VECTOR + 11)
+#define IRQ12_VECTOR		(IRQ0_VECTOR + 12)
+#define IRQ13_VECTOR		(IRQ0_VECTOR + 13)
+#define IRQ14_VECTOR		(IRQ0_VECTOR + 14)
+#define IRQ15_VECTOR		(IRQ0_VECTOR + 15)
+
+/*
+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
+ *
+ *  some of the following vectors are 'rare', they are merged
+ *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
+ *  TLB, reschedule and local APIC vectors are performance-critical.
+ *
+ *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
+ */
+#ifdef CONFIG_X86_32
+
+# define SPURIOUS_APIC_VECTOR		0xff
+# define ERROR_APIC_VECTOR		0xfe
+# define INVALIDATE_TLB_VECTOR		0xfd
+# define RESCHEDULE_VECTOR		0xfc
+# define CALL_FUNCTION_VECTOR		0xfb
+# define THERMAL_APIC_VECTOR		0xf0
+
+#else
+
+#define SPURIOUS_APIC_VECTOR		0xff
+#define ERROR_APIC_VECTOR		0xfe
+#define RESCHEDULE_VECTOR		0xfd
+#define CALL_FUNCTION_VECTOR		0xfc
+#define THERMAL_APIC_VECTOR		0xfa
+#define THRESHOLD_APIC_VECTOR		0xf9
+#define INVALIDATE_TLB_VECTOR_END	0xf7
+#define INVALIDATE_TLB_VECTOR_START	0xf0	/* f0-f7 used for TLB flush */
+
+#define NUM_INVALIDATE_TLB_VECTORS	8
+
+#endif
+
+/*
+ * Local APIC timer IRQ vector is on a different priority level,
+ * to work around the 'lost local interrupt if more than 2 IRQ
+ * sources per level' errata.
+ */
+#define LOCAL_TIMER_VECTOR	0xef
+
+/*
+ * First APIC vector available to drivers: (vectors 0x30-0xee) we
+ * start at 0x31(0x41) to spread out vectors evenly between priority
+ * levels. (0x80 is the syscall vector)
+ */
+#ifdef CONFIG_X86_32
+# define FIRST_DEVICE_VECTOR	0x31
+#else
+# define FIRST_DEVICE_VECTOR	(IRQ15_VECTOR + 2)
+#endif
+
+#define FIRST_SYSTEM_VECTOR	0xef
+
+#define NR_VECTORS		256
+
+#define FPU_IRQ			13
+
+#define	FIRST_VM86_IRQ		3
+#define LAST_VM86_IRQ		15
+#define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
+
+#if !defined(CONFIG_X86_VISWS) && !defined(CONFIG_X86_VOYAGER)
+
+# if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT)
+
+#  define NR_IRQS		224
+
+#  if (224 >= 32 * NR_CPUS)
+#   define NR_IRQ_VECTORS	NR_IRQS
+#  else
+#   define NR_IRQ_VECTORS	(32 * NR_CPUS)
+#  endif
+
+# else /* IO_APIC || PARAVIRT */
+
+#  define NR_IRQS		16
+#  define NR_IRQ_VECTORS	NR_IRQS
+
+# endif
+
+#else /* !VISWS && !VOYAGER */
+
+# define NR_IRQS		224
+# define NR_IRQ_VECTORS		NR_IRQS
+
+#endif /* VISWS */
+
+/* Voyager specific defines */
+/* These define the CPIs we use in linux */
+#define VIC_CPI_LEVEL0			0
+#define VIC_CPI_LEVEL1			1
+/* now the fake CPIs */
+#define VIC_TIMER_CPI			2
+#define VIC_INVALIDATE_CPI		3
+#define VIC_RESCHEDULE_CPI		4
+#define VIC_ENABLE_IRQ_CPI		5
+#define VIC_CALL_FUNCTION_CPI		6
+
+/* Now the QIC CPIs:  Since we don't need the two initial levels,
+ * these are 2 less than the VIC CPIs */
+#define QIC_CPI_OFFSET			1
+#define QIC_TIMER_CPI			(VIC_TIMER_CPI - QIC_CPI_OFFSET)
+#define QIC_INVALIDATE_CPI		(VIC_INVALIDATE_CPI - QIC_CPI_OFFSET)
+#define QIC_RESCHEDULE_CPI		(VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET)
+#define QIC_ENABLE_IRQ_CPI		(VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET)
+#define QIC_CALL_FUNCTION_CPI		(VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET)
+
+#define VIC_START_FAKE_CPI		VIC_TIMER_CPI
+#define VIC_END_FAKE_CPI		VIC_CALL_FUNCTION_CPI
+
+/* this is the SYS_INT CPI. */
+#define VIC_SYS_INT			8
+#define VIC_CMN_INT			15
+
+/* This is the boot CPI for alternate processors.  It gets overwritten
+ * by the above once the system has activated all available processors */
+#define VIC_CPU_BOOT_CPI		VIC_CPI_LEVEL0
+#define VIC_CPU_BOOT_ERRATA_CPI		(VIC_CPI_LEVEL0 + 8)
+
+
+#endif /* _ASM_IRQ_VECTORS_H */
diff --git a/include/asm-x86/mach-default/irq_vectors.h b/include/asm-x86/mach-default/irq_vectors.h
deleted file mode 100644
index 881c63ca61ad..000000000000
--- a/include/asm-x86/mach-default/irq_vectors.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * This file should contain #defines for all of the interrupt vector
- * numbers used by this architecture.
- *
- * In addition, there are some standard defines:
- *
- *	FIRST_EXTERNAL_VECTOR:
- *		The first free place for external interrupts
- *
- *	SYSCALL_VECTOR:
- *		The IRQ vector a syscall makes the user to kernel transition
- *		under.
- *
- *	TIMER_IRQ:
- *		The IRQ number the timer interrupt comes in at.
- *
- *	NR_IRQS:
- *		The total number of interrupt vectors (including all the
- *		architecture specific interrupts) needed.
- *
- */			
-#ifndef _ASM_IRQ_VECTORS_H
-#define _ASM_IRQ_VECTORS_H
-
-/*
- * IDT vectors usable for external interrupt sources start
- * at 0x20:
- */
-#define FIRST_EXTERNAL_VECTOR	0x20
-
-#define SYSCALL_VECTOR		0x80
-
-/*
- * Vectors 0x20-0x2f are used for ISA interrupts.
- */
-
-/*
- * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
- *
- *  some of the following vectors are 'rare', they are merged
- *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
- *  TLB, reschedule and local APIC vectors are performance-critical.
- *
- *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
- */
-#define SPURIOUS_APIC_VECTOR	0xff
-#define ERROR_APIC_VECTOR	0xfe
-#define INVALIDATE_TLB_VECTOR	0xfd
-#define RESCHEDULE_VECTOR	0xfc
-#define CALL_FUNCTION_VECTOR	0xfb
-
-#define THERMAL_APIC_VECTOR	0xf0
-/*
- * Local APIC timer IRQ vector is on a different priority level,
- * to work around the 'lost local interrupt if more than 2 IRQ
- * sources per level' errata.
- */
-#define LOCAL_TIMER_VECTOR	0xef
-
-/*
- * First APIC vector available to drivers: (vectors 0x30-0xee)
- * we start at 0x31 to spread out vectors evenly between priority
- * levels. (0x80 is the syscall vector)
- */
-#define FIRST_DEVICE_VECTOR	0x31
-#define FIRST_SYSTEM_VECTOR	0xef
-
-#define TIMER_IRQ 0
-
-/*
- * 16 8259A IRQ's, 208 potential APIC interrupt sources.
- * Right now the APIC is mostly only used for SMP.
- * 256 vectors is an architectural limit. (we can have
- * more than 256 devices theoretically, but they will
- * have to use shared interrupts)
- * Since vectors 0x00-0x1f are used/reserved for the CPU,
- * the usable vector space is 0x20-0xff (224 vectors)
- */
-
-/*
- * The maximum number of vectors supported by i386 processors
- * is limited to 256. For processors other than i386, NR_VECTORS
- * should be changed accordingly.
- */
-#define NR_VECTORS 256
-
-#include "irq_vectors_limits.h"
-
-#define FPU_IRQ			13
-
-#define	FIRST_VM86_IRQ		3
-#define LAST_VM86_IRQ		15
-#define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
-
-
-#endif /* _ASM_IRQ_VECTORS_H */
diff --git a/include/asm-x86/mach-default/irq_vectors_limits.h b/include/asm-x86/mach-default/irq_vectors_limits.h
deleted file mode 100644
index a90c7a60109f..000000000000
--- a/include/asm-x86/mach-default/irq_vectors_limits.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _ASM_IRQ_VECTORS_LIMITS_H
-#define _ASM_IRQ_VECTORS_LIMITS_H
-
-#if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT)
-#define NR_IRQS 224
-# if (224 >= 32 * NR_CPUS)
-# define NR_IRQ_VECTORS NR_IRQS
-# else
-# define NR_IRQ_VECTORS (32 * NR_CPUS)
-# endif
-#else
-#define NR_IRQS 16
-#define NR_IRQ_VECTORS NR_IRQS
-#endif
-
-#endif /* _ASM_IRQ_VECTORS_LIMITS_H */
diff --git a/include/asm-x86/mach-visws/irq_vectors.h b/include/asm-x86/mach-visws/irq_vectors.h
deleted file mode 100644
index cb572d8db505..000000000000
--- a/include/asm-x86/mach-visws/irq_vectors.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef _ASM_IRQ_VECTORS_H
-#define _ASM_IRQ_VECTORS_H
-
-/*
- * IDT vectors usable for external interrupt sources start
- * at 0x20:
- */
-#define FIRST_EXTERNAL_VECTOR	0x20
-
-#define SYSCALL_VECTOR		0x80
-
-/*
- * Vectors 0x20-0x2f are used for ISA interrupts.
- */
-
-/*
- * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
- *
- *  some of the following vectors are 'rare', they are merged
- *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
- *  TLB, reschedule and local APIC vectors are performance-critical.
- *
- *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
- */
-#define SPURIOUS_APIC_VECTOR	0xff
-#define ERROR_APIC_VECTOR	0xfe
-#define INVALIDATE_TLB_VECTOR	0xfd
-#define RESCHEDULE_VECTOR	0xfc
-#define CALL_FUNCTION_VECTOR	0xfb
-
-#define THERMAL_APIC_VECTOR	0xf0
-/*
- * Local APIC timer IRQ vector is on a different priority level,
- * to work around the 'lost local interrupt if more than 2 IRQ
- * sources per level' errata.
- */
-#define LOCAL_TIMER_VECTOR	0xef
-
-/*
- * First APIC vector available to drivers: (vectors 0x30-0xee)
- * we start at 0x31 to spread out vectors evenly between priority
- * levels. (0x80 is the syscall vector)
- */
-#define FIRST_DEVICE_VECTOR	0x31
-#define FIRST_SYSTEM_VECTOR	0xef
-
-#define TIMER_IRQ 0
-
-/*
- * IRQ definitions
- */
-#define NR_VECTORS 256
-#define NR_IRQS 224
-#define NR_IRQ_VECTORS NR_IRQS
-
-#define FPU_IRQ			13
-
-#define	FIRST_VM86_IRQ		3
-#define LAST_VM86_IRQ		15
-#define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
-
-#endif /* _ASM_IRQ_VECTORS_H */
diff --git a/include/asm-x86/mach-voyager/irq_vectors.h b/include/asm-x86/mach-voyager/irq_vectors.h
deleted file mode 100644
index 165421f5821c..000000000000
--- a/include/asm-x86/mach-voyager/irq_vectors.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8 -*- */
-
-/* Copyright (C) 2002
- *
- * Author: James.Bottomley@HansenPartnership.com
- *
- * linux/arch/i386/voyager/irq_vectors.h
- *
- * This file provides definitions for the VIC and QIC CPIs
- */
-
-#ifndef _ASM_IRQ_VECTORS_H
-#define _ASM_IRQ_VECTORS_H
-
-/*
- * IDT vectors usable for external interrupt sources start
- * at 0x20:
- */
-#define FIRST_EXTERNAL_VECTOR	0x20
-
-#define SYSCALL_VECTOR		0x80
-
-/*
- * Vectors 0x20-0x2f are used for ISA interrupts.
- */
-
-/* These define the CPIs we use in linux */
-#define VIC_CPI_LEVEL0			0
-#define VIC_CPI_LEVEL1			1
-/* now the fake CPIs */
-#define VIC_TIMER_CPI			2
-#define VIC_INVALIDATE_CPI		3
-#define VIC_RESCHEDULE_CPI		4
-#define VIC_ENABLE_IRQ_CPI		5
-#define VIC_CALL_FUNCTION_CPI		6
-
-/* Now the QIC CPIs:  Since we don't need the two initial levels,
- * these are 2 less than the VIC CPIs */
-#define QIC_CPI_OFFSET			1
-#define QIC_TIMER_CPI			(VIC_TIMER_CPI - QIC_CPI_OFFSET)
-#define QIC_INVALIDATE_CPI		(VIC_INVALIDATE_CPI - QIC_CPI_OFFSET)
-#define QIC_RESCHEDULE_CPI		(VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET)
-#define QIC_ENABLE_IRQ_CPI		(VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET)
-#define QIC_CALL_FUNCTION_CPI		(VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET)
-
-#define VIC_START_FAKE_CPI		VIC_TIMER_CPI
-#define VIC_END_FAKE_CPI		VIC_CALL_FUNCTION_CPI
-
-/* this is the SYS_INT CPI. */
-#define VIC_SYS_INT			8
-#define VIC_CMN_INT			15
-
-/* This is the boot CPI for alternate processors.  It gets overwritten
- * by the above once the system has activated all available processors */
-#define VIC_CPU_BOOT_CPI		VIC_CPI_LEVEL0
-#define VIC_CPU_BOOT_ERRATA_CPI		(VIC_CPI_LEVEL0 + 8)
-
-#define NR_VECTORS 256
-#define NR_IRQS 224
-#define NR_IRQ_VECTORS NR_IRQS
-
-#define FPU_IRQ				13
-
-#define	FIRST_VM86_IRQ		3
-#define LAST_VM86_IRQ		15
-#define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
-
-#ifndef __ASSEMBLY__
-extern asmlinkage void vic_cpi_interrupt(void);
-extern asmlinkage void vic_sys_interrupt(void);
-extern asmlinkage void vic_cmn_interrupt(void);
-extern asmlinkage void qic_timer_interrupt(void);
-extern asmlinkage void qic_invalidate_interrupt(void);
-extern asmlinkage void qic_reschedule_interrupt(void);
-extern asmlinkage void qic_enable_irq_interrupt(void);
-extern asmlinkage void qic_call_function_interrupt(void);
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _ASM_IRQ_VECTORS_H */
-- 
cgit v1.2.3


From 0bc471d93051a19545257909bc2ed2ad3b389b54 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 2 May 2008 21:55:12 +0200
Subject: x86: move BUILD_IRQ macro magic to i8259_64.c

i8259_64.c is the only place which uses those macros.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i8259_64.c  | 14 ++++++++++++++
 include/asm-x86/hw_irq_64.h | 14 --------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index fa57a1568508..c4ae4769ce67 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -34,6 +34,20 @@
  * interrupt-controller happy.
  */
 
+#define IRQ_NAME2(nr) nr##_interrupt(void)
+#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
+
+/*
+ *	SMP has a few special interrupts for IPI messages
+ */
+
+#define BUILD_IRQ(nr)				\
+	asmlinkage void IRQ_NAME(nr);		\
+	asm("\n.p2align\n"			\
+	    "IRQ" #nr "_interrupt:\n\t"		\
+	    "push $~(" #nr ") ; "		\
+	    "jmp common_interrupt");
+
 #define BI(x,y) \
 	BUILD_IRQ(x##y)
 
diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h
index 98c9d494a711..9305f7456a7f 100644
--- a/include/asm-x86/hw_irq_64.h
+++ b/include/asm-x86/hw_irq_64.h
@@ -17,18 +17,4 @@ extern void native_init_IRQ(void);
 
 #include <asm/ptrace.h>
 
-#define IRQ_NAME2(nr) nr##_interrupt(void)
-#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
-
-/*
- *	SMP has a few special interrupts for IPI messages
- */
-
-#define BUILD_IRQ(nr)				\
-	asmlinkage void IRQ_NAME(nr);		\
-	asm("\n.p2align\n"			\
-	    "IRQ" #nr "_interrupt:\n\t"		\
-	    "push $~(" #nr ") ; "		\
-	    "jmp common_interrupt");
-
 #endif
-- 
cgit v1.2.3


From 1a331957efd214fc3a84f70956dfaec65e70c031 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 3 May 2008 00:30:50 +0200
Subject: x86: move eisa_set_level_irq declaration to header

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 2 --
 include/asm-x86/hw_irq.h    | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c49ebcc6c41e..671591fcc61d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -507,8 +507,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
 	 */
 	if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
-		extern void eisa_set_level_irq(unsigned int irq);
-
 		if (triggering == ACPI_LEVEL_SENSITIVE)
 			eisa_set_level_irq(gsi);
 	}
diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h
index 1db2dff1ef49..1428b41dcbb9 100644
--- a/include/asm-x86/hw_irq.h
+++ b/include/asm-x86/hw_irq.h
@@ -79,6 +79,9 @@ extern void send_IPI(int dest, int vector);
 extern atomic_t irq_err_count;
 extern atomic_t irq_mis_count;
 
+/* EISA */
+extern void eisa_set_level_irq(unsigned int irq);
+
 /* Voyager functions */
 extern asmlinkage void vic_cpi_interrupt(void);
 extern asmlinkage void vic_sys_interrupt(void);
-- 
cgit v1.2.3


From 305b92a2323eeaa4b481f409d54f778dd7e21a46 Mon Sep 17 00:00:00 2001
From: Alan Mayer <ajm@sgi.com>
Date: Tue, 15 Apr 2008 15:36:56 -0500
Subject: x86: change FIRST_SYSTEM_VECTOR to a variable

The SGI UV system needs several more system vectors than a vanilla
x86_64 system.  Rather than burden the other archs with extra system
vectors that they don't use, change FIRST_SYSTEM_VECTOR to a variable,
so that it can be dynamic.

Signed-off-by: Alan Mayer <ajm@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_32.c     | 14 +++++++-------
 arch/x86/kernel/i8259_64.c    | 30 +++++++++++++++---------------
 arch/x86/kernel/io_apic_32.c  |  8 ++++++--
 arch/x86/kernel/io_apic_64.c  |  6 +++++-
 include/asm-x86/desc.h        | 22 ++++++++++++++++++++++
 include/asm-x86/irq_vectors.h |  2 --
 6 files changed, 55 insertions(+), 27 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6c..807158e4b5d0 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1351,13 +1351,13 @@ void __init smp_intr_init(void)
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
 	 * IPI, driven by wakeup.
 	 */
-	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
 
 	/* IPI for invalidation */
-	set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
 
 	/* IPI for generic function call */
-	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 }
 #endif
 
@@ -1370,15 +1370,15 @@ void __init apic_intr_init(void)
 	smp_intr_init();
 #endif
 	/* self generated IPI for local APIC timer */
-	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
 	/* IPI vectors for APIC spurious and error interrupts */
-	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
 	/* thermal monitor LVT interrupt */
 #ifdef CONFIG_X86_MCE_P4THERMAL
-	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
 }
 
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index c4ae4769ce67..1870e0e86559 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -493,33 +493,33 @@ void __init native_init_IRQ(void)
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
 	 * IPI, driven by wakeup.
 	 */
-	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
 
 	/* IPIs for invalidation */
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
 
 	/* IPI for generic function call */
-	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 
 	/* Low priority IPI to cleanup after moving an irq */
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 #endif
-	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-	set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 
 	/* self generated IPI for local APIC timer */
-	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
 	/* IPI vectors for APIC spurious and error interrupts */
-	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index a40d54fc1fdd..0ae4a9d00ce8 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -83,6 +83,10 @@ int mp_irq_entries;
 
 static int disable_timer_pin_1 __initdata;
 
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+
 /*
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
@@ -1176,7 +1180,7 @@ static int __assign_irq_vector(int irq)
 	offset = current_offset;
 next:
 	vector += 8;
-	if (vector >= FIRST_SYSTEM_VECTOR) {
+	if (vector >= first_system_vector) {
 		offset = (offset + 1) % 8;
 		vector = FIRST_DEVICE_VECTOR + offset;
 	}
@@ -2269,7 +2273,7 @@ void __init setup_IO_APIC(void)
 	int i;
 
 	/* Reserve all the system vectors. */
-	for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
+	for (i = first_system_vector; i < NR_VECTORS; i++)
 		set_bit(i, used_vectors);
 
 	enable_IO_APIC();
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..f1e1ae3e5c7d 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -82,6 +82,10 @@ struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
 
 static int assign_irq_vector(int irq, cpumask_t mask);
 
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+
 #define __apicdebuginit  __init
 
 int sis_apic_bug; /* not actually supported, dummy for compile */
@@ -730,7 +734,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
 		offset = current_offset;
 next:
 		vector += 8;
-		if (vector >= FIRST_SYSTEM_VECTOR) {
+		if (vector >= first_system_vector) {
 			/* If we run out of vectors on large boxen, must share them. */
 			offset = (offset + 1) % 8;
 			vector = FIRST_DEVICE_VECTOR + offset;
diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h
index 268a012bcd79..b3875d4b4fab 100644
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -311,6 +311,28 @@ static inline void set_intr_gate(unsigned int n, void *addr)
 	_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
 }
 
+#define SYS_VECTOR_FREE		0
+#define SYS_VECTOR_ALLOCED	1
+
+extern int first_system_vector;
+extern char system_vectors[];
+
+static inline void alloc_system_vector(int vector)
+{
+	if (system_vectors[vector] == SYS_VECTOR_FREE) {
+		system_vectors[vector] = SYS_VECTOR_ALLOCED;
+		if (first_system_vector > vector)
+			first_system_vector = vector;
+	} else
+		BUG();
+}
+
+static inline void alloc_intr_gate(unsigned int n, void *addr)
+{
+	alloc_system_vector(n);
+	set_intr_gate(n, addr);
+}
+
 /*
  * This routine sets up an interrupt gate at directory privilege level 3.
  */
diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h
index daceaaf0a3a4..3cb6d8c77b39 100644
--- a/include/asm-x86/irq_vectors.h
+++ b/include/asm-x86/irq_vectors.h
@@ -96,8 +96,6 @@
 # define FIRST_DEVICE_VECTOR	(IRQ15_VECTOR + 2)
 #endif
 
-#define FIRST_SYSTEM_VECTOR	0xef
-
 #define NR_VECTORS		256
 
 #define FPU_IRQ			13
-- 
cgit v1.2.3


From ce17833183bf0a08ce3d174a2088eff0a06f2080 Mon Sep 17 00:00:00 2001
From: Alan Mayer <ajm@sgi.com>
Date: Wed, 16 Apr 2008 15:17:20 -0500
Subject: x86: change FIRST_SYSTEM_VECTOR to a variable, fix

Fixes the build error introduced by my FIRST_SYSTEM_VECTOR patch

Signed-off-by: Alan Mayer <ajm@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_32.c    | 4 ++++
 arch/x86/kernel/io_apic_32.c | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 807158e4b5d0..d5767cb19d56 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -71,6 +71,10 @@ int local_apic_timer_disabled;
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+
 /*
  * Debug level, exported for io_apic.c
  */
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 0ae4a9d00ce8..76769ccb1425 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -83,10 +83,6 @@ int mp_irq_entries;
 
 static int disable_timer_pin_1 __initdata;
 
-int first_system_vector = 0xfe;
-
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
-
 /*
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
-- 
cgit v1.2.3


From 04b361abfdc522239e3a071f3afdebf5787d9f03 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 5 May 2008 12:36:38 +0200
Subject: i386: Execute stack overflow warning on interrupt stack v2

Previously the reporting printk would run on the process stack, which risks
overflow an already low stack. Instead execute it on the interrupt stack.
This makes it more likely for the printk to make it actually out.

It adds one not taken test/branch more to the interrupt path when
stack overflow checking is enabled. We could avoid that by duplicating
more code, but that seemed not worth it.

Based on an observation by Eric Sandeen.

v2: Fix warnings in some configs

Signed-off-by: Andi Kleen <andi@firstfloor.org>
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_32.c | 49 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 147352df28b9..3f76561da815 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -61,6 +61,26 @@ static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
 #endif
 
+static void stack_overflow(void)
+{
+	printk("low stack detected by irq handler\n");
+	dump_stack();
+}
+
+static inline void call_on_stack2(void *func, void *stack,
+			   unsigned long arg1, unsigned long arg2)
+{
+	unsigned long bx;
+	asm volatile(
+			"	xchgl  %%ebx,%%esp    \n"
+			"	call   *%%edi	      \n"
+			"	movl   %%ebx,%%esp    \n"
+			: "=a" (arg1), "=d" (arg2), "=b" (bx)
+			:  "0" (arg1),	 "1" (arg2),  "2" (stack),
+			   "D" (func)
+			: "memory", "cc", "ecx");
+}
+
 /*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
@@ -76,6 +96,7 @@ unsigned int do_IRQ(struct pt_regs *regs)
 	union irq_ctx *curctx, *irqctx;
 	u32 *isp;
 #endif
+	int overflow = 0;
 
 	if (unlikely((unsigned)irq >= NR_IRQS)) {
 		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
@@ -92,11 +113,8 @@ unsigned int do_IRQ(struct pt_regs *regs)
 
 		__asm__ __volatile__("andl %%esp,%0" :
 					"=r" (sp) : "0" (THREAD_SIZE - 1));
-		if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
-			printk("do_IRQ: stack overflow: %ld\n",
-				sp - sizeof(struct thread_info));
-			dump_stack();
-		}
+		if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN)))
+			overflow = 1;
 	}
 #endif
 
@@ -112,8 +130,6 @@ unsigned int do_IRQ(struct pt_regs *regs)
 	 * current stack (which is the irq stack already after all)
 	 */
 	if (curctx != irqctx) {
-		int arg1, arg2, bx;
-
 		/* build the stack frame on the IRQ stack */
 		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
 		irqctx->tinfo.task = curctx->tinfo.task;
@@ -127,18 +143,19 @@ unsigned int do_IRQ(struct pt_regs *regs)
 			(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
 			(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
 
-		asm volatile(
-			"       xchgl  %%ebx,%%esp    \n"
-			"       call   *%%edi         \n"
-			"       movl   %%ebx,%%esp    \n"
-			: "=a" (arg1), "=d" (arg2), "=b" (bx)
-			:  "0" (irq),   "1" (desc),  "2" (isp),
-			   "D" (desc->handle_irq)
-			: "memory", "cc", "ecx"
-		);
+		/* Execute warning on interrupt stack */
+		if (unlikely(overflow))
+			call_on_stack2(stack_overflow, isp, 0, 0);
+
+		call_on_stack2(desc->handle_irq, isp, irq, (unsigned long)desc);
 	} else
 #endif
+	{
+		/* AK: Slightly bogus here */
+		if (overflow)
+			stack_overflow();
 		desc->handle_irq(irq, desc);
+	}
 
 	irq_exit();
 	set_irq_regs(old_regs);
-- 
cgit v1.2.3


From de9b10af1287bf25b9c0433de53a2e95ef611aa7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 5 May 2008 15:58:15 +0200
Subject: x86: janitor stack overflow warning patch

Add KERN_WARNING to the printk as this could not be done in the
original patch, which allegedly only moves code around.

Un#ifdef do_IRQ.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_32.c | 136 ++++++++++++++++++++++++++---------------------
 1 file changed, 75 insertions(+), 61 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 3f76561da815..1c470d2e5af7 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
 #endif
 }
 
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+/* Debugging check for stack overflow: is there less than 1KB free? */
+static int check_stack_overflow(void)
+{
+	long sp;
+
+	__asm__ __volatile__("andl %%esp,%0" :
+			     "=r" (sp) : "0" (THREAD_SIZE - 1));
+
+	return sp < (sizeof(struct thread_info) + STACK_WARN);
+}
+
+static void print_stack_overflow(void)
+{
+	printk(KERN_WARNING "low stack detected by irq handler\n");
+	dump_stack();
+}
+
+#else
+static inline int check_stack_overflow(void) { return 0; }
+static inline void print_stack_overflow(void) { }
+#endif
+
 #ifdef CONFIG_4KSTACKS
 /*
  * per-CPU IRQ handling contexts (thread information and stack)
@@ -59,18 +82,12 @@ union irq_ctx {
 
 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
-#endif
-
-static void stack_overflow(void)
-{
-	printk("low stack detected by irq handler\n");
-	dump_stack();
-}
 
-static inline void call_on_stack2(void *func, void *stack,
-			   unsigned long arg1, unsigned long arg2)
+static inline void call_on_stack(void *func, void *stack,
+				 unsigned long arg1, void *arg2)
 {
 	unsigned long bx;
+
 	asm volatile(
 			"	xchgl  %%ebx,%%esp    \n"
 			"	call   *%%edi	      \n"
@@ -81,22 +98,61 @@ static inline void call_on_stack2(void *func, void *stack,
 			: "memory", "cc", "ecx");
 }
 
+static inline int
+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
+{
+	union irq_ctx *curctx, *irqctx;
+	u32 *isp;
+
+	curctx = (union irq_ctx *) current_thread_info();
+	irqctx = hardirq_ctx[smp_processor_id()];
+
+	/*
+	 * this is where we switch to the IRQ stack. However, if we are
+	 * already using the IRQ stack (because we interrupted a hardirq
+	 * handler) we can't do that and just have to keep using the
+	 * current stack (which is the irq stack already after all)
+	 */
+	if (unlikely(curctx == irqctx))
+		return 0;
+
+	/* build the stack frame on the IRQ stack */
+	isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
+	irqctx->tinfo.task = curctx->tinfo.task;
+	irqctx->tinfo.previous_esp = current_stack_pointer;
+
+	/*
+	 * Copy the softirq bits in preempt_count so that the
+	 * softirq checks work in the hardirq context.
+	 */
+	irqctx->tinfo.preempt_count =
+		(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+		(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+
+	if (unlikely(overflow))
+		call_on_stack(print_stack_overflow, isp, 0, NULL);
+
+	call_on_stack(desc->handle_irq, isp, irq, desc);
+
+	return 1;
+}
+
+#else
+static inline int
+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
+#endif
+
 /*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
 unsigned int do_IRQ(struct pt_regs *regs)
-{	
+{
 	struct pt_regs *old_regs;
 	/* high bit used in ret_from_ code */
-	int irq = ~regs->orig_ax;
+	int overflow, irq = ~regs->orig_ax;
 	struct irq_desc *desc = irq_desc + irq;
-#ifdef CONFIG_4KSTACKS
-	union irq_ctx *curctx, *irqctx;
-	u32 *isp;
-#endif
-	int overflow = 0;
 
 	if (unlikely((unsigned)irq >= NR_IRQS)) {
 		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
@@ -106,54 +162,12 @@ unsigned int do_IRQ(struct pt_regs *regs)
 
 	old_regs = set_irq_regs(regs);
 	irq_enter();
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
-	/* Debugging check for stack overflow: is there less than 1KB free? */
-	{
-		long sp;
-
-		__asm__ __volatile__("andl %%esp,%0" :
-					"=r" (sp) : "0" (THREAD_SIZE - 1));
-		if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN)))
-			overflow = 1;
-	}
-#endif
-
-#ifdef CONFIG_4KSTACKS
 
-	curctx = (union irq_ctx *) current_thread_info();
-	irqctx = hardirq_ctx[smp_processor_id()];
+	overflow = check_stack_overflow();
 
-	/*
-	 * this is where we switch to the IRQ stack. However, if we are
-	 * already using the IRQ stack (because we interrupted a hardirq
-	 * handler) we can't do that and just have to keep using the
-	 * current stack (which is the irq stack already after all)
-	 */
-	if (curctx != irqctx) {
-		/* build the stack frame on the IRQ stack */
-		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
-		irqctx->tinfo.task = curctx->tinfo.task;
-		irqctx->tinfo.previous_esp = current_stack_pointer;
-
-		/*
-		 * Copy the softirq bits in preempt_count so that the
-		 * softirq checks work in the hardirq context.
-		 */
-		irqctx->tinfo.preempt_count =
-			(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
-			(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
-
-		/* Execute warning on interrupt stack */
+	if (!execute_on_irq_stack(overflow, desc, irq)) {
 		if (unlikely(overflow))
-			call_on_stack2(stack_overflow, isp, 0, 0);
-
-		call_on_stack2(desc->handle_irq, isp, irq, (unsigned long)desc);
-	} else
-#endif
-	{
-		/* AK: Slightly bogus here */
-		if (overflow)
-			stack_overflow();
+			print_stack_overflow();
 		desc->handle_irq(irq, desc);
 	}
 
-- 
cgit v1.2.3


From 403d8efc94cd02ae36e7db13c4edf1d06d7b7fac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 5 May 2008 18:13:50 +0200
Subject: x86: irq_32 move 4kstacks code to one place

Move the 4KSTACKS related code to one place. This allows to un#ifdef
do_IRQ() and share the executed on stack for the stack overflow printk
and the softirq call.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_32.c | 163 ++++++++++++++++++++++-------------------------
 1 file changed, 77 insertions(+), 86 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1c470d2e5af7..4e3e8ec60276 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,26 +83,28 @@ union irq_ctx {
 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
 
-static inline void call_on_stack(void *func, void *stack,
-				 unsigned long arg1, void *arg2)
+static char softirq_stack[NR_CPUS * THREAD_SIZE]
+		__attribute__((__section__(".bss.page_aligned")));
+
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
+		__attribute__((__section__(".bss.page_aligned")));
+
+static void call_on_stack(void *func, void *stack)
 {
-	unsigned long bx;
-
-	asm volatile(
-			"	xchgl  %%ebx,%%esp    \n"
-			"	call   *%%edi	      \n"
-			"	movl   %%ebx,%%esp    \n"
-			: "=a" (arg1), "=d" (arg2), "=b" (bx)
-			:  "0" (arg1),	 "1" (arg2),  "2" (stack),
-			   "D" (func)
-			: "memory", "cc", "ecx");
+	asm volatile("xchgl	%%ebx,%%esp	\n"
+		     "call	*%%edi		\n"
+		     "movl	%%ebx,%%esp	\n"
+		     : "=b" (stack)
+		     : "0" (stack),
+		       "D"(func)
+		     : "memory", "cc", "edx", "ecx", "eax");
 }
 
 static inline int
 execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 {
 	union irq_ctx *curctx, *irqctx;
-	u32 *isp;
+	u32 *isp, arg1, arg2;
 
 	curctx = (union irq_ctx *) current_thread_info();
 	irqctx = hardirq_ctx[smp_processor_id()];
@@ -130,64 +132,22 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 		(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
 
 	if (unlikely(overflow))
-		call_on_stack(print_stack_overflow, isp, 0, NULL);
-
-	call_on_stack(desc->handle_irq, isp, irq, desc);
-
-	return 1;
-}
-
-#else
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
-#endif
-
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-unsigned int do_IRQ(struct pt_regs *regs)
-{
-	struct pt_regs *old_regs;
-	/* high bit used in ret_from_ code */
-	int overflow, irq = ~regs->orig_ax;
-	struct irq_desc *desc = irq_desc + irq;
-
-	if (unlikely((unsigned)irq >= NR_IRQS)) {
-		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
-					__func__, irq);
-		BUG();
-	}
-
-	old_regs = set_irq_regs(regs);
-	irq_enter();
-
-	overflow = check_stack_overflow();
-
-	if (!execute_on_irq_stack(overflow, desc, irq)) {
-		if (unlikely(overflow))
-			print_stack_overflow();
-		desc->handle_irq(irq, desc);
-	}
-
-	irq_exit();
-	set_irq_regs(old_regs);
+		call_on_stack(print_stack_overflow, isp);
+
+	asm volatile("xchgl	%%ebx,%%esp	\n"
+		     "call	*%%edi		\n"
+		     "movl	%%ebx,%%esp	\n"
+		     : "=a" (arg1), "=d" (arg2), "=b" (isp)
+		     :  "0" (irq),   "1" (desc),  "2" (isp),
+			"D" (desc->handle_irq)
+		     : "memory", "cc", "ecx");
 	return 1;
 }
 
-#ifdef CONFIG_4KSTACKS
-
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
-
 /*
  * allocate per-cpu stacks for hardirq and for softirq processing
  */
-void irq_ctx_init(int cpu)
+void __cpuinit irq_ctx_init(int cpu)
 {
 	union irq_ctx *irqctx;
 
@@ -195,25 +155,25 @@ void irq_ctx_init(int cpu)
 		return;
 
 	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
-	irqctx->tinfo.task              = NULL;
-	irqctx->tinfo.exec_domain       = NULL;
-	irqctx->tinfo.cpu               = cpu;
-	irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
-	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+	irqctx->tinfo.task		= NULL;
+	irqctx->tinfo.exec_domain	= NULL;
+	irqctx->tinfo.cpu		= cpu;
+	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET;
+	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
 	hardirq_ctx[cpu] = irqctx;
 
 	irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
-	irqctx->tinfo.task              = NULL;
-	irqctx->tinfo.exec_domain       = NULL;
-	irqctx->tinfo.cpu               = cpu;
-	irqctx->tinfo.preempt_count     = 0;
-	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+	irqctx->tinfo.task		= NULL;
+	irqctx->tinfo.exec_domain	= NULL;
+	irqctx->tinfo.cpu		= cpu;
+	irqctx->tinfo.preempt_count	= 0;
+	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
 	softirq_ctx[cpu] = irqctx;
 
-	printk("CPU %u irqstacks, hard=%p soft=%p\n",
-		cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+	printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
+	       cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
 }
 
 void irq_ctx_exit(int cpu)
@@ -242,24 +202,55 @@ asmlinkage void do_softirq(void)
 		/* build the stack frame on the softirq stack */
 		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
 
-		asm volatile(
-			"       xchgl   %%ebx,%%esp     \n"
-			"       call    __do_softirq    \n"
-			"       movl    %%ebx,%%esp     \n"
-			: "=b"(isp)
-			: "0"(isp)
-			: "memory", "cc", "edx", "ecx", "eax"
-		);
+		call_on_stack(__do_softirq, isp);
 		/*
 		 * Shouldnt happen, we returned above if in_interrupt():
-	 	 */
+		 */
 		WARN_ON_ONCE(softirq_count());
 	}
 
 	local_irq_restore(flags);
 }
+
+#else
+static inline int
+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
 #endif
 
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int do_IRQ(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs;
+	/* high bit used in ret_from_ code */
+	int overflow, irq = ~regs->orig_ax;
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (unlikely((unsigned)irq >= NR_IRQS)) {
+		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
+					__func__, irq);
+		BUG();
+	}
+
+	old_regs = set_irq_regs(regs);
+	irq_enter();
+
+	overflow = check_stack_overflow();
+
+	if (!execute_on_irq_stack(overflow, desc, irq)) {
+		if (unlikely(overflow))
+			print_stack_overflow();
+		desc->handle_irq(irq, desc);
+	}
+
+	irq_exit();
+	set_irq_regs(old_regs);
+	return 1;
+}
+
 /*
  * Interrupt statistics:
  */
-- 
cgit v1.2.3


From aa134f1b09df6beaa4d031a50d5fda1f3cebce6c Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 8 Apr 2008 10:49:03 +0200
Subject: x86: iommu: use symbolic constants, not hardcoded numbers

Move symbolic constants into gart.h, and use them instead of hardcoded
constant.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/pci-gart_64.c | 10 +++++-----
 drivers/char/agp/amd64-agp.c  | 25 +++----------------------
 include/asm-x86/gart.h        | 21 +++++++++++++++++++++
 3 files changed, 29 insertions(+), 27 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c07455d1695f..bffcf455c857 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -598,13 +598,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 		dev = k8_northbridges[i];
 		gatt_reg = __pa(gatt) >> 12;
 		gatt_reg <<= 4;
-		pci_write_config_dword(dev, 0x98, gatt_reg);
-		pci_read_config_dword(dev, 0x90, &ctl);
+		pci_write_config_dword(dev, AMD64_GARTTABLEBASE, gatt_reg);
+		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
 
-		ctl |= 1;
-		ctl &= ~((1<<4) | (1<<5));
+		ctl |= GARTEN;
+		ctl &= ~(DISGARTCPU | DISGARTIO);
 
-		pci_write_config_dword(dev, 0x90, ctl);
+		pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
 	}
 	flush_gart();
 
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index d8200ac8f8cb..25d64224cdbb 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -16,28 +16,9 @@
 #include <asm/page.h>		/* PAGE_SIZE */
 #include <asm/e820.h>
 #include <asm/k8.h>
+#include <asm/gart.h>
 #include "agp.h"
 
-/* PTE bits. */
-#define GPTE_VALID	1
-#define GPTE_COHERENT	2
-
-/* Aperture control register bits. */
-#define GARTEN		(1<<0)
-#define DISGARTCPU	(1<<4)
-#define DISGARTIO	(1<<5)
-
-/* GART cache control register bits. */
-#define INVGART		(1<<0)
-#define GARTPTEERR	(1<<1)
-
-/* K8 On-cpu GART registers */
-#define AMD64_GARTAPERTURECTL	0x90
-#define AMD64_GARTAPERTUREBASE	0x94
-#define AMD64_GARTTABLEBASE	0x98
-#define AMD64_GARTCACHECTL	0x9c
-#define AMD64_GARTEN		(1<<0)
-
 /* NVIDIA K8 registers */
 #define NVIDIA_X86_64_0_APBASE		0x10
 #define NVIDIA_X86_64_1_APBASE1		0x50
@@ -165,7 +146,7 @@ static int amd64_fetch_size(void)
  * In a multiprocessor x86-64 system, this function gets
  * called once for each CPU.
  */
-static u64 amd64_configure (struct pci_dev *hammer, u64 gatt_table)
+static u64 amd64_configure(struct pci_dev *hammer, u64 gatt_table)
 {
 	u64 aperturebase;
 	u32 tmp;
@@ -181,7 +162,7 @@ static u64 amd64_configure (struct pci_dev *hammer, u64 gatt_table)
 	addr >>= 12;
 	tmp = (u32) addr<<4;
 	tmp &= ~0xf;
-	pci_write_config_dword (hammer, AMD64_GARTTABLEBASE, tmp);
+	pci_write_config_dword(hammer, AMD64_GARTTABLEBASE, tmp);
 
 	/* Enable GART translation for this hammer. */
 	pci_read_config_dword(hammer, AMD64_GARTAPERTURECTL, &tmp);
diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h
index 90958ed993fa..248e5778e928 100644
--- a/include/asm-x86/gart.h
+++ b/include/asm-x86/gart.h
@@ -5,6 +5,7 @@ extern void pci_iommu_shutdown(void);
 extern void no_iommu_init(void);
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
+extern int agp_amd64_init(void);
 #ifdef CONFIG_GART_IOMMU
 extern void gart_iommu_init(void);
 extern void gart_iommu_shutdown(void);
@@ -31,4 +32,24 @@ static inline void gart_iommu_shutdown(void)
 
 #endif
 
+/* PTE bits. */
+#define GPTE_VALID	1
+#define GPTE_COHERENT	2
+
+/* Aperture control register bits. */
+#define GARTEN		(1<<0)
+#define DISGARTCPU	(1<<4)
+#define DISGARTIO	(1<<5)
+
+/* GART cache control register bits. */
+#define INVGART		(1<<0)
+#define GARTPTEERR	(1<<1)
+
+/* K8 On-cpu GART registers */
+#define AMD64_GARTAPERTURECTL	0x90
+#define AMD64_GARTAPERTUREBASE	0x94
+#define AMD64_GARTTABLEBASE	0x98
+#define AMD64_GARTCACHECTL	0x9c
+#define AMD64_GARTEN		(1<<0)
+
 #endif
-- 
cgit v1.2.3


From 1edc1ab3f68168ec6815e6d630f38948a6da005a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Sun, 13 Apr 2008 01:11:41 -0700
Subject: x86: agp_gart size checking for buggy device

while looking at Rafael J. Wysocki's system boot log,

I found a funny printout:

	Node 0: aperture @ de000000 size 32 MB
	Aperture too small (32 MB)
	AGP bridge at 00:04:00
	Aperture from AGP @ de000000 size 4096 MB (APSIZE 0)
	Aperture too small (0 MB)
	Your BIOS doesn't leave a aperture memory hole
	Please enable the IOMMU option in the BIOS setup
	This costs you 64 MB of RAM
	Mapping aperture over 65536 KB of RAM @ 4000000

	...

	agpgart: Detected AGP bridge 20
	agpgart: Aperture pointing to RAM
	agpgart: Aperture from AGP @ de000000 size 4096 MB
	agpgart: Aperture too small (0 MB)
	agpgart: No usable aperture found.
	agpgart: Consider rebooting with iommu=memaper=2 to get a good aperture.

it means BIOS allocated the correct gart on the NB and AGP bridge, but
because a bug in the silicon (the agp bridge reports the wrong order,
it wants 4G instead) the kernel will reject that allocation.

Also, because the size is only 32MB, and we try to get another 64M for gart,
late fix_northbridge can not revert that change because it still reads
the wrong size from agp bridge.

So try to double check the order value from the agp bridge, before calling
aperture_valid().

[ mingo@elte.hu: 32-bit fix. ]

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 14 ++++++++++++++
 drivers/char/agp/amd64-agp.c  | 11 +++++++++++
 2 files changed, 25 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 479926d9e004..2e93b3132dff 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -138,6 +138,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	int nbits;
 	u32 aper_low, aper_hi;
 	u64 aper;
+	u32 old_order;
 
 	printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func);
 	apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
@@ -146,6 +147,9 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 		return 0;
 	}
 
+	/* old_order could be the value from NB gart setting */
+	old_order = *order;
+
 	apsize = apsizereg & 0xfff;
 	/* Some BIOS use weird encodings not in the AGPv3 table. */
 	if (apsize & 0xff)
@@ -159,6 +163,16 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	aper_hi = read_pci_config(num, slot, func, 0x14);
 	aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
 
+	/*
+	 * On some sick chips, APSIZE is 0. It means it wants 4G
+	 * so let double check that order, and lets trust AMD NB settings:
+	 */
+	if (aper + (32UL<<(20 + *order)) > 0x100000000UL) {
+		printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
+				32 << *order, apsizereg);
+		*order = old_order;
+	}
+
 	printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
 			aper, 32 << *order, apsizereg);
 
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 25d64224cdbb..9e3939db76e8 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -312,6 +312,17 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
 	pci_read_config_dword(agp, 0x10, &aper_low);
 	pci_read_config_dword(agp, 0x14, &aper_hi);
 	aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+
+	/*
+	 * On some sick chips APSIZE is 0. This means it wants 4G
+	 * so let double check that order, and lets trust the AMD NB settings
+	 */
+	if (aper + (32ULL<<(20 + order)) > 0x100000000ULL) {
+		printk(KERN_INFO "Aperture size %u MB is not right, using settings from NB\n",
+				  32 << order);
+		order = nb_order;
+	}
+
 	printk(KERN_INFO PFX "Aperture from AGP @ %Lx size %u MB\n", aper, 32 << order);
 	if (order < 0 || !aperture_valid(aper, (32*1024*1024)<<order))
 		return -1;
-- 
cgit v1.2.3


From 8c9fd91a0dc503f085169d44f4360be025f75224 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 13 Apr 2008 18:42:31 -0700
Subject: x86: checking aperture size order

some systems are using 32M for gart and agp when memory is less than 4G.
Kernel will reject and try to allcate another 64M that is not needed,
and we will waste 64M of perfectly good RAM.

this patch adds a workaround by checking aper_base/order between NB and
agp bridge. If they are the same, and memory size is less than 4G, it
will allow it.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 47 ++++++++++++++++++++++++++++++++-----------
 drivers/char/agp/amd64-agp.c  | 12 +++++------
 2 files changed, 41 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 2e93b3132dff..8c325b7f2d9b 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -83,7 +83,7 @@ static u32 __init allocate_aperture(void)
 	return (u32)__pa(p);
 }
 
-static int __init aperture_valid(u64 aper_base, u32 aper_size)
+static int __init aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
 {
 	if (!aper_base)
 		return 0;
@@ -96,8 +96,9 @@ static int __init aperture_valid(u64 aper_base, u32 aper_size)
 		printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
 		return 0;
 	}
-	if (aper_size < 64*1024*1024) {
-		printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
+	if (aper_size < min_size) {
+		printk(KERN_ERR "Aperture too small (%d MB) than (%d MB)\n",
+				 aper_size>>20, min_size>>20);
 		return 0;
 	}
 
@@ -167,7 +168,9 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	 * On some sick chips, APSIZE is 0. It means it wants 4G
 	 * so let double check that order, and lets trust AMD NB settings:
 	 */
-	if (aper + (32UL<<(20 + *order)) > 0x100000000UL) {
+	printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
+			aper, 32 << old_order);
+	if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
 		printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
 				32 << *order, apsizereg);
 		*order = old_order;
@@ -176,7 +179,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
 			aper, 32 << *order, apsizereg);
 
-	if (!aperture_valid(aper, (32*1024*1024) << *order))
+	if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
 		return 0;
 	return (u32)aper;
 }
@@ -302,8 +305,8 @@ void __init early_gart_iommu_check(void)
 		fix = 1;
 
 	if (gart_fix_e820 && !fix && aper_enabled) {
-		if (e820_any_mapped(aper_base, aper_base + aper_size,
-				    E820_RAM)) {
+		if (!e820_all_mapped(aper_base, aper_base + aper_size,
+				    E820_RESERVED)) {
 			/* reserved it, so we can resuse it in second kernel */
 			printk(KERN_INFO "update e820 for GART\n");
 			add_memory_region(aper_base, aper_size, E820_RESERVED);
@@ -324,8 +327,11 @@ void __init early_gart_iommu_check(void)
 
 }
 
+static int __initdata printed_gart_size_msg;
+
 void __init gart_iommu_hole_init(void)
 {
+	u32 agp_aper_base = 0, agp_aper_order = 0;
 	u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
 	u64 aper_base, last_aper_base = 0;
 	int fix, num, valid_agp = 0;
@@ -337,6 +343,9 @@ void __init gart_iommu_hole_init(void)
 
 	printk(KERN_INFO  "Checking aperture...\n");
 
+	if (!fallback_aper_force)
+		agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+
 	fix = 0;
 	node = 0;
 	for (num = 24; num < 32; num++) {
@@ -355,9 +364,21 @@ void __init gart_iommu_hole_init(void)
 				node, aper_base, aper_size >> 20);
 		node++;
 
-		if (!aperture_valid(aper_base, aper_size)) {
-			fix = 1;
-			break;
+		if (!aperture_valid(aper_base, aper_size, 64<<20)) {
+			if (valid_agp && agp_aper_base &&
+			    agp_aper_base == aper_base &&
+			    agp_aper_order == aper_order) {
+				/* the same between two setting from NB and agp */
+				if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) {
+					printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
+					printk(KERN_ERR "please increase GART size in your BIOS setup\n");
+					printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+					printed_gart_size_msg = 1;
+				}
+			} else {
+				fix = 1;
+				break;
+			}
 		}
 
 		if ((last_aper_order && aper_order != last_aper_order) ||
@@ -378,8 +399,10 @@ void __init gart_iommu_hole_init(void)
 		return;
 	}
 
-	if (!fallback_aper_force)
-		aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
+	if (!fallback_aper_force) {
+		aper_alloc = agp_aper_base;
+		aper_order = agp_aper_order;
+	}
 
 	if (aper_alloc) {
 		/* Got the aperture from the AGP bridge */
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 9e3939db76e8..9c24470a8252 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -245,11 +245,7 @@ static int __devinit aperture_valid(u64 aper, u32 size)
 		printk(KERN_ERR PFX "No aperture\n");
 		return 0;
 	}
-	if (size < 32*1024*1024) {
-		printk(KERN_ERR PFX "Aperture too small (%d MB)\n", size>>20);
-		return 0;
-	}
-       if ((u64)aper + size > 0x100000000ULL) {
+	if ((u64)aper + size > 0x100000000ULL) {
 		printk(KERN_ERR PFX "Aperture out of bounds\n");
 		return 0;
 	}
@@ -257,6 +253,10 @@ static int __devinit aperture_valid(u64 aper, u32 size)
 		printk(KERN_ERR PFX "Aperture pointing to RAM\n");
 		return 0;
 	}
+	if (size < 32*1024*1024) {
+		printk(KERN_ERR PFX "Aperture too small (%d MB)\n", size>>20);
+		return 0;
+	}
 
 	/* Request the Aperture. This catches cases when someone else
 	   already put a mapping in there - happens with some very broken BIOS
@@ -317,7 +317,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
 	 * On some sick chips APSIZE is 0. This means it wants 4G
 	 * so let double check that order, and lets trust the AMD NB settings
 	 */
-	if (aper + (32ULL<<(20 + order)) > 0x100000000ULL) {
+	if (order >=0 && aper + (32ULL<<(20 + order)) > 0x100000000ULL) {
 		printk(KERN_INFO "Aperture size %u MB is not right, using settings from NB\n",
 				  32 << order);
 		order = nb_order;
-- 
cgit v1.2.3


From 7677b2ef6c0c4fddc84f6473f3863f40eb71821b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Mon, 14 Apr 2008 20:40:37 -0700
Subject: x86_64: allocate gart aperture from 512M

because we try to reserve dma32 early, so we have chance to get aperture
from 64M.

with some sequence aperture allocated from RAM, could become E820_RESERVED.

and then if doing a kexec with a big kernel that uncompressed size is above
64M we could have a range conflict with still using gart.

So allocate gart aperture from 512M instead.

Also change the fallback_aper_order to 5, because we don't have chance to get
2G or 4G aperture.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 20 +++++++++++++++++---
 arch/x86/kernel/pci-dma.c     |  6 +++++-
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 8c325b7f2d9b..c63f8d9fad3e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -55,8 +55,9 @@ static u32 __init allocate_aperture(void)
 	u32 aper_size;
 	void *p;
 
-	if (fallback_aper_order > 7)
-		fallback_aper_order = 7;
+	/* aper_size should <= 1G */
+	if (fallback_aper_order > 5)
+		fallback_aper_order = 5;
 	aper_size = (32 * 1024 * 1024) << fallback_aper_order;
 
 	/*
@@ -65,7 +66,20 @@ static u32 __init allocate_aperture(void)
 	 * memory. Unfortunately we cannot move it up because that would
 	 * make the IOMMU useless.
 	 */
-	p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
+	/*
+	 * using 512M as goal, in case kexec will load kernel_big
+	 * that will do the on position decompress, and  could overlap with
+	 * that positon with gart that is used.
+	 * sequende:
+	 * kernel_small
+	 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
+	 * ==> kernel_small(gart area become e820_reserved)
+	 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
+	 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+	 * so don't use 512M below as gart iommu, leave the space for kernel
+	 * code for safe
+	 */
+	p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
 	if (!p || __pa(p)+aper_size > 0xffffffff) {
 		printk(KERN_ERR
 			"Cannot allocate aperture memory hole (%p,%uK)\n",
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 0c37f16b6950..1a017f00e867 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -77,10 +77,14 @@ void __init dma32_reserve_bootmem(void)
 	if (end_pfn <= MAX_DMA32_PFN)
 		return;
 
+	/*
+	 * check aperture_64.c allocate_aperture() for reason about
+	 * using 512M as goal
+	 */
 	align = 64ULL<<20;
 	size = round_up(dma32_bootmem_size, align);
 	dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
-				 __pa(MAX_DMA_ADDRESS));
+				 512ULL<<20);
 	if (dma32_bootmem_ptr)
 		dma32_bootmem_size = size;
 	else
-- 
cgit v1.2.3


From 55c0d721df80dcc505dc888e85d4ca51ea150ce9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Sat, 19 Apr 2008 01:31:11 -0700
Subject: x86: clean up aperture_64.c

1. use symbolic register names where appropriate.
2. num to bus or slot changing
3. handle for new opteron for bus other than 0

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/aperture_64.c | 230 +++++++++++++++++++++++++-----------------
 1 file changed, 139 insertions(+), 91 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index c63f8d9fad3e..02f4dbaa4df4 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -35,6 +35,18 @@ int fallback_aper_force __initdata;
 
 int fix_aperture __initdata = 1;
 
+struct bus_dev_range {
+	int bus;
+	int dev_base;
+	int dev_limit;
+};
+
+static struct bus_dev_range bus_dev_ranges[] __initdata = {
+	{ 0x00, 0x18, 0x20},
+	{ 0xff, 0x00, 0x20},
+	{ 0xfe, 0x00, 0x20}
+};
+
 static struct resource gart_resource = {
 	.name	= "GART",
 	.flags	= IORESOURCE_MEM,
@@ -120,33 +132,33 @@ static int __init aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
 }
 
 /* Find a PCI capability */
-static __u32 __init find_cap(int num, int slot, int func, int cap)
+static __u32 __init find_cap(int bus, int slot, int func, int cap)
 {
 	int bytes;
 	u8 pos;
 
-	if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+	if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) &
 						PCI_STATUS_CAP_LIST))
 		return 0;
 
-	pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+	pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST);
 	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
 		u8 id;
 
 		pos &= ~3;
-		id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
+		id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID);
 		if (id == 0xff)
 			break;
 		if (id == cap)
 			return pos;
-		pos = read_pci_config_byte(num, slot, func,
+		pos = read_pci_config_byte(bus, slot, func,
 						pos+PCI_CAP_LIST_NEXT);
 	}
 	return 0;
 }
 
 /* Read a standard AGPv3 bridge header */
-static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+static __u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
 {
 	u32 apsize;
 	u32 apsizereg;
@@ -155,8 +167,8 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	u64 aper;
 	u32 old_order;
 
-	printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func);
-	apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
+	printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+	apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
 	if (apsizereg == 0xffffffff) {
 		printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
 		return 0;
@@ -174,8 +186,8 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	if ((int)*order < 0) /* < 32MB */
 		*order = 0;
 
-	aper_low = read_pci_config(num, slot, func, 0x10);
-	aper_hi = read_pci_config(num, slot, func, 0x14);
+	aper_low = read_pci_config(bus, slot, func, 0x10);
+	aper_hi = read_pci_config(bus, slot, func, 0x14);
 	aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
 
 	/*
@@ -213,15 +225,15 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
  */
 static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
 {
-	int num, slot, func;
+	int bus, slot, func;
 
 	/* Poor man's PCI discovery */
-	for (num = 0; num < 256; num++) {
+	for (bus = 0; bus < 256; bus++) {
 		for (slot = 0; slot < 32; slot++) {
 			for (func = 0; func < 8; func++) {
 				u32 class, cap;
 				u8 type;
-				class = read_pci_config(num, slot, func,
+				class = read_pci_config(bus, slot, func,
 							PCI_CLASS_REVISION);
 				if (class == 0xffffffff)
 					break;
@@ -230,17 +242,17 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
 				case PCI_CLASS_BRIDGE_HOST:
 				case PCI_CLASS_BRIDGE_OTHER: /* needed? */
 					/* AGP bridge? */
-					cap = find_cap(num, slot, func,
+					cap = find_cap(bus, slot, func,
 							PCI_CAP_ID_AGP);
 					if (!cap)
 						break;
 					*valid_agp = 1;
-					return read_agp(num, slot, func, cap,
+					return read_agp(bus, slot, func, cap,
 							order);
 				}
 
 				/* No multi-function device? */
-				type = read_pci_config_byte(num, slot, func,
+				type = read_pci_config_byte(bus, slot, func,
 							       PCI_HEADER_TYPE);
 				if (!(type & 0x80))
 					break;
@@ -280,38 +292,49 @@ void __init early_gart_iommu_check(void)
 	 * or BIOS forget to put that in reserved.
 	 * try to update e820 to make that region as reserved.
 	 */
-	int fix, num;
+	int fix, slot;
 	u32 ctl;
 	u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
 	u64 aper_base = 0, last_aper_base = 0;
 	int aper_enabled = 0, last_aper_enabled = 0;
+	int i;
 
 	if (!early_pci_allowed())
 		return;
 
 	fix = 0;
-	for (num = 24; num < 32; num++) {
-		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-			continue;
-
-		ctl = read_pci_config(0, num, 3, 0x90);
-		aper_enabled = ctl & 1;
-		aper_order = (ctl >> 1) & 7;
-		aper_size = (32 * 1024 * 1024) << aper_order;
-		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
-		aper_base <<= 25;
-
-		if ((last_aper_order && aper_order != last_aper_order) ||
-		    (last_aper_base && aper_base != last_aper_base) ||
-		    (last_aper_enabled && aper_enabled != last_aper_enabled)) {
-			fix = 1;
-			break;
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+		int bus;
+		int dev_base, dev_limit;
+
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
+
+		for (slot = dev_base; slot < dev_limit; slot++) {
+			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+				continue;
+
+			ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+			aper_enabled = ctl & AMD64_GARTEN;
+			aper_order = (ctl >> 1) & 7;
+			aper_size = (32 * 1024 * 1024) << aper_order;
+			aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+			aper_base <<= 25;
+
+			if ((last_aper_order && aper_order != last_aper_order) ||
+			    (last_aper_base && aper_base != last_aper_base) ||
+			    (last_aper_enabled && aper_enabled != last_aper_enabled)) {
+				fix = 1;
+				goto out;
+			}
+			last_aper_order = aper_order;
+			last_aper_base = aper_base;
+			last_aper_enabled = aper_enabled;
 		}
-		last_aper_order = aper_order;
-		last_aper_base = aper_base;
-		last_aper_enabled = aper_enabled;
 	}
 
+out:
 	if (!fix && !aper_enabled)
 		return;
 
@@ -330,13 +353,22 @@ void __init early_gart_iommu_check(void)
 	}
 
 	/* different nodes have different setting, disable them all at first*/
-	for (num = 24; num < 32; num++) {
-		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-			continue;
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+		int bus;
+		int dev_base, dev_limit;
+
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
 
-		ctl = read_pci_config(0, num, 3, 0x90);
-		ctl &= ~1;
-		write_pci_config(0, num, 3, 0x90, ctl);
+		for (slot = dev_base; slot < dev_limit; slot++) {
+			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+				continue;
+
+			ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+			ctl &= ~AMD64_GARTEN;
+			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+		}
 	}
 
 }
@@ -348,8 +380,8 @@ void __init gart_iommu_hole_init(void)
 	u32 agp_aper_base = 0, agp_aper_order = 0;
 	u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
 	u64 aper_base, last_aper_base = 0;
-	int fix, num, valid_agp = 0;
-	int node;
+	int fix, slot, valid_agp = 0;
+	int i, node;
 
 	if (gart_iommu_aperture_disabled || !fix_aperture ||
 	    !early_pci_allowed())
@@ -362,48 +394,58 @@ void __init gart_iommu_hole_init(void)
 
 	fix = 0;
 	node = 0;
-	for (num = 24; num < 32; num++) {
-		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-			continue;
-
-		iommu_detected = 1;
-		gart_iommu_aperture = 1;
-
-		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
-		aper_size = (32 * 1024 * 1024) << aper_order;
-		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
-		aper_base <<= 25;
-
-		printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
-				node, aper_base, aper_size >> 20);
-		node++;
-
-		if (!aperture_valid(aper_base, aper_size, 64<<20)) {
-			if (valid_agp && agp_aper_base &&
-			    agp_aper_base == aper_base &&
-			    agp_aper_order == aper_order) {
-				/* the same between two setting from NB and agp */
-				if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) {
-					printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
-					printk(KERN_ERR "please increase GART size in your BIOS setup\n");
-					printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
-					printed_gart_size_msg = 1;
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+		int bus;
+		int dev_base, dev_limit;
+
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
+
+		for (slot = dev_base; slot < dev_limit; slot++) {
+			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+				continue;
+
+			iommu_detected = 1;
+			gart_iommu_aperture = 1;
+
+			aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
+			aper_size = (32 * 1024 * 1024) << aper_order;
+			aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+			aper_base <<= 25;
+
+			printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
+					node, aper_base, aper_size >> 20);
+			node++;
+
+			if (!aperture_valid(aper_base, aper_size, 64<<20)) {
+				if (valid_agp && agp_aper_base &&
+				    agp_aper_base == aper_base &&
+				    agp_aper_order == aper_order) {
+					/* the same between two setting from NB and agp */
+					if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) {
+						printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
+						printk(KERN_ERR "please increase GART size in your BIOS setup\n");
+						printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+						printed_gart_size_msg = 1;
+					}
+				} else {
+					fix = 1;
+					goto out;
 				}
-			} else {
-				fix = 1;
-				break;
 			}
-		}
 
-		if ((last_aper_order && aper_order != last_aper_order) ||
-		    (last_aper_base && aper_base != last_aper_base)) {
-			fix = 1;
-			break;
+			if ((last_aper_order && aper_order != last_aper_order) ||
+			    (last_aper_base && aper_base != last_aper_base)) {
+				fix = 1;
+				goto out;
+			}
+			last_aper_order = aper_order;
+			last_aper_base = aper_base;
 		}
-		last_aper_order = aper_order;
-		last_aper_base = aper_base;
 	}
 
+out:
 	if (!fix && !fallback_aper_force) {
 		if (last_aper_base) {
 			unsigned long n = (32 * 1024 * 1024) << last_aper_order;
@@ -452,16 +494,22 @@ void __init gart_iommu_hole_init(void)
 	}
 
 	/* Fix up the north bridges */
-	for (num = 24; num < 32; num++) {
-		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-			continue;
-
-		/*
-		 * Don't enable translation yet. That is done later.
-		 * Assume this BIOS didn't initialise the GART so
-		 * just overwrite all previous bits
-		 */
-		write_pci_config(0, num, 3, 0x90, aper_order<<1);
-		write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+		int bus;
+		int dev_base, dev_limit;
+
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
+		for (slot = dev_base; slot < dev_limit; slot++) {
+			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+				continue;
+
+			/* Don't enable translation yet. That is done later.
+			   Assume this BIOS didn't initialise the GART so
+			   just overwrite all previous bits */
+			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+			write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
+		}
 	}
 }
-- 
cgit v1.2.3


From 330fce23dab6e6a3d1979e55f27aba4c0c301331 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Sat, 19 Apr 2008 01:31:45 -0700
Subject: x86: reserve dma32 early for gart fix

we can use free_bootmem() directly.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-dma.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1a017f00e867..81862d0c7a9a 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -92,7 +92,6 @@ void __init dma32_reserve_bootmem(void)
 }
 static void __init dma32_free_bootmem(void)
 {
-	int node;
 
 	if (end_pfn <= MAX_DMA32_PFN)
 		return;
@@ -100,9 +99,7 @@ static void __init dma32_free_bootmem(void)
 	if (!dma32_bootmem_ptr)
 		return;
 
-	for_each_online_node(node)
-		free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
-				  dma32_bootmem_size);
+	free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
 
 	dma32_bootmem_ptr = NULL;
 	dma32_bootmem_size = 0;
-- 
cgit v1.2.3


From 3bb6fbf9969a8bbe4892968659239273d092e78a Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 15 Apr 2008 12:43:57 +0200
Subject: x86 gart: factor out common code

Cleanup gart handling on amd64 a bit: move common code into
enable_gart_translation , and use symbolic register names where
appropriate.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/pci-gart_64.c | 23 ++++++-----------------
 drivers/char/agp/amd64-agp.c  | 29 +++++++++--------------------
 include/asm-x86/gart.h        | 17 +++++++++++++++++
 3 files changed, 32 insertions(+), 37 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index bffcf455c857..1f99b62ff616 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -533,8 +533,8 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
 	unsigned aper_size = 0, aper_base_32, aper_order;
 	u64 aper_base;
 
-	pci_read_config_dword(dev, 0x94, &aper_base_32);
-	pci_read_config_dword(dev, 0x90, &aper_order);
+	pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
+	pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
 	aper_order = (aper_order >> 1) & 7;
 
 	aper_base = aper_base_32 & 0x7fff;
@@ -592,19 +592,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	agp_gatt_table = gatt;
 
 	for (i = 0; i < num_k8_northbridges; i++) {
-		u32 gatt_reg;
-		u32 ctl;
-
 		dev = k8_northbridges[i];
-		gatt_reg = __pa(gatt) >> 12;
-		gatt_reg <<= 4;
-		pci_write_config_dword(dev, AMD64_GARTTABLEBASE, gatt_reg);
-		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
-
-		ctl |= GARTEN;
-		ctl &= ~(DISGARTCPU | DISGARTIO);
-
-		pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+		enable_gart_translation(dev, __pa(gatt));
 	}
 	flush_gart();
 
@@ -648,11 +637,11 @@ void gart_iommu_shutdown(void)
 		u32 ctl;
 
 		dev = k8_northbridges[i];
-		pci_read_config_dword(dev, 0x90, &ctl);
+		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
 
-		ctl &= ~1;
+		ctl &= ~GARTEN;
 
-		pci_write_config_dword(dev, 0x90, ctl);
+		pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
 	}
 }
 
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 9c24470a8252..e3c7ea07f57c 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -150,25 +150,14 @@ static u64 amd64_configure(struct pci_dev *hammer, u64 gatt_table)
 {
 	u64 aperturebase;
 	u32 tmp;
-	u64 addr, aper_base;
+	u64 aper_base;
 
 	/* Address to map to */
-	pci_read_config_dword (hammer, AMD64_GARTAPERTUREBASE, &tmp);
+	pci_read_config_dword(hammer, AMD64_GARTAPERTUREBASE, &tmp);
 	aperturebase = tmp << 25;
 	aper_base = (aperturebase & PCI_BASE_ADDRESS_MEM_MASK);
 
-	/* address of the mappings table */
-	addr = (u64) gatt_table;
-	addr >>= 12;
-	tmp = (u32) addr<<4;
-	tmp &= ~0xf;
-	pci_write_config_dword(hammer, AMD64_GARTTABLEBASE, tmp);
-
-	/* Enable GART translation for this hammer. */
-	pci_read_config_dword(hammer, AMD64_GARTAPERTURECTL, &tmp);
-	tmp |= GARTEN;
-	tmp &= ~(DISGARTCPU | DISGARTIO);
-	pci_write_config_dword(hammer, AMD64_GARTAPERTURECTL, tmp);
+	enable_gart_translation(hammer, gatt_table);
 
 	return aper_base;
 }
@@ -207,9 +196,9 @@ static void amd64_cleanup(void)
         for (i = 0; i < num_k8_northbridges; i++) {
 		struct pci_dev *dev = k8_northbridges[i];
 		/* disable gart translation */
-		pci_read_config_dword (dev, AMD64_GARTAPERTURECTL, &tmp);
+		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
 		tmp &= ~AMD64_GARTEN;
-		pci_write_config_dword (dev, AMD64_GARTAPERTURECTL, tmp);
+		pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp);
 	}
 }
 
@@ -289,9 +278,9 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
 	u32 nb_order, nb_base;
 	u16 apsize;
 
-	pci_read_config_dword(nb, 0x90, &nb_order);
+	pci_read_config_dword(nb, AMD64_GARTAPERTURECTL, &nb_order);
 	nb_order = (nb_order >> 1) & 7;
-	pci_read_config_dword(nb, 0x94, &nb_base);
+	pci_read_config_dword(nb, AMD64_GARTAPERTUREBASE, &nb_base);
 	nb_aper = nb_base << 25;
 	if (aperture_valid(nb_aper, (32*1024*1024)<<nb_order)) {
 		return 0;
@@ -327,8 +316,8 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
 	if (order < 0 || !aperture_valid(aper, (32*1024*1024)<<order))
 		return -1;
 
-	pci_write_config_dword(nb, 0x90, order << 1);
-	pci_write_config_dword(nb, 0x94, aper >> 25);
+	pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1);
+	pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25);
 
 	return 0;
 }
diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h
index 248e5778e928..6f22786d2f0c 100644
--- a/include/asm-x86/gart.h
+++ b/include/asm-x86/gart.h
@@ -52,4 +52,21 @@ static inline void gart_iommu_shutdown(void)
 #define AMD64_GARTCACHECTL	0x9c
 #define AMD64_GARTEN		(1<<0)
 
+static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
+{
+	u32 tmp, ctl;
+
+        /* address of the mappings table */
+        addr >>= 12;
+        tmp = (u32) addr<<4;
+        tmp &= ~0xf;
+        pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
+
+        /* Enable GART translation for this hammer. */
+        pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+        ctl |= GARTEN;
+        ctl &= ~(DISGARTCPU | DISGARTIO);
+        pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
 #endif
-- 
cgit v1.2.3


From f784946ded3e734524d1f48a6a8286b8a152c3b9 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Fri, 2 May 2008 16:45:08 -0700
Subject: x86: use per_cpu data in nmi_32

use per_cpu for per CPU data.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi_32.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 11b14bbaa61e..69bdae555c18 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -293,9 +293,9 @@ void stop_apic_nmi_watchdog(void *unused)
  *  here too!]
  */
 
-static unsigned int
-	last_irq_sums [NR_CPUS],
-	alert_counter [NR_CPUS];
+static DEFINE_PER_CPU(unsigned, last_irq_sum);
+static DEFINE_PER_CPU(local_t, alert_counter);
+static DEFINE_PER_CPU(int, nmi_touch);
 
 void touch_nmi_watchdog(void)
 {
@@ -303,12 +303,13 @@ void touch_nmi_watchdog(void)
 		unsigned cpu;
 
 		/*
-		 * Just reset the alert counters, (other CPUs might be
-		 * spinning on locks we hold):
+		 * Tell other CPUs to reset their alert counters. We cannot
+		 * do it ourselves because the alert count increase is not
+		 * atomic.
 		 */
 		for_each_present_cpu(cpu) {
-			if (alert_counter[cpu])
-				alert_counter[cpu] = 0;
+			if (per_cpu(nmi_touch, cpu) != 1)
+				per_cpu(nmi_touch, cpu) = 1;
 		}
 	}
 
@@ -358,22 +359,26 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 	 */
 	sum = per_cpu(irq_stat, cpu).apic_timer_irqs +
 		per_cpu(irq_stat, cpu).irq0_irqs;
+	if (__get_cpu_var(nmi_touch)) {
+		__get_cpu_var(nmi_touch) = 0;
+		touched = 1;
+	}
 
 	/* if the none of the timers isn't firing, this cpu isn't doing much */
-	if (!touched && last_irq_sums[cpu] == sum) {
+	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
 		/*
 		 * Ayiee, looks like this CPU is stuck ...
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
-		alert_counter[cpu]++;
-		if (alert_counter[cpu] == 5*nmi_hz)
+		local_inc(&__get_cpu_var(alert_counter));
+		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
 			/*
 			 * die_nmi will return ONLY if NOTIFY_STOP happens..
 			 */
 			die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
 	} else {
-		last_irq_sums[cpu] = sum;
-		alert_counter[cpu] = 0;
+		__get_cpu_var(last_irq_sum) = sum;
+		local_set(&__get_cpu_var(alert_counter), 0);
 	}
 	/* see if the nmi watchdog went off */
 	if (!__get_cpu_var(wd_enabled))
-- 
cgit v1.2.3


From f59a9310b97879159ef4d25227bc270278f1ee71 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Fri, 2 May 2008 16:44:52 -0700
Subject: x86: nmi and irq counters are unsigned in nmi_64.c

__nmi_count, apic_timer_irqs and irq0_irqs are unsigned.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 5a29ded994fa..33cb8ecbb6dd 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -79,7 +79,7 @@ static __init void nmi_cpu_busy(void *data)
 
 int __init check_nmi_watchdog(void)
 {
-	int *prev_nmi_count;
+	unsigned int *prev_nmi_count;
 	int cpu;
 
 	if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
@@ -316,7 +316,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
 notrace __kprobes int
 nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 {
-	int sum;
+	unsigned int sum;
 	int touched = 0;
 	int cpu = smp_processor_id();
 	int rc = 0;
-- 
cgit v1.2.3


From 205f93288093df69f9ab5f6981aef27b91088b28 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@codemonkey.org.uk>
Date: Mon, 5 May 2008 17:52:52 -0400
Subject: x86: add new cache descriptor

The latest rev of Intel doc AP-485 details a new cache
descriptor that we don't yet support.
A 6MB 24-way assoc L2 cache.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 26d615dcb149..2c8afafa18e8 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -62,6 +62,7 @@ static struct _cache_table cache_table[] __cpuinitdata =
 	{ 0x4b, LVL_3,      8192 },	/* 16-way set assoc, 64 byte line size */
 	{ 0x4c, LVL_3,     12288 },	/* 12-way set assoc, 64 byte line size */
 	{ 0x4d, LVL_3,     16384 },	/* 16-way set assoc, 64 byte line size */
+	{ 0x4e, LVL_2,      6144 },	/* 24-way set assoc, 64 byte line size */
 	{ 0x60, LVL_1_DATA, 16 },	/* 8-way set assoc, sectored cache, 64 byte line size */
 	{ 0x66, LVL_1_DATA, 8 },	/* 4-way set assoc, sectored cache, 64 byte line size */
 	{ 0x67, LVL_1_DATA, 16 },	/* 4-way set assoc, sectored cache, 64 byte line size */
-- 
cgit v1.2.3


From 5119e92efc733d730b34f9605a5ae61fdc4bf649 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 15 May 2008 09:12:01 -0600
Subject: x86: cdev lock_kernel() pushdown

Push the cdev lock_kernel() call down into the x86 msr and cpuid drivers.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 arch/x86/kernel/cpuid.c | 25 +++++++++++++++++--------
 arch/x86/kernel/msr.c   | 16 ++++++++++++----
 2 files changed, 29 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index daff52a62248..71f1c2654bec 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/poll.h>
 #include <linux/smp.h>
+#include <linux/smp_lock.h>
 #include <linux/major.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
@@ -107,15 +108,23 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
 
 static int cpuid_open(struct inode *inode, struct file *file)
 {
-	unsigned int cpu = iminor(file->f_path.dentry->d_inode);
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-	if (cpu >= NR_CPUS || !cpu_online(cpu))
-		return -ENXIO;	/* No such CPU */
+	unsigned int cpu;
+	struct cpuinfo_x86 *c;
+	int ret = 0;
+	
+	lock_kernel();
+
+	cpu = iminor(file->f_path.dentry->d_inode);
+	if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+		ret = -ENXIO;	/* No such CPU */
+		goto out;
+	}
+	c = &cpu_data(cpu);
 	if (c->cpuid_level < 0)
-		return -EIO;	/* CPUID not supported */
-
-	return 0;
+		ret = -EIO;	/* CPUID not supported */
+out:
+	unlock_kernel();
+	return ret;
 }
 
 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 1f3abe048e93..a153b3905f60 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -117,12 +117,20 @@ static int msr_open(struct inode *inode, struct file *file)
 {
 	unsigned int cpu = iminor(file->f_path.dentry->d_inode);
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	int ret = 0;
 
-	if (cpu >= NR_CPUS || !cpu_online(cpu))
-		return -ENXIO;	/* No such CPU */
-	if (!cpu_has(c, X86_FEATURE_MSR))
-		return -EIO;	/* MSR not supported */
+	lock_kernel();
+	cpu = iminor(file->f_path.dentry->d_inode);
 
+	if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+		ret = -ENXIO;	/* No such CPU */
+		goto out;
+	}
+	c = &cpu_data(cpu);
+	if (!cpu_has(c, X86_FEATURE_MSR))
+		ret = -EIO;	/* MSR not supported */
+out:
+	unlock_kernel();
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0abbc78a0137fee60ef092f0b20a3d3d7e7e0cc2 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 20 May 2008 16:27:17 +0200
Subject: x86, aperture_64: use symbolic constants

Factor-out common aperture_valid code.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 23 +----------------------
 drivers/char/agp/amd64-agp.c  | 22 ++++------------------
 include/asm-x86/gart.h        | 24 ++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 40 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 02f4dbaa4df4..5373f7834d8a 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -109,27 +109,6 @@ static u32 __init allocate_aperture(void)
 	return (u32)__pa(p);
 }
 
-static int __init aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
-{
-	if (!aper_base)
-		return 0;
-
-	if (aper_base + aper_size > 0x100000000UL) {
-		printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
-		return 0;
-	}
-	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
-		printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
-		return 0;
-	}
-	if (aper_size < min_size) {
-		printk(KERN_ERR "Aperture too small (%d MB) than (%d MB)\n",
-				 aper_size>>20, min_size>>20);
-		return 0;
-	}
-
-	return 1;
-}
 
 /* Find a PCI capability */
 static __u32 __init find_cap(int bus, int slot, int func, int cap)
@@ -344,7 +323,7 @@ out:
 	if (gart_fix_e820 && !fix && aper_enabled) {
 		if (!e820_all_mapped(aper_base, aper_base + aper_size,
 				    E820_RESERVED)) {
-			/* reserved it, so we can resuse it in second kernel */
+			/* reserve it, so we can reuse it in second kernel */
 			printk(KERN_INFO "update e820 for GART\n");
 			add_memory_region(aper_base, aper_size, E820_RESERVED);
 			update_e820();
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index e3c7ea07f57c..f5af65ac8c78 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -228,24 +228,10 @@ static const struct agp_bridge_driver amd_8151_driver = {
 };
 
 /* Some basic sanity checks for the aperture. */
-static int __devinit aperture_valid(u64 aper, u32 size)
+static int __devinit agp_aperture_valid(u64 aper, u32 size)
 {
-	if (aper == 0) {
-		printk(KERN_ERR PFX "No aperture\n");
+	if (!aperture_valid(aper, size, 32*1024*1024))
 		return 0;
-	}
-	if ((u64)aper + size > 0x100000000ULL) {
-		printk(KERN_ERR PFX "Aperture out of bounds\n");
-		return 0;
-	}
-	if (e820_any_mapped(aper, aper + size, E820_RAM)) {
-		printk(KERN_ERR PFX "Aperture pointing to RAM\n");
-		return 0;
-	}
-	if (size < 32*1024*1024) {
-		printk(KERN_ERR PFX "Aperture too small (%d MB)\n", size>>20);
-		return 0;
-	}
 
 	/* Request the Aperture. This catches cases when someone else
 	   already put a mapping in there - happens with some very broken BIOS
@@ -282,7 +268,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
 	nb_order = (nb_order >> 1) & 7;
 	pci_read_config_dword(nb, AMD64_GARTAPERTUREBASE, &nb_base);
 	nb_aper = nb_base << 25;
-	if (aperture_valid(nb_aper, (32*1024*1024)<<nb_order)) {
+	if (agp_aperture_valid(nb_aper, (32*1024*1024)<<nb_order)) {
 		return 0;
 	}
 
@@ -313,7 +299,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
 	}
 
 	printk(KERN_INFO PFX "Aperture from AGP @ %Lx size %u MB\n", aper, 32 << order);
-	if (order < 0 || !aperture_valid(aper, (32*1024*1024)<<order))
+	if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order))
 		return -1;
 
 	pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1);
diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h
index 6f22786d2f0c..c818b96f936b 100644
--- a/include/asm-x86/gart.h
+++ b/include/asm-x86/gart.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X8664_IOMMU_H
 #define _ASM_X8664_IOMMU_H 1
 
+#include <asm/e820.h>
+
 extern void pci_iommu_shutdown(void);
 extern void no_iommu_init(void);
 extern int force_iommu, no_iommu;
@@ -69,4 +71,26 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
         pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
 }
 
+static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
+{
+	if (!aper_base)
+		return 0;
+
+	if (aper_base + aper_size > 0x100000000ULL) {
+		printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
+		return 0;
+	}
+	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+		printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
+		return 0;
+	}
+	if (aper_size < min_size) {
+		printk(KERN_ERR "Aperture too small (%d MB) than (%d MB)\n",
+				 aper_size>>20, min_size>>20);
+		return 0;
+	}
+
+	return 1;
+}
+
 #endif
-- 
cgit v1.2.3


From 0748aca6f0af917591dcfd70dccac770a25d4f71 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 May 2008 10:34:20 +0200
Subject: x86: remove useless static current_tsc_khz variable

current_tsc_khz is just written by the init code and never used
again. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_32.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index e4790728b224..d53b1040cbb7 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -283,7 +283,6 @@ core_initcall(cpufreq_tsc);
 
 /* clock source code */
 
-static unsigned long current_tsc_khz;
 static struct clocksource clocksource_tsc;
 
 /*
@@ -434,9 +433,8 @@ void __init tsc_init(void)
 
 	unsynchronized_tsc();
 	check_geode_tsc_reliable();
-	current_tsc_khz = tsc_khz;
-	clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
-							clocksource_tsc.shift);
+	clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
+						    clocksource_tsc.shift);
 	/* lower the rating if we already know its unstable: */
 	if (check_tsc_unstable()) {
 		clocksource_tsc.rating = 0;
-- 
cgit v1.2.3


From b478458aeebfc55fe409abec43794ac72a623c79 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 12 May 2008 15:44:39 +0200
Subject: x86: avoid re-loading LDT in unrelated address spaces

Performance optimization.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ldt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0224c3637c73..21f2bae98c15 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -20,9 +20,9 @@
 #include <asm/mmu_context.h>
 
 #ifdef CONFIG_SMP
-static void flush_ldt(void *null)
+static void flush_ldt(void *current_mm)
 {
-	if (current->active_mm)
+	if (current->active_mm == current_mm)
 		load_LDT(&current->active_mm->context);
 }
 #endif
@@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 		load_LDT(pc);
 		mask = cpumask_of_cpu(smp_processor_id());
 		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-			smp_call_function(flush_ldt, NULL, 1, 1);
+			smp_call_function(flush_ldt, current->mm, 1, 1);
 		preempt_enable();
 #else
 		load_LDT(pc);
-- 
cgit v1.2.3


From 873b274a41c0cfe58b2eb0a7722424eb367b905b Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 22 May 2008 13:02:23 -0400
Subject: x86: Add Centaur and Transmeta CPUs to PAT whitelist

Unconditionally enable PAT support on Centaur and Transmeta CPUs.
All known models that advertise PAT have no known errata.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c2e1ce33c7cb..d8b3e4a9d663 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -62,6 +62,9 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
 		if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
 			return;
 		break;
+	case X86_VENDOR_CENTAUR:
+	case X86_VENDOR_TRANSMETA:
+		return;
 	}
 
 	pat_disable(cpu_has_pat ?
-- 
cgit v1.2.3


From 23adec554a7648f99c8acc0caf49c66320cd2b84 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: x86: add notrace annotations to vsyscall.

Add the notrace annotations to the vsyscall functions - there we are
not in kernel context yet, so the tracer function cannot (and must not)
be called.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vsyscall_64.c  |  3 ++-
 arch/x86/vdso/vclock_gettime.c | 15 ++++++++-------
 arch/x86/vdso/vgetcpu.c        |  3 ++-
 include/asm-x86/vsyscall.h     |  3 ++-
 4 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 61efa2f7d564..4063dfa2a02d 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -42,7 +42,8 @@
 #include <asm/topology.h>
 #include <asm/vgtod.h>
 
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) \
+		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
 #define __syscall_clobber "r11","cx","memory"
 
 /*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 23476c2ebfc4..5cb8f754c52d 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -23,7 +23,7 @@
 
 #define gtod vdso_vsyscall_gtod_data
 
-static long vdso_fallback_gettime(long clock, struct timespec *ts)
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
 	asm("syscall" : "=a" (ret) :
@@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts)
 	return ret;
 }
 
-static inline long vgetns(void)
+notrace static inline long vgetns(void)
 {
 	long v;
 	cycles_t (*vread)(void);
@@ -40,7 +40,7 @@ static inline long vgetns(void)
 	return (v * gtod->clock.mult) >> gtod->clock.shift;
 }
 
-static noinline int do_realtime(struct timespec *ts)
+notrace static noinline int do_realtime(struct timespec *ts)
 {
 	unsigned long seq, ns;
 	do {
@@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts)
 }
 
 /* Copy of the version in kernel/time.c which we cannot directly access */
-static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
+notrace static void
+vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
 {
 	while (nsec >= NSEC_PER_SEC) {
 		nsec -= NSEC_PER_SEC;
@@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
 	ts->tv_nsec = nsec;
 }
 
-static noinline int do_monotonic(struct timespec *ts)
+notrace static noinline int do_monotonic(struct timespec *ts)
 {
 	unsigned long seq, ns, secs;
 	do {
@@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts)
 	return 0;
 }
 
-int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread))
 		switch (clock) {
@@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 int clock_gettime(clockid_t, struct timespec *)
 	__attribute__((weak, alias("__vdso_clock_gettime")));
 
-int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	long ret;
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index c8097f17f8a9..9fbc6b20026b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -13,7 +13,8 @@
 #include <asm/vgtod.h>
 #include "vextern.h"
 
-long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+notrace long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
 {
 	unsigned int p;
 
diff --git a/include/asm-x86/vsyscall.h b/include/asm-x86/vsyscall.h
index 17b3700949bf..6b66ff905af0 100644
--- a/include/asm-x86/vsyscall.h
+++ b/include/asm-x86/vsyscall.h
@@ -24,7 +24,8 @@ enum vsyscall_num {
 	((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
 #define __section_vsyscall_clock __attribute__ \
 	((unused, __section__ (".vsyscall_clock"),aligned(16)))
-#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+#define __vsyscall_fn \
+	__attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
 
 #define VGETCPU_RDTSCP	1
 #define VGETCPU_LSL	2
-- 
cgit v1.2.3


From 16444a8a40d4c7b4f6de34af0cae1f76a4f6c901 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: add basic support for gcc profiler instrumentation

If CONFIG_FTRACE is selected and /proc/sys/kernel/ftrace_enabled is
set to a non-zero value the ftrace routine will be called everytime
we enter a kernel function that is not marked with the "notrace"
attribute.

The ftrace routine will then call a registered function if a function
happens to be registered.

[ This code has been highly hacked by Steven Rostedt and Ingo Molnar,
  so don't blame Arnaldo for all of this ;-) ]

Update:
  It is now possible to register more than one ftrace function.
  If only one ftrace function is registered, that will be the
  function that ftrace calls directly. If more than one function
  is registered, then ftrace will call a function that will loop
  through the functions to call.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Makefile                   |   4 ++
 arch/x86/Kconfig           |   1 +
 arch/x86/kernel/entry_32.S |  27 +++++++++
 arch/x86/kernel/entry_64.S |  37 ++++++++++++
 include/linux/ftrace.h     |  38 +++++++++++++
 kernel/Makefile            |   1 +
 kernel/trace/Kconfig       |   5 ++
 kernel/trace/Makefile      |   3 +
 kernel/trace/ftrace.c      | 138 +++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug          |   2 +
 10 files changed, 256 insertions(+)
 create mode 100644 include/linux/ftrace.h
 create mode 100644 kernel/trace/Kconfig
 create mode 100644 kernel/trace/Makefile
 create mode 100644 kernel/trace/ftrace.c

(limited to 'arch/x86/kernel')

diff --git a/Makefile b/Makefile
index 20b32351906b..b4a273f19b52 100644
--- a/Makefile
+++ b/Makefile
@@ -528,6 +528,10 @@ KBUILD_CFLAGS	+= -g
 KBUILD_AFLAGS	+= -gdwarf-2
 endif
 
+ifdef CONFIG_FTRACE
+KBUILD_CFLAGS	+= -pg
+endif
+
 # We trigger additional mismatches with less inlining
 ifdef CONFIG_DEBUG_SECTION_MISMATCH
 KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe361ae7ef2f..c742dfeb0dbe 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
+	select HAVE_FTRACE
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2a609dc3271c..f47b9b5440d2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1109,6 +1109,33 @@ ENDPROC(xen_failsafe_callback)
 
 #endif	/* CONFIG_XEN */
 
+#ifdef CONFIG_FTRACE
+ENTRY(mcount)
+	cmpl $ftrace_stub, ftrace_trace_function
+	jnz trace
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+
+	/* taken from glibc */
+trace:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+
+	call   *ftrace_trace_function
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	jmp ftrace_stub
+END(mcount)
+#endif
+
 .section .rodata,"a"
 #include "syscall_table_32.S"
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..f046e0c64883 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -54,6 +54,43 @@
 
 	.code64
 
+#ifdef CONFIG_FTRACE
+ENTRY(mcount)
+	cmpq $ftrace_stub, ftrace_trace_function
+	jnz trace
+.globl ftrace_stub
+ftrace_stub:
+	retq
+
+trace:
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+
+	call   *ftrace_trace_function
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	jmp ftrace_stub
+END(mcount)
+#endif
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
new file mode 100644
index 000000000000..b96ef14c249a
--- /dev/null
+++ b/include/linux/ftrace.h
@@ -0,0 +1,38 @@
+#ifndef _LINUX_FTRACE_H
+#define _LINUX_FTRACE_H
+
+#ifdef CONFIG_FTRACE
+
+#include <linux/linkage.h>
+
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+
+typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
+
+struct ftrace_ops {
+	ftrace_func_t	  func;
+	struct ftrace_ops *next;
+};
+
+/*
+ * The ftrace_ops must be a static and should also
+ * be read_mostly.  These functions do modify read_mostly variables
+ * so use them sparely. Never free an ftrace_op or modify the
+ * next pointer after it has been registered. Even after unregistering
+ * it, the next pointer may still be used internally.
+ */
+int register_ftrace_function(struct ftrace_ops *ops);
+int unregister_ftrace_function(struct ftrace_ops *ops);
+void clear_ftrace_function(void);
+
+extern void ftrace_stub(unsigned long a0, unsigned long a1);
+extern void mcount(void);
+
+#else /* !CONFIG_FTRACE */
+# define register_ftrace_function(ops) do { } while (0)
+# define unregister_ftrace_function(ops) do { } while (0)
+# define clear_ftrace_function(ops) do { } while (0)
+#endif /* CONFIG_FTRACE */
+#endif /* _LINUX_FTRACE_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..fa05f6d8bdbf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_FTRACE) += trace/
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000000..8185c91417bc
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,5 @@
+#
+# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+#
+config HAVE_FTRACE
+	bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000000..bf4fd215a6a9
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_FTRACE) += libftrace.o
+
+libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..b6a80b98a3fb
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,138 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ *   Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+static DEFINE_SPINLOCK(ftrace_func_lock);
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+	.func = ftrace_stub,
+};
+
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+
+/* mcount is defined per arch in assembly */
+EXPORT_SYMBOL(mcount);
+
+notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_ops *op = ftrace_list;
+
+	/* in case someone actually ports this to alpha! */
+	read_barrier_depends();
+
+	while (op != &ftrace_list_end) {
+		/* silly alpha */
+		read_barrier_depends();
+		op->func(ip, parent_ip);
+		op = op->next;
+	};
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ftrace_func_lock, flags);
+	ops->next = ftrace_list;
+	/*
+	 * We are entering ops into the ftrace_list but another
+	 * CPU might be walking that list. We need to make sure
+	 * the ops->next pointer is valid before another CPU sees
+	 * the ops pointer included into the ftrace_list.
+	 */
+	smp_wmb();
+	ftrace_list = ops;
+	/*
+	 * For one func, simply call it directly.
+	 * For more than one func, call the chain.
+	 */
+	if (ops->next == &ftrace_list_end)
+		ftrace_trace_function = ops->func;
+	else
+		ftrace_trace_function = ftrace_list_func;
+	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	return 0;
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	unsigned long flags;
+	struct ftrace_ops **p;
+	int ret = 0;
+
+	spin_lock_irqsave(&ftrace_func_lock, flags);
+
+	/*
+	 * If we are the only function, then the ftrace pointer is
+	 * pointing directly to that function.
+	 */
+	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+		ftrace_trace_function = ftrace_stub;
+		ftrace_list = &ftrace_list_end;
+		goto out;
+	}
+
+	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+		if (*p == ops)
+			break;
+
+	if (*p != ops) {
+		ret = -1;
+		goto out;
+	}
+
+	*p = (*p)->next;
+
+	/* If we only have one func left, then call that directly */
+	if (ftrace_list->next == &ftrace_list_end)
+		ftrace_trace_function = ftrace_list->func;
+
+ out:
+	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	return 0;
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
+ */
+void clear_ftrace_function(void)
+{
+	ftrace_trace_function = ftrace_stub;
+}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d2099f41aa1e..d8b6279a9b42 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -634,6 +634,8 @@ config LATENCYTOP
 	  Enable this option if you want to use the LatencyTOP tool
 	  to find out which userspace is blocking on what kernel operations.
 
+source kernel/trace/Kconfig
+
 config PROVIDE_OHCI1394_DMA_INIT
 	bool "Remote debugging over FireWire early on boot"
 	depends on PCI && X86
-- 
cgit v1.2.3


From 81d68a96a39844853b37f20cc8282d9b65b78ef3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: trace irq disabled critical timings

This patch adds latency tracing for critical timings
(how long interrupts are disabled for).

 "irqsoff" is added to /debugfs/tracing/available_tracers

Note:
  tracing_max_latency
    also holds the max latency for irqsoff (in usecs).
   (default to large number so one must start latency tracing)

  tracing_thresh
    threshold (in usecs) to always print out if irqs off
    is detected to be longer than stated here.
    If irq_thresh is non-zero, then max_irq_latency
    is ignored.

Here's an example of a trace with ftrace_enabled = 0

=======
preemption latency trace v1.1.5 on 2.6.24-rc7
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------
 latency: 100 us, #3/3, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
    -----------------
 => started at: _spin_lock_irqsave+0x2a/0xb7
 => ended at:   _spin_unlock_irqrestore+0x32/0x5f

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
 swapper-0     1d.s3    0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000])
 swapper-0     1d.s3  100us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1d.s3  100us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f)

vim:ft=help
=======

And this is a trace with ftrace_enabled == 1

=======
preemption latency trace v1.1.5 on 2.6.24-rc7
--------------------------------------------------------------------
 latency: 102 us, #12/12, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
    -----------------
 => started at: _spin_lock_irqsave+0x2a/0xb7
 => ended at:   _spin_unlock_irqrestore+0x32/0x5f

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
 swapper-0     1dNs3    0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000])
 swapper-0     1dNs3   46us : e1000_read_phy_reg+0x16/0x225 [e1000] (e1000_update_stats+0x5e2/0x64c [e1000])
 swapper-0     1dNs3   46us : e1000_swfw_sync_acquire+0x10/0x99 [e1000] (e1000_read_phy_reg+0x49/0x225 [e1000])
 swapper-0     1dNs3   46us : e1000_get_hw_eeprom_semaphore+0x12/0xa6 [e1000] (e1000_swfw_sync_acquire+0x36/0x99 [e1000])
 swapper-0     1dNs3   47us : __const_udelay+0x9/0x47 (e1000_read_phy_reg+0x116/0x225 [e1000])
 swapper-0     1dNs3   47us+: __delay+0x9/0x50 (__const_udelay+0x45/0x47)
 swapper-0     1dNs3   97us : preempt_schedule+0xc/0x84 (__delay+0x4e/0x50)
 swapper-0     1dNs3   98us : e1000_swfw_sync_release+0xc/0x55 [e1000] (e1000_read_phy_reg+0x211/0x225 [e1000])
 swapper-0     1dNs3   99us+: e1000_put_hw_eeprom_semaphore+0x9/0x35 [e1000] (e1000_swfw_sync_release+0x50/0x55 [e1000])
 swapper-0     1dNs3  101us : _spin_unlock_irqrestore+0xe/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1dNs3  102us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1dNs3  102us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f)

vim:ft=help
=======

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c |   3 +
 arch/x86/lib/Makefile        |   1 +
 arch/x86/lib/thunk_32.S      |  47 +++++
 arch/x86/lib/thunk_64.S      |  19 +-
 include/asm-x86/irqflags.h   |  24 +--
 include/linux/ftrace.h       |   8 +
 include/linux/irqflags.h     |  12 +-
 kernel/fork.c                |   2 +-
 kernel/lockdep.c             |  23 ++-
 kernel/printk.c              |   2 +
 kernel/trace/Kconfig         |  18 ++
 kernel/trace/Makefile        |   1 +
 kernel/trace/trace_irqsoff.c | 402 +++++++++++++++++++++++++++++++++++++++++++
 13 files changed, 531 insertions(+), 31 deletions(-)
 create mode 100644 arch/x86/lib/thunk_32.S
 create mode 100644 kernel/trace/trace_irqsoff.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e2319f39988b..dd349c92f051 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -165,7 +165,10 @@ void cpu_idle(void)
 			 */
 			local_irq_disable();
 			enter_idle();
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
+			start_critical_timings();
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
 			   loops can be woken up without interrupt. */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 76f60f52a885..84aa2883fe15 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -5,6 +5,7 @@
 obj-$(CONFIG_SMP) := msr-on-cpu.o
 
 lib-y := delay_$(BITS).o
+lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
 lib-y += memcpy_$(BITS).o
 
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
new file mode 100644
index 000000000000..650b11e00ecc
--- /dev/null
+++ b/arch/x86/lib/thunk_32.S
@@ -0,0 +1,47 @@
+/*
+ * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
+ * Copyright 2008 by Steven Rostedt, Red Hat, Inc
+ *  (inspired by Andi Kleen's thunk_64.S)
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+
+	#include <linux/linkage.h>
+
+#define ARCH_TRACE_IRQS_ON			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_on;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_TRACE_IRQS_OFF			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_off;		\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* put return address in eax (arg1) */
+	.macro thunk_ra name,func
+	.globl \name
+\name:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	/* Place EIP in the arg1 */
+	movl 3*4(%esp), %eax
+	call \func
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+	.endm
+
+	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index e009251d4e9f..bf9a7d5a5428 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -2,6 +2,7 @@
  * Save registers before calling assembly functions. This avoids
  * disturbance of register allocation in some inline assembly constructs.
  * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
 
@@ -42,8 +43,22 @@
 #endif	
 	
 #ifdef CONFIG_TRACE_IRQFLAGS
-	thunk trace_hardirqs_on_thunk,trace_hardirqs_on
-	thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+	/* put return address in rdi (arg1) */
+	.macro thunk_ra name,func
+	.globl \name
+\name:
+	CFI_STARTPROC
+	SAVE_ARGS
+	/* SAVE_ARGS pushs 9 elements */
+	/* the next element would be the rip */
+	movq 9*8(%rsp), %rdi
+	call \func
+	jmp  restore
+	CFI_ENDPROC
+	.endm
+
+	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
index c242527f970e..24d71b1eb189 100644
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -179,8 +179,6 @@ static inline void trace_hardirqs_fixup(void)
  * have a reliable stack. x86_64 only.
  */
 #define SWAPGS_UNSAFE_STACK	swapgs
-#define ARCH_TRACE_IRQS_ON		call trace_hardirqs_on_thunk
-#define ARCH_TRACE_IRQS_OFF		call trace_hardirqs_off_thunk
 #define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
 	TRACE_IRQS_ON; \
@@ -192,24 +190,6 @@ static inline void trace_hardirqs_fixup(void)
 	TRACE_IRQS_OFF;
 
 #else
-#define ARCH_TRACE_IRQS_ON			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_on;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-#define ARCH_TRACE_IRQS_OFF			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_off;		\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
 #define ARCH_LOCKDEP_SYS_EXIT			\
 	pushl %eax;				\
 	pushl %ecx;				\
@@ -223,8 +203,8 @@ static inline void trace_hardirqs_fixup(void)
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON		ARCH_TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF	ARCH_TRACE_IRQS_OFF
+#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk;
 #else
 #  define TRACE_IRQS_ON
 #  define TRACE_IRQS_OFF
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index db8a5e7abe41..0a20445dcbcc 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -50,4 +50,12 @@ extern void mcount(void);
 # define CALLER_ADDR5 0UL
 #endif
 
+#ifdef CONFIG_IRQSOFF_TRACER
+  extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1);
+  extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1)		do { } while (0)
+# define time_hardirqs_off(a0, a1)		do { } while (0)
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index e600c4e9b8c5..5b711d4e9fd9 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -12,10 +12,10 @@
 #define _LINUX_TRACE_IRQFLAGS_H
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-  extern void trace_hardirqs_on(void);
-  extern void trace_hardirqs_off(void);
   extern void trace_softirqs_on(unsigned long ip);
   extern void trace_softirqs_off(unsigned long ip);
+  extern void trace_hardirqs_on(void);
+  extern void trace_hardirqs_off(void);
 # define trace_hardirq_context(p)	((p)->hardirq_context)
 # define trace_softirq_context(p)	((p)->softirq_context)
 # define trace_hardirqs_enabled(p)	((p)->hardirqs_enabled)
@@ -41,6 +41,14 @@
 # define INIT_TRACE_IRQFLAGS
 #endif
 
+#ifdef CONFIG_IRQSOFF_TRACER
+ extern void stop_critical_timings(void);
+ extern void start_critical_timings(void);
+#else
+# define stop_critical_timings() do { } while (0)
+# define start_critical_timings() do { } while (0)
+#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 
 #include <asm/irqflags.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..d66d676dc362 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	rt_mutex_init_task(p);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..e21924365ea3 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -39,6 +39,7 @@
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
 #include <linux/hash.h>
+#include <linux/ftrace.h>
 
 #include <asm/sections.h>
 
@@ -982,7 +983,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	return 1;
 }
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
  * Forwards and backwards subgraph searching, for the purposes of
  * proving that two subgraphs can be connected by a new dependency
@@ -1680,7 +1681,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		     enum lock_usage_bit new_bit);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 
 /*
  * print irq inversion bug:
@@ -2013,11 +2014,13 @@ void early_boot_irqs_on(void)
 /*
  * Hardirqs will be enabled:
  */
-void trace_hardirqs_on(void)
+void notrace trace_hardirqs_on_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 	unsigned long ip;
 
+	time_hardirqs_on(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2055,16 +2058,23 @@ void trace_hardirqs_on(void)
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(&hardirqs_on_events);
 }
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
+void notrace trace_hardirqs_on(void)
+{
+	trace_hardirqs_on_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 /*
  * Hardirqs were disabled:
  */
-void trace_hardirqs_off(void)
+void notrace trace_hardirqs_off_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 
+	time_hardirqs_off(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2082,7 +2092,12 @@ void trace_hardirqs_off(void)
 	} else
 		debug_atomic_inc(&redundant_hardirqs_off);
 }
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
+void notrace trace_hardirqs_off(void)
+{
+	trace_hardirqs_off_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 /*
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..ae7d5b9e535d 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1041,7 +1041,9 @@ void release_console_sem(void)
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
 		spin_unlock(&logbuf_lock);
+		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(_con_start, _log_end);
+		start_critical_timings();
 		local_irq_restore(flags);
 	}
 	console_locked = 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 892ecc94a82b..896df1cf6adc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -26,6 +26,24 @@ config FTRACE
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
+config IRQSOFF_TRACER
+	bool "Interrupts-off Latency Tracer"
+	default n
+	depends on TRACE_IRQFLAGS_SUPPORT
+	depends on GENERIC_TIME
+	select TRACE_IRQFLAGS
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5508cdb19aea..46be8647fb65 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 000000000000..a9131b0cf1a5
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,402 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static struct trace_array		*irqsoff_trace __read_mostly;
+static int				tracer_enabled __read_mostly;
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp	unsigned long max_sequence;
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void notrace
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	local_save_flags(flags);
+
+	if (!irqs_disabled_flags(flags))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		ftrace(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = irqsoff_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int notrace report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+static void notrace
+check_critical_timing(struct trace_array *tr,
+		      struct trace_array_cpu *data,
+		      unsigned long parent_ip,
+		      int cpu)
+{
+	unsigned long latency, t0, t1;
+	cycle_t T0, T1, T2, delta;
+	unsigned long flags;
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = now(cpu);
+	delta = T1-T0;
+
+	local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
+	/*
+	 * Update the timestamp, because the trace entry above
+	 * might change it (it can only get larger so the latency
+	 * is fair to be reported):
+	 */
+	T2 = now(cpu);
+
+	delta = T2-T0;
+
+	latency = nsecs_to_usecs(delta);
+
+	if (data->critical_sequence != max_sequence)
+		goto out;
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	data->critical_end = parent_ip;
+
+	update_max_tr_single(tr, current, cpu);
+
+	if (tracing_thresh)
+		printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section "
+		       "violates %lu us threshold.\n"
+		       " => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, nsecs_to_usecs(tracing_thresh), t0);
+	else
+		printk(KERN_INFO "(%16s-%-5d|#%d):"
+		       " new %lu us maximum-latency "
+		       "critical section.\n => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, t0);
+
+	print_symbol(KERN_CONT "<%s>\n", data->critical_start);
+	printk(KERN_CONT " =>   ended at timestamp %lu: ", t1);
+	print_symbol(KERN_CONT "<%s>\n", data->critical_end);
+	dump_stack();
+	t1 = nsecs_to_usecs(now(cpu));
+	printk(KERN_CONT " =>   dump-end timestamp %lu\n\n", t1);
+
+	max_sequence++;
+
+out:
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = now(cpu);
+	tracing_reset(data);
+	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
+}
+
+static inline void notrace
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!data->trace) ||
+	    data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = now(cpu);
+	data->critical_start = parent_ip;
+	tracing_reset(data);
+
+	local_save_flags(flags);
+	ftrace(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static inline void notrace
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!data->trace) ||
+	    !data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+	local_save_flags(flags);
+	ftrace(tr, data, ip, parent_ip, flags);
+	check_critical_timing(tr, data, parent_ip, cpu);
+	data->critical_start = 0;
+	atomic_dec(&data->disabled);
+}
+
+void notrace start_critical_timings(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+void notrace stop_critical_timings(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(a0, a1);
+}
+
+void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(a0, a1);
+}
+
+#else /* !CONFIG_PROVE_LOCKING */
+
+/*
+ * Stubs:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void notrace trace_hardirqs_on(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void notrace trace_hardirqs_off(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_PROVE_LOCKING */
+
+static void start_irqsoff_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 1;
+	register_ftrace_function(&trace_ops);
+}
+
+static void stop_irqsoff_tracer(struct trace_array *tr)
+{
+	unregister_ftrace_function(&trace_ops);
+	tracer_enabled = 0;
+}
+
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	irqsoff_trace = tr;
+	/* make sure that the tracer is visibel */
+	smp_wmb();
+
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+	else
+		stop_irqsoff_tracer(tr);
+}
+
+static void notrace irqsoff_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_irqsoff_tracer(iter->tr);
+}
+
+static void notrace irqsoff_tracer_close(struct trace_iterator *iter)
+{
+	if (iter->tr->ctrl)
+		start_irqsoff_tracer(iter->tr);
+}
+
+static struct tracer irqsoff_tracer __read_mostly =
+{
+	.name		= "irqsoff",
+	.init		= irqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+__init static int init_irqsoff_tracer(void)
+{
+	register_tracer(&irqsoff_tracer);
+
+	return 0;
+}
+device_initcall(init_irqsoff_tracer);
-- 
cgit v1.2.3


From 6cd8a4bb2f97527a9ceb30bc77ea4e959c6a95e3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: trace preempt off critical timings

Add preempt off timings. A lot of kernel core code is taken from the RT patch
latency trace that was written by Ingo Molnar.

This adds "preemptoff" and "preemptirqsoff" to /debugfs/tracing/available_tracers

Now instead of just tracing irqs off, preemption off can be selected
to be recorded.

When this is selected, it shares the same files as irqs off timings.
One can either trace preemption off, irqs off, or one or the other off.

By echoing "preemptoff" into /debugfs/tracing/current_tracer, recording
of preempt off only is performed. "irqsoff" will only record the time
irqs are disabled, but "preemptirqsoff" will take the total time irqs
or preemption are disabled. Runtime switching of these options is now
supported by simpling echoing in the appropriate trace name into
/debugfs/tracing/current_tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c |   3 +
 include/linux/ftrace.h       |   8 ++
 include/linux/irqflags.h     |   3 +-
 include/linux/preempt.h      |   2 +-
 kernel/sched.c               |  24 +++++-
 kernel/trace/Kconfig         |  25 ++++++
 kernel/trace/Makefile        |   1 +
 kernel/trace/trace_irqsoff.c | 184 +++++++++++++++++++++++++++++++------------
 8 files changed, 197 insertions(+), 53 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f8476dfbb60d..a30aa1f2607a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -185,7 +185,10 @@ void cpu_idle(void)
 
 			local_irq_disable();
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
+			start_critical_timings();
 		}
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0a20445dcbcc..740c97dcf9cb 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -58,4 +58,12 @@ extern void mcount(void);
 # define time_hardirqs_off(a0, a1)		do { } while (0)
 #endif
 
+#ifdef CONFIG_PREEMPT_TRACER
+  extern void notrace trace_preempt_on(unsigned long a0, unsigned long a1);
+  extern void notrace trace_preempt_off(unsigned long a0, unsigned long a1);
+#else
+# define trace_preempt_on(a0, a1)		do { } while (0)
+# define trace_preempt_off(a0, a1)		do { } while (0)
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5b711d4e9fd9..2b1c2e58566e 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -41,7 +41,8 @@
 # define INIT_TRACE_IRQFLAGS
 #endif
 
-#ifdef CONFIG_IRQSOFF_TRACER
+#if defined(CONFIG_IRQSOFF_TRACER) || \
+	defined(CONFIG_PREEMPT_TRACER)
  extern void stop_critical_timings(void);
  extern void start_critical_timings(void);
 #else
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 36b03d50bf40..72b1a10a59b6 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -10,7 +10,7 @@
 #include <linux/linkage.h>
 #include <linux/list.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
   extern void add_preempt_count(int val);
   extern void sub_preempt_count(int val);
 #else
diff --git a/kernel/sched.c b/kernel/sched.c
index 73e600852365..328494e28df2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,6 +70,7 @@
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
+#include <linux/ftrace.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -4365,26 +4366,44 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
 
 void __kprobes add_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
+#endif
 	preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 
 void __kprobes sub_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
@@ -4396,7 +4415,10 @@ void __kprobes sub_preempt_count(int val)
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
+#endif
 
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 896df1cf6adc..6430016b98e8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,6 +44,31 @@ config IRQSOFF_TRACER
 
 	      echo 0 > /debugfs/tracing/tracing_max_latency
 
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_TRACER
+	bool "Preemption-off Latency Tracer"
+	default n
+	depends on GENERIC_TIME
+	depends on PREEMPT
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in preemption off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 46be8647fb65..3fec653d6533 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a9131b0cf1a5..8b1231633dc5 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -21,6 +21,36 @@
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
 
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+enum {
+	TRACER_IRQS_OFF		= (1 << 1),
+	TRACER_PREEMPT_OFF	= (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int notrace
+preempt_trace(void)
+{
+	return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int notrace
+irq_trace(void)
+{
+	return ((trace_type & TRACER_IRQS_OFF) &&
+		irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
 /*
  * Sequence count - we record it when starting a measurement and
  * skip the latency if the sequence has changed - some other section
@@ -44,14 +74,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	long disabled;
 	int cpu;
 
-	if (likely(!tracer_enabled))
+	if (likely(!__get_cpu_var(tracing_cpu)))
 		return;
 
 	local_save_flags(flags);
 
-	if (!irqs_disabled_flags(flags))
-		return;
-
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -171,23 +198,29 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (likely(!tracer_enabled))
 		return;
 
+	if (__get_cpu_var(tracing_cpu))
+		return;
+
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
 	if (unlikely(!data) || unlikely(!data->trace) ||
-	    data->critical_start || atomic_read(&data->disabled))
+	    atomic_read(&data->disabled))
 		return;
 
 	atomic_inc(&data->disabled);
 
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = now(cpu);
-	data->critical_start = parent_ip;
+	data->critical_start = parent_ip ? : ip;
 	tracing_reset(data);
 
 	local_save_flags(flags);
+
 	ftrace(tr, data, ip, parent_ip, flags);
 
+	__get_cpu_var(tracing_cpu) = 1;
+
 	atomic_dec(&data->disabled);
 }
 
@@ -199,7 +232,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 
-	if (likely(!tracer_enabled))
+	/* Always clear the tracing cpu on stopping the trace */
+	if (unlikely(__get_cpu_var(tracing_cpu)))
+		__get_cpu_var(tracing_cpu) = 0;
+	else
+		return;
+
+	if (!tracer_enabled)
 		return;
 
 	cpu = raw_smp_processor_id();
@@ -212,49 +251,35 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 	local_save_flags(flags);
 	ftrace(tr, data, ip, parent_ip, flags);
-	check_critical_timing(tr, data, parent_ip, cpu);
+	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
 }
 
+/* start and stop critical timings used to for stoppage (in idle) */
 void notrace start_critical_timings(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
 void notrace stop_critical_timings(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
+#ifdef CONFIG_IRQSOFF_TRACER
 #ifdef CONFIG_PROVE_LOCKING
 void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
 void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(a0, a1);
 }
 
@@ -289,49 +314,46 @@ inline void print_irqtrace_events(struct task_struct *curr)
  */
 void notrace trace_hardirqs_on(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 void notrace trace_hardirqs_off(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
 void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
 #endif /* CONFIG_PROVE_LOCKING */
+#endif /*  CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void notrace trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	stop_critical_timing(a0, a1);
+}
+
+void notrace trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
 
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
@@ -345,7 +367,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 	tracer_enabled = 0;
 }
 
-static void irqsoff_tracer_init(struct trace_array *tr)
+static void __irqsoff_tracer_init(struct trace_array *tr)
 {
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visibel */
@@ -382,6 +404,13 @@ static void notrace irqsoff_tracer_close(struct trace_iterator *iter)
 		start_irqsoff_tracer(iter->tr);
 }
 
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
 static struct tracer irqsoff_tracer __read_mostly =
 {
 	.name		= "irqsoff",
@@ -392,10 +421,65 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 };
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+	.name		= "preemptoff",
+	.init		= preemptoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+	defined(CONFIG_PREEMPT_TRACER)
+
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+	.name		= "preemptirqsoff",
+	.init		= preemptirqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
 
 __init static int init_irqsoff_tracer(void)
 {
-	register_tracer(&irqsoff_tracer);
+	register_irqsoff(irqsoff_tracer);
+	register_preemptoff(preemptoff_tracer);
+	register_preemptirqsoff(preemptirqsoff_tracer);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 3d0833953e1b98b79ddf491dd49229eef9baeac1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: dynamic enabling/disabling of function calls

This patch adds a feature to dynamically replace the ftrace code
with the jmps to allow a kernel with ftrace configured to run
as fast as it can without it configured.

The way this works, is on bootup (if ftrace is enabled), a ftrace
function is registered to record the instruction pointer of all
places that call the function.

Later, if there's still any code to patch, a kthread is awoken
(rate limited to at most once a second) that performs a stop_machine,
and replaces all the code that was called with a jmp over the call
to ftrace. It only replaces what was found the previous time. Typically
the system reaches equilibrium quickly after bootup and there's no code
patching needed at all.

e.g.

  call ftrace  /* 5 bytes */

is replaced with

  jmp 3f  /* jmp is 2 bytes and we jump 3 forward */
3:

When we want to enable ftrace for function tracing, the IP recording
is removed, and stop_machine is called again to replace all the locations
of that were recorded back to the call of ftrace.  When it is disabled,
we replace the code back to the jmp.

Allocation is done by the kthread. If the ftrace recording function is
called, and we don't have any record slots available, then we simply
skip that call. Once a second a new page (if needed) is allocated for
recording new ftrace function calls.  A large batch is allocated at
boot up to get most of the calls there.

Because we do this via stop_machine, we don't have to worry about another
CPU executing a ftrace call as we modify it. But we do need to worry
about NMI's so all functions that might be called via nmi must be
annotated with notrace_nmi. When this code is configured in, the NMI code
will not call notrace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile |   1 +
 arch/x86/kernel/ftrace.c | 237 +++++++++++++++++++++++++++++++
 include/linux/ftrace.h   |  18 +++
 kernel/trace/Kconfig     |  17 +++
 kernel/trace/ftrace.c    | 356 ++++++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 597 insertions(+), 32 deletions(-)
 create mode 100644 arch/x86/kernel/ftrace.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..e142091524b0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_$(BITS).o nmi_$(BITS).o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic_$(BITS).o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
+obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644
index 000000000000..5dd58136ef02
--- /dev/null
+++ b/arch/x86/kernel/ftrace.c
@@ -0,0 +1,237 @@
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes to Ingo Molnar, for suggesting the idea.
+ * Mathieu Desnoyers, for suggesting postponing the modifications.
+ * Arjan van de Ven, for keeping me straight, and explaining to me
+ * the dangers of modifying code on the run.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#define CALL_BACK		5
+
+#define JMPFWD			0x03eb
+
+static unsigned short ftrace_jmp = JMPFWD;
+
+struct ftrace_record {
+	struct dyn_ftrace	rec;
+	int			failed;
+} __attribute__((packed));
+
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct ftrace_record	records[];
+} __attribute__((packed));
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+#define MCOUNT_ADDR ((long)(&mcount))
+
+union ftrace_code_union {
+	char code[5];
+	struct {
+		char e8;
+		int offset;
+	} __attribute__((packed));
+};
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
+notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+{
+	struct ftrace_record *rec;
+	unsigned short save;
+
+	ip -= CALL_BACK;
+	save = *(short *)ip;
+
+	/* If this was already converted, skip it */
+	if (save == JMPFWD)
+		return NULL;
+
+	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+		if (!ftrace_pages->next)
+			return NULL;
+		ftrace_pages = ftrace_pages->next;
+	}
+
+	rec = &ftrace_pages->records[ftrace_pages->index++];
+
+	return &rec->rec;
+}
+
+static int notrace
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+		   unsigned char *new_code)
+{
+	unsigned short old = *(unsigned short *)old_code;
+	unsigned short new = *(unsigned short *)new_code;
+	unsigned short replaced;
+	int faulted = 0;
+
+	/*
+	 * Note: Due to modules and __init, code can
+	 *  disappear and change, we need to protect against faulting
+	 *  as well as code changing.
+	 *
+	 * No real locking needed, this code is run through
+	 * kstop_machine.
+	 */
+	asm volatile (
+		"1: lock\n"
+		"   cmpxchg %w3, (%2)\n"
+		"2:\n"
+		".section .fixup, \"ax\"\n"
+		"	movl $1, %0\n"
+		"3:	jmp 2b\n"
+		".previous\n"
+		_ASM_EXTABLE(1b, 3b)
+		: "=r"(faulted), "=a"(replaced)
+		: "r"(ip), "r"(new), "0"(faulted), "a"(old)
+		: "memory");
+	sync_core();
+
+	if (replaced != old)
+		faulted = 2;
+
+	return faulted;
+}
+
+static int notrace ftrace_calc_offset(long ip)
+{
+	return (int)(MCOUNT_ADDR - ip);
+}
+
+notrace void ftrace_code_disable(struct dyn_ftrace *rec)
+{
+	unsigned long ip;
+	union ftrace_code_union save;
+	struct ftrace_record *r =
+		container_of(rec, struct ftrace_record, rec);
+
+	ip = rec->ip;
+
+	save.e8		= 0xe8;
+	save.offset 	= ftrace_calc_offset(ip);
+
+	/* move the IP back to the start of the call */
+	ip -= CALL_BACK;
+
+	r->failed = ftrace_modify_code(ip, save.code, (char *)&ftrace_jmp);
+}
+
+static void notrace ftrace_replace_code(int saved)
+{
+	unsigned char *new = NULL, *old = NULL;
+	struct ftrace_record *rec;
+	struct ftrace_page *pg;
+	unsigned long ip;
+	int i;
+
+	if (saved)
+		old = (char *)&ftrace_jmp;
+	else
+		new = (char *)&ftrace_jmp;
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			union ftrace_code_union calc;
+			rec = &pg->records[i];
+
+			/* don't modify code that has already faulted */
+			if (rec->failed)
+				continue;
+
+			ip = rec->rec.ip;
+
+			calc.e8		= 0xe8;
+			calc.offset	= ftrace_calc_offset(ip);
+
+			if (saved)
+				new = calc.code;
+			else
+				old = calc.code;
+
+			ip -= CALL_BACK;
+
+			rec->failed = ftrace_modify_code(ip, old, new);
+		}
+	}
+
+}
+
+notrace void ftrace_startup_code(void)
+{
+	ftrace_replace_code(1);
+}
+
+notrace void ftrace_shutdown_code(void)
+{
+	ftrace_replace_code(0);
+}
+
+notrace void ftrace_shutdown_replenish(void)
+{
+	if (ftrace_pages->next)
+		return;
+
+	/* allocate another page */
+	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+notrace int ftrace_shutdown_arch_init(void)
+{
+	struct ftrace_page *pg;
+	int cnt;
+	int i;
+
+	/* allocate a few pages */
+	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ftrace_pages_start)
+		return -1;
+
+	/*
+	 * Allocate a few more pages.
+	 *
+	 * TODO: have some parser search vmlinux before
+	 *   final linking to find all calls to ftrace.
+	 *   Then we can:
+	 *    a) know how many pages to allocate.
+	 *     and/or
+	 *    b) set up the table then.
+	 *
+	 *  The dynamic code is still necessary for
+	 *  modules.
+	 */
+
+	pg = ftrace_pages = ftrace_pages_start;
+
+	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+	for (i = 0; i < cnt; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+		/* If we fail, we'll try later anyway */
+		if (!pg->next)
+			break;
+
+		pg = pg->next;
+	}
+
+	return 0;
+}
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 740c97dcf9cb..90dbc0ee2046 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -32,6 +32,24 @@ extern void mcount(void);
 # define clear_ftrace_function(ops) do { } while (0)
 #endif /* CONFIG_FTRACE */
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+# define FTRACE_HASHBITS	10
+# define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
+
+struct dyn_ftrace {
+	struct hlist_node node;
+	unsigned long	  ip;
+};
+
+/* defined in arch */
+extern struct dyn_ftrace *
+ftrace_alloc_shutdown_node(unsigned long ip);
+extern int ftrace_shutdown_arch_init(void);
+extern void ftrace_code_disable(struct dyn_ftrace *rec);
+extern void ftrace_startup_code(void);
+extern void ftrace_shutdown_code(void);
+extern void ftrace_shutdown_replenish(void);
+#endif
 
 #ifdef CONFIG_FRAME_POINTER
 /* TODO: need to fix this for ARM */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6430016b98e8..cad9db1dee02 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -88,3 +88,20 @@ config CONTEXT_SWITCH_TRACER
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
 
+config DYNAMIC_FTRACE
+	bool "enable/disable ftrace tracepoints dynamically"
+	depends on FTRACE
+	default y
+	help
+         This option will modify all the calls to ftrace dynamically
+	 (will patch them out of the binary image and replaces them
+	 with a No-Op instruction) as they are called. A table is
+	 created to dynamically enable them again.
+
+	 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+	 has native performance as long as no tracing is active.
+
+	 The changes to the code are done by a kernel thread that
+	 wakes up once a second and checks to see if any ftrace calls
+	 were made. If so, it runs stop_machine (stops all CPUS)
+	 and modifies the code to jump over the call to ftrace.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b6a80b98a3fb..d1ae2ba25274 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -13,10 +13,19 @@
  *  Copyright (C) 2004 William Lee Irwin III
  */
 
-#include <linux/module.h>
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/kthread.h>
+#include <linux/hardirq.h>
 #include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+
+#include "trace.h"
 
-static DEFINE_SPINLOCK(ftrace_func_lock);
+static DEFINE_SPINLOCK(ftrace_lock);
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
 	.func = ftrace_stub,
@@ -44,21 +53,21 @@ notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 }
 
 /**
- * register_ftrace_function - register a function for profiling
- * @ops - ops structure that holds the function for profiling.
- *
- * Register a function to be called by all functions in the
- * kernel.
+ * clear_ftrace_function - reset the ftrace function
  *
- * Note: @ops->func and all the functions it calls must be labeled
- *       with "notrace", otherwise it will go into a
- *       recursive loop.
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
  */
-int register_ftrace_function(struct ftrace_ops *ops)
+void clear_ftrace_function(void)
 {
-	unsigned long flags;
+	ftrace_trace_function = ftrace_stub;
+}
+
+static int notrace __register_ftrace_function(struct ftrace_ops *ops)
+{
+	/* Should never be called by interrupts */
+	spin_lock(&ftrace_lock);
 
-	spin_lock_irqsave(&ftrace_func_lock, flags);
 	ops->next = ftrace_list;
 	/*
 	 * We are entering ops into the ftrace_list but another
@@ -68,6 +77,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	 */
 	smp_wmb();
 	ftrace_list = ops;
+
 	/*
 	 * For one func, simply call it directly.
 	 * For more than one func, call the chain.
@@ -76,28 +86,22 @@ int register_ftrace_function(struct ftrace_ops *ops)
 		ftrace_trace_function = ops->func;
 	else
 		ftrace_trace_function = ftrace_list_func;
-	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	spin_unlock(&ftrace_lock);
 
 	return 0;
 }
 
-/**
- * unregister_ftrace_function - unresgister a function for profiling.
- * @ops - ops structure that holds the function to unregister
- *
- * Unregister a function that was added to be called by ftrace profiling.
- */
-int unregister_ftrace_function(struct ftrace_ops *ops)
+static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 {
-	unsigned long flags;
 	struct ftrace_ops **p;
 	int ret = 0;
 
-	spin_lock_irqsave(&ftrace_func_lock, flags);
+	spin_lock(&ftrace_lock);
 
 	/*
-	 * If we are the only function, then the ftrace pointer is
-	 * pointing directly to that function.
+	 * If we are removing the last function, then simply point
+	 * to the ftrace_stub.
 	 */
 	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
 		ftrace_trace_function = ftrace_stub;
@@ -117,22 +121,310 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 	*p = (*p)->next;
 
 	/* If we only have one func left, then call that directly */
-	if (ftrace_list->next == &ftrace_list_end)
+	if (ftrace_list == &ftrace_list_end ||
+	    ftrace_list->next == &ftrace_list_end)
 		ftrace_trace_function = ftrace_list->func;
 
  out:
-	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+	spin_unlock(&ftrace_lock);
+
+	return ret;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
+
+static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
+
+static DEFINE_SPINLOCK(ftrace_shutdown_lock);
+static DEFINE_MUTEX(ftraced_lock);
+
+static int ftraced_trigger;
+static int ftraced_suspend;
+
+static int ftrace_record_suspend;
+
+static inline int
+notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key)
+{
+	struct dyn_ftrace *p;
+	struct hlist_node *t;
+	int found = 0;
+
+	hlist_for_each_entry(p, t, &ftrace_hash[key], node) {
+		if (p->ip == ip) {
+			found = 1;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static inline void notrace
+ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
+{
+	hlist_add_head(&node->node, &ftrace_hash[key]);
+}
+
+static void notrace
+ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
+{
+	struct dyn_ftrace *node;
+	unsigned long flags;
+	unsigned long key;
+	int resched;
+	int atomic;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	/* We simply need to protect against recursion */
+	__get_cpu_var(ftrace_shutdown_disable_cpu)++;
+	if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1)
+		goto out;
+
+	if (unlikely(ftrace_record_suspend))
+		goto out;
+
+	key = hash_long(ip, FTRACE_HASHBITS);
+
+	WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
+
+	if (ftrace_ip_in_hash(ip, key))
+		goto out;
+
+	atomic = irqs_disabled();
+
+	spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+
+	/* This ip may have hit the hash before the lock */
+	if (ftrace_ip_in_hash(ip, key))
+		goto out_unlock;
+
+	/*
+	 * There's a slight race that the ftraced will update the
+	 * hash and reset here. The arch alloc is responsible
+	 * for seeing if the IP has already changed, and if
+	 * it has, the alloc will fail.
+	 */
+	node = ftrace_alloc_shutdown_node(ip);
+	if (!node)
+		goto out_unlock;
+
+	node->ip = ip;
+
+	ftrace_add_hash(node, key);
+
+	ftraced_trigger = 1;
+
+ out_unlock:
+	spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ out:
+	__get_cpu_var(ftrace_shutdown_disable_cpu)--;
+
+	/* prevent recursion with scheduler */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
+{
+	.func = ftrace_record_ip,
+};
+
+
+static int notrace __ftrace_modify_code(void *data)
+{
+	void (*func)(void) = data;
+
+	func();
+	return 0;
+}
+
+static void notrace ftrace_run_startup_code(void)
+{
+	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
+}
+
+static void notrace ftrace_run_shutdown_code(void)
+{
+	stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS);
+}
+
+static void notrace ftrace_startup(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend++;
+	if (ftraced_suspend != 1)
+		goto out;
+	__unregister_ftrace_function(&ftrace_shutdown_ops);
+
+	ftrace_run_startup_code();
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static void notrace ftrace_shutdown(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend--;
+	if (ftraced_suspend)
+		goto out;
+
+	ftrace_run_shutdown_code();
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static cycle_t		ftrace_update_time;
+static unsigned long	ftrace_update_cnt;
+unsigned long		ftrace_update_tot_cnt;
+
+static int notrace __ftrace_update_code(void *ignore)
+{
+	struct dyn_ftrace *p;
+	struct hlist_head head;
+	struct hlist_node *t;
+	cycle_t start, stop;
+	int i;
+
+	/* Don't be calling ftrace ops now */
+	__unregister_ftrace_function(&ftrace_shutdown_ops);
+
+	start = now(raw_smp_processor_id());
+	ftrace_update_cnt = 0;
+
+	/* No locks needed, the machine is stopped! */
+	for (i = 0; i < FTRACE_HASHSIZE; i++) {
+		if (hlist_empty(&ftrace_hash[i]))
+			continue;
+
+		head = ftrace_hash[i];
+		INIT_HLIST_HEAD(&ftrace_hash[i]);
+
+		/* all CPUS are stopped, we are safe to modify code */
+		hlist_for_each_entry(p, t, &head, node) {
+			ftrace_code_disable(p);
+			ftrace_update_cnt++;
+		}
+
+	}
+
+	stop = now(raw_smp_processor_id());
+	ftrace_update_time = stop - start;
+	ftrace_update_tot_cnt += ftrace_update_cnt;
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
 
 	return 0;
 }
 
+static void notrace ftrace_update_code(void)
+{
+	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+}
+
+static int notrace ftraced(void *ignore)
+{
+	unsigned long usecs;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		/* check once a second */
+		schedule_timeout(HZ);
+
+		mutex_lock(&ftraced_lock);
+		if (ftraced_trigger && !ftraced_suspend) {
+			ftrace_record_suspend++;
+			ftrace_update_code();
+			usecs = nsecs_to_usecs(ftrace_update_time);
+			if (ftrace_update_tot_cnt > 100000) {
+				ftrace_update_tot_cnt = 0;
+				pr_info("hm, dftrace overflow: %lu change%s"
+					 " (%lu total) in %lu usec%s\n",
+					ftrace_update_cnt,
+					ftrace_update_cnt != 1 ? "s" : "",
+					ftrace_update_tot_cnt,
+					usecs, usecs != 1 ? "s" : "");
+				WARN_ON_ONCE(1);
+			}
+			ftraced_trigger = 0;
+			ftrace_record_suspend--;
+		}
+		mutex_unlock(&ftraced_lock);
+
+		ftrace_shutdown_replenish();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __init notrace ftrace_shutdown_init(void)
+{
+	struct task_struct *p;
+	int ret;
+
+	ret = ftrace_shutdown_arch_init();
+	if (ret)
+		return ret;
+
+	p = kthread_run(ftraced, NULL, "ftraced");
+	if (IS_ERR(p))
+		return -1;
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
+
+	return 0;
+}
+
+core_initcall(ftrace_shutdown_init);
+#else
+# define ftrace_startup()	do { } while (0)
+# define ftrace_shutdown()	do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 /**
- * clear_ftrace_function - reset the ftrace function
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
  *
- * This NULLs the ftrace function and in essence stops
- * tracing.  There may be lag
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
  */
-void clear_ftrace_function(void)
+int register_ftrace_function(struct ftrace_ops *ops)
 {
-	ftrace_trace_function = ftrace_stub;
+	ftrace_startup();
+
+	return __register_ftrace_function(ops);
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	int ret;
+
+	ret = __unregister_ftrace_function(ops);
+
+	if (ftrace_list == &ftrace_list_end)
+		ftrace_shutdown();
+
+	return ret;
 }
-- 
cgit v1.2.3


From dfa60aba04dae7833d75b2e2be124bb7cfb8239f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: use nops instead of jmp

This patch patches the call to mcount with nops instead
of a jmp over the mcount call.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/alternative.c |  4 ++--
 arch/x86/kernel/ftrace.c      | 40 ++++++++++++++++++++++++----------------
 include/asm-x86/alternative.h |  2 ++
 3 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65c7857a90dd..de240ba2e288 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
 #ifdef CONFIG_X86_64
 
 extern char __vsyscall_0;
-static inline const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
 {
 	return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
 	       boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
@@ -162,7 +162,7 @@ static const struct nop {
 	{ -1, NULL }
 };
 
-static const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
 {
 	const unsigned char *const *noptable = intel_nops;
 	int i;
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 5dd58136ef02..2e060c58b860 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -16,11 +16,12 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
-#define CALL_BACK		5
+#include <asm/alternative.h>
 
-#define JMPFWD			0x03eb
+#define CALL_BACK		5
 
-static unsigned short ftrace_jmp = JMPFWD;
+/* Long is fine, even if it is only 4 bytes ;-) */
+static long *ftrace_nop;
 
 struct ftrace_record {
 	struct dyn_ftrace	rec;
@@ -55,13 +56,13 @@ static struct ftrace_page	*ftrace_pages;
 notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
 {
 	struct ftrace_record *rec;
-	unsigned short save;
+	unsigned long save;
 
 	ip -= CALL_BACK;
-	save = *(short *)ip;
+	save = *(long *)ip;
 
 	/* If this was already converted, skip it */
-	if (save == JMPFWD)
+	if (save == *ftrace_nop)
 		return NULL;
 
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
@@ -79,9 +80,10 @@ static int notrace
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 {
-	unsigned short old = *(unsigned short *)old_code;
-	unsigned short new = *(unsigned short *)new_code;
-	unsigned short replaced;
+	unsigned replaced;
+	unsigned old = *(unsigned *)old_code; /* 4 bytes */
+	unsigned new = *(unsigned *)new_code; /* 4 bytes */
+	unsigned char newch = new_code[4];
 	int faulted = 0;
 
 	/*
@@ -94,7 +96,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	 */
 	asm volatile (
 		"1: lock\n"
-		"   cmpxchg %w3, (%2)\n"
+		"   cmpxchg %3, (%2)\n"
+		"   jnz 2f\n"
+		"   movb %b4, 4(%2)\n"
 		"2:\n"
 		".section .fixup, \"ax\"\n"
 		"	movl $1, %0\n"
@@ -102,11 +106,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		".previous\n"
 		_ASM_EXTABLE(1b, 3b)
 		: "=r"(faulted), "=a"(replaced)
-		: "r"(ip), "r"(new), "0"(faulted), "a"(old)
+		: "r"(ip), "r"(new), "r"(newch),
+		  "0"(faulted), "a"(old)
 		: "memory");
 	sync_core();
 
-	if (replaced != old)
+	if (replaced != old && replaced != new)
 		faulted = 2;
 
 	return faulted;
@@ -132,7 +137,7 @@ notrace void ftrace_code_disable(struct dyn_ftrace *rec)
 	/* move the IP back to the start of the call */
 	ip -= CALL_BACK;
 
-	r->failed = ftrace_modify_code(ip, save.code, (char *)&ftrace_jmp);
+	r->failed = ftrace_modify_code(ip, save.code, (char *)ftrace_nop);
 }
 
 static void notrace ftrace_replace_code(int saved)
@@ -144,9 +149,9 @@ static void notrace ftrace_replace_code(int saved)
 	int i;
 
 	if (saved)
-		old = (char *)&ftrace_jmp;
+		old = (char *)ftrace_nop;
 	else
-		new = (char *)&ftrace_jmp;
+		new = (char *)ftrace_nop;
 
 	for (pg = ftrace_pages_start; pg; pg = pg->next) {
 		for (i = 0; i < pg->index; i++) {
@@ -194,12 +199,15 @@ notrace void ftrace_shutdown_replenish(void)
 	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
 }
 
-notrace int ftrace_shutdown_arch_init(void)
+notrace int __init ftrace_shutdown_arch_init(void)
 {
+	const unsigned char *const *noptable = find_nop_table();
 	struct ftrace_page *pg;
 	int cnt;
 	int i;
 
+	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
+
 	/* allocate a few pages */
 	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!ftrace_pages_start)
diff --git a/include/asm-x86/alternative.h b/include/asm-x86/alternative.h
index 1f6a9ca10126..f6aa18eadf71 100644
--- a/include/asm-x86/alternative.h
+++ b/include/asm-x86/alternative.h
@@ -72,6 +72,8 @@ static inline void alternatives_smp_module_del(struct module *mod) {}
 static inline void alternatives_smp_switch(int smp) {}
 #endif	/* CONFIG_SMP */
 
+const unsigned char *const *find_nop_table(void);
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
-- 
cgit v1.2.3


From 3c1720f00bb619302ba19d55986ab565e74d06db Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: move memory management out of arch code

This patch moves the memory management of the ftrace
records out of the arch code and into the generic code
making the arch code simpler.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ftrace.c | 183 ++++++++---------------------------------------
 include/linux/ftrace.h   |  18 +++--
 kernel/trace/ftrace.c    | 154 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 192 insertions(+), 163 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 2e060c58b860..b69795efa226 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -23,25 +23,6 @@
 /* Long is fine, even if it is only 4 bytes ;-) */
 static long *ftrace_nop;
 
-struct ftrace_record {
-	struct dyn_ftrace	rec;
-	int			failed;
-} __attribute__((packed));
-
-struct ftrace_page {
-	struct ftrace_page	*next;
-	int			index;
-	struct ftrace_record	records[];
-} __attribute__((packed));
-
-#define ENTRIES_PER_PAGE \
-  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record))
-
-/* estimate from running different kernels */
-#define NR_TO_INIT		10000
-
-#define MCOUNT_ADDR ((long)(&mcount))
-
 union ftrace_code_union {
 	char code[5];
 	struct {
@@ -50,33 +31,41 @@ union ftrace_code_union {
 	} __attribute__((packed));
 };
 
-static struct ftrace_page	*ftrace_pages_start;
-static struct ftrace_page	*ftrace_pages;
-
-notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+notrace int ftrace_ip_converted(unsigned long ip)
 {
-	struct ftrace_record *rec;
 	unsigned long save;
 
 	ip -= CALL_BACK;
 	save = *(long *)ip;
 
-	/* If this was already converted, skip it */
-	if (save == *ftrace_nop)
-		return NULL;
+	return save == *ftrace_nop;
+}
 
-	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
-		if (!ftrace_pages->next)
-			return NULL;
-		ftrace_pages = ftrace_pages->next;
-	}
+static int notrace ftrace_calc_offset(long ip, long addr)
+{
+	return (int)(addr - ip);
+}
 
-	rec = &ftrace_pages->records[ftrace_pages->index++];
+notrace unsigned char *ftrace_nop_replace(void)
+{
+	return (char *)ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+	static union ftrace_code_union calc;
 
-	return &rec->rec;
+	calc.e8		= 0xe8;
+	calc.offset	= ftrace_calc_offset(ip, addr);
+
+	/*
+	 * No locking needed, this must be called via kstop_machine
+	 * which in essence is like running on a uniprocessor machine.
+	 */
+	return calc.code;
 }
 
-static int notrace
+notrace int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 {
@@ -86,6 +75,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	unsigned char newch = new_code[4];
 	int faulted = 0;
 
+	/* move the IP back to the start of the call */
+	ip -= CALL_BACK;
+
 	/*
 	 * Note: Due to modules and __init, code can
 	 *  disappear and change, we need to protect against faulting
@@ -117,129 +109,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	return faulted;
 }
 
-static int notrace ftrace_calc_offset(long ip)
-{
-	return (int)(MCOUNT_ADDR - ip);
-}
-
-notrace void ftrace_code_disable(struct dyn_ftrace *rec)
-{
-	unsigned long ip;
-	union ftrace_code_union save;
-	struct ftrace_record *r =
-		container_of(rec, struct ftrace_record, rec);
-
-	ip = rec->ip;
-
-	save.e8		= 0xe8;
-	save.offset 	= ftrace_calc_offset(ip);
-
-	/* move the IP back to the start of the call */
-	ip -= CALL_BACK;
-
-	r->failed = ftrace_modify_code(ip, save.code, (char *)ftrace_nop);
-}
-
-static void notrace ftrace_replace_code(int saved)
-{
-	unsigned char *new = NULL, *old = NULL;
-	struct ftrace_record *rec;
-	struct ftrace_page *pg;
-	unsigned long ip;
-	int i;
-
-	if (saved)
-		old = (char *)ftrace_nop;
-	else
-		new = (char *)ftrace_nop;
-
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			union ftrace_code_union calc;
-			rec = &pg->records[i];
-
-			/* don't modify code that has already faulted */
-			if (rec->failed)
-				continue;
-
-			ip = rec->rec.ip;
-
-			calc.e8		= 0xe8;
-			calc.offset	= ftrace_calc_offset(ip);
-
-			if (saved)
-				new = calc.code;
-			else
-				old = calc.code;
-
-			ip -= CALL_BACK;
-
-			rec->failed = ftrace_modify_code(ip, old, new);
-		}
-	}
-
-}
-
-notrace void ftrace_startup_code(void)
-{
-	ftrace_replace_code(1);
-}
-
-notrace void ftrace_shutdown_code(void)
-{
-	ftrace_replace_code(0);
-}
-
-notrace void ftrace_shutdown_replenish(void)
-{
-	if (ftrace_pages->next)
-		return;
-
-	/* allocate another page */
-	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
-}
-
-notrace int __init ftrace_shutdown_arch_init(void)
+int __init ftrace_dyn_arch_init(void)
 {
 	const unsigned char *const *noptable = find_nop_table();
-	struct ftrace_page *pg;
-	int cnt;
-	int i;
 
 	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
 
-	/* allocate a few pages */
-	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!ftrace_pages_start)
-		return -1;
-
-	/*
-	 * Allocate a few more pages.
-	 *
-	 * TODO: have some parser search vmlinux before
-	 *   final linking to find all calls to ftrace.
-	 *   Then we can:
-	 *    a) know how many pages to allocate.
-	 *     and/or
-	 *    b) set up the table then.
-	 *
-	 *  The dynamic code is still necessary for
-	 *  modules.
-	 */
-
-	pg = ftrace_pages = ftrace_pages_start;
-
-	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
-
-	for (i = 0; i < cnt; i++) {
-		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
-
-		/* If we fail, we'll try later anyway */
-		if (!pg->next)
-			break;
-
-		pg = pg->next;
-	}
-
 	return 0;
 }
+
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ccd8537dbdb7..d509ad6c9cb8 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -42,19 +42,23 @@ extern void mcount(void);
 # define FTRACE_HASHBITS	10
 # define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
 
+enum {
+	FTRACE_FL_FAILED	= (1<<0),
+};
+
 struct dyn_ftrace {
 	struct hlist_node node;
 	unsigned long	  ip;
+	unsigned long	  flags;
 };
 
 /* defined in arch */
-extern struct dyn_ftrace *
-ftrace_alloc_shutdown_node(unsigned long ip);
-extern int ftrace_shutdown_arch_init(void);
-extern void ftrace_code_disable(struct dyn_ftrace *rec);
-extern void ftrace_startup_code(void);
-extern void ftrace_shutdown_code(void);
-extern void ftrace_shutdown_replenish(void);
+extern int ftrace_ip_converted(unsigned long ip);
+extern unsigned char *ftrace_nop_replace(void);
+extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
+extern int ftrace_dyn_arch_init(void);
+extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+			      unsigned char *new_code);
 #endif
 
 #ifdef CONFIG_FRAME_POINTER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d3de37299ba4..f6d9af3bf66b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -156,6 +156,21 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
 static DEFINE_SPINLOCK(ftrace_shutdown_lock);
 static DEFINE_MUTEX(ftraced_lock);
 
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct dyn_ftrace	records[];
+} __attribute__((packed));
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
 static int ftraced_trigger;
 static int ftraced_suspend;
 
@@ -184,6 +199,21 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
+static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+{
+	/* If this was already converted, skip it */
+	if (ftrace_ip_converted(ip))
+		return NULL;
+
+	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+		if (!ftrace_pages->next)
+			return NULL;
+		ftrace_pages = ftrace_pages->next;
+	}
+
+	return &ftrace_pages->records[ftrace_pages->index++];
+}
+
 static void notrace
 ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 {
@@ -252,6 +282,62 @@ static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
 	.func = ftrace_record_ip,
 };
 
+#define MCOUNT_ADDR ((long)(&mcount))
+
+static void notrace ftrace_replace_code(int saved)
+{
+	unsigned char *new = NULL, *old = NULL;
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	unsigned long ip;
+	int failed;
+	int i;
+
+	if (saved)
+		old = ftrace_nop_replace();
+	else
+		new = ftrace_nop_replace();
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			/* don't modify code that has already faulted */
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+
+			ip = rec->ip;
+
+			if (saved)
+				new = ftrace_call_replace(ip, MCOUNT_ADDR);
+			else
+				old = ftrace_call_replace(ip, MCOUNT_ADDR);
+
+			failed = ftrace_modify_code(ip, old, new);
+			if (failed)
+				rec->flags |= FTRACE_FL_FAILED;
+		}
+	}
+}
+
+static notrace void ftrace_startup_code(void)
+{
+	ftrace_replace_code(1);
+}
+
+static notrace void ftrace_shutdown_code(void)
+{
+	ftrace_replace_code(0);
+}
+
+static notrace void ftrace_shutdown_replenish(void)
+{
+	if (ftrace_pages->next)
+		return;
+
+	/* allocate another page */
+	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
 
 static int notrace __ftrace_modify_code(void *data)
 {
@@ -261,6 +347,23 @@ static int notrace __ftrace_modify_code(void *data)
 	return 0;
 }
 
+static notrace void
+ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip;
+	unsigned char *nop, *call;
+	int failed;
+
+	ip = rec->ip;
+
+	nop = ftrace_nop_replace();
+	call = ftrace_call_replace(ip, addr);
+
+	failed = ftrace_modify_code(ip, call, nop);
+	if (failed)
+		rec->flags |= FTRACE_FL_FAILED;
+}
+
 static void notrace ftrace_run_startup_code(void)
 {
 	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
@@ -346,7 +449,7 @@ static int notrace __ftrace_update_code(void *ignore)
 
 		/* all CPUS are stopped, we are safe to modify code */
 		hlist_for_each_entry(p, t, &head, node) {
-			ftrace_code_disable(p);
+			ftrace_code_disable(p, MCOUNT_ADDR);
 			ftrace_update_cnt++;
 		}
 
@@ -407,12 +510,59 @@ static int notrace ftraced(void *ignore)
 	return 0;
 }
 
+static int __init ftrace_dyn_table_alloc(void)
+{
+	struct ftrace_page *pg;
+	int cnt;
+	int i;
+	int ret;
+
+	ret = ftrace_dyn_arch_init();
+	if (ret)
+		return ret;
+
+	/* allocate a few pages */
+	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ftrace_pages_start)
+		return -1;
+
+	/*
+	 * Allocate a few more pages.
+	 *
+	 * TODO: have some parser search vmlinux before
+	 *   final linking to find all calls to ftrace.
+	 *   Then we can:
+	 *    a) know how many pages to allocate.
+	 *     and/or
+	 *    b) set up the table then.
+	 *
+	 *  The dynamic code is still necessary for
+	 *  modules.
+	 */
+
+	pg = ftrace_pages = ftrace_pages_start;
+
+	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+	for (i = 0; i < cnt; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+		/* If we fail, we'll try later anyway */
+		if (!pg->next)
+			break;
+
+		pg = pg->next;
+	}
+
+	return 0;
+}
+
 static int __init notrace ftrace_shutdown_init(void)
 {
 	struct task_struct *p;
 	int ret;
 
-	ret = ftrace_shutdown_arch_init();
+	ret = ftrace_dyn_table_alloc();
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From d61f82d06672f57fca410da6f7fffd15867db622 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: use dynamic patching for updating mcount calls

This patch replaces the indirect call to the mcount function
pointer with a direct call that will be patched by the
dynamic ftrace routines.

On boot up, the mcount function calls the ftace_stub function.
When the dynamic ftrace code is initialized, the ftrace_stub
is replaced with a call to the ftrace_record_ip, which records
the instruction pointers of the locations that call it.

Later, the ftraced daemon will call kstop_machine and patch all
the locations to nops.

When a ftrace is enabled, the original calls to mcount will now
be set top call ftrace_caller, which will do a direct call
to the registered ftrace function. This direct call is also patched
when the function that should be called is updated.

All patching is performed by a kstop_machine routine to prevent any
type of race conditions that is associated with modifying code
on the fly.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/entry_32.S |  47 +++++++++++-
 arch/x86/kernel/entry_64.S |  67 ++++++++++++++++-
 arch/x86/kernel/ftrace.c   |  41 +++++++++-
 include/linux/ftrace.h     |   7 +-
 kernel/trace/ftrace.c      | 183 ++++++++++++++++++++++++++-------------------
 5 files changed, 261 insertions(+), 84 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f47b9b5440d2..e6517ce0b824 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1110,10 +1110,50 @@ ENDPROC(xen_failsafe_callback)
 #endif	/* CONFIG_XEN */
 
 #ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
 ENTRY(mcount)
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
-
 .globl ftrace_stub
 ftrace_stub:
 	ret
@@ -1126,7 +1166,7 @@ trace:
 	movl 0xc(%esp), %eax
 	movl 0x4(%ebp), %edx
 
-	call   *ftrace_trace_function
+	call *ftrace_trace_function
 
 	popl %edx
 	popl %ecx
@@ -1134,7 +1174,8 @@ trace:
 
 	jmp ftrace_stub
 END(mcount)
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
 
 .section .rodata,"a"
 #include "syscall_table_32.S"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f046e0c64883..fe25e5febca3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,70 @@
 	.code64
 
 #ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+.globl ftrace_stub
+ftrace_stub:
+	retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
 	cmpq $ftrace_stub, ftrace_trace_function
 	jnz trace
@@ -89,7 +153,8 @@ trace:
 
 	jmp ftrace_stub
 END(mcount)
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
 
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index b69795efa226..9f44623e0072 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -109,10 +109,49 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	return faulted;
 }
 
-int __init ftrace_dyn_arch_init(void)
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip = (unsigned long)(&ftrace_call);
+	unsigned char old[5], *new;
+	int ret;
+
+	ip += CALL_BACK;
+
+	memcpy(old, &ftrace_call, 5);
+	new = ftrace_call_replace(ip, (unsigned long)func);
+	ret = ftrace_modify_code(ip, old, new);
+
+	return ret;
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+	unsigned long ip = (long)(&mcount_call);
+	unsigned long *addr = data;
+	unsigned char old[5], *new;
+
+	/* ip is at the location, but modify code will subtact this */
+	ip += CALL_BACK;
+
+	/*
+	 * Replace the mcount stub with a pointer to the
+	 * ip recorder function.
+	 */
+	memcpy(old, &mcount_call, 5);
+	new = ftrace_call_replace(ip, *addr);
+	*addr = ftrace_modify_code(ip, old, new);
+
+	return 0;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
 {
 	const unsigned char *const *noptable = find_nop_table();
 
+	/* This is running in kstop_machine */
+
+	ftrace_mcount_set(data);
+
 	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
 
 	return 0;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index d509ad6c9cb8..b0dd0093058f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -56,9 +56,14 @@ struct dyn_ftrace {
 extern int ftrace_ip_converted(unsigned long ip);
 extern unsigned char *ftrace_nop_replace(void);
 extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
-extern int ftrace_dyn_arch_init(void);
+extern int ftrace_dyn_arch_init(void *data);
+extern int ftrace_mcount_set(unsigned long *data);
 extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 			      unsigned char *new_code);
+extern int ftrace_update_ftrace_func(ftrace_func_t func);
+extern void ftrace_caller(void);
+extern void ftrace_call(void);
+extern void mcount_call(void);
 #endif
 
 #ifdef CONFIG_FRAME_POINTER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f6d9af3bf66b..88544f9bc0ed 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -26,14 +26,8 @@
 
 #include "trace.h"
 
-#ifdef CONFIG_DYNAMIC_FTRACE
-# define FTRACE_ENABLED_INIT 1
-#else
-# define FTRACE_ENABLED_INIT 0
-#endif
-
-int ftrace_enabled = FTRACE_ENABLED_INIT;
-static int last_ftrace_enabled = FTRACE_ENABLED_INIT;
+int ftrace_enabled;
+static int last_ftrace_enabled;
 
 static DEFINE_SPINLOCK(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
@@ -149,6 +143,14 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+};
+
 static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
 
 static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
@@ -199,12 +201,8 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
-static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
-	/* If this was already converted, skip it */
-	if (ftrace_ip_converted(ip))
-		return NULL;
-
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
 		if (!ftrace_pages->next)
 			return NULL;
@@ -215,7 +213,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
 }
 
 static void notrace
-ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
+ftrace_record_ip(unsigned long ip)
 {
 	struct dyn_ftrace *node;
 	unsigned long flags;
@@ -223,6 +221,9 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 	int resched;
 	int atomic;
 
+	if (!ftrace_enabled)
+		return;
+
 	resched = need_resched();
 	preempt_disable_notrace();
 
@@ -251,11 +252,12 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 
 	/*
 	 * There's a slight race that the ftraced will update the
-	 * hash and reset here. The arch alloc is responsible
-	 * for seeing if the IP has already changed, and if
-	 * it has, the alloc will fail.
+	 * hash and reset here. If it is already converted, skip it.
 	 */
-	node = ftrace_alloc_shutdown_node(ip);
+	if (ftrace_ip_converted(ip))
+		goto out_unlock;
+
+	node = ftrace_alloc_dyn_node(ip);
 	if (!node)
 		goto out_unlock;
 
@@ -277,11 +279,7 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 		preempt_enable_notrace();
 }
 
-static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
-{
-	.func = ftrace_record_ip,
-};
-
+#define FTRACE_ADDR ((long)(&ftrace_caller))
 #define MCOUNT_ADDR ((long)(&mcount))
 
 static void notrace ftrace_replace_code(int saved)
@@ -309,9 +307,9 @@ static void notrace ftrace_replace_code(int saved)
 			ip = rec->ip;
 
 			if (saved)
-				new = ftrace_call_replace(ip, MCOUNT_ADDR);
+				new = ftrace_call_replace(ip, FTRACE_ADDR);
 			else
-				old = ftrace_call_replace(ip, MCOUNT_ADDR);
+				old = ftrace_call_replace(ip, FTRACE_ADDR);
 
 			failed = ftrace_modify_code(ip, old, new);
 			if (failed)
@@ -320,16 +318,6 @@ static void notrace ftrace_replace_code(int saved)
 	}
 }
 
-static notrace void ftrace_startup_code(void)
-{
-	ftrace_replace_code(1);
-}
-
-static notrace void ftrace_shutdown_code(void)
-{
-	ftrace_replace_code(0);
-}
-
 static notrace void ftrace_shutdown_replenish(void)
 {
 	if (ftrace_pages->next)
@@ -339,16 +327,8 @@ static notrace void ftrace_shutdown_replenish(void)
 	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
 }
 
-static int notrace __ftrace_modify_code(void *data)
-{
-	void (*func)(void) = data;
-
-	func();
-	return 0;
-}
-
 static notrace void
-ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
+ftrace_code_disable(struct dyn_ftrace *rec)
 {
 	unsigned long ip;
 	unsigned char *nop, *call;
@@ -357,67 +337,113 @@ ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
 	ip = rec->ip;
 
 	nop = ftrace_nop_replace();
-	call = ftrace_call_replace(ip, addr);
+	call = ftrace_call_replace(ip, MCOUNT_ADDR);
 
 	failed = ftrace_modify_code(ip, call, nop);
 	if (failed)
 		rec->flags |= FTRACE_FL_FAILED;
 }
 
-static void notrace ftrace_run_startup_code(void)
+static int notrace __ftrace_modify_code(void *data)
 {
-	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
+	unsigned long addr;
+	int *command = data;
+
+	if (*command & FTRACE_ENABLE_CALLS)
+		ftrace_replace_code(1);
+	else if (*command & FTRACE_DISABLE_CALLS)
+		ftrace_replace_code(0);
+
+	if (*command & FTRACE_UPDATE_TRACE_FUNC)
+		ftrace_update_ftrace_func(ftrace_trace_function);
+
+	if (*command & FTRACE_ENABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_record_ip;
+		ftrace_mcount_set(&addr);
+	} else if (*command & FTRACE_DISABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_stub;
+		ftrace_mcount_set(&addr);
+	}
+
+	return 0;
 }
 
-static void notrace ftrace_run_shutdown_code(void)
+static void notrace ftrace_run_update_code(int command)
 {
-	stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS);
+	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
 }
 
+static ftrace_func_t saved_ftrace_func;
+
 static void notrace ftrace_startup(void)
 {
+	int command = 0;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend++;
-	if (ftraced_suspend != 1)
+	if (ftraced_suspend == 1)
+		command |= FTRACE_ENABLE_CALLS;
+
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
 		goto out;
-	__unregister_ftrace_function(&ftrace_shutdown_ops);
 
-	if (ftrace_enabled)
-		ftrace_run_startup_code();
+	ftrace_run_update_code(command);
  out:
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_shutdown(void)
 {
+	int command = 0;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend--;
-	if (ftraced_suspend)
-		goto out;
+	if (!ftraced_suspend)
+		command |= FTRACE_DISABLE_CALLS;
 
-	if (ftrace_enabled)
-		ftrace_run_shutdown_code();
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	if (!command || !ftrace_enabled)
+		goto out;
+
+	ftrace_run_update_code(command);
  out:
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_startup_sysctl(void)
 {
+	int command = FTRACE_ENABLE_MCOUNT;
+
 	mutex_lock(&ftraced_lock);
+	/* Force update next time */
+	saved_ftrace_func = NULL;
 	/* ftraced_suspend is true if we want ftrace running */
 	if (ftraced_suspend)
-		ftrace_run_startup_code();
+		command |= FTRACE_ENABLE_CALLS;
+
+	ftrace_run_update_code(command);
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_shutdown_sysctl(void)
 {
+	int command = FTRACE_DISABLE_MCOUNT;
+
 	mutex_lock(&ftraced_lock);
 	/* ftraced_suspend is true if ftrace is running */
 	if (ftraced_suspend)
-		ftrace_run_shutdown_code();
+		command |= FTRACE_DISABLE_CALLS;
+
+	ftrace_run_update_code(command);
 	mutex_unlock(&ftraced_lock);
 }
 
@@ -430,11 +456,13 @@ static int notrace __ftrace_update_code(void *ignore)
 	struct dyn_ftrace *p;
 	struct hlist_head head;
 	struct hlist_node *t;
+	int save_ftrace_enabled;
 	cycle_t start, stop;
 	int i;
 
-	/* Don't be calling ftrace ops now */
-	__unregister_ftrace_function(&ftrace_shutdown_ops);
+	/* Don't be recording funcs now */
+	save_ftrace_enabled = ftrace_enabled;
+	ftrace_enabled = 0;
 
 	start = now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
@@ -449,7 +477,7 @@ static int notrace __ftrace_update_code(void *ignore)
 
 		/* all CPUS are stopped, we are safe to modify code */
 		hlist_for_each_entry(p, t, &head, node) {
-			ftrace_code_disable(p, MCOUNT_ADDR);
+			ftrace_code_disable(p);
 			ftrace_update_cnt++;
 		}
 
@@ -459,7 +487,7 @@ static int notrace __ftrace_update_code(void *ignore)
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += ftrace_update_cnt;
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	ftrace_enabled = save_ftrace_enabled;
 
 	return 0;
 }
@@ -515,11 +543,6 @@ static int __init ftrace_dyn_table_alloc(void)
 	struct ftrace_page *pg;
 	int cnt;
 	int i;
-	int ret;
-
-	ret = ftrace_dyn_arch_init();
-	if (ret)
-		return ret;
 
 	/* allocate a few pages */
 	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
@@ -557,11 +580,19 @@ static int __init ftrace_dyn_table_alloc(void)
 	return 0;
 }
 
-static int __init notrace ftrace_shutdown_init(void)
+static int __init notrace ftrace_dynamic_init(void)
 {
 	struct task_struct *p;
+	unsigned long addr;
 	int ret;
 
+	addr = (unsigned long)ftrace_record_ip;
+	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+
+	/* ftrace_dyn_arch_init places the return code in addr */
+	if (addr)
+		return addr;
+
 	ret = ftrace_dyn_table_alloc();
 	if (ret)
 		return ret;
@@ -570,12 +601,12 @@ static int __init notrace ftrace_shutdown_init(void)
 	if (IS_ERR(p))
 		return -1;
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	last_ftrace_enabled = ftrace_enabled = 1;
 
 	return 0;
 }
 
-core_initcall(ftrace_shutdown_init);
+core_initcall(ftrace_dynamic_init);
 #else
 # define ftrace_startup()	  do { } while (0)
 # define ftrace_shutdown()	  do { } while (0)
@@ -599,9 +630,8 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	int ret;
 
 	mutex_lock(&ftrace_sysctl_lock);
-	ftrace_startup();
-
 	ret = __register_ftrace_function(ops);
+	ftrace_startup();
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
@@ -619,10 +649,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	ret = __unregister_ftrace_function(ops);
-
-	if (ftrace_list == &ftrace_list_end)
-		ftrace_shutdown();
-
+	ftrace_shutdown();
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
-- 
cgit v1.2.3


From f43fdad8627fec2d21df92799b254dceb66c9c3c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: fix kexec

disable the tracer while kexec pulls the rug from under the old
kernel.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/machine_kexec_32.c | 4 ++++
 arch/x86/kernel/machine_kexec_64.c | 4 ++++
 include/linux/ftrace.h             | 7 +++++++
 3 files changed, 15 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d0b234c9fc31..88923fd7a6fc 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -11,6 +11,8 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/numa.h>
+#include <linux/ftrace.h>
+
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
@@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
 
+	tracer_disable();
+
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 576a03db4511..1558fdc174f9 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -11,6 +11,8 @@
 #include <linux/string.h>
 #include <linux/reboot.h>
 #include <linux/numa.h>
+#include <linux/ftrace.h>
+
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
 
+	tracer_disable();
+
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f5911d2d42c3..a42390c1d6e1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -68,6 +68,13 @@ extern void ftrace_call(void);
 extern void mcount_call(void);
 #endif
 
+static inline void tracer_disable(void)
+{
+#ifdef CONFIG_FTRACE
+	ftrace_enabled = 0;
+#endif
+}
+
 #ifdef CONFIG_FRAME_POINTER
 /* TODO: need to fix this for ARM */
 # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-- 
cgit v1.2.3


From a56be3fe2f65f9f776e727bfd382e35db75911d6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: ftrace: fix the fault label in updating code

The fault label to jump to on fault of updating the code was misplaced
preventing the fault from being recorded.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9f44623e0072..498608c015fb 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -93,8 +93,8 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		"   movb %b4, 4(%2)\n"
 		"2:\n"
 		".section .fixup, \"ax\"\n"
-		"	movl $1, %0\n"
-		"3:	jmp 2b\n"
+		"3:	movl $1, %0\n"
+		"	jmp 2b\n"
 		".previous\n"
 		_ASM_EXTABLE(1b, 3b)
 		: "=r"(faulted), "=a"(replaced)
-- 
cgit v1.2.3


From 2f1dafe50cc4e58a239fd81bd47f87f32042a1ee Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:21:01 +0200
Subject: x86: fix SMP alternatives: use mutex instead of spinlock, text_poke
 is sleepable

text_poke is sleepable.
The original fix by Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/alternative.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index de240ba2e288..2763cb37b553 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,6 +1,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/kprobes.h>
 #include <linux/mm.h>
@@ -279,7 +279,7 @@ struct smp_alt_module {
 	struct list_head next;
 };
 static LIST_HEAD(smp_alt_modules);
-static DEFINE_SPINLOCK(smp_alt);
+static DEFINE_MUTEX(smp_alt);
 static int smp_mode = 1;	/* protected by smp_alt */
 
 void alternatives_smp_module_add(struct module *mod, char *name,
@@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name,
 		__func__, smp->locks, smp->locks_end,
 		smp->text, smp->text_end, smp->name);
 
-	spin_lock(&smp_alt);
+	mutex_lock(&smp_alt);
 	list_add_tail(&smp->next, &smp_alt_modules);
 	if (boot_cpu_has(X86_FEATURE_UP))
 		alternatives_smp_unlock(smp->locks, smp->locks_end,
 					smp->text, smp->text_end);
-	spin_unlock(&smp_alt);
+	mutex_unlock(&smp_alt);
 }
 
 void alternatives_smp_module_del(struct module *mod)
@@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod)
 	if (smp_alt_once || noreplace_smp)
 		return;
 
-	spin_lock(&smp_alt);
+	mutex_lock(&smp_alt);
 	list_for_each_entry(item, &smp_alt_modules, next) {
 		if (mod != item->mod)
 			continue;
 		list_del(&item->next);
-		spin_unlock(&smp_alt);
+		mutex_unlock(&smp_alt);
 		DPRINTK("%s: %s\n", __func__, item->name);
 		kfree(item);
 		return;
 	}
-	spin_unlock(&smp_alt);
+	mutex_unlock(&smp_alt);
 }
 
 void alternatives_smp_switch(int smp)
@@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp)
 		return;
 	BUG_ON(!smp && (num_online_cpus() > 1));
 
-	spin_lock(&smp_alt);
+	mutex_lock(&smp_alt);
 
 	/*
 	 * Avoid unnecessary switches because it forces JIT based VMs to
@@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp)
 						mod->text, mod->text_end);
 	}
 	smp_mode = smp;
-	spin_unlock(&smp_alt);
+	mutex_unlock(&smp_alt);
 }
 
 #endif
-- 
cgit v1.2.3


From 37135677e653537ffc6e7def679443272a1c03c3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 14 May 2008 08:10:31 +0200
Subject: ftrace: fix mcount export bug

David S. Miller noticed the following bug: the -pg instrumentation
function callback is named differently on each platform. On x86 it
is mcount, on sparc it is _mcount. So the export does not make sense
in kernel/trace/ftrace.c - move it to x86.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i386_ksyms_32.c  |  9 ++++++++-
 arch/x86/kernel/x8664_ksyms_64.c | 11 +++++++++--
 kernel/trace/ftrace.c            |  3 ---
 3 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index deb43785e923..29999dbb754c 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,7 +1,14 @@
+#include <linux/ftrace.h>
 #include <linux/module.h>
+
 #include <asm/checksum.h>
-#include <asm/desc.h>
 #include <asm/pgtable.h>
+#include <asm/desc.h>
+
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
 
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index f6c05d0410fb..122885bc5f3b 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -1,15 +1,22 @@
 /* Exports for assembly files.
    All C exports should go in the respective C files. */
 
+#include <linux/ftrace.h>
 #include <linux/module.h>
-#include <net/checksum.h>
 #include <linux/smp.h>
 
+#include <net/checksum.h>
+
 #include <asm/processor.h>
-#include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#include <asm/uaccess.h>
 #include <asm/desc.h>
 
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
+
 EXPORT_SYMBOL(kernel_thread);
 
 EXPORT_SYMBOL(__get_user_1);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 07b2a14943f8..a3e47f43f8a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -50,9 +50,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 
-/* mcount is defined per arch in assembly */
-EXPORT_SYMBOL(mcount);
-
 void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
 	struct ftrace_ops *op = ftrace_list;
-- 
cgit v1.2.3


From 7fa09f24b477ad41b821713eba757b3aa7a2864a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 14 May 2008 21:30:32 -0400
Subject: ftrace: use the new kbuild CFLAGS_REMOVE for x86/kernel directory

This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro
in the x86/kernel directory.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e142091524b0..739d49acd2f1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -6,6 +6,13 @@ extra-y                := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
+ifdef CONFIG_FTRACE
+# Do not profile debug utilities
+CFLAGS_REMOVE_tsc_64.o = -pg
+CFLAGS_REMOVE_tsc_32.o = -pg
+CFLAGS_REMOVE_rtc.o = -pg
+endif
+
 #
 # vsyscalls (which work on the user stack) should have
 # no stack-protector checks:
-- 
cgit v1.2.3


From 40bd21740012132afb62d78ac3e6b82372b2fbc2 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 21 May 2008 11:44:02 +0200
Subject: x86: automatical unification of i8259.c

Make conversion of i8259 very mechanical -- i8259 was generated by
---
 arch/x86/kernel/Makefile   |   2 +-
 arch/x86/kernel/i8259.c    | 368 +++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/i8259_32.c | 295 ------------------------------------
 arch/x86/kernel/i8259_64.c | 309 -------------------------------------
 include/asm-x86/i8259.h    |   2 +
 5 files changed, 371 insertions(+), 605 deletions(-)
 create mode 100644 arch/x86/kernel/i8259.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..f71a76ef2596 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,7 +18,7 @@ CFLAGS_tsc_64.o		:= $(nostackp)
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
-obj-y			+= setup_$(BITS).o i8259_$(BITS).o setup.o
+obj-y			+= setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
new file mode 100644
index 000000000000..2decba6b0101
--- /dev/null
+++ b/arch/x86/kernel/i8259.c
@@ -0,0 +1,368 @@
+#ifdef CONFIG_X86_64
+#include <linux/linkage.h>
+#endif /* CONFIG_X86_64 */
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#ifdef CONFIG_X86_64
+#include <linux/timex.h>
+#endif /* CONFIG_X86_64 */
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/acpi.h>
+#endif /* CONFIG_X86_64 */
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#ifndef CONFIG_X86_64
+#include <asm/timer.h>
+#else /* CONFIG_X86_64 */
+#include <asm/hw_irq.h>
+#endif /* CONFIG_X86_64 */
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#ifndef CONFIG_X86_64
+#include <asm/arch_hooks.h>
+#endif /* ! CONFIG_X86_64 */
+#include <asm/i8259.h>
+
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ */
+
+static int i8259A_auto_eoi;
+DEFINE_SPINLOCK(i8259A_lock);
+static void mask_and_ack_8259A(unsigned int);
+
+struct irq_chip i8259A_chip = {
+	.name		= "XT-PIC",
+	.mask		= disable_8259A_irq,
+	.disable	= disable_8259A_irq,
+	.unmask		= enable_8259A_irq,
+	.mask_ack	= mask_and_ack_8259A,
+};
+
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+unsigned int cached_irq_mask = 0xffff;
+
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+
+void disable_8259A_irq(unsigned int irq)
+{
+	unsigned int mask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask |= mask;
+	if (irq & 8)
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+	else
+		outb(cached_master_mask, PIC_MASTER_IMR);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+void enable_8259A_irq(unsigned int irq)
+{
+	unsigned int mask = ~(1 << irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask &= mask;
+	if (irq & 8)
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+	else
+		outb(cached_master_mask, PIC_MASTER_IMR);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+int i8259A_irq_pending(unsigned int irq)
+{
+	unsigned int mask = 1<<irq;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	if (irq < 8)
+		ret = inb(PIC_MASTER_CMD) & mask;
+	else
+		ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+
+	return ret;
+}
+
+void make_8259A_irq(unsigned int irq)
+{
+	disable_irq_nosync(irq);
+	io_apic_irqs &= ~(1<<irq);
+	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
+				      "XT");
+	enable_irq(irq);
+}
+
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+	int value;
+	int irqmask = 1<<irq;
+
+	if (irq < 8) {
+		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
+		value = inb(PIC_MASTER_CMD) & irqmask;
+		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
+		return value;
+	}
+	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
+	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
+	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
+	return value;
+}
+
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(unsigned int irq)
+{
+	unsigned int irqmask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	/*
+	 * Lightweight spurious IRQ detection. We do not want
+	 * to overdo spurious IRQ handling - it's usually a sign
+	 * of hardware problems, so we only do the checks we can
+	 * do without slowing down good hardware unnecessarily.
+	 *
+	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
+	 * usually resulting from the 8259A-1|2 PICs) occur
+	 * even if the IRQ is masked in the 8259A. Thus we
+	 * can check spurious 8259A IRQs without doing the
+	 * quite slow i8259A_irq_real() call for every IRQ.
+	 * This does not cover 100% of spurious interrupts,
+	 * but should be enough to warn the user that there
+	 * is something bad going on ...
+	 */
+	if (cached_irq_mask & irqmask)
+		goto spurious_8259A_irq;
+	cached_irq_mask |= irqmask;
+
+handle_real_irq:
+	if (irq & 8) {
+		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+#ifndef CONFIG_X86_64
+		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
+		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+#else /* CONFIG_X86_64 */
+		/* 'Specific EOI' to slave */
+		outb(0x60+(irq&7),PIC_SLAVE_CMD);
+		 /* 'Specific EOI' to master-IRQ2 */
+		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
+#endif /* CONFIG_X86_64 */
+	} else {
+		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
+		outb(cached_master_mask, PIC_MASTER_IMR);
+#ifndef CONFIG_X86_64
+		outb(0x60+irq,PIC_MASTER_CMD);	/* 'Specific EOI to master */
+#else /* CONFIG_X86_64 */
+		/* 'Specific EOI' to master */
+		outb(0x60+irq,PIC_MASTER_CMD);
+#endif /* CONFIG_X86_64 */
+	}
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+	return;
+
+spurious_8259A_irq:
+	/*
+	 * this is the slow path - should happen rarely.
+	 */
+	if (i8259A_irq_real(irq))
+		/*
+		 * oops, the IRQ _is_ in service according to the
+		 * 8259A - not spurious, go handle it.
+		 */
+		goto handle_real_irq;
+
+	{
+		static int spurious_irq_mask;
+		/*
+		 * At this point we can be sure the IRQ is spurious,
+		 * lets ACK and report it. [once per IRQ]
+		 */
+		if (!(spurious_irq_mask & irqmask)) {
+#ifndef CONFIG_X86_64
+			printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+#else /* CONFIG_X86_64 */
+			printk(KERN_DEBUG
+			       "spurious 8259A interrupt: IRQ%d.\n", irq);
+#endif /* CONFIG_X86_64 */
+			spurious_irq_mask |= irqmask;
+		}
+		atomic_inc(&irq_err_count);
+		/*
+		 * Theoretically we do not have to handle this IRQ,
+		 * but in Linux this does not cause problems and is
+		 * simpler for us.
+		 */
+		goto handle_real_irq;
+	}
+}
+
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+	outb(trigger[0], 0x4d0);
+	outb(trigger[1], 0x4d1);
+}
+
+static void save_ELCR(char *trigger)
+{
+	/* IRQ 0,1,2,8,13 are marked as reserved */
+	trigger[0] = inb(0x4d0) & 0xF8;
+	trigger[1] = inb(0x4d1) & 0xDE;
+}
+
+static int i8259A_resume(struct sys_device *dev)
+{
+	init_8259A(i8259A_auto_eoi);
+	restore_ELCR(irq_trigger);
+	return 0;
+}
+
+static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+{
+	save_ELCR(irq_trigger);
+	return 0;
+}
+
+static int i8259A_shutdown(struct sys_device *dev)
+{
+	/* Put the i8259A into a quiescent state that
+	 * the kernel initialization code can get it
+	 * out of.
+	 */
+	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */
+	return 0;
+}
+
+static struct sysdev_class i8259_sysdev_class = {
+	.name = "i8259",
+	.suspend = i8259A_suspend,
+	.resume = i8259A_resume,
+	.shutdown = i8259A_shutdown,
+};
+
+static struct sys_device device_i8259A = {
+	.id	= 0,
+	.cls	= &i8259_sysdev_class,
+};
+
+static int __init i8259A_init_sysfs(void)
+{
+	int error = sysdev_class_register(&i8259_sysdev_class);
+	if (!error)
+		error = sysdev_register(&device_i8259A);
+	return error;
+}
+
+device_initcall(i8259A_init_sysfs);
+
+void init_8259A(int auto_eoi)
+{
+	unsigned long flags;
+
+	i8259A_auto_eoi = auto_eoi;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
+
+	/*
+	 * outb_pic - this has to work on a wide range of PC hardware.
+	 */
+	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
+#ifndef CONFIG_X86_64
+	outb_pic(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
+#else /* CONFIG_X86_64 */
+	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
+	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+	/* 8259A-1 (the master) has a slave on IR2 */
+	outb_pic(0x04, PIC_MASTER_IMR);
+#endif /* CONFIG_X86_64 */
+	if (auto_eoi)	/* master does Auto EOI */
+		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+	else		/* master expects normal EOI */
+		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+
+	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
+#ifndef CONFIG_X86_64
+	outb_pic(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);	/* 8259A-2 is a slave on master's IR2 */
+	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
+#else /* CONFIG_X86_64 */
+	/* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
+	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+	/* 8259A-2 is a slave on master's IR2 */
+	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
+	/* (slave's support for AEOI in flat mode is to be investigated) */
+	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
+
+#endif /* CONFIG_X86_64 */
+	if (auto_eoi)
+		/*
+		 * In AEOI mode we just have to mask the interrupt
+		 * when acking.
+		 */
+		i8259A_chip.mask_ack = disable_8259A_irq;
+	else
+		i8259A_chip.mask_ack = mask_and_ack_8259A;
+
+	udelay(100);		/* wait for 8259A to initialize */
+
+	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index fe631967d625..d66914287ee1 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -21,302 +21,7 @@
 #include <asm/arch_hooks.h>
 #include <asm/i8259.h>
 
-/*
- * This is the 'legacy' 8259A Programmable Interrupt Controller,
- * present in the majority of PC/AT boxes.
- * plus some generic x86 specific things if generic specifics makes
- * any sense at all.
- */
-
-static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-
-static struct irq_chip i8259A_chip = {
-	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
-	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
-};
-
-/*
- * 8259A PIC functions to handle ISA devices:
- */
-
-/*
- * This contains the irq mask for both 8259A irq controllers,
- */
-unsigned int cached_irq_mask = 0xffff;
-
-/*
- * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
- * boards the timer interrupt is not really connected to any IO-APIC pin,
- * it's fed to the master 8259A's IR0 line only.
- *
- * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
- * this 'mixed mode' IRQ handling costs nothing because it's only used
- * at IRQ setup time.
- */
-unsigned long io_apic_irqs;
-
-void disable_8259A_irq(unsigned int irq)
-{
-	unsigned int mask = 1 << irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	cached_irq_mask |= mask;
-	if (irq & 8)
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-	else
-		outb(cached_master_mask, PIC_MASTER_IMR);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-void enable_8259A_irq(unsigned int irq)
-{
-	unsigned int mask = ~(1 << irq);
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	cached_irq_mask &= mask;
-	if (irq & 8)
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-	else
-		outb(cached_master_mask, PIC_MASTER_IMR);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-int i8259A_irq_pending(unsigned int irq)
-{
-	unsigned int mask = 1<<irq;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	if (irq < 8)
-		ret = inb(PIC_MASTER_CMD) & mask;
-	else
-		ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-
-	return ret;
-}
-
-void make_8259A_irq(unsigned int irq)
-{
-	disable_irq_nosync(irq);
-	io_apic_irqs &= ~(1<<irq);
-	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
-				      "XT");
-	enable_irq(irq);
-}
-
-/*
- * This function assumes to be called rarely. Switching between
- * 8259A registers is slow.
- * This has to be protected by the irq controller spinlock
- * before being called.
- */
-static inline int i8259A_irq_real(unsigned int irq)
-{
-	int value;
-	int irqmask = 1<<irq;
-
-	if (irq < 8) {
-		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
-		value = inb(PIC_MASTER_CMD) & irqmask;
-		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
-		return value;
-	}
-	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
-	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
-	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
-	return value;
-}
-
-/*
- * Careful! The 8259A is a fragile beast, it pretty
- * much _has_ to be done exactly like this (mask it
- * first, _then_ send the EOI, and the order of EOI
- * to the two 8259s is important!
- */
-static void mask_and_ack_8259A(unsigned int irq)
-{
-	unsigned int irqmask = 1 << irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	/*
-	 * Lightweight spurious IRQ detection. We do not want
-	 * to overdo spurious IRQ handling - it's usually a sign
-	 * of hardware problems, so we only do the checks we can
-	 * do without slowing down good hardware unnecessarily.
-	 *
-	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
-	 * usually resulting from the 8259A-1|2 PICs) occur
-	 * even if the IRQ is masked in the 8259A. Thus we
-	 * can check spurious 8259A IRQs without doing the
-	 * quite slow i8259A_irq_real() call for every IRQ.
-	 * This does not cover 100% of spurious interrupts,
-	 * but should be enough to warn the user that there
-	 * is something bad going on ...
-	 */
-	if (cached_irq_mask & irqmask)
-		goto spurious_8259A_irq;
-	cached_irq_mask |= irqmask;
 
-handle_real_irq:
-	if (irq & 8) {
-		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
-		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
-	} else {
-		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
-		outb(cached_master_mask, PIC_MASTER_IMR);
-		outb(0x60+irq,PIC_MASTER_CMD);	/* 'Specific EOI to master */
-	}
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-	return;
-
-spurious_8259A_irq:
-	/*
-	 * this is the slow path - should happen rarely.
-	 */
-	if (i8259A_irq_real(irq))
-		/*
-		 * oops, the IRQ _is_ in service according to the
-		 * 8259A - not spurious, go handle it.
-		 */
-		goto handle_real_irq;
-
-	{
-		static int spurious_irq_mask;
-		/*
-		 * At this point we can be sure the IRQ is spurious,
-		 * lets ACK and report it. [once per IRQ]
-		 */
-		if (!(spurious_irq_mask & irqmask)) {
-			printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
-			spurious_irq_mask |= irqmask;
-		}
-		atomic_inc(&irq_err_count);
-		/*
-		 * Theoretically we do not have to handle this IRQ,
-		 * but in Linux this does not cause problems and is
-		 * simpler for us.
-		 */
-		goto handle_real_irq;
-	}
-}
-
-static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
-static void restore_ELCR(char *trigger)
-{
-	outb(trigger[0], 0x4d0);
-	outb(trigger[1], 0x4d1);
-}
-
-static void save_ELCR(char *trigger)
-{
-	/* IRQ 0,1,2,8,13 are marked as reserved */
-	trigger[0] = inb(0x4d0) & 0xF8;
-	trigger[1] = inb(0x4d1) & 0xDE;
-}
-
-static int i8259A_resume(struct sys_device *dev)
-{
-	init_8259A(i8259A_auto_eoi);
-	restore_ELCR(irq_trigger);
-	return 0;
-}
-
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
-{
-	save_ELCR(irq_trigger);
-	return 0;
-}
-
-static int i8259A_shutdown(struct sys_device *dev)
-{
-	/* Put the i8259A into a quiescent state that
-	 * the kernel initialization code can get it
-	 * out of.
-	 */
-	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
-	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */
-	return 0;
-}
-
-static struct sysdev_class i8259_sysdev_class = {
-	.name = "i8259",
-	.suspend = i8259A_suspend,
-	.resume = i8259A_resume,
-	.shutdown = i8259A_shutdown,
-};
-
-static struct sys_device device_i8259A = {
-	.id	= 0,
-	.cls	= &i8259_sysdev_class,
-};
-
-static int __init i8259A_init_sysfs(void)
-{
-	int error = sysdev_class_register(&i8259_sysdev_class);
-	if (!error)
-		error = sysdev_register(&device_i8259A);
-	return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
-void init_8259A(int auto_eoi)
-{
-	unsigned long flags;
-
-	i8259A_auto_eoi = auto_eoi;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-
-	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
-	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
-
-	/*
-	 * outb_pic - this has to work on a wide range of PC hardware.
-	 */
-	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
-	outb_pic(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
-	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
-	if (auto_eoi)	/* master does Auto EOI */
-		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
-	else		/* master expects normal EOI */
-		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
-
-	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
-	outb_pic(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
-	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);	/* 8259A-2 is a slave on master's IR2 */
-	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
-	if (auto_eoi)
-		/*
-		 * In AEOI mode we just have to mask the interrupt
-		 * when acking.
-		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
-	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
-
-	udelay(100);		/* wait for 8259A to initialize */
-
-	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
-	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
-
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
 
 /*
  * Note that on a 486, we don't want to do a SIGFPE on an irq13
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index fa57a1568508..a95d2bd27e9d 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -87,315 +87,6 @@ static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) =
 #undef IRQ
 #undef IRQLIST_16
 
-/*
- * This is the 'legacy' 8259A Programmable Interrupt Controller,
- * present in the majority of PC/AT boxes.
- * plus some generic x86 specific things if generic specifics makes
- * any sense at all.
- * this file should become arch/i386/kernel/irq.c when the old irq.c
- * moves to arch independent land
- */
-
-static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-
-static struct irq_chip i8259A_chip = {
-	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
-	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
-};
-
-/*
- * 8259A PIC functions to handle ISA devices:
- */
-
-/*
- * This contains the irq mask for both 8259A irq controllers,
- */
-unsigned int cached_irq_mask = 0xffff;
-
-/*
- * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
- * boards the timer interrupt is not really connected to any IO-APIC pin,
- * it's fed to the master 8259A's IR0 line only.
- *
- * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
- * this 'mixed mode' IRQ handling costs nothing because it's only used
- * at IRQ setup time.
- */
-unsigned long io_apic_irqs;
-
-void disable_8259A_irq(unsigned int irq)
-{
-	unsigned int mask = 1 << irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	cached_irq_mask |= mask;
-	if (irq & 8)
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-	else
-		outb(cached_master_mask, PIC_MASTER_IMR);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-void enable_8259A_irq(unsigned int irq)
-{
-	unsigned int mask = ~(1 << irq);
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	cached_irq_mask &= mask;
-	if (irq & 8)
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-	else
-		outb(cached_master_mask, PIC_MASTER_IMR);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-int i8259A_irq_pending(unsigned int irq)
-{
-	unsigned int mask = 1<<irq;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	if (irq < 8)
-		ret = inb(PIC_MASTER_CMD) & mask;
-	else
-		ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-
-	return ret;
-}
-
-void make_8259A_irq(unsigned int irq)
-{
-	disable_irq_nosync(irq);
-	io_apic_irqs &= ~(1<<irq);
-	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
-				      "XT");
-	enable_irq(irq);
-}
-
-/*
- * This function assumes to be called rarely. Switching between
- * 8259A registers is slow.
- * This has to be protected by the irq controller spinlock
- * before being called.
- */
-static inline int i8259A_irq_real(unsigned int irq)
-{
-	int value;
-	int irqmask = 1<<irq;
-
-	if (irq < 8) {
-		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
-		value = inb(PIC_MASTER_CMD) & irqmask;
-		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
-		return value;
-	}
-	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
-	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
-	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
-	return value;
-}
-
-/*
- * Careful! The 8259A is a fragile beast, it pretty
- * much _has_ to be done exactly like this (mask it
- * first, _then_ send the EOI, and the order of EOI
- * to the two 8259s is important!
- */
-static void mask_and_ack_8259A(unsigned int irq)
-{
-	unsigned int irqmask = 1 << irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	/*
-	 * Lightweight spurious IRQ detection. We do not want
-	 * to overdo spurious IRQ handling - it's usually a sign
-	 * of hardware problems, so we only do the checks we can
-	 * do without slowing down good hardware unnecessarily.
-	 *
-	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
-	 * usually resulting from the 8259A-1|2 PICs) occur
-	 * even if the IRQ is masked in the 8259A. Thus we
-	 * can check spurious 8259A IRQs without doing the
-	 * quite slow i8259A_irq_real() call for every IRQ.
-	 * This does not cover 100% of spurious interrupts,
-	 * but should be enough to warn the user that there
-	 * is something bad going on ...
-	 */
-	if (cached_irq_mask & irqmask)
-		goto spurious_8259A_irq;
-	cached_irq_mask |= irqmask;
-
-handle_real_irq:
-	if (irq & 8) {
-		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-		/* 'Specific EOI' to slave */
-		outb(0x60+(irq&7),PIC_SLAVE_CMD);
-		 /* 'Specific EOI' to master-IRQ2 */
-		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
-	} else {
-		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
-		outb(cached_master_mask, PIC_MASTER_IMR);
-		/* 'Specific EOI' to master */
-		outb(0x60+irq,PIC_MASTER_CMD);
-	}
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-	return;
-
-spurious_8259A_irq:
-	/*
-	 * this is the slow path - should happen rarely.
-	 */
-	if (i8259A_irq_real(irq))
-		/*
-		 * oops, the IRQ _is_ in service according to the
-		 * 8259A - not spurious, go handle it.
-		 */
-		goto handle_real_irq;
-
-	{
-		static int spurious_irq_mask;
-		/*
-		 * At this point we can be sure the IRQ is spurious,
-		 * lets ACK and report it. [once per IRQ]
-		 */
-		if (!(spurious_irq_mask & irqmask)) {
-			printk(KERN_DEBUG
-			       "spurious 8259A interrupt: IRQ%d.\n", irq);
-			spurious_irq_mask |= irqmask;
-		}
-		atomic_inc(&irq_err_count);
-		/*
-		 * Theoretically we do not have to handle this IRQ,
-		 * but in Linux this does not cause problems and is
-		 * simpler for us.
-		 */
-		goto handle_real_irq;
-	}
-}
-
-static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
-static void restore_ELCR(char *trigger)
-{
-	outb(trigger[0], 0x4d0);
-	outb(trigger[1], 0x4d1);
-}
-
-static void save_ELCR(char *trigger)
-{
-	/* IRQ 0,1,2,8,13 are marked as reserved */
-	trigger[0] = inb(0x4d0) & 0xF8;
-	trigger[1] = inb(0x4d1) & 0xDE;
-}
-
-static int i8259A_resume(struct sys_device *dev)
-{
-	init_8259A(i8259A_auto_eoi);
-	restore_ELCR(irq_trigger);
-	return 0;
-}
-
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
-{
-	save_ELCR(irq_trigger);
-	return 0;
-}
-
-static int i8259A_shutdown(struct sys_device *dev)
-{
-	/* Put the i8259A into a quiescent state that
-	 * the kernel initialization code can get it
-	 * out of.
-	 */
-	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
-	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */
-	return 0;
-}
-
-static struct sysdev_class i8259_sysdev_class = {
-	.name = "i8259",
-	.suspend = i8259A_suspend,
-	.resume = i8259A_resume,
-	.shutdown = i8259A_shutdown,
-};
-
-static struct sys_device device_i8259A = {
-	.id	= 0,
-	.cls	= &i8259_sysdev_class,
-};
-
-static int __init i8259A_init_sysfs(void)
-{
-	int error = sysdev_class_register(&i8259_sysdev_class);
-	if (!error)
-		error = sysdev_register(&device_i8259A);
-	return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
-void init_8259A(int auto_eoi)
-{
-	unsigned long flags;
-
-	i8259A_auto_eoi = auto_eoi;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-
-	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
-	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
-
-	/*
-	 * outb_pic - this has to work on a wide range of PC hardware.
-	 */
-	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
-	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
-	/* 8259A-1 (the master) has a slave on IR2 */
-	outb_pic(0x04, PIC_MASTER_IMR);
-	if (auto_eoi)	/* master does Auto EOI */
-		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
-	else		/* master expects normal EOI */
-		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
-
-	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
-	/* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
-	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
-	/* 8259A-2 is a slave on master's IR2 */
-	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
-	/* (slave's support for AEOI in flat mode is to be investigated) */
-	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
-
-	if (auto_eoi)
-		/*
-		 * In AEOI mode we just have to mask the interrupt
-		 * when acking.
-		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
-	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
-
-	udelay(100);		/* wait for 8259A to initialize */
-
-	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
-	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
-
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
 
 
diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h
index 45d4df3e51e6..2f98df91f1f2 100644
--- a/include/asm-x86/i8259.h
+++ b/include/asm-x86/i8259.h
@@ -55,4 +55,6 @@ static inline void outb_pic(unsigned char value, unsigned int port)
 	udelay(2);
 }
 
+extern struct irq_chip i8259A_chip;
+
 #endif	/* __ASM_I8259_H__ */
-- 
cgit v1.2.3


From 6b4d3afbe722ee71b8da346c2c4393938440c471 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 21 May 2008 11:47:24 +0200
Subject: x86: i8259.c: remove #ifdefs around includes

Remove #ifdefs around includes; including too much should be always
safe.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: macro@ds2.pg.gda.pl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i8259.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 2decba6b0101..8b7eb0cb2e7e 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -1,14 +1,10 @@
-#ifdef CONFIG_X86_64
 #include <linux/linkage.h>
-#endif /* CONFIG_X86_64 */
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
-#ifdef CONFIG_X86_64
 #include <linux/timex.h>
-#endif /* CONFIG_X86_64 */
 #include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/init.h>
@@ -16,24 +12,17 @@
 #include <linux/sysdev.h>
 #include <linux/bitops.h>
 
-#ifdef CONFIG_X86_64
 #include <asm/acpi.h>
-#endif /* CONFIG_X86_64 */
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <asm/io.h>
-#ifndef CONFIG_X86_64
 #include <asm/timer.h>
-#else /* CONFIG_X86_64 */
 #include <asm/hw_irq.h>
-#endif /* CONFIG_X86_64 */
 #include <asm/pgtable.h>
 #include <asm/delay.h>
 #include <asm/desc.h>
 #include <asm/apic.h>
-#ifndef CONFIG_X86_64
 #include <asm/arch_hooks.h>
-#endif /* ! CONFIG_X86_64 */
 #include <asm/i8259.h>
 
 /*
-- 
cgit v1.2.3


From 4f6f3bac18c80ff6cf072d90796755c69cc4c748 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 21 May 2008 11:52:52 +0200
Subject: x86: i8259.c: remove trivial ifdefs

Remove #ifdefs where the only difference is formatting of comments.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: macro@ds2.pg.gda.pl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i8259.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 8b7eb0cb2e7e..433e49edfd4f 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -175,24 +175,14 @@ handle_real_irq:
 	if (irq & 8) {
 		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
 		outb(cached_slave_mask, PIC_SLAVE_IMR);
-#ifndef CONFIG_X86_64
-		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
-		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
-#else /* CONFIG_X86_64 */
 		/* 'Specific EOI' to slave */
-		outb(0x60+(irq&7),PIC_SLAVE_CMD);
+		outb(0x60+(irq&7), PIC_SLAVE_CMD);
 		 /* 'Specific EOI' to master-IRQ2 */
-		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
-#endif /* CONFIG_X86_64 */
+		outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD);
 	} else {
 		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
 		outb(cached_master_mask, PIC_MASTER_IMR);
-#ifndef CONFIG_X86_64
-		outb(0x60+irq,PIC_MASTER_CMD);	/* 'Specific EOI to master */
-#else /* CONFIG_X86_64 */
-		/* 'Specific EOI' to master */
-		outb(0x60+irq,PIC_MASTER_CMD);
-#endif /* CONFIG_X86_64 */
+		outb(0x60+irq, PIC_MASTER_CMD);	/* 'Specific EOI to master */
 	}
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 	return;
@@ -215,12 +205,8 @@ spurious_8259A_irq:
 		 * lets ACK and report it. [once per IRQ]
 		 */
 		if (!(spurious_irq_mask & irqmask)) {
-#ifndef CONFIG_X86_64
-			printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
-#else /* CONFIG_X86_64 */
 			printk(KERN_DEBUG
 			       "spurious 8259A interrupt: IRQ%d.\n", irq);
-#endif /* CONFIG_X86_64 */
 			spurious_irq_mask |= irqmask;
 		}
 		atomic_inc(&irq_err_count);
-- 
cgit v1.2.3


From 4d9a6e6128dd8ada0d2feed2a9bb6506043e4655 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 21 May 2008 11:57:52 +0200
Subject: x86: i8259: cleanup codingstyle

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: macro@ds2.pg.gda.pl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i8259.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 433e49edfd4f..7a0fda8f01b5 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -129,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq)
 	int irqmask = 1<<irq;
 
 	if (irq < 8) {
-		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
+		outb(0x0B, PIC_MASTER_CMD);	/* ISR register */
 		value = inb(PIC_MASTER_CMD) & irqmask;
-		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
+		outb(0x0A, PIC_MASTER_CMD);	/* back to the IRR register */
 		return value;
 	}
-	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
+	outb(0x0B, PIC_SLAVE_CMD);	/* ISR register */
 	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
-	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
+	outb(0x0A, PIC_SLAVE_CMD);	/* back to the IRR register */
 	return value;
 }
 
-- 
cgit v1.2.3


From f20b11e716936f85850654c49e06a1f955b9e5fc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 May 2008 15:59:58 +0200
Subject: x86: rename the i8259_32/64.c leftovers to initirq_32/64.c

The leftovers of the i8259 unification have nothing to do with i8259
at all. They contain interrupt init code and the i8259_xx name is just
misleading now.

Rename them to initirq_32/64.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile     |   2 +-
 arch/x86/kernel/i8259_32.c   | 114 ------------------------
 arch/x86/kernel/i8259_64.c   | 203 -------------------------------------------
 arch/x86/kernel/irqinit_32.c | 114 ++++++++++++++++++++++++
 arch/x86/kernel/irqinit_64.c | 203 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 318 insertions(+), 318 deletions(-)
 delete mode 100644 arch/x86/kernel/i8259_32.c
 delete mode 100644 arch/x86/kernel/i8259_64.c
 create mode 100644 arch/x86/kernel/irqinit_32.c
 create mode 100644 arch/x86/kernel/irqinit_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f71a76ef2596..2a53ad2cb450 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,7 +18,7 @@ CFLAGS_tsc_64.o		:= $(nostackp)
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
-obj-y			+= setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o
+obj-y			+= setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
deleted file mode 100644
index d66914287ee1..000000000000
--- a/arch/x86/kernel/i8259_32.c
+++ /dev/null
@@ -1,114 +0,0 @@
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/timer.h>
-#include <asm/pgtable.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/arch_hooks.h>
-#include <asm/i8259.h>
-
-
-
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
- 
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
-	extern void math_error(void __user *);
-	outb(0,0xF0);
-	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
-		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->ip);
-	return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
-	.handler = math_error_irq,
-	.mask = CPU_MASK_NONE,
-	.name = "fpu",
-};
-
-void __init init_ISA_irqs (void)
-{
-	int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	init_bsp_APIC();
-#endif
-	init_8259A(0);
-
-	/*
-	 * 16 old-style INTA-cycle interrupts:
-	 */
-	for (i = 0; i < 16; i++) {
-		set_irq_chip_and_handler_name(i, &i8259A_chip,
-					      handle_level_irq, "XT");
-	}
-}
-
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
-	int i;
-
-	/* all the set up before the call gates are initialised */
-	pre_intr_init_hook();
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (i >= NR_IRQS)
-			break;
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (!test_bit(vector, used_vectors))
-			set_intr_gate(vector, interrupt[i]);
-	}
-
-	/* setup after call gates are initialised (usually add in
-	 * the architecture specific gates)
-	 */
-	intr_init_hook();
-
-	/*
-	 * External FPU? Set up irq13 if so, for
-	 * original braindamaged IBM FERR coupling.
-	 */
-	if (boot_cpu_data.hard_math && !cpu_has_fpu)
-		setup_irq(FPU_IRQ, &fpu_irq);
-
-	irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
deleted file mode 100644
index a95d2bd27e9d..000000000000
--- a/arch/x86/kernel/i8259_64.c
+++ /dev/null
@@ -1,203 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-
-#include <asm/acpi.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/*
- * Common place to define all x86 IRQ vectors
- *
- * This builds up the IRQ handler stubs using some ugly macros in irq.h
- *
- * These macros create the low-level assembly IRQ routines that save
- * register context and call do_IRQ(). do_IRQ() then does all the
- * operations that are needed to keep the AT (or SMP IOAPIC)
- * interrupt-controller happy.
- */
-
-#define BI(x,y) \
-	BUILD_IRQ(x##y)
-
-#define BUILD_16_IRQS(x) \
-	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
-	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
-	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
-	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-				      BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
-BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
-BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
-
-#undef BUILD_16_IRQS
-#undef BI
-
-
-#define IRQ(x,y) \
-	IRQ##x##y##_interrupt
-
-#define IRQLIST_16(x) \
-	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
-	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
-	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
-	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-
-/* for the irq vectors */
-static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
-					  IRQLIST_16(0x2), IRQLIST_16(0x3),
-	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
-	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
-	IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
-};
-
-#undef IRQ
-#undef IRQLIST_16
-
-
-
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = {
-	.handler = no_action,
-	.mask = CPU_MASK_NONE,
-	.name = "cascade",
-};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... IRQ0_VECTOR - 1] = -1,
-	[IRQ0_VECTOR] = 0,
-	[IRQ1_VECTOR] = 1,
-	[IRQ2_VECTOR] = 2,
-	[IRQ3_VECTOR] = 3,
-	[IRQ4_VECTOR] = 4,
-	[IRQ5_VECTOR] = 5,
-	[IRQ6_VECTOR] = 6,
-	[IRQ7_VECTOR] = 7,
-	[IRQ8_VECTOR] = 8,
-	[IRQ9_VECTOR] = 9,
-	[IRQ10_VECTOR] = 10,
-	[IRQ11_VECTOR] = 11,
-	[IRQ12_VECTOR] = 12,
-	[IRQ13_VECTOR] = 13,
-	[IRQ14_VECTOR] = 14,
-	[IRQ15_VECTOR] = 15,
-	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-void __init init_ISA_irqs (void)
-{
-	int i;
-
-	init_bsp_APIC();
-	init_8259A(0);
-
-	for (i = 0; i < NR_IRQS; i++) {
-		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].action = NULL;
-		irq_desc[i].depth = 1;
-
-		if (i < 16) {
-			/*
-			 * 16 old-style INTA-cycle interrupts:
-			 */
-			set_irq_chip_and_handler_name(i, &i8259A_chip,
-						      handle_level_irq, "XT");
-		} else {
-			/*
-			 * 'high' PCI IRQs filled in on demand
-			 */
-			irq_desc[i].chip = &no_irq_chip;
-		}
-	}
-}
-
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
-	int i;
-
-	init_ISA_irqs();
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != IA32_SYSCALL_VECTOR)
-			set_intr_gate(vector, interrupt[i]);
-	}
-
-#ifdef CONFIG_SMP
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPIs for invalidation */
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-	/* IPI for generic function call */
-	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* Low priority IPI to cleanup after moving an irq */
-	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-#endif
-	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-	set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
-	/* self generated IPI for local APIC timer */
-	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-	/* IPI vectors for APIC spurious and error interrupts */
-	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-	if (!acpi_ioapic)
-		setup_irq(2, &irq2);
-}
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
new file mode 100644
index 000000000000..d66914287ee1
--- /dev/null
+++ b/arch/x86/kernel/irqinit_32.c
@@ -0,0 +1,114 @@
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/i8259.h>
+
+
+
+/*
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+ 
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+	extern void math_error(void __user *);
+	outb(0,0xF0);
+	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+		return IRQ_NONE;
+	math_error((void __user *)get_irq_regs()->ip);
+	return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
+ */
+static struct irqaction fpu_irq = {
+	.handler = math_error_irq,
+	.mask = CPU_MASK_NONE,
+	.name = "fpu",
+};
+
+void __init init_ISA_irqs (void)
+{
+	int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	init_bsp_APIC();
+#endif
+	init_8259A(0);
+
+	/*
+	 * 16 old-style INTA-cycle interrupts:
+	 */
+	for (i = 0; i < 16; i++) {
+		set_irq_chip_and_handler_name(i, &i8259A_chip,
+					      handle_level_irq, "XT");
+	}
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	/* all the set up before the call gates are initialised */
+	pre_intr_init_hook();
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (!test_bit(vector, used_vectors))
+			set_intr_gate(vector, interrupt[i]);
+	}
+
+	/* setup after call gates are initialised (usually add in
+	 * the architecture specific gates)
+	 */
+	intr_init_hook();
+
+	/*
+	 * External FPU? Set up irq13 if so, for
+	 * original braindamaged IBM FERR coupling.
+	 */
+	if (boot_cpu_data.hard_math && !cpu_has_fpu)
+		setup_irq(FPU_IRQ, &fpu_irq);
+
+	irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
new file mode 100644
index 000000000000..a95d2bd27e9d
--- /dev/null
+++ b/arch/x86/kernel/irqinit_64.c
@@ -0,0 +1,203 @@
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/acpi.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+
+/*
+ * Common place to define all x86 IRQ vectors
+ *
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
+
+#define BI(x,y) \
+	BUILD_IRQ(x##y)
+
+#define BUILD_16_IRQS(x) \
+	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+				      BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
+BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
+
+#undef BUILD_16_IRQS
+#undef BI
+
+
+#define IRQ(x,y) \
+	IRQ##x##y##_interrupt
+
+#define IRQLIST_16(x) \
+	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
+
+/* for the irq vectors */
+static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
+					  IRQLIST_16(0x2), IRQLIST_16(0x3),
+	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
+	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
+	IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
+};
+
+#undef IRQ
+#undef IRQLIST_16
+
+
+
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+
+static struct irqaction irq2 = {
+	.handler = no_action,
+	.mask = CPU_MASK_NONE,
+	.name = "cascade",
+};
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+	[0 ... IRQ0_VECTOR - 1] = -1,
+	[IRQ0_VECTOR] = 0,
+	[IRQ1_VECTOR] = 1,
+	[IRQ2_VECTOR] = 2,
+	[IRQ3_VECTOR] = 3,
+	[IRQ4_VECTOR] = 4,
+	[IRQ5_VECTOR] = 5,
+	[IRQ6_VECTOR] = 6,
+	[IRQ7_VECTOR] = 7,
+	[IRQ8_VECTOR] = 8,
+	[IRQ9_VECTOR] = 9,
+	[IRQ10_VECTOR] = 10,
+	[IRQ11_VECTOR] = 11,
+	[IRQ12_VECTOR] = 12,
+	[IRQ13_VECTOR] = 13,
+	[IRQ14_VECTOR] = 14,
+	[IRQ15_VECTOR] = 15,
+	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
+void __init init_ISA_irqs (void)
+{
+	int i;
+
+	init_bsp_APIC();
+	init_8259A(0);
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc[i].status = IRQ_DISABLED;
+		irq_desc[i].action = NULL;
+		irq_desc[i].depth = 1;
+
+		if (i < 16) {
+			/*
+			 * 16 old-style INTA-cycle interrupts:
+			 */
+			set_irq_chip_and_handler_name(i, &i8259A_chip,
+						      handle_level_irq, "XT");
+		} else {
+			/*
+			 * 'high' PCI IRQs filled in on demand
+			 */
+			irq_desc[i].chip = &no_irq_chip;
+		}
+	}
+}
+
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	init_ISA_irqs();
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (vector != IA32_SYSCALL_VECTOR)
+			set_intr_gate(vector, interrupt[i]);
+	}
+
+#ifdef CONFIG_SMP
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPIs for invalidation */
+	set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+	set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+	set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+	set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+	set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+	set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+	set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+	set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+
+	/* IPI for generic function call */
+	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+	/* Low priority IPI to cleanup after moving an irq */
+	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+#endif
+	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+
+	/* self generated IPI for local APIC timer */
+	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+	/* IPI vectors for APIC spurious and error interrupts */
+	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+	if (!acpi_ioapic)
+		setup_irq(2, &irq2);
+}
-- 
cgit v1.2.3


From fce39665abb71d01d74ac74eb13dd69a799dfc2f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 15:43:36 +0200
Subject: x86: make init_ISA_irqs() static

Moved to i8259 branch to avoid conflicts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index a95d2bd27e9d..64bc0f14285f 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -120,7 +120,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
 	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
 };
 
-void __init init_ISA_irqs (void)
+static void __init init_ISA_irqs (void)
 {
 	int i;
 
-- 
cgit v1.2.3


From 21fd5132b223a10bdf17713dd0bf321cbd6471d2 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 21 May 2008 11:44:02 +0200
Subject: x86: automatical unification of i8259.c

Make conversion of i8259 very mechanical -- i8259 was generated by
 diff -D, with too different parts left in i8259_32 and
i8259_64.c. Only "by hand" changes were removal of #ifdef from middle
of the comment (prevented compilation) and removal of one static to
allow splitting into files.

Of course, it will need some cleanups now, and those will follow.

Signed-of-by: Pavel Machek <pavel@suse.cz>
---
 arch/x86/kernel/Makefile   |   2 +-
 arch/x86/kernel/i8259.c    | 368 +++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/i8259_32.c | 295 ------------------------------------
 arch/x86/kernel/i8259_64.c | 309 -------------------------------------
 include/asm-x86/i8259.h    |   2 +
 5 files changed, 371 insertions(+), 605 deletions(-)
 create mode 100644 arch/x86/kernel/i8259.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..f71a76ef2596 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,7 +18,7 @@ CFLAGS_tsc_64.o		:= $(nostackp)
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
-obj-y			+= setup_$(BITS).o i8259_$(BITS).o setup.o
+obj-y			+= setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
new file mode 100644
index 000000000000..2decba6b0101
--- /dev/null
+++ b/arch/x86/kernel/i8259.c
@@ -0,0 +1,368 @@
+#ifdef CONFIG_X86_64
+#include <linux/linkage.h>
+#endif /* CONFIG_X86_64 */
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#ifdef CONFIG_X86_64
+#include <linux/timex.h>
+#endif /* CONFIG_X86_64 */
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/acpi.h>
+#endif /* CONFIG_X86_64 */
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#ifndef CONFIG_X86_64
+#include <asm/timer.h>
+#else /* CONFIG_X86_64 */
+#include <asm/hw_irq.h>
+#endif /* CONFIG_X86_64 */
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#ifndef CONFIG_X86_64
+#include <asm/arch_hooks.h>
+#endif /* ! CONFIG_X86_64 */
+#include <asm/i8259.h>
+
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ */
+
+static int i8259A_auto_eoi;
+DEFINE_SPINLOCK(i8259A_lock);
+static void mask_and_ack_8259A(unsigned int);
+
+struct irq_chip i8259A_chip = {
+	.name		= "XT-PIC",
+	.mask		= disable_8259A_irq,
+	.disable	= disable_8259A_irq,
+	.unmask		= enable_8259A_irq,
+	.mask_ack	= mask_and_ack_8259A,
+};
+
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+unsigned int cached_irq_mask = 0xffff;
+
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+
+void disable_8259A_irq(unsigned int irq)
+{
+	unsigned int mask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask |= mask;
+	if (irq & 8)
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+	else
+		outb(cached_master_mask, PIC_MASTER_IMR);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+void enable_8259A_irq(unsigned int irq)
+{
+	unsigned int mask = ~(1 << irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask &= mask;
+	if (irq & 8)
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+	else
+		outb(cached_master_mask, PIC_MASTER_IMR);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+int i8259A_irq_pending(unsigned int irq)
+{
+	unsigned int mask = 1<<irq;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	if (irq < 8)
+		ret = inb(PIC_MASTER_CMD) & mask;
+	else
+		ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+
+	return ret;
+}
+
+void make_8259A_irq(unsigned int irq)
+{
+	disable_irq_nosync(irq);
+	io_apic_irqs &= ~(1<<irq);
+	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
+				      "XT");
+	enable_irq(irq);
+}
+
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+	int value;
+	int irqmask = 1<<irq;
+
+	if (irq < 8) {
+		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
+		value = inb(PIC_MASTER_CMD) & irqmask;
+		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
+		return value;
+	}
+	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
+	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
+	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
+	return value;
+}
+
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(unsigned int irq)
+{
+	unsigned int irqmask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	/*
+	 * Lightweight spurious IRQ detection. We do not want
+	 * to overdo spurious IRQ handling - it's usually a sign
+	 * of hardware problems, so we only do the checks we can
+	 * do without slowing down good hardware unnecessarily.
+	 *
+	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
+	 * usually resulting from the 8259A-1|2 PICs) occur
+	 * even if the IRQ is masked in the 8259A. Thus we
+	 * can check spurious 8259A IRQs without doing the
+	 * quite slow i8259A_irq_real() call for every IRQ.
+	 * This does not cover 100% of spurious interrupts,
+	 * but should be enough to warn the user that there
+	 * is something bad going on ...
+	 */
+	if (cached_irq_mask & irqmask)
+		goto spurious_8259A_irq;
+	cached_irq_mask |= irqmask;
+
+handle_real_irq:
+	if (irq & 8) {
+		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+#ifndef CONFIG_X86_64
+		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
+		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+#else /* CONFIG_X86_64 */
+		/* 'Specific EOI' to slave */
+		outb(0x60+(irq&7),PIC_SLAVE_CMD);
+		 /* 'Specific EOI' to master-IRQ2 */
+		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
+#endif /* CONFIG_X86_64 */
+	} else {
+		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
+		outb(cached_master_mask, PIC_MASTER_IMR);
+#ifndef CONFIG_X86_64
+		outb(0x60+irq,PIC_MASTER_CMD);	/* 'Specific EOI to master */
+#else /* CONFIG_X86_64 */
+		/* 'Specific EOI' to master */
+		outb(0x60+irq,PIC_MASTER_CMD);
+#endif /* CONFIG_X86_64 */
+	}
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+	return;
+
+spurious_8259A_irq:
+	/*
+	 * this is the slow path - should happen rarely.
+	 */
+	if (i8259A_irq_real(irq))
+		/*
+		 * oops, the IRQ _is_ in service according to the
+		 * 8259A - not spurious, go handle it.
+		 */
+		goto handle_real_irq;
+
+	{
+		static int spurious_irq_mask;
+		/*
+		 * At this point we can be sure the IRQ is spurious,
+		 * lets ACK and report it. [once per IRQ]
+		 */
+		if (!(spurious_irq_mask & irqmask)) {
+#ifndef CONFIG_X86_64
+			printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+#else /* CONFIG_X86_64 */
+			printk(KERN_DEBUG
+			       "spurious 8259A interrupt: IRQ%d.\n", irq);
+#endif /* CONFIG_X86_64 */
+			spurious_irq_mask |= irqmask;
+		}
+		atomic_inc(&irq_err_count);
+		/*
+		 * Theoretically we do not have to handle this IRQ,
+		 * but in Linux this does not cause problems and is
+		 * simpler for us.
+		 */
+		goto handle_real_irq;
+	}
+}
+
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+	outb(trigger[0], 0x4d0);
+	outb(trigger[1], 0x4d1);
+}
+
+static void save_ELCR(char *trigger)
+{
+	/* IRQ 0,1,2,8,13 are marked as reserved */
+	trigger[0] = inb(0x4d0) & 0xF8;
+	trigger[1] = inb(0x4d1) & 0xDE;
+}
+
+static int i8259A_resume(struct sys_device *dev)
+{
+	init_8259A(i8259A_auto_eoi);
+	restore_ELCR(irq_trigger);
+	return 0;
+}
+
+static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+{
+	save_ELCR(irq_trigger);
+	return 0;
+}
+
+static int i8259A_shutdown(struct sys_device *dev)
+{
+	/* Put the i8259A into a quiescent state that
+	 * the kernel initialization code can get it
+	 * out of.
+	 */
+	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */
+	return 0;
+}
+
+static struct sysdev_class i8259_sysdev_class = {
+	.name = "i8259",
+	.suspend = i8259A_suspend,
+	.resume = i8259A_resume,
+	.shutdown = i8259A_shutdown,
+};
+
+static struct sys_device device_i8259A = {
+	.id	= 0,
+	.cls	= &i8259_sysdev_class,
+};
+
+static int __init i8259A_init_sysfs(void)
+{
+	int error = sysdev_class_register(&i8259_sysdev_class);
+	if (!error)
+		error = sysdev_register(&device_i8259A);
+	return error;
+}
+
+device_initcall(i8259A_init_sysfs);
+
+void init_8259A(int auto_eoi)
+{
+	unsigned long flags;
+
+	i8259A_auto_eoi = auto_eoi;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
+
+	/*
+	 * outb_pic - this has to work on a wide range of PC hardware.
+	 */
+	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
+#ifndef CONFIG_X86_64
+	outb_pic(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
+#else /* CONFIG_X86_64 */
+	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
+	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+	/* 8259A-1 (the master) has a slave on IR2 */
+	outb_pic(0x04, PIC_MASTER_IMR);
+#endif /* CONFIG_X86_64 */
+	if (auto_eoi)	/* master does Auto EOI */
+		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+	else		/* master expects normal EOI */
+		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+
+	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
+#ifndef CONFIG_X86_64
+	outb_pic(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);	/* 8259A-2 is a slave on master's IR2 */
+	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
+#else /* CONFIG_X86_64 */
+	/* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
+	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+	/* 8259A-2 is a slave on master's IR2 */
+	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
+	/* (slave's support for AEOI in flat mode is to be investigated) */
+	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
+
+#endif /* CONFIG_X86_64 */
+	if (auto_eoi)
+		/*
+		 * In AEOI mode we just have to mask the interrupt
+		 * when acking.
+		 */
+		i8259A_chip.mask_ack = disable_8259A_irq;
+	else
+		i8259A_chip.mask_ack = mask_and_ack_8259A;
+
+	udelay(100);		/* wait for 8259A to initialize */
+
+	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index fe631967d625..d66914287ee1 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -21,302 +21,7 @@
 #include <asm/arch_hooks.h>
 #include <asm/i8259.h>
 
-/*
- * This is the 'legacy' 8259A Programmable Interrupt Controller,
- * present in the majority of PC/AT boxes.
- * plus some generic x86 specific things if generic specifics makes
- * any sense at all.
- */
-
-static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-
-static struct irq_chip i8259A_chip = {
-	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
-	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
-};
-
-/*
- * 8259A PIC functions to handle ISA devices:
- */
-
-/*
- * This contains the irq mask for both 8259A irq controllers,
- */
-unsigned int cached_irq_mask = 0xffff;
-
-/*
- * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
- * boards the timer interrupt is not really connected to any IO-APIC pin,
- * it's fed to the master 8259A's IR0 line only.
- *
- * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
- * this 'mixed mode' IRQ handling costs nothing because it's only used
- * at IRQ setup time.
- */
-unsigned long io_apic_irqs;
-
-void disable_8259A_irq(unsigned int irq)
-{
-	unsigned int mask = 1 << irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	cached_irq_mask |= mask;
-	if (irq & 8)
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-	else
-		outb(cached_master_mask, PIC_MASTER_IMR);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-void enable_8259A_irq(unsigned int irq)
-{
-	unsigned int mask = ~(1 << irq);
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	cached_irq_mask &= mask;
-	if (irq & 8)
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-	else
-		outb(cached_master_mask, PIC_MASTER_IMR);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-int i8259A_irq_pending(unsigned int irq)
-{
-	unsigned int mask = 1<<irq;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	if (irq < 8)
-		ret = inb(PIC_MASTER_CMD) & mask;
-	else
-		ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-
-	return ret;
-}
-
-void make_8259A_irq(unsigned int irq)
-{
-	disable_irq_nosync(irq);
-	io_apic_irqs &= ~(1<<irq);
-	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
-				      "XT");
-	enable_irq(irq);
-}
-
-/*
- * This function assumes to be called rarely. Switching between
- * 8259A registers is slow.
- * This has to be protected by the irq controller spinlock
- * before being called.
- */
-static inline int i8259A_irq_real(unsigned int irq)
-{
-	int value;
-	int irqmask = 1<<irq;
-
-	if (irq < 8) {
-		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
-		value = inb(PIC_MASTER_CMD) & irqmask;
-		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
-		return value;
-	}
-	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
-	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
-	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
-	return value;
-}
-
-/*
- * Careful! The 8259A is a fragile beast, it pretty
- * much _has_ to be done exactly like this (mask it
- * first, _then_ send the EOI, and the order of EOI
- * to the two 8259s is important!
- */
-static void mask_and_ack_8259A(unsigned int irq)
-{
-	unsigned int irqmask = 1 << irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	/*
-	 * Lightweight spurious IRQ detection. We do not want
-	 * to overdo spurious IRQ handling - it's usually a sign
-	 * of hardware problems, so we only do the checks we can
-	 * do without slowing down good hardware unnecessarily.
-	 *
-	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
-	 * usually resulting from the 8259A-1|2 PICs) occur
-	 * even if the IRQ is masked in the 8259A. Thus we
-	 * can check spurious 8259A IRQs without doing the
-	 * quite slow i8259A_irq_real() call for every IRQ.
-	 * This does not cover 100% of spurious interrupts,
-	 * but should be enough to warn the user that there
-	 * is something bad going on ...
-	 */
-	if (cached_irq_mask & irqmask)
-		goto spurious_8259A_irq;
-	cached_irq_mask |= irqmask;
 
-handle_real_irq:
-	if (irq & 8) {
-		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
-		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
-	} else {
-		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
-		outb(cached_master_mask, PIC_MASTER_IMR);
-		outb(0x60+irq,PIC_MASTER_CMD);	/* 'Specific EOI to master */
-	}
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-	return;
-
-spurious_8259A_irq:
-	/*
-	 * this is the slow path - should happen rarely.
-	 */
-	if (i8259A_irq_real(irq))
-		/*
-		 * oops, the IRQ _is_ in service according to the
-		 * 8259A - not spurious, go handle it.
-		 */
-		goto handle_real_irq;
-
-	{
-		static int spurious_irq_mask;
-		/*
-		 * At this point we can be sure the IRQ is spurious,
-		 * lets ACK and report it. [once per IRQ]
-		 */
-		if (!(spurious_irq_mask & irqmask)) {
-			printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
-			spurious_irq_mask |= irqmask;
-		}
-		atomic_inc(&irq_err_count);
-		/*
-		 * Theoretically we do not have to handle this IRQ,
-		 * but in Linux this does not cause problems and is
-		 * simpler for us.
-		 */
-		goto handle_real_irq;
-	}
-}
-
-static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
-static void restore_ELCR(char *trigger)
-{
-	outb(trigger[0], 0x4d0);
-	outb(trigger[1], 0x4d1);
-}
-
-static void save_ELCR(char *trigger)
-{
-	/* IRQ 0,1,2,8,13 are marked as reserved */
-	trigger[0] = inb(0x4d0) & 0xF8;
-	trigger[1] = inb(0x4d1) & 0xDE;
-}
-
-static int i8259A_resume(struct sys_device *dev)
-{
-	init_8259A(i8259A_auto_eoi);
-	restore_ELCR(irq_trigger);
-	return 0;
-}
-
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
-{
-	save_ELCR(irq_trigger);
-	return 0;
-}
-
-static int i8259A_shutdown(struct sys_device *dev)
-{
-	/* Put the i8259A into a quiescent state that
-	 * the kernel initialization code can get it
-	 * out of.
-	 */
-	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
-	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */
-	return 0;
-}
-
-static struct sysdev_class i8259_sysdev_class = {
-	.name = "i8259",
-	.suspend = i8259A_suspend,
-	.resume = i8259A_resume,
-	.shutdown = i8259A_shutdown,
-};
-
-static struct sys_device device_i8259A = {
-	.id	= 0,
-	.cls	= &i8259_sysdev_class,
-};
-
-static int __init i8259A_init_sysfs(void)
-{
-	int error = sysdev_class_register(&i8259_sysdev_class);
-	if (!error)
-		error = sysdev_register(&device_i8259A);
-	return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
-void init_8259A(int auto_eoi)
-{
-	unsigned long flags;
-
-	i8259A_auto_eoi = auto_eoi;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-
-	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
-	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
-
-	/*
-	 * outb_pic - this has to work on a wide range of PC hardware.
-	 */
-	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
-	outb_pic(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
-	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
-	if (auto_eoi)	/* master does Auto EOI */
-		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
-	else		/* master expects normal EOI */
-		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
-
-	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
-	outb_pic(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
-	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);	/* 8259A-2 is a slave on master's IR2 */
-	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
-	if (auto_eoi)
-		/*
-		 * In AEOI mode we just have to mask the interrupt
-		 * when acking.
-		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
-	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
-
-	udelay(100);		/* wait for 8259A to initialize */
-
-	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
-	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
-
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
 
 /*
  * Note that on a 486, we don't want to do a SIGFPE on an irq13
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index 1870e0e86559..b44095efcf83 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -101,315 +101,6 @@ static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) =
 #undef IRQ
 #undef IRQLIST_16
 
-/*
- * This is the 'legacy' 8259A Programmable Interrupt Controller,
- * present in the majority of PC/AT boxes.
- * plus some generic x86 specific things if generic specifics makes
- * any sense at all.
- * this file should become arch/i386/kernel/irq.c when the old irq.c
- * moves to arch independent land
- */
-
-static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-
-static struct irq_chip i8259A_chip = {
-	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
-	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
-};
-
-/*
- * 8259A PIC functions to handle ISA devices:
- */
-
-/*
- * This contains the irq mask for both 8259A irq controllers,
- */
-unsigned int cached_irq_mask = 0xffff;
-
-/*
- * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
- * boards the timer interrupt is not really connected to any IO-APIC pin,
- * it's fed to the master 8259A's IR0 line only.
- *
- * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
- * this 'mixed mode' IRQ handling costs nothing because it's only used
- * at IRQ setup time.
- */
-unsigned long io_apic_irqs;
-
-void disable_8259A_irq(unsigned int irq)
-{
-	unsigned int mask = 1 << irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	cached_irq_mask |= mask;
-	if (irq & 8)
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-	else
-		outb(cached_master_mask, PIC_MASTER_IMR);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-void enable_8259A_irq(unsigned int irq)
-{
-	unsigned int mask = ~(1 << irq);
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	cached_irq_mask &= mask;
-	if (irq & 8)
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-	else
-		outb(cached_master_mask, PIC_MASTER_IMR);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-int i8259A_irq_pending(unsigned int irq)
-{
-	unsigned int mask = 1<<irq;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	if (irq < 8)
-		ret = inb(PIC_MASTER_CMD) & mask;
-	else
-		ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-
-	return ret;
-}
-
-void make_8259A_irq(unsigned int irq)
-{
-	disable_irq_nosync(irq);
-	io_apic_irqs &= ~(1<<irq);
-	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
-				      "XT");
-	enable_irq(irq);
-}
-
-/*
- * This function assumes to be called rarely. Switching between
- * 8259A registers is slow.
- * This has to be protected by the irq controller spinlock
- * before being called.
- */
-static inline int i8259A_irq_real(unsigned int irq)
-{
-	int value;
-	int irqmask = 1<<irq;
-
-	if (irq < 8) {
-		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
-		value = inb(PIC_MASTER_CMD) & irqmask;
-		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
-		return value;
-	}
-	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
-	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
-	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
-	return value;
-}
-
-/*
- * Careful! The 8259A is a fragile beast, it pretty
- * much _has_ to be done exactly like this (mask it
- * first, _then_ send the EOI, and the order of EOI
- * to the two 8259s is important!
- */
-static void mask_and_ack_8259A(unsigned int irq)
-{
-	unsigned int irqmask = 1 << irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	/*
-	 * Lightweight spurious IRQ detection. We do not want
-	 * to overdo spurious IRQ handling - it's usually a sign
-	 * of hardware problems, so we only do the checks we can
-	 * do without slowing down good hardware unnecessarily.
-	 *
-	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
-	 * usually resulting from the 8259A-1|2 PICs) occur
-	 * even if the IRQ is masked in the 8259A. Thus we
-	 * can check spurious 8259A IRQs without doing the
-	 * quite slow i8259A_irq_real() call for every IRQ.
-	 * This does not cover 100% of spurious interrupts,
-	 * but should be enough to warn the user that there
-	 * is something bad going on ...
-	 */
-	if (cached_irq_mask & irqmask)
-		goto spurious_8259A_irq;
-	cached_irq_mask |= irqmask;
-
-handle_real_irq:
-	if (irq & 8) {
-		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-		/* 'Specific EOI' to slave */
-		outb(0x60+(irq&7),PIC_SLAVE_CMD);
-		 /* 'Specific EOI' to master-IRQ2 */
-		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
-	} else {
-		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
-		outb(cached_master_mask, PIC_MASTER_IMR);
-		/* 'Specific EOI' to master */
-		outb(0x60+irq,PIC_MASTER_CMD);
-	}
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-	return;
-
-spurious_8259A_irq:
-	/*
-	 * this is the slow path - should happen rarely.
-	 */
-	if (i8259A_irq_real(irq))
-		/*
-		 * oops, the IRQ _is_ in service according to the
-		 * 8259A - not spurious, go handle it.
-		 */
-		goto handle_real_irq;
-
-	{
-		static int spurious_irq_mask;
-		/*
-		 * At this point we can be sure the IRQ is spurious,
-		 * lets ACK and report it. [once per IRQ]
-		 */
-		if (!(spurious_irq_mask & irqmask)) {
-			printk(KERN_DEBUG
-			       "spurious 8259A interrupt: IRQ%d.\n", irq);
-			spurious_irq_mask |= irqmask;
-		}
-		atomic_inc(&irq_err_count);
-		/*
-		 * Theoretically we do not have to handle this IRQ,
-		 * but in Linux this does not cause problems and is
-		 * simpler for us.
-		 */
-		goto handle_real_irq;
-	}
-}
-
-static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
-static void restore_ELCR(char *trigger)
-{
-	outb(trigger[0], 0x4d0);
-	outb(trigger[1], 0x4d1);
-}
-
-static void save_ELCR(char *trigger)
-{
-	/* IRQ 0,1,2,8,13 are marked as reserved */
-	trigger[0] = inb(0x4d0) & 0xF8;
-	trigger[1] = inb(0x4d1) & 0xDE;
-}
-
-static int i8259A_resume(struct sys_device *dev)
-{
-	init_8259A(i8259A_auto_eoi);
-	restore_ELCR(irq_trigger);
-	return 0;
-}
-
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
-{
-	save_ELCR(irq_trigger);
-	return 0;
-}
-
-static int i8259A_shutdown(struct sys_device *dev)
-{
-	/* Put the i8259A into a quiescent state that
-	 * the kernel initialization code can get it
-	 * out of.
-	 */
-	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
-	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */
-	return 0;
-}
-
-static struct sysdev_class i8259_sysdev_class = {
-	.name = "i8259",
-	.suspend = i8259A_suspend,
-	.resume = i8259A_resume,
-	.shutdown = i8259A_shutdown,
-};
-
-static struct sys_device device_i8259A = {
-	.id	= 0,
-	.cls	= &i8259_sysdev_class,
-};
-
-static int __init i8259A_init_sysfs(void)
-{
-	int error = sysdev_class_register(&i8259_sysdev_class);
-	if (!error)
-		error = sysdev_register(&device_i8259A);
-	return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
-void init_8259A(int auto_eoi)
-{
-	unsigned long flags;
-
-	i8259A_auto_eoi = auto_eoi;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-
-	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
-	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
-
-	/*
-	 * outb_pic - this has to work on a wide range of PC hardware.
-	 */
-	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
-	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
-	/* 8259A-1 (the master) has a slave on IR2 */
-	outb_pic(0x04, PIC_MASTER_IMR);
-	if (auto_eoi)	/* master does Auto EOI */
-		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
-	else		/* master expects normal EOI */
-		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
-
-	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
-	/* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
-	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
-	/* 8259A-2 is a slave on master's IR2 */
-	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
-	/* (slave's support for AEOI in flat mode is to be investigated) */
-	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
-
-	if (auto_eoi)
-		/*
-		 * In AEOI mode we just have to mask the interrupt
-		 * when acking.
-		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
-	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
-
-	udelay(100);		/* wait for 8259A to initialize */
-
-	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
-	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
-
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
 
 
diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h
index 45d4df3e51e6..2f98df91f1f2 100644
--- a/include/asm-x86/i8259.h
+++ b/include/asm-x86/i8259.h
@@ -55,4 +55,6 @@ static inline void outb_pic(unsigned char value, unsigned int port)
 	udelay(2);
 }
 
+extern struct irq_chip i8259A_chip;
+
 #endif	/* __ASM_I8259_H__ */
-- 
cgit v1.2.3


From 15d613cb25efd978dd55592d011a6ffc487b3432 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 21 May 2008 11:47:24 +0200
Subject: x86: i8259.c: remove #ifdefs around includes

Remove #ifdefs around includes; including too much should be always
safe.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: macro@ds2.pg.gda.pl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i8259.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 2decba6b0101..8b7eb0cb2e7e 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -1,14 +1,10 @@
-#ifdef CONFIG_X86_64
 #include <linux/linkage.h>
-#endif /* CONFIG_X86_64 */
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
-#ifdef CONFIG_X86_64
 #include <linux/timex.h>
-#endif /* CONFIG_X86_64 */
 #include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/init.h>
@@ -16,24 +12,17 @@
 #include <linux/sysdev.h>
 #include <linux/bitops.h>
 
-#ifdef CONFIG_X86_64
 #include <asm/acpi.h>
-#endif /* CONFIG_X86_64 */
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <asm/io.h>
-#ifndef CONFIG_X86_64
 #include <asm/timer.h>
-#else /* CONFIG_X86_64 */
 #include <asm/hw_irq.h>
-#endif /* CONFIG_X86_64 */
 #include <asm/pgtable.h>
 #include <asm/delay.h>
 #include <asm/desc.h>
 #include <asm/apic.h>
-#ifndef CONFIG_X86_64
 #include <asm/arch_hooks.h>
-#endif /* ! CONFIG_X86_64 */
 #include <asm/i8259.h>
 
 /*
-- 
cgit v1.2.3


From 3e8631d27088f394b6788829e238a60bf07d47ab Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 21 May 2008 11:52:52 +0200
Subject: x86: i8259.c: remove trivial ifdefs

Remove #ifdefs where the only difference is formatting of comments.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: macro@ds2.pg.gda.pl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i8259.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 8b7eb0cb2e7e..433e49edfd4f 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -175,24 +175,14 @@ handle_real_irq:
 	if (irq & 8) {
 		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
 		outb(cached_slave_mask, PIC_SLAVE_IMR);
-#ifndef CONFIG_X86_64
-		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
-		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
-#else /* CONFIG_X86_64 */
 		/* 'Specific EOI' to slave */
-		outb(0x60+(irq&7),PIC_SLAVE_CMD);
+		outb(0x60+(irq&7), PIC_SLAVE_CMD);
 		 /* 'Specific EOI' to master-IRQ2 */
-		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
-#endif /* CONFIG_X86_64 */
+		outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD);
 	} else {
 		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
 		outb(cached_master_mask, PIC_MASTER_IMR);
-#ifndef CONFIG_X86_64
-		outb(0x60+irq,PIC_MASTER_CMD);	/* 'Specific EOI to master */
-#else /* CONFIG_X86_64 */
-		/* 'Specific EOI' to master */
-		outb(0x60+irq,PIC_MASTER_CMD);
-#endif /* CONFIG_X86_64 */
+		outb(0x60+irq, PIC_MASTER_CMD);	/* 'Specific EOI to master */
 	}
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 	return;
@@ -215,12 +205,8 @@ spurious_8259A_irq:
 		 * lets ACK and report it. [once per IRQ]
 		 */
 		if (!(spurious_irq_mask & irqmask)) {
-#ifndef CONFIG_X86_64
-			printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
-#else /* CONFIG_X86_64 */
 			printk(KERN_DEBUG
 			       "spurious 8259A interrupt: IRQ%d.\n", irq);
-#endif /* CONFIG_X86_64 */
 			spurious_irq_mask |= irqmask;
 		}
 		atomic_inc(&irq_err_count);
-- 
cgit v1.2.3


From 680afbf989d697b9ba1382017a73c8cfa53b3f89 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 21 May 2008 11:57:52 +0200
Subject: x86: i8259: cleanup codingstyle

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: macro@ds2.pg.gda.pl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i8259.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 433e49edfd4f..7a0fda8f01b5 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -129,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq)
 	int irqmask = 1<<irq;
 
 	if (irq < 8) {
-		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
+		outb(0x0B, PIC_MASTER_CMD);	/* ISR register */
 		value = inb(PIC_MASTER_CMD) & irqmask;
-		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
+		outb(0x0A, PIC_MASTER_CMD);	/* back to the IRR register */
 		return value;
 	}
-	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
+	outb(0x0B, PIC_SLAVE_CMD);	/* ISR register */
 	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
-	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
+	outb(0x0A, PIC_SLAVE_CMD);	/* back to the IRR register */
 	return value;
 }
 
-- 
cgit v1.2.3


From d23b200a75f78e62744c3c25d078855eec8a689b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 15:43:36 +0200
Subject: x86: make init_ISA_irqs() static

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i8259_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index b44095efcf83..31f49e8f46a7 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -134,7 +134,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
 	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
 };
 
-void __init init_ISA_irqs (void)
+static void __init init_ISA_irqs (void)
 {
 	int i;
 
-- 
cgit v1.2.3


From ec42418f1973e1f77da1fcca3be59c0acb878a9d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 May 2008 15:59:58 +0200
Subject: x86: rename the i8259_32/64.c leftovers to irqinit_32/64.c

The leftovers of the i8259 unification have nothing to do with i8259
at all. They contain interrupt init code and the i8259_xx name is just
misleading now.

Rename them to irqinit_32/64.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile     |   2 +-
 arch/x86/kernel/i8259_32.c   | 114 -----------------------
 arch/x86/kernel/i8259_64.c   | 217 -------------------------------------------
 arch/x86/kernel/irqinit_32.c | 114 +++++++++++++++++++++++
 arch/x86/kernel/irqinit_64.c | 217 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 332 insertions(+), 332 deletions(-)
 delete mode 100644 arch/x86/kernel/i8259_32.c
 delete mode 100644 arch/x86/kernel/i8259_64.c
 create mode 100644 arch/x86/kernel/irqinit_32.c
 create mode 100644 arch/x86/kernel/irqinit_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f71a76ef2596..2a53ad2cb450 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,7 +18,7 @@ CFLAGS_tsc_64.o		:= $(nostackp)
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
-obj-y			+= setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o
+obj-y			+= setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
deleted file mode 100644
index d66914287ee1..000000000000
--- a/arch/x86/kernel/i8259_32.c
+++ /dev/null
@@ -1,114 +0,0 @@
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/timer.h>
-#include <asm/pgtable.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/arch_hooks.h>
-#include <asm/i8259.h>
-
-
-
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
- 
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
-	extern void math_error(void __user *);
-	outb(0,0xF0);
-	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
-		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->ip);
-	return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
-	.handler = math_error_irq,
-	.mask = CPU_MASK_NONE,
-	.name = "fpu",
-};
-
-void __init init_ISA_irqs (void)
-{
-	int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	init_bsp_APIC();
-#endif
-	init_8259A(0);
-
-	/*
-	 * 16 old-style INTA-cycle interrupts:
-	 */
-	for (i = 0; i < 16; i++) {
-		set_irq_chip_and_handler_name(i, &i8259A_chip,
-					      handle_level_irq, "XT");
-	}
-}
-
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
-	int i;
-
-	/* all the set up before the call gates are initialised */
-	pre_intr_init_hook();
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (i >= NR_IRQS)
-			break;
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (!test_bit(vector, used_vectors))
-			set_intr_gate(vector, interrupt[i]);
-	}
-
-	/* setup after call gates are initialised (usually add in
-	 * the architecture specific gates)
-	 */
-	intr_init_hook();
-
-	/*
-	 * External FPU? Set up irq13 if so, for
-	 * original braindamaged IBM FERR coupling.
-	 */
-	if (boot_cpu_data.hard_math && !cpu_has_fpu)
-		setup_irq(FPU_IRQ, &fpu_irq);
-
-	irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
deleted file mode 100644
index 31f49e8f46a7..000000000000
--- a/arch/x86/kernel/i8259_64.c
+++ /dev/null
@@ -1,217 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-
-#include <asm/acpi.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/*
- * Common place to define all x86 IRQ vectors
- *
- * This builds up the IRQ handler stubs using some ugly macros in irq.h
- *
- * These macros create the low-level assembly IRQ routines that save
- * register context and call do_IRQ(). do_IRQ() then does all the
- * operations that are needed to keep the AT (or SMP IOAPIC)
- * interrupt-controller happy.
- */
-
-#define IRQ_NAME2(nr) nr##_interrupt(void)
-#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
-
-/*
- *	SMP has a few special interrupts for IPI messages
- */
-
-#define BUILD_IRQ(nr)				\
-	asmlinkage void IRQ_NAME(nr);		\
-	asm("\n.p2align\n"			\
-	    "IRQ" #nr "_interrupt:\n\t"		\
-	    "push $~(" #nr ") ; "		\
-	    "jmp common_interrupt");
-
-#define BI(x,y) \
-	BUILD_IRQ(x##y)
-
-#define BUILD_16_IRQS(x) \
-	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
-	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
-	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
-	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-				      BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
-BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
-BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
-
-#undef BUILD_16_IRQS
-#undef BI
-
-
-#define IRQ(x,y) \
-	IRQ##x##y##_interrupt
-
-#define IRQLIST_16(x) \
-	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
-	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
-	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
-	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-
-/* for the irq vectors */
-static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
-					  IRQLIST_16(0x2), IRQLIST_16(0x3),
-	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
-	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
-	IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
-};
-
-#undef IRQ
-#undef IRQLIST_16
-
-
-
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = {
-	.handler = no_action,
-	.mask = CPU_MASK_NONE,
-	.name = "cascade",
-};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... IRQ0_VECTOR - 1] = -1,
-	[IRQ0_VECTOR] = 0,
-	[IRQ1_VECTOR] = 1,
-	[IRQ2_VECTOR] = 2,
-	[IRQ3_VECTOR] = 3,
-	[IRQ4_VECTOR] = 4,
-	[IRQ5_VECTOR] = 5,
-	[IRQ6_VECTOR] = 6,
-	[IRQ7_VECTOR] = 7,
-	[IRQ8_VECTOR] = 8,
-	[IRQ9_VECTOR] = 9,
-	[IRQ10_VECTOR] = 10,
-	[IRQ11_VECTOR] = 11,
-	[IRQ12_VECTOR] = 12,
-	[IRQ13_VECTOR] = 13,
-	[IRQ14_VECTOR] = 14,
-	[IRQ15_VECTOR] = 15,
-	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-static void __init init_ISA_irqs (void)
-{
-	int i;
-
-	init_bsp_APIC();
-	init_8259A(0);
-
-	for (i = 0; i < NR_IRQS; i++) {
-		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].action = NULL;
-		irq_desc[i].depth = 1;
-
-		if (i < 16) {
-			/*
-			 * 16 old-style INTA-cycle interrupts:
-			 */
-			set_irq_chip_and_handler_name(i, &i8259A_chip,
-						      handle_level_irq, "XT");
-		} else {
-			/*
-			 * 'high' PCI IRQs filled in on demand
-			 */
-			irq_desc[i].chip = &no_irq_chip;
-		}
-	}
-}
-
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
-	int i;
-
-	init_ISA_irqs();
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != IA32_SYSCALL_VECTOR)
-			set_intr_gate(vector, interrupt[i]);
-	}
-
-#ifdef CONFIG_SMP
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPIs for invalidation */
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-	/* IPI for generic function call */
-	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* Low priority IPI to cleanup after moving an irq */
-	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-#endif
-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
-	/* self generated IPI for local APIC timer */
-	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-	/* IPI vectors for APIC spurious and error interrupts */
-	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-	if (!acpi_ioapic)
-		setup_irq(2, &irq2);
-}
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
new file mode 100644
index 000000000000..d66914287ee1
--- /dev/null
+++ b/arch/x86/kernel/irqinit_32.c
@@ -0,0 +1,114 @@
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/i8259.h>
+
+
+
+/*
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+ 
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+	extern void math_error(void __user *);
+	outb(0,0xF0);
+	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+		return IRQ_NONE;
+	math_error((void __user *)get_irq_regs()->ip);
+	return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
+ */
+static struct irqaction fpu_irq = {
+	.handler = math_error_irq,
+	.mask = CPU_MASK_NONE,
+	.name = "fpu",
+};
+
+void __init init_ISA_irqs (void)
+{
+	int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	init_bsp_APIC();
+#endif
+	init_8259A(0);
+
+	/*
+	 * 16 old-style INTA-cycle interrupts:
+	 */
+	for (i = 0; i < 16; i++) {
+		set_irq_chip_and_handler_name(i, &i8259A_chip,
+					      handle_level_irq, "XT");
+	}
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	/* all the set up before the call gates are initialised */
+	pre_intr_init_hook();
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (!test_bit(vector, used_vectors))
+			set_intr_gate(vector, interrupt[i]);
+	}
+
+	/* setup after call gates are initialised (usually add in
+	 * the architecture specific gates)
+	 */
+	intr_init_hook();
+
+	/*
+	 * External FPU? Set up irq13 if so, for
+	 * original braindamaged IBM FERR coupling.
+	 */
+	if (boot_cpu_data.hard_math && !cpu_has_fpu)
+		setup_irq(FPU_IRQ, &fpu_irq);
+
+	irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
new file mode 100644
index 000000000000..31f49e8f46a7
--- /dev/null
+++ b/arch/x86/kernel/irqinit_64.c
@@ -0,0 +1,217 @@
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/acpi.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+
+/*
+ * Common place to define all x86 IRQ vectors
+ *
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
+
+#define IRQ_NAME2(nr) nr##_interrupt(void)
+#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
+
+/*
+ *	SMP has a few special interrupts for IPI messages
+ */
+
+#define BUILD_IRQ(nr)				\
+	asmlinkage void IRQ_NAME(nr);		\
+	asm("\n.p2align\n"			\
+	    "IRQ" #nr "_interrupt:\n\t"		\
+	    "push $~(" #nr ") ; "		\
+	    "jmp common_interrupt");
+
+#define BI(x,y) \
+	BUILD_IRQ(x##y)
+
+#define BUILD_16_IRQS(x) \
+	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+				      BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
+BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
+
+#undef BUILD_16_IRQS
+#undef BI
+
+
+#define IRQ(x,y) \
+	IRQ##x##y##_interrupt
+
+#define IRQLIST_16(x) \
+	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
+
+/* for the irq vectors */
+static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
+					  IRQLIST_16(0x2), IRQLIST_16(0x3),
+	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
+	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
+	IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
+};
+
+#undef IRQ
+#undef IRQLIST_16
+
+
+
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+
+static struct irqaction irq2 = {
+	.handler = no_action,
+	.mask = CPU_MASK_NONE,
+	.name = "cascade",
+};
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+	[0 ... IRQ0_VECTOR - 1] = -1,
+	[IRQ0_VECTOR] = 0,
+	[IRQ1_VECTOR] = 1,
+	[IRQ2_VECTOR] = 2,
+	[IRQ3_VECTOR] = 3,
+	[IRQ4_VECTOR] = 4,
+	[IRQ5_VECTOR] = 5,
+	[IRQ6_VECTOR] = 6,
+	[IRQ7_VECTOR] = 7,
+	[IRQ8_VECTOR] = 8,
+	[IRQ9_VECTOR] = 9,
+	[IRQ10_VECTOR] = 10,
+	[IRQ11_VECTOR] = 11,
+	[IRQ12_VECTOR] = 12,
+	[IRQ13_VECTOR] = 13,
+	[IRQ14_VECTOR] = 14,
+	[IRQ15_VECTOR] = 15,
+	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
+static void __init init_ISA_irqs (void)
+{
+	int i;
+
+	init_bsp_APIC();
+	init_8259A(0);
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc[i].status = IRQ_DISABLED;
+		irq_desc[i].action = NULL;
+		irq_desc[i].depth = 1;
+
+		if (i < 16) {
+			/*
+			 * 16 old-style INTA-cycle interrupts:
+			 */
+			set_irq_chip_and_handler_name(i, &i8259A_chip,
+						      handle_level_irq, "XT");
+		} else {
+			/*
+			 * 'high' PCI IRQs filled in on demand
+			 */
+			irq_desc[i].chip = &no_irq_chip;
+		}
+	}
+}
+
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	init_ISA_irqs();
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (vector != IA32_SYSCALL_VECTOR)
+			set_intr_gate(vector, interrupt[i]);
+	}
+
+#ifdef CONFIG_SMP
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPIs for invalidation */
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+
+	/* IPI for generic function call */
+	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+	/* Low priority IPI to cleanup after moving an irq */
+	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+#endif
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+
+	/* self generated IPI for local APIC timer */
+	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+	/* IPI vectors for APIC spurious and error interrupts */
+	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+	if (!acpi_ioapic)
+		setup_irq(2, &irq2);
+}
-- 
cgit v1.2.3


From 6360b1fbb4a939efd34fc770c2ebd927c55506e0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 12 May 2008 15:44:41 +0200
Subject: move BUG_TABLE into RODATA

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/avr32/kernel/vmlinux.lds.S   |  2 --
 arch/parisc/kernel/vmlinux.lds.S  |  1 -
 arch/powerpc/kernel/vmlinux.lds.S |  2 --
 arch/s390/kernel/vmlinux.lds.S    |  1 -
 arch/sh/kernel/vmlinux_32.lds.S   |  1 -
 arch/sh/kernel/vmlinux_64.lds.S   |  1 -
 arch/x86/kernel/vmlinux_32.lds.S  |  8 +++-----
 arch/x86/kernel/vmlinux_64.lds.S  | 10 ++++------
 include/asm-generic/vmlinux.lds.h |  6 ++++++
 9 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S
index 481cfd40c053..bc932c9b4272 100644
--- a/arch/avr32/kernel/vmlinux.lds.S
+++ b/arch/avr32/kernel/vmlinux.lds.S
@@ -93,8 +93,6 @@ SECTIONS
 		__stop___ex_table = .;
 	}
 
-	BUG_TABLE
-
 	RODATA
 
 	. = ALIGN(THREAD_SIZE);
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 50b4a3a25d0a..ff7d4ff4675a 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -66,7 +66,6 @@ SECTIONS
 	_etext = .;
 
 	RODATA
-	BUG_TABLE
 
 	/* writeable */
 	/* Make sure this is page aligned so
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 0c3000bf8d75..53d57d17a894 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -64,8 +64,6 @@ SECTIONS
 
 	NOTES
 
-	BUG_TABLE
-
 /*
  * Init sections discarded at runtime
  */
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index b4607155e8d0..76c1e60c92f3 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -40,7 +40,6 @@ SECTIONS
 	_etext = .;		/* End of text section */
 
 	NOTES :text :note
-	BUG_TABLE :text
 
 	RODATA
 
diff --git a/arch/sh/kernel/vmlinux_32.lds.S b/arch/sh/kernel/vmlinux_32.lds.S
index c7113786ecd4..7b4b82bd1156 100644
--- a/arch/sh/kernel/vmlinux_32.lds.S
+++ b/arch/sh/kernel/vmlinux_32.lds.S
@@ -44,7 +44,6 @@ SECTIONS
 
 	_etext = .;			/* End of text section */
 
-	BUG_TABLE
 	NOTES
 	RO_DATA(PAGE_SIZE)
 
diff --git a/arch/sh/kernel/vmlinux_64.lds.S b/arch/sh/kernel/vmlinux_64.lds.S
index d1e177009a41..33fa46451406 100644
--- a/arch/sh/kernel/vmlinux_64.lds.S
+++ b/arch/sh/kernel/vmlinux_64.lds.S
@@ -65,7 +65,6 @@ SECTIONS
 
 	_etext = .;			/* End of text section */
 
-	BUG_TABLE
 	NOTES 
 	RO_DATA(PAGE_SIZE)
 
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index ce5ed083a1e9..aa0855471c79 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -49,16 +49,14 @@ SECTIONS
   	_etext = .;			/* End of text section */
   } :text = 0x9090
 
+  NOTES :text :note
+
   . = ALIGN(16);		/* Exception table */
   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
   	__start___ex_table = .;
 	 *(__ex_table)
   	__stop___ex_table = .;
-  }
-
-  NOTES :text :note
-
-  BUG_TABLE :text
+  } :text = 0x9090
 
   . = ALIGN(4);
   .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fad3674b06a5..d123747af1e4 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -19,7 +19,7 @@ PHDRS {
 	data PT_LOAD FLAGS(7);	/* RWE */
 	user PT_LOAD FLAGS(7);	/* RWE */
 	data.init PT_LOAD FLAGS(7);	/* RWE */
-	note PT_NOTE FLAGS(4);	/* R__ */
+	note PT_NOTE FLAGS(0);	/* ___ */
 }
 SECTIONS
 {
@@ -40,16 +40,14 @@ SECTIONS
 	_etext = .;		/* End of text section */
   } :text = 0x9090
 
+  NOTES :text :note
+
   . = ALIGN(16);		/* Exception table */
   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
   	__start___ex_table = .;
 	 *(__ex_table)
   	__stop___ex_table = .;
-  }
-
-  NOTES :text :note
-
-  BUG_TABLE :text
+  } :text = 0x9090
 
   RODATA
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f054778e916c..dd2cc8122ad8 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -67,6 +67,8 @@
 		*(.rodata1)						\
 	}								\
 									\
+	BUG_TABLE							\
+									\
 	/* PCI quirks */						\
 	.pci_fixup        : AT(ADDR(.pci_fixup) - LOAD_OFFSET) {	\
 		VMLINUX_SYMBOL(__start_pci_fixups_early) = .;		\
@@ -310,6 +312,7 @@
 		.stab.indexstr 0 : { *(.stab.indexstr) }		\
 		.comment 0 : { *(.comment) }
 
+#ifdef CONFIG_GENERIC_BUG
 #define BUG_TABLE							\
 	. = ALIGN(8);							\
 	__bug_table : AT(ADDR(__bug_table) - LOAD_OFFSET) {		\
@@ -317,6 +320,9 @@
 		*(__bug_table)						\
 		__stop___bug_table = .;					\
 	}
+#else
+#define BUG_TABLE
+#endif
 
 #define NOTES								\
 	.notes : AT(ADDR(.notes) - LOAD_OFFSET) {			\
-- 
cgit v1.2.3


From 63687a528c39a67c1a213cdffa09feb0e6af9dbe Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 12 May 2008 15:44:41 +0200
Subject: x86: move tracedata to RODATA

.. allowing it to be write-protected just as other read-only data
under CONFIG_DEBUG_RODATA.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vmlinux_32.lds.S  |  7 -------
 arch/x86/kernel/vmlinux_64.lds.S  |  7 -------
 drivers/base/power/trace.c        |  2 +-
 include/asm-generic/vmlinux.lds.h | 14 ++++++++++++++
 include/asm-x86/resume-trace.h    |  2 +-
 include/linux/resume-trace.h      |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index ce5ed083a1e9..2674f5796275 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -60,13 +60,6 @@ SECTIONS
 
   BUG_TABLE :text
 
-  . = ALIGN(4);
-  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
-  	__tracedata_start = .;
-	*(.tracedata)
-  	__tracedata_end = .;
-  }
-
   RODATA
 
   /* writeable */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fad3674b06a5..687041bfbae5 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -53,13 +53,6 @@ SECTIONS
 
   RODATA
 
-  . = ALIGN(4);
-  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
-  	__tracedata_start = .;
-	*(.tracedata)
-  	__tracedata_end = .;
-  }
-
   . = ALIGN(PAGE_SIZE);		/* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
index 2b4b392dcbc1..87a7f1d02578 100644
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -153,7 +153,7 @@ EXPORT_SYMBOL(set_trace_device);
  * it's not any guarantee, but it's a high _likelihood_ that
  * the match is valid).
  */
-void generate_resume_trace(void *tracedata, unsigned int user)
+void generate_resume_trace(const void *tracedata, unsigned int user)
 {
 	unsigned short lineno = *(unsigned short *)tracedata;
 	const char *file = *(const char **)(tracedata + 2);
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f054778e916c..f1992dc5c424 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -93,6 +93,8 @@
 		VMLINUX_SYMBOL(__end_rio_route_ops) = .;		\
 	}								\
 									\
+	TRACEDATA							\
+									\
 	/* Kernel symbol table: Normal symbols */			\
 	__ksymtab         : AT(ADDR(__ksymtab) - LOAD_OFFSET) {		\
 		VMLINUX_SYMBOL(__start___ksymtab) = .;			\
@@ -318,6 +320,18 @@
 		__stop___bug_table = .;					\
 	}
 
+#ifdef CONFIG_PM_TRACE
+#define TRACEDATA							\
+	. = ALIGN(4);							\
+	.tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {		\
+	  	__tracedata_start = .;					\
+		*(.tracedata)						\
+	  	__tracedata_end = .;					\
+	}
+#else
+#define TRACEDATA
+#endif
+
 #define NOTES								\
 	.notes : AT(ADDR(.notes) - LOAD_OFFSET) {			\
 		VMLINUX_SYMBOL(__start_notes) = .;			\
diff --git a/include/asm-x86/resume-trace.h b/include/asm-x86/resume-trace.h
index 2557514d7ef6..8d9f0b41ee86 100644
--- a/include/asm-x86/resume-trace.h
+++ b/include/asm-x86/resume-trace.h
@@ -6,7 +6,7 @@
 #define TRACE_RESUME(user)					\
 do {								\
 	if (pm_trace_enabled) {					\
-		void *tracedata;				\
+		const void *tracedata;				\
 		asm volatile(_ASM_MOV_UL " $1f,%0\n"		\
 			     ".section .tracedata,\"a\"\n"	\
 			     "1:\t.word %c1\n\t"		\
diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h
index f3f4f28c6960..c9ba2fdf807d 100644
--- a/include/linux/resume-trace.h
+++ b/include/linux/resume-trace.h
@@ -8,7 +8,7 @@ extern int pm_trace_enabled;
 
 struct device;
 extern void set_trace_device(struct device *);
-extern void generate_resume_trace(void *tracedata, unsigned int user);
+extern void generate_resume_trace(const void *tracedata, unsigned int user);
 
 #define TRACE_DEVICE(dev) do { \
 	if (pm_trace_enabled) \
-- 
cgit v1.2.3


From a2eddfa95919a730e0e5ed17e9c303fe5ba249cd Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 12 May 2008 15:44:41 +0200
Subject: x86: make /proc/stat account for all interrupts

LAPIC interrupts, which don't go through the generic interrupt handling
code, aren't accounted for in /proc/stat. Hence this patch adds a
mechanism architectures can use to accordingly adjust the statistics.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irq_32.c  | 38 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/irq_64.c  | 28 ++++++++++++++++++++++++++++
 fs/proc/proc_misc.c       |  9 +++++++++
 include/asm-x86/hardirq.h |  6 ++++++
 4 files changed, 81 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 147352df28b9..468acd04aa2e 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -313,16 +313,20 @@ skip:
 				per_cpu(irq_stat,j).irq_tlb_count);
 		seq_printf(p, "  TLB shootdowns\n");
 #endif
+#ifdef CONFIG_X86_MCE
 		seq_printf(p, "TRM: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ",
 				per_cpu(irq_stat,j).irq_thermal_count);
 		seq_printf(p, "  Thermal event interrupts\n");
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
 		seq_printf(p, "SPU: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ",
 				per_cpu(irq_stat,j).irq_spurious_count);
 		seq_printf(p, "  Spurious interrupts\n");
+#endif
 		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
 		seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
@@ -331,6 +335,40 @@ skip:
 	return 0;
 }
 
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+	u64 sum = nmi_count(cpu);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
+#endif
+#ifdef CONFIG_SMP
+	sum += per_cpu(irq_stat, cpu).irq_resched_count;
+	sum += per_cpu(irq_stat, cpu).irq_call_count;
+	sum += per_cpu(irq_stat, cpu).irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+	sum += per_cpu(irq_stat, cpu).irq_thermal_count;
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+	sum += per_cpu(irq_stat, cpu).irq_spurious_count;
+#endif
+	return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+	u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+	sum += atomic_read(&irq_mis_count);
+#endif
+	return sum;
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 #include <mach_apic.h>
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 3aac15466a91..1f78b238d8d2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -135,6 +135,7 @@ skip:
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
 		seq_printf(p, "  TLB shootdowns\n");
 #endif
+#ifdef CONFIG_X86_MCE
 		seq_printf(p, "TRM: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
@@ -143,6 +144,7 @@ skip:
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
 		seq_printf(p, "  Threshold APIC interrupts\n");
+#endif
 		seq_printf(p, "SPU: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
@@ -152,6 +154,32 @@ skip:
 	return 0;
 }
 
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+	u64 sum = cpu_pda(cpu)->__nmi_count;
+
+	sum += cpu_pda(cpu)->apic_timer_irqs;
+#ifdef CONFIG_SMP
+	sum += cpu_pda(cpu)->irq_resched_count;
+	sum += cpu_pda(cpu)->irq_call_count;
+	sum += cpu_pda(cpu)->irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+	sum += cpu_pda(cpu)->irq_thermal_count;
+	sum += cpu_pda(cpu)->irq_threshold_count;
+#endif
+	sum += cpu_pda(cpu)->irq_spurious_count;
+	return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+	return atomic_read(&irq_err_count);
+}
+
 /*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 74a323d2b850..903e617bec58 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -472,6 +472,13 @@ static const struct file_operations proc_vmalloc_operations = {
 };
 #endif
 
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i;
@@ -509,7 +516,9 @@ static int show_stat(struct seq_file *p, void *v)
 			sum += temp;
 			per_irq_sum[j] += temp;
 		}
+		sum += arch_irq_stat_cpu(i);
 	}
+	sum += arch_irq_stat();
 
 	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
diff --git a/include/asm-x86/hardirq.h b/include/asm-x86/hardirq.h
index 314434d664e7..000787df66e6 100644
--- a/include/asm-x86/hardirq.h
+++ b/include/asm-x86/hardirq.h
@@ -3,3 +3,9 @@
 #else
 # include "hardirq_64.h"
 #endif
+
+extern u64 arch_irq_stat_cpu(unsigned int cpu);
+#define arch_irq_stat_cpu	arch_irq_stat_cpu
+
+extern u64 arch_irq_stat(void);
+#define arch_irq_stat		arch_irq_stat
-- 
cgit v1.2.3


From c2c14fb7afcc67b48724571506e095fccc65a670 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 15:43:36 +0200
Subject: x86: tsc_64.c make constant UL

arch/x86/kernel/tsc_64.c:245:13: warning: constant 0x100000000 is so big it is long

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index fcc16e58609e..c19b0e273ef1 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -242,7 +242,7 @@ void __init tsc_calibrate(void)
 	if (hpet) {
 		printk(KERN_INFO "TSC calibrated against HPET\n");
 		if (hpet2 < hpet1)
-			hpet2 += 0x100000000;
+			hpet2 += 0x100000000UL;
 		hpet2 -= hpet1;
 		tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000;
 	} else {
-- 
cgit v1.2.3


From eef8f871d84b5df1a24902a4e4700188be1aff2c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 15:43:34 +0200
Subject: x86: vsmp_64 add missing includes

sparse mutters:
arch/x86/kernel/vsmp_64.c:126:5: warning: symbol 'is_vsmp_box' was not declared. Should it be static?
arch/x86/kernel/vsmp_64.c:145:13: warning: symbol 'vsmp_init' was not declared. Should it be static?

Include the appropriate headers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsmp_64.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index ba8c0b75ab0a..0c029e8959c7 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,9 +15,12 @@
 #include <linux/init.h>
 #include <linux/pci_ids.h>
 #include <linux/pci_regs.h>
+
+#include <asm/apic.h>
 #include <asm/pci-direct.h>
 #include <asm/io.h>
 #include <asm/paravirt.h>
+#include <asm/setup.h>
 
 #if defined CONFIG_PCI && defined CONFIG_PARAVIRT
 /*
-- 
cgit v1.2.3


From d1097635deed7196c2a859f285c29267779ca45a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 2 May 2008 23:42:01 +0200
Subject: x86: move mmconfig declarations to header

arch/x86/kernel/mmconf-fam10h_64.c is missing the prototypes, which
are decalred in arch/x86/kernel/setup_64.c. Move the prototypes and
the inline stubs to the appropriate header file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mmconf-fam10h_64.c |  1 +
 arch/x86/kernel/setup_64.c         | 13 +------------
 include/asm-x86/mmconfig.h         | 12 ++++++++++++
 3 files changed, 14 insertions(+), 12 deletions(-)
 create mode 100644 include/asm-x86/mmconfig.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index edc5fbfe85c0..fdfdc550b366 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -12,6 +12,7 @@
 #include <asm/io.h>
 #include <asm/msr.h>
 #include <asm/acpi.h>
+#include <asm/mmconfig.h>
 
 #include "../pci/pci.h"
 
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6dff1286ad8a..341230db74e1 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -71,6 +71,7 @@
 #include <asm/topology.h>
 #include <asm/trampoline.h>
 #include <asm/pat.h>
+#include <asm/mmconfig.h>
 
 #include <mach_apic.h>
 #ifdef CONFIG_PARAVIRT
@@ -293,18 +294,6 @@ static void __init parse_setup_data(void)
 	}
 }
 
-#ifdef CONFIG_PCI_MMCONFIG
-extern void __cpuinit fam10h_check_enable_mmcfg(void);
-extern void __init check_enable_amd_mmconf_dmi(void);
-#else
-void __cpuinit fam10h_check_enable_mmcfg(void)
-{
-}
-void __init check_enable_amd_mmconf_dmi(void)
-{
-}
-#endif
-
 /*
  * setup_arch - architecture-specific boot-time initializations
  *
diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h
new file mode 100644
index 000000000000..95beda07c6fa
--- /dev/null
+++ b/include/asm-x86/mmconfig.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_MMCONFIG_H
+#define _ASM_MMCONFIG_H
+
+#ifdef CONFIG_PCI_MMCONFIG
+extern void __cpuinit fam10h_check_enable_mmcfg(void);
+extern void __init check_enable_amd_mmconf_dmi(void);
+#else
+static inline void fam10h_check_enable_mmcfg(void) { }
+static inline void check_enable_amd_mmconf_dmi(void) { }
+#endif
+
+#endif
-- 
cgit v1.2.3


From e0b32d768cad17af07af3aced67498bde98296c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 15:43:37 +0200
Subject: x86: make command_line static in setup_64.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 341230db74e1..78c1be69c6d3 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(edid_info);
 
 extern int root_mountflags;
 
-char __initdata command_line[COMMAND_LINE_SIZE];
+static char __initdata command_line[COMMAND_LINE_SIZE];
 
 static struct resource standard_io_resources[] = {
 	{ .name = "dma1", .start = 0x00, .end = 0x1f,
-- 
cgit v1.2.3


From 311f83494830fec17f086401a5b43984bb447cd2 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 12 May 2008 15:43:37 +0200
Subject: x86: kernel/pci-dma.c cleanups

This patch contains the following cleanups:
- make the following needlessly global code static:
  - dma_alloc_pages()

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/pci-dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index c5ef1af8e79d..5bc29ca4196e 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -357,7 +357,7 @@ int dma_supported(struct device *dev, u64 mask)
 EXPORT_SYMBOL(dma_supported);
 
 /* Allocate DMA memory on node near device */
-noinline struct page *
+static noinline struct page *
 dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
 {
 	int node;
-- 
cgit v1.2.3


From ebdd561a19d7917ac49fff9c355aa4833c504bf1 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 12 May 2008 15:43:38 +0200
Subject: x86: constify data in reboot.c

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/reboot.c           | 18 +++++++++---------
 arch/x86/kernel/reboot_fixups_32.c |  4 ++--
 include/asm-x86/reboot.h           |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f6be7d5f82f8..f8a62160e151 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -27,7 +27,7 @@
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
-static long no_idt[3];
+static const struct desc_ptr no_idt = {};
 static int reboot_mode;
 enum reboot_type reboot_type = BOOT_KBD;
 int reboot_force;
@@ -201,15 +201,15 @@ core_initcall(reboot_init);
    controller to pulse the CPU reset line, which is more thorough, but
    doesn't work with at least one type of 486 motherboard.  It is easy
    to stop this code working; hence the copious comments. */
-static unsigned long long
+static const unsigned long long
 real_mode_gdt_entries [3] =
 {
 	0x0000000000000000ULL,	/* Null descriptor */
-	0x00009a000000ffffULL,	/* 16-bit real-mode 64k code at 0x00000000 */
-	0x000092000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
+	0x00009b000000ffffULL,	/* 16-bit real-mode 64k code at 0x00000000 */
+	0x000093000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
 };
 
-static struct desc_ptr
+static const struct desc_ptr
 real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
 real_mode_idt = { 0x3ff, 0 };
 
@@ -231,7 +231,7 @@ real_mode_idt = { 0x3ff, 0 };
 
    More could be done here to set up the registers as if a CPU reset had
    occurred; hopefully real BIOSs don't assume much. */
-static unsigned char real_mode_switch [] =
+static const unsigned char real_mode_switch [] =
 {
 	0x66, 0x0f, 0x20, 0xc0,			/*    movl  %cr0,%eax        */
 	0x66, 0x83, 0xe0, 0x11,			/*    andl  $0x00000011,%eax */
@@ -245,7 +245,7 @@ static unsigned char real_mode_switch [] =
 	0x24, 0x10,				/* f: andb  $0x10,al         */
 	0x66, 0x0f, 0x22, 0xc0			/*    movl  %eax,%cr0        */
 };
-static unsigned char jump_to_bios [] =
+static const unsigned char jump_to_bios [] =
 {
 	0xea, 0x00, 0x00, 0xff, 0xff		/*    ljmp  $0xffff,$0x0000  */
 };
@@ -255,7 +255,7 @@ static unsigned char jump_to_bios [] =
  * specified by the code and length parameters.
  * We assume that length will aways be less that 100!
  */
-void machine_real_restart(unsigned char *code, int length)
+void machine_real_restart(const unsigned char *code, int length)
 {
 	local_irq_disable();
 
@@ -368,7 +368,7 @@ static void native_machine_emergency_restart(void)
 			}
 
 		case BOOT_TRIPLE:
-			load_idt((const struct desc_ptr *)&no_idt);
+			load_idt(&no_idt);
 			__asm__ __volatile__("int3");
 
 			reboot_type = BOOT_KBD;
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index dec0b5ec25c2..61a837743fe5 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -49,7 +49,7 @@ struct device_fixup {
 	void (*reboot_fixup)(struct pci_dev *);
 };
 
-static struct device_fixup fixups_table[] = {
+static const struct device_fixup fixups_table[] = {
 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
 { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
@@ -64,7 +64,7 @@ static struct device_fixup fixups_table[] = {
  */
 void mach_reboot_fixups(void)
 {
-	struct device_fixup *cur;
+	const struct device_fixup *cur;
 	struct pci_dev *dev;
 	int i;
 
diff --git a/include/asm-x86/reboot.h b/include/asm-x86/reboot.h
index e63741f19392..206f355786dc 100644
--- a/include/asm-x86/reboot.h
+++ b/include/asm-x86/reboot.h
@@ -14,8 +14,8 @@ struct machine_ops {
 
 extern struct machine_ops machine_ops;
 
-void machine_real_restart(unsigned char *code, int length);
 void native_machine_crash_shutdown(struct pt_regs *regs);
 void native_machine_shutdown(void);
+void machine_real_restart(const unsigned char *code, int length);
 
 #endif	/* _ASM_REBOOT_H */
-- 
cgit v1.2.3


From 369101da7e3f6f971b7303922d2978b8483241c6 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Mon, 12 May 2008 15:43:38 +0200
Subject: x86: head_64.S cleanup - use predefined flags from processor-flags.h

We should better use already defined flags from processor-flags.h instead
of defining own ones

[>>> object code check >>>]

original
md5sum: 9cfa6dbf045a046bb5dfb85f8bcfe8c4  arch/x86/kernel/head_64.o
   text    data     bss     dec     hex filename
  37361    4432    8192   49985    c341 arch/x86/kernel/head_64.o

patched
md5sum: 9cfa6dbf045a046bb5dfb85f8bcfe8c4  arch/x86/kernel/head_64.o
   text    data     bss     dec     hex filename
  37361    4432    8192   49985    c341 arch/x86/kernel/head_64.o

[<<< object code check <<<]

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_64.S | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 10a1955bb1d1..2f59ca8bd16c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -18,6 +18,7 @@
 #include <asm/page.h>
 #include <asm/msr.h>
 #include <asm/cache.h>
+#include <asm/processor-flags.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/asm-offsets.h>
@@ -184,14 +185,10 @@ ENTRY(secondary_startup_64)
 1:	wrmsr				/* Make changes effective */
 
 	/* Setup cr0 */
-#define CR0_PM				1		/* protected mode */
-#define CR0_MP				(1<<1)
-#define CR0_ET				(1<<4)
-#define CR0_NE				(1<<5)
-#define CR0_WP				(1<<16)
-#define CR0_AM				(1<<18)
-#define CR0_PAGING 			(1<<31)
-	movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
+#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+			 X86_CR0_PG)
+	movl	$CR0_STATE, %eax
 	/* Make changes effective */
 	movq	%rax, %cr0
 
-- 
cgit v1.2.3


From 2cc74111c78dbc7b26c41b4e87cfebfc3aed49c4 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Sun, 11 May 2008 19:35:54 +0800
Subject: x86: ipi.c: removed duplicated include

Removed duplicated include <linux/interrupt.h> in
arch/x86/kernel/ipi.c.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Cc: mingo@redhat.com
Cc: hpa@zytor.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ipi.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index c0df7b89ca23..9d98cda39ad9 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -8,7 +8,6 @@
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/cache.h>
-#include <linux/interrupt.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
 
-- 
cgit v1.2.3


From 883b7af932b4435eb4798cfa4fec0848639c2a87 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Sun, 11 May 2008 19:36:57 +0800
Subject: x86: smpboot.c: removed duplicated include

Removed duplicated include <asm/nmi.h> in
arch/x86/kernel/smpboot.c.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Cc: mingo@redhat.com
Cc: hpa@zytor.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 38988491c622..0974fc0997b9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -59,7 +59,6 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mtrr.h>
-#include <asm/nmi.h>
 #include <asm/vmi.h>
 #include <asm/genapic.h>
 #include <linux/mc146818rtc.h>
-- 
cgit v1.2.3


From 05139d8fb445d9a3569071af1c5920fc46191b7b Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 13 May 2008 21:14:22 +0400
Subject: x86: head_64.S cleanup - use straight move to CR4 register

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_64.S | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2f59ca8bd16c..7475b4c91f05 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -155,9 +155,7 @@ ENTRY(secondary_startup_64)
 	 */
 
 	/* Enable PAE mode and PGE */
-	xorq	%rax, %rax
-	btsq	$5, %rax
-	btsq	$7, %rax
+	movl	$(X86_CR4_PAE | X86_CR4_PGE), %eax
 	movq	%rax, %cr4
 
 	/* Setup early boot stage 4 level pagetables. */
-- 
cgit v1.2.3


From 0e192b99d7d09f08b445c31ee7c7f3d0bfb27345 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 13 May 2008 20:55:40 +0400
Subject: x86: head_64.S cleanup - use PMD_SHIFT instead of numeric constant

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_64.S | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7475b4c91f05..d8ed3259e6e4 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -322,11 +322,11 @@ early_idt_ripmsg:
 ENTRY(name)
 
 /* Automate the creation of 1 to 1 mapping pmd entries */
-#define PMDS(START, PERM, COUNT)		\
-	i = 0 ;					\
-	.rept (COUNT) ;				\
-	.quad	(START) + (i << 21) + (PERM) ;	\
-	i = i + 1 ;				\
+#define PMDS(START, PERM, COUNT)			\
+	i = 0 ;						\
+	.rept (COUNT) ;					\
+	.quad	(START) + (i << PMD_SHIFT) + (PERM) ;	\
+	i = i + 1 ;					\
 	.endr
 
 	/*
-- 
cgit v1.2.3


From bfe4bb1526945e446d2912bef2e1e2cbd2c7349e Mon Sep 17 00:00:00 2001
From: Miklos Vajna <vmiklos@frugalware.org>
Date: Sat, 17 May 2008 22:48:13 +0200
Subject: x86: janitor work in bugs.c

Just moved trailing statements to the next line, removed space before
open/close parenthesis, wrapped long lines.

Signed-off-by: Miklos Vajna <vmiklos@frugalware.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/bugs.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 170d2f5523b2..1b1c56bb338f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -59,8 +59,12 @@ static void __init check_fpu(void)
 		return;
 	}
 
-/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
-	/* Test for the divl bug.. */
+	/*
+	 * trap_init() enabled FXSR and company _before_ testing for FP
+	 * problems here.
+	 *
+	 * Test for the divl bug..
+	 */
 	__asm__("fninit\n\t"
 		"fldl %1\n\t"
 		"fdivl %2\n\t"
@@ -108,10 +112,15 @@ static void __init check_popad(void)
 	  "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
 	  : "=&a" (res)
 	  : "d" (inp)
-	  : "ecx", "edi" );
-	/* If this fails, it means that any user program may lock the CPU hard. Too bad. */
-	if (res != 12345678) printk( "Buggy.\n" );
-		        else printk( "OK.\n" );
+	  : "ecx", "edi");
+	/*
+	 * If this fails, it means that any user program may lock the
+	 * CPU hard. Too bad.
+	 */
+	if (res != 12345678)
+		printk("Buggy.\n");
+	else
+		printk("OK.\n");
 #endif
 }
 
@@ -137,7 +146,8 @@ static void __init check_config(void)
  * i486+ only features! (WP works in supervisor mode and the
  * new "invlpg" and "bswap" instructions)
  */
-#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP)
+#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
+	defined(CONFIG_X86_BSWAP)
 	if (boot_cpu_data.x86 == 3)
 		panic("Kernel requires i486+ for 'invlpg' and other features");
 #endif
@@ -170,6 +180,7 @@ void __init check_bugs(void)
 	check_fpu();
 	check_hlt();
 	check_popad();
-	init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+	init_utsname()->machine[1] =
+		'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
 	alternative_instructions();
 }
-- 
cgit v1.2.3


From 83cd1daa1dc3400e5ca2255818641233ebb30680 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 12 May 2008 15:44:39 +0200
Subject: x86: eliminate dead code in x86_64 entry.S

Remove the not longer used handlers for reserved vectors.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/entry_64.S | 4 ----
 arch/x86/kernel/traps_64.c | 3 ---
 2 files changed, 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..1f1751dfb2f3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1120,10 +1120,6 @@ ENTRY(coprocessor_segment_overrun)
 	zeroentry do_coprocessor_segment_overrun
 END(coprocessor_segment_overrun)
 
-ENTRY(reserved)
-	zeroentry do_reserved
-END(reserved)
-
 	/* runs on exception stack */
 ENTRY(double_fault)
 	XCPT_FRAME
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index adff76ea97c4..ec6d3b2130c4 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -71,7 +71,6 @@ asmlinkage void general_protection(void);
 asmlinkage void page_fault(void);
 asmlinkage void coprocessor_error(void);
 asmlinkage void simd_coprocessor_error(void);
-asmlinkage void reserved(void);
 asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);
@@ -702,12 +701,10 @@ DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
 DO_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_ERROR( 5, SIGSEGV, "bounds", bounds)
 DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR(18, SIGSEGV, "reserved", reserved)
 
 /* Runs on IST stack */
 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
-- 
cgit v1.2.3


From 0da72a4aeb4482c64c1142a2e36b556d13374937 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Apr 2008 20:11:51 +0200
Subject: x86: fix sparse warning in mtrr/generic.c

arch/x86/kernel/cpu/mtrr/generic.c:216:12: warning: symbol 'lo' shadows an earlier one

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 5d241ce94a44..0625d4158e58 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -213,12 +213,12 @@ void __init get_mtrr_state(void)
 	mtrr_state.enabled = (lo & 0xc00) >> 10;
 
 	if (amd_special_default_mtrr()) {
-		unsigned lo, hi;
+		unsigned low, high;
 		/* TOP_MEM2 */
-		rdmsr(MSR_K8_TOP_MEM2, lo, hi);
-		tom2 = hi;
+		rdmsr(MSR_K8_TOP_MEM2, low, high);
+		tom2 = high;
 		tom2 <<= 32;
-		tom2 |= lo;
+		tom2 |= low;
 		tom2 &= 0xffffff8000000ULL;
 	}
 	if (mtrr_show) {
-- 
cgit v1.2.3


From 0dbfafa5fcd4dd189e2adc7b6ed9e0405e846d79 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Wed, 23 Apr 2008 15:09:05 +0200
Subject: x86: move i386 memory setup code to e820_32.c

The x86_64 code has centralized the memory setup code in
e820_64.c. This patch copies that approach to i386:

- early_param("mem", ...) parsing is moved from
setup_32.c to e820_32.c.

- setup_memory_map() and finish_e820_parsing() are
factored out from setup_arch(), and declarations
are added to e820_32.h.

- print_memory_map() is made static and removed from
e820_32.h.

- user_defined_memmap is marked as __initdata.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820_32.c  | 60 ++++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/kernel/setup_32.c | 50 ++------------------------------------
 include/asm-x86/e820_32.h  |  4 +++-
 3 files changed, 63 insertions(+), 51 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index ed733e7cf4e6..31ea2bb8c91a 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -30,7 +30,6 @@ unsigned long pci_mem_start = 0x10000000;
 #ifdef CONFIG_PCI
 EXPORT_SYMBOL(pci_mem_start);
 #endif
-extern int user_defined_memmap;
 
 static struct resource system_rom_resource = {
 	.name	= "System ROM",
@@ -584,7 +583,7 @@ void __init e820_register_memory(void)
 		pci_mem_start, gapstart, gapsize);
 }
 
-void __init print_memory_map(char *who)
+static void __init print_memory_map(char *who)
 {
 	int i;
 
@@ -692,6 +691,54 @@ e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
 	return 0;
 }
 
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __init __attribute__((weak)) memory_setup(void)
+{
+	return machine_specific_memory_setup();
+}
+
+void __init setup_memory_map(void)
+{
+	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+	print_memory_map(memory_setup());
+}
+
+static int __initdata user_defined_memmap;
+
+/*
+ * "mem=nopentium" disables the 4MB page tables.
+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
+ * to <mem>, overriding the bios size.
+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
+ * <start> to <start>+<mem>, overriding the bios size.
+ *
+ * HPA tells me bootloaders need to parse mem=, so no new
+ * option should be mem=  [also see Documentation/i386/boot.txt]
+ */
+static int __init parse_mem(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strcmp(arg, "nopentium") == 0) {
+		setup_clear_cpu_cap(X86_FEATURE_PSE);
+	} else {
+		/* If the user specifies memory size, we
+		 * limit the BIOS-provided memory map to
+		 * that size. exactmap can be used to specify
+		 * the exact map. mem=number can be used to
+		 * trim the existing memory map.
+		 */
+		unsigned long long mem_size;
+
+		mem_size = memparse(arg, &arg);
+		limit_regions(mem_size);
+		user_defined_memmap = 1;
+	}
+	return 0;
+}
+early_param("mem", parse_mem);
+
 static int __init parse_memmap(char *arg)
 {
 	if (!arg)
@@ -762,6 +809,15 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type,
 					 new_type);
 	}
 }
+
+void __init finish_e820_parsing(void)
+{
+	if (user_defined_memmap) {
+		printk(KERN_INFO "user-defined physical RAM map:\n");
+		print_memory_map("user");
+	}
+}
+
 void __init update_e820(void)
 {
 	u8 nr_map;
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 2c5f8b213e86..b54c79c91efd 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -237,42 +237,6 @@ static inline void copy_edd(void)
 }
 #endif
 
-int __initdata user_defined_memmap;
-
-/*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem=  [also see Documentation/i386/boot.txt]
- */
-static int __init parse_mem(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp(arg, "nopentium") == 0) {
-		setup_clear_cpu_cap(X86_FEATURE_PSE);
-	} else {
-		/* If the user specifies memory size, we
-		 * limit the BIOS-provided memory map to
-		 * that size. exactmap can be used to specify
-		 * the exact map. mem=number can be used to
-		 * trim the existing memory map.
-		 */
-		unsigned long long mem_size;
-
-		mem_size = memparse(arg, &arg);
-		limit_regions(mem_size);
-		user_defined_memmap = 1;
-	}
-	return 0;
-}
-early_param("mem", parse_mem);
-
 #ifdef CONFIG_PROC_VMCORE
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel.
@@ -725,12 +689,6 @@ static void set_mca_bus(int x)
 static void set_mca_bus(int x) { }
 #endif
 
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
-	return machine_specific_memory_setup();
-}
-
 #ifdef CONFIG_NUMA
 /*
  * In the golden day, when everything among i386 and x86_64 will be
@@ -786,8 +744,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	ARCH_SETUP
 
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	print_memory_map(memory_setup());
+	setup_memory_map();
 
 	copy_edd();
 
@@ -807,10 +764,7 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
-	if (user_defined_memmap) {
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		print_memory_map("user");
-	}
+	finish_e820_parsing();
 
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index a9f7c6ec32bf..e1f10c60901f 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -18,6 +18,9 @@
 
 #ifndef __ASSEMBLY__
 
+extern void setup_memory_map(void);
+extern void finish_e820_parsing(void);
+
 extern struct e820map e820;
 extern void update_e820(void);
 
@@ -32,7 +35,6 @@ extern void update_memory_range(u64 start, u64 size, unsigned old_type,
 				unsigned new_type);
 extern void e820_register_memory(void);
 extern void limit_regions(unsigned long long size);
-extern void print_memory_map(char *who);
 extern void init_iomem_resources(struct resource *code_resource,
 				 struct resource *data_resource,
 				 struct resource *bss_resource);
-- 
cgit v1.2.3


From 95ffa2438d0e9c48779f0106b1c0eb36165e759c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Tue, 29 Apr 2008 03:52:33 -0700
Subject: x86: mtrr cleanup for converting continuous to discrete layout, v8

some BIOS like to use continus MTRR layout, and X driver can not add
WB entries for graphical cards when 4g or more RAM installed.

the patch will change MTRR to discrete.

mtrr_chunk_size= could be used to have smaller continuous block to hold holes.
default is 256m, could be set according to size of graphics card memory.

mtrr_gran_size= could be used to send smallest mtrr block to avoid run out of MTRRs

v2: fix -1 for UC checking
v3: default to disable, and need use enable_mtrr_cleanup to enable this feature
    skip the var state change warning.
    remove next_basek in range_to_mtrr()
v4: correct warning mask.
v5: CONFIG_MTRR_SANITIZER
v6: fix 1g, 2g, 512 aligment with extra hole
v7: gran_sizek to prevent running out of MTRRs.
v8: fix hole_basek caculation caused when removing next_basek
    gran_sizek using when basek is 0.

need to apply
	[PATCH] x86: fix trimming e820 with MTRR holes.
right after this one.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |  14 ++
 arch/x86/Kconfig                    |  26 ++
 arch/x86/kernel/cpu/mtrr/generic.c  |  32 ++-
 arch/x86/kernel/cpu/mtrr/main.c     | 467 +++++++++++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/mtrr/mtrr.h     |   3 +
 5 files changed, 528 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e07c432c731f..44427607b7c6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -599,6 +599,20 @@ and is between 256 and 4096 characters. It is defined in the file
 			See drivers/char/README.epca and
 			Documentation/digiepca.txt.
 
+	disable_mtrr_cleanup [X86]
+	enable_mtrr_cleanup [X86]
+			The kernel tries to adjust MTRR layout from continuous
+			to discrete, to make X server driver able to add WB
+			entry later. This parameter enables/disables that.
+
+	mtrr_chunk_size=nn[KMG] [X86]
+			used for mtrr cleanup. It is largest continous chunk
+			that could hold holes aka. UC entries.
+
+	mtrr_gran_size=nn[KMG] [X86]
+			used for mtrr cleanup. It is granity of mtrr block.
+			Big value could prevent small alignment use up MTRRs.
+
 	disable_mtrr_trim [X86, Intel and AMD only]
 			By default the kernel will trim any uncacheable
 			memory out of your available memory pool based on
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe361ae7ef2f..97a1764a3d20 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1092,6 +1092,32 @@ config MTRR
 
 	  See <file:Documentation/mtrr.txt> for more information.
 
+config MTRR_SANITIZER
+	def_bool y
+	prompt "MTRR cleanup support"
+	depends on MTRR
+	help
+	  Convert MTRR layout from continuous to discrete, so some X driver
+	  could add WB entries.
+
+	  Say N here if you see bootup problems (boot crash, boot hang,
+	  spontaneous reboots).
+
+	  Could be disabled with disable_mtrr_cleanup. Also mtrr_chunk_size
+	  could be used to send largest mtrr entry size for continuous block
+	  to hold holes (aka. UC entries)
+
+	  If unsure, say Y.
+
+config MTRR_SANITIZER_ENABLE_DEFAULT
+	def_bool y
+	prompt "Enable MTRR cleanup by default"
+	depends on MTRR_SANITIZER
+	help
+	  Enable mtrr cleanup by default
+
+	  If unsure, say Y.
+
 config X86_PAT
 	bool
 	prompt "x86 PAT support"
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0625d4158e58..5aae648600bf 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
 static unsigned long smp_changes_mask;
 static struct mtrr_state mtrr_state = {};
 static int mtrr_state_set;
-static u64 tom2;
+u64 mtrr_tom2;
 
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "mtrr."
@@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 		}
 	}
 
-	if (tom2) {
-		if (start >= (1ULL<<32) && (end < tom2))
+	if (mtrr_tom2) {
+		if (start >= (1ULL<<32) && (end < mtrr_tom2))
 			return MTRR_TYPE_WRBACK;
 	}
 
@@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
 	rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
 }
 
+/*  fill the MSR pair relating to a var range  */
+void fill_mtrr_var_range(unsigned int index,
+		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
+{
+	struct mtrr_var_range *vr;
+
+	vr = mtrr_state.var_ranges;
+
+	vr[index].base_lo = base_lo;
+	vr[index].base_hi = base_hi;
+	vr[index].mask_lo = mask_lo;
+	vr[index].mask_hi = mask_hi;
+}
+
 static void
 get_fixed_ranges(mtrr_type * frs)
 {
@@ -216,10 +230,10 @@ void __init get_mtrr_state(void)
 		unsigned low, high;
 		/* TOP_MEM2 */
 		rdmsr(MSR_K8_TOP_MEM2, low, high);
-		tom2 = high;
-		tom2 <<= 32;
-		tom2 |= low;
-		tom2 &= 0xffffff8000000ULL;
+		mtrr_tom2 = high;
+		mtrr_tom2 <<= 32;
+		mtrr_tom2 |= low;
+		mtrr_tom2 &= 0xffffff8000000ULL;
 	}
 	if (mtrr_show) {
 		int high_width;
@@ -251,9 +265,9 @@ void __init get_mtrr_state(void)
 			else
 				printk(KERN_INFO "MTRR %u disabled\n", i);
 		}
-		if (tom2) {
+		if (mtrr_tom2) {
 			printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
-					  tom2, tom2>>20);
+					  mtrr_tom2, mtrr_tom2>>20);
 		}
 	}
 	mtrr_state_set = 1;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6a1e278d9323..8a6f68b45e3e 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -37,6 +37,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/sort.h>
 
 #include <asm/e820.h>
 #include <asm/mtrr.h>
@@ -609,6 +610,452 @@ static struct sysdev_driver mtrr_sysdev_driver = {
 	.resume		= mtrr_restore,
 };
 
+#ifdef CONFIG_MTRR_SANITIZER
+
+#ifdef CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT
+static int enable_mtrr_cleanup __initdata = 1;
+#else
+static int enable_mtrr_cleanup __initdata;
+#endif
+
+#else
+
+static int enable_mtrr_cleanup __initdata = -1;
+
+#endif
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+	if (enable_mtrr_cleanup != -1)
+		enable_mtrr_cleanup = 0;
+	return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+	if (enable_mtrr_cleanup != -1)
+		enable_mtrr_cleanup = 1;
+	return 0;
+}
+early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+#define RANGE_NUM 256
+
+struct res_range {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int __init add_range(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end, int merge)
+{
+	int i;
+
+	if (!merge)
+		goto addit;
+
+	/* try to merge it with old one */
+	for (i = 0; i < nr_range; i++) {
+		unsigned long final_start, final_end;
+		unsigned long common_start, common_end;
+
+		if (!range[i].end)
+			continue;
+
+		common_start = max(range[i].start, start);
+		common_end = min(range[i].end, end);
+		if (common_start > common_end + 1)
+			continue;
+
+		final_start = min(range[i].start, start);
+		final_end = max(range[i].end, end);
+
+		range[i].start = final_start;
+		range[i].end =  final_end;
+		return nr_range;
+	}
+
+addit:
+	/* need to add that */
+	if (nr_range >= RANGE_NUM)
+		return nr_range;
+
+	range[nr_range].start = start;
+	range[nr_range].end = end;
+
+	nr_range++;
+
+	return nr_range;
+
+}
+static void __init subtract_range(struct res_range *range, unsigned long start,
+				unsigned long end)
+{
+	int i;
+	int j;
+
+	for (j = 0; j < RANGE_NUM; j++) {
+		if (!range[j].end)
+			continue;
+
+		if (start <= range[j].start && end >= range[j].end) {
+			range[j].start = 0;
+			range[j].end = 0;
+			continue;
+		}
+
+		if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) {
+			range[j].start = end + 1;
+			continue;
+		}
+
+
+		if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) {
+			range[j].end = start - 1;
+			continue;
+		}
+
+		if (start > range[j].start && end < range[j].end) {
+			/* find the new spare */
+			for (i = 0; i < RANGE_NUM; i++) {
+				if (range[i].end == 0)
+					break;
+			}
+			if (i < RANGE_NUM) {
+				range[i].end = range[j].end;
+				range[i].start = end + 1;
+			} else {
+				printk(KERN_ERR "run of slot in ranges\n");
+			}
+			range[j].end = start - 1;
+			continue;
+		}
+	}
+}
+
+static int __init cmp_range(const void *x1, const void *x2)
+{
+	const struct res_range *r1 = x1;
+	const struct res_range *r2 = x2;
+	long start1, start2;
+
+	start1 = r1->start;
+	start2 = r2->start;
+
+	return start1 - start2;
+}
+
+struct var_mtrr_state {
+	unsigned long range_startk, range_sizek;
+	unsigned long chunk_sizek;
+	unsigned long gran_sizek;
+	unsigned int reg;
+	unsigned address_bits;
+};
+
+static void __init set_var_mtrr(
+	unsigned int reg, unsigned long basek, unsigned long sizek,
+	unsigned char type, unsigned address_bits)
+{
+	u32 base_lo, base_hi, mask_lo, mask_hi;
+	unsigned address_mask_high;
+
+	if (!sizek) {
+		fill_mtrr_var_range(reg, 0, 0, 0, 0);
+		return;
+	}
+
+	address_mask_high = ((1u << (address_bits - 32u)) - 1u);
+
+	base_hi = basek >> 22;
+	base_lo  = basek << 10;
+
+	if (sizek < 4*1024*1024) {
+		mask_hi = address_mask_high;
+		mask_lo = ~((sizek << 10) - 1);
+	} else {
+		mask_hi = address_mask_high & (~((sizek >> 22) - 1));
+		mask_lo = 0;
+	}
+
+	base_lo |= type;
+	mask_lo |= 0x800;
+	fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static unsigned int __init range_to_mtrr(unsigned int reg,
+	unsigned long range_startk, unsigned long range_sizek,
+	unsigned char type, unsigned address_bits)
+{
+	if (!range_sizek || (reg >= num_var_ranges))
+		return reg;
+
+	while (range_sizek) {
+		unsigned long max_align, align;
+		unsigned long sizek;
+		/* Compute the maximum size I can make a range */
+		if (range_startk)
+			max_align = ffs(range_startk) - 1;
+		else
+			max_align = 32;
+		align = fls(range_sizek) - 1;
+		if (align > max_align)
+			align = max_align;
+
+		sizek = 1 << align;
+		printk(KERN_INFO "Setting variable MTRR %d, base: %ldMB, range: %ldMB, type %s\n",
+			reg, range_startk >> 10, sizek >> 10,
+			(type == MTRR_TYPE_UNCACHABLE)?"UC":
+			    ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
+			);
+		set_var_mtrr(reg++, range_startk, sizek, type, address_bits);
+		range_startk += sizek;
+		range_sizek -= sizek;
+		if (reg >= num_var_ranges)
+			break;
+	}
+	return reg;
+}
+
+static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek)
+{
+	unsigned long hole_basek, hole_sizek;
+	unsigned long range0_basek, range0_sizek;
+	unsigned long range_basek, range_sizek;
+	unsigned long chunk_sizek;
+	unsigned long gran_sizek;
+
+	hole_basek = 0;
+	hole_sizek = 0;
+	chunk_sizek = state->chunk_sizek;
+	gran_sizek = state->gran_sizek;
+
+	/* align with gran size, prevent small block used up MTRRs */
+	range_basek = ALIGN(state->range_startk, gran_sizek);
+	if ((range_basek > basek) && basek)
+		return;
+	range_sizek = ALIGN(state->range_sizek - (range_basek - state->range_startk), gran_sizek);
+
+	while (range_basek + range_sizek > (state->range_startk + state->range_sizek)) {
+		range_sizek -= gran_sizek;
+		if (!range_sizek)
+			return;
+	}
+	state->range_startk = range_basek;
+	state->range_sizek = range_sizek;
+
+	/* try to append some small hole */
+	range0_basek = state->range_startk;
+	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+	if ((range0_sizek == state->range_sizek) ||
+	    ((range0_basek + range0_sizek - chunk_sizek > basek) && basek)) {
+			printk(KERN_INFO "rangeX: %016lx - %016lx\n", range0_basek<<10, (range0_basek + state->range_sizek)<<10);
+			state->reg = range_to_mtrr(state->reg, range0_basek,
+				state->range_sizek, MTRR_TYPE_WRBACK, state->address_bits);
+		return;
+	}
+
+
+	range0_sizek -= chunk_sizek;
+	printk(KERN_INFO "range0: %016lx - %016lx\n", range0_basek<<10, (range0_basek + range0_sizek)<<10);
+	state->reg = range_to_mtrr(state->reg, range0_basek,
+			range0_sizek, MTRR_TYPE_WRBACK, state->address_bits);
+
+	range_basek = range0_basek + range0_sizek;
+	range_sizek = chunk_sizek;
+	if (range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) {
+		hole_sizek = range_sizek - (state->range_sizek - range0_sizek);
+		hole_basek = range_basek + range_sizek - hole_sizek;
+	} else
+		range_sizek = state->range_sizek - range0_sizek;
+
+	printk(KERN_INFO "range: %016lx - %016lx\n", range_basek<<10, (range_basek + range_sizek)<<10);
+	state->reg = range_to_mtrr(state->reg, range_basek,
+			range_sizek, MTRR_TYPE_WRBACK, state->address_bits);
+	if (hole_sizek) {
+		printk(KERN_INFO "hole: %016lx - %016lx\n", hole_basek<<10, (hole_basek + hole_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, hole_basek,
+				hole_sizek, MTRR_TYPE_UNCACHABLE, state->address_bits);
+	}
+}
+
+static void __init set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, unsigned long size_pfn)
+{
+	unsigned long basek, sizek;
+
+	if (state->reg >= num_var_ranges)
+		return;
+
+	basek = base_pfn << (PAGE_SHIFT - 10);
+	sizek = size_pfn << (PAGE_SHIFT - 10);
+
+	/* See if I can merge with the last range */
+	if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) {
+		unsigned long endk = basek + sizek;
+		state->range_sizek = endk - state->range_startk;
+		return;
+	}
+	/* Write the range mtrrs */
+	if (state->range_sizek != 0) {
+		range_to_mtrr_with_hole(state, basek);
+
+		state->range_startk = 0;
+		state->range_sizek = 0;
+	}
+	/* Allocate an msr */
+	state->range_startk = basek;
+	state->range_sizek  = sizek;
+}
+
+/* mininum size of mtrr block that can take hole */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	mtrr_chunk_size = memparse(p, &p);
+	return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* granity of mtrr of block */
+static u64 mtrr_gran_size __initdata = (64ULL<<20);
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	mtrr_gran_size = memparse(p, &p);
+	return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static void __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, unsigned address_bits)
+{
+	struct var_mtrr_state var_state;
+	int i;
+
+	var_state.range_startk = 0;
+	var_state.range_sizek = 0;
+	var_state.reg = 0;
+	var_state.address_bits = address_bits;
+	var_state.chunk_sizek = mtrr_chunk_size >> 10;
+	var_state.gran_sizek = mtrr_gran_size >> 10;
+
+	/* Write the range etc */
+	for (i = 0; i < nr_range; i++)
+		set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1);
+
+	/* Write the last range */
+	range_to_mtrr_with_hole(&var_state, 0);
+	printk(KERN_INFO "DONE variable MTRRs\n");
+	/* Clear out the extra MTRR's */
+	while (var_state.reg < num_var_ranges)
+		set_var_mtrr(var_state.reg++, 0, 0, 0, var_state.address_bits);
+}
+
+static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size)
+{
+	unsigned long i, base, size;
+	mtrr_type type;
+
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		if (type != MTRR_TYPE_WRBACK)
+			continue;
+		nr_range = add_range(range, nr_range, base, base + size - 1, 1);
+	}
+	printk(KERN_INFO "After WB checking\n");
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1);
+
+	/* take out UC ranges */
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		if (type != MTRR_TYPE_UNCACHABLE)
+			continue;
+		if (!size)
+			continue;
+		subtract_range(range, base, base + size - 1);
+	}
+	if (extra_remove_size)
+		subtract_range(range, extra_remove_base,  extra_remove_base + extra_remove_size  - 1);
+
+	/* get new range num */
+	nr_range = 0;
+	for (i = 0; i < RANGE_NUM; i++) {
+		if (!range[i].end)
+			continue;
+		nr_range++;
+	}
+	printk(KERN_INFO "After UC checking\n");
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1);
+
+	/* sort the ranges */
+	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+	printk(KERN_INFO "After sorting\n");
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1);
+
+	return nr_range;
+}
+
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+	unsigned long i, base, size, def, dummy;
+	mtrr_type type;
+	struct res_range range[RANGE_NUM];
+	int nr_range;
+	unsigned long extra_remove_base, extra_remove_size;
+
+	/* extra one for all 0 */
+	int num[MTRR_NUM_TYPES + 1];
+
+	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+		return 0;
+	rdmsr(MTRRdefType_MSR, def, dummy);
+	def &= 0xff;
+	if (def != MTRR_TYPE_UNCACHABLE)
+		return 0;
+
+	/* check entries number */
+	memset(num, 0, sizeof(num));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		if (type >= MTRR_NUM_TYPES)
+			continue;
+		if (!size)
+			type = MTRR_NUM_TYPES;
+		num[type]++;
+	}
+
+	/* check if we got UC entries */
+	if (!num[MTRR_TYPE_UNCACHABLE])
+		return 0;
+
+	/* check if we only had WB and UC */
+	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+		num_var_ranges - num[MTRR_NUM_TYPES])
+		return 0;
+
+	memset(range, 0, sizeof(range));
+	extra_remove_size = 0;
+	if (mtrr_tom2) {
+		extra_remove_base = 1 << (32 - PAGE_SHIFT);
+		extra_remove_size = (mtrr_tom2>>PAGE_SHIFT) - extra_remove_base;
+	}
+	nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, extra_remove_size);
+
+	/* convert ranges to var ranges state */
+	x86_setup_var_mtrrs(range, nr_range, address_bits);
+
+	return 1;
+
+}
+
 static int disable_mtrr_trim;
 
 static int __init disable_mtrr_trim_setup(char *str)
@@ -729,18 +1176,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
  */
 void __init mtrr_bp_init(void)
 {
+	u32 phys_addr;
 	init_ifs();
 
+	phys_addr = 32;
+
 	if (cpu_has_mtrr) {
 		mtrr_if = &generic_mtrr_ops;
 		size_or_mask = 0xff000000;	/* 36 bits */
 		size_and_mask = 0x00f00000;
+		phys_addr = 36;
 
 		/* This is an AMD specific MSR, but we assume(hope?) that
 		   Intel will implement it to when they extend the address
 		   bus of the Xeon. */
 		if (cpuid_eax(0x80000000) >= 0x80000008) {
-			u32 phys_addr;
 			phys_addr = cpuid_eax(0x80000008) & 0xff;
 			/* CPUID workaround for Intel 0F33/0F34 CPU */
 			if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
@@ -758,6 +1208,7 @@ void __init mtrr_bp_init(void)
 			   don't support PAE */
 			size_or_mask = 0xfff00000;	/* 32 bits */
 			size_and_mask = 0;
+			phys_addr = 32;
 		}
 	} else {
 		switch (boot_cpu_data.x86_vendor) {
@@ -791,8 +1242,13 @@ void __init mtrr_bp_init(void)
 	if (mtrr_if) {
 		set_num_var_ranges();
 		init_table();
-		if (use_intel())
+		if (use_intel()) {
 			get_mtrr_state();
+
+			if (mtrr_cleanup(phys_addr))
+				mtrr_if->set_all();
+
+		}
 	}
 }
 
@@ -829,9 +1285,10 @@ static int __init mtrr_init_finialize(void)
 {
 	if (!mtrr_if)
 		return 0;
-	if (use_intel())
-		mtrr_state_warn();
-	else {
+	if (use_intel()) {
+		if (enable_mtrr_cleanup < 1)
+			mtrr_state_warn();
+	} else {
 		/* The CPUs haven't MTRR and seem to not support SMP. They have
 		 * specific drivers, we use a tricky method to support
 		 * suspend/resume for them.
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2cc77eb6fea3..2dc4ec656b23 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt);
 void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
 void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
 
+void fill_mtrr_var_range(unsigned int index,
+		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
 void get_mtrr_state(void);
 
 extern void set_mtrr_ops(struct mtrr_ops * ops);
@@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if;
 #define use_intel()	(mtrr_if && mtrr_if->use_intel_if == 1)
 
 extern unsigned int num_var_ranges;
+extern u64 mtrr_tom2;
 
 void mtrr_state_warn(void);
 const char *mtrr_attrib_to_str(int x);
-- 
cgit v1.2.3


From 42651f15824d003e8357693ab72c4dbb3e280836 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Tue, 29 Apr 2008 01:59:49 -0700
Subject: x86: fix trimming e820 with MTRR holes.

converting MTRR layout from continous to discrete, some time could run out of
MTRRs. So add gran_sizek to prevent that by dumpping small RAM piece less than
gran_sizek.

previous trimming only can handle highest_pfn from mtrr to end_pfn from e820.
when have more than 4g RAM installed, there will be holes below 4g. so need to
check ram below 4g is coverred well.

need to be applied after
	[PATCH] x86: mtrr cleanup for converting continuous to discrete layout v7

Signed-off-by: Yinghai Lu <yinghai.lu@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mtrr/main.c | 101 +++++++++++++++++++++++++++++++++-------
 arch/x86/kernel/e820_32.c       |   7 ++-
 arch/x86/kernel/e820_64.c       |   6 ++-
 include/asm-x86/e820_32.h       |   2 +-
 include/asm-x86/e820_64.h       |   2 +-
 5 files changed, 97 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 8a6f68b45e3e..9ab5c16b0d52 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -1095,6 +1095,17 @@ int __init amd_special_default_mtrr(void)
 	return 0;
 }
 
+static u64 __init real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
+{
+	u64 trim_start, trim_size;
+	trim_start =  start_pfn;
+	trim_start <<= PAGE_SHIFT;
+	trim_size = limit_pfn;
+	trim_size <<= PAGE_SHIFT;
+	trim_size -= trim_start;
+	return update_memory_range(trim_start, trim_size, E820_RAM,
+				E820_RESERVED);
+}
 /**
  * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
  * @end_pfn: ending page frame number
@@ -1110,8 +1121,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 {
 	unsigned long i, base, size, highest_pfn = 0, def, dummy;
 	mtrr_type type;
-	u64 trim_start, trim_size;
+	struct res_range range[RANGE_NUM];
+	int nr_range;
+	u64 total_real_trim_size;
+	int changed;
 
+	/* extra one for all 0 */
+	int num[MTRR_NUM_TYPES + 1];
 	/*
 	 * Make sure we only trim uncachable memory on machines that
 	 * support the Intel MTRR architecture:
@@ -1123,9 +1139,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	if (def != MTRR_TYPE_UNCACHABLE)
 		return 0;
 
-	if (amd_special_default_mtrr())
-		return 0;
-
 	/* Find highest cached pfn */
 	for (i = 0; i < num_var_ranges; i++) {
 		mtrr_if->get(i, &base, &size, &type);
@@ -1145,26 +1158,80 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 		return 0;
 	}
 
-	if (highest_pfn < end_pfn) {
+	/* check entries number */
+	memset(num, 0, sizeof(num));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		if (type >= MTRR_NUM_TYPES)
+			continue;
+		if (!size)
+			type = MTRR_NUM_TYPES;
+		num[type]++;
+	}
+
+	/* no entry for WB? */
+	if (!num[MTRR_TYPE_WRBACK])
+		return 0;
+
+	/* check if we only had WB and UC */
+	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+		num_var_ranges - num[MTRR_NUM_TYPES])
+		return 0;
+
+	memset(range, 0, sizeof(range));
+	nr_range = 0;
+	if (mtrr_tom2) {
+		range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+		range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
+		if (highest_pfn < range[nr_range].end + 1)
+			highest_pfn = range[nr_range].end + 1;
+		nr_range++;
+	}
+	nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+	changed = 0;
+	total_real_trim_size = 0;
+
+	/* check the top at first */
+	i = nr_range - 1;
+	if (range[i].end + 1 < end_pfn) {
+			total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn);
+	}
+
+	if (total_real_trim_size) {
 		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
-			" all of memory, losing %luMB of RAM.\n",
-			(end_pfn - highest_pfn) >> (20 - PAGE_SHIFT));
+			" all of memory, losing %lluMB of RAM.\n",
+			total_real_trim_size >> 20);
 
 		WARN_ON(1);
 
-		printk(KERN_INFO "update e820 for mtrr\n");
-		trim_start = highest_pfn;
-		trim_start <<= PAGE_SHIFT;
-		trim_size = end_pfn;
-		trim_size <<= PAGE_SHIFT;
-		trim_size -= trim_start;
-		update_memory_range(trim_start, trim_size, E820_RAM,
-					E820_RESERVED);
+		printk(KERN_INFO "update e820 for mtrr -- end_pfn\n");
 		update_e820();
-		return 1;
+		changed = 1;
 	}
 
-	return 0;
+	total_real_trim_size = 0;
+	if (range[0].start)
+		total_real_trim_size += real_trim_memory(0, range[0].start);
+
+	for (i = 0; i < nr_range - 1; i--) {
+		if (range[i].end + 1 < range[i+1].start)
+			total_real_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start);
+	}
+
+	if (total_real_trim_size) {
+		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
+			" all of memory, losing %lluMB of RAM.\n",
+			total_real_trim_size >> 20);
+
+		WARN_ON(1);
+
+		printk(KERN_INFO "update e820 for mtrr -- holes\n");
+		update_e820();
+		changed = 1;
+	}
+
+	return changed;
 }
 
 /**
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 31ea2bb8c91a..857f706273a8 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -783,10 +783,11 @@ static int __init parse_memmap(char *arg)
 	return 0;
 }
 early_param("memmap", parse_memmap);
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
+u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
 				unsigned new_type)
 {
 	int i;
+	u64 real_updated_size = 0;
 
 	BUG_ON(old_type == new_type);
 
@@ -798,6 +799,7 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type,
 		/* totally covered? */
 		if (ei->addr >= start && ei->size <= size) {
 			ei->type = new_type;
+			real_updated_size += ei->size;
 			continue;
 		}
 		/* partially covered */
@@ -807,7 +809,10 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type,
 			continue;
 		add_memory_region(final_start, final_end - final_start,
 					 new_type);
+		real_updated_size += final_end - final_start;
 	}
+
+	return real_updated_size;
 }
 
 void __init finish_e820_parsing(void)
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 124480c0008d..848b2cd2d1dd 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -829,10 +829,11 @@ void __init finish_e820_parsing(void)
 	}
 }
 
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
+u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
 				unsigned new_type)
 {
 	int i;
+	u64 real_updated_size = 0;
 
 	BUG_ON(old_type == new_type);
 
@@ -844,6 +845,7 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type,
 		/* totally covered? */
 		if (ei->addr >= start && ei->size <= size) {
 			ei->type = new_type;
+			real_updated_size += ei->size;
 			continue;
 		}
 		/* partially covered */
@@ -853,7 +855,9 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type,
 			continue;
 		add_memory_region(final_start, final_end - final_start,
 					 new_type);
+		real_updated_size += final_end - final_start;
 	}
+	return real_updated_size;
 }
 
 void __init update_e820(void)
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index e1f10c60901f..af0711b220df 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -31,7 +31,7 @@ extern void propagate_e820_map(void);
 extern void register_bootmem_low_pages(unsigned long max_low_pfn);
 extern void add_memory_region(unsigned long long start,
 			      unsigned long long size, int type);
-extern void update_memory_range(u64 start, u64 size, unsigned old_type,
+extern u64 update_memory_range(u64 start, u64 size, unsigned old_type,
 				unsigned new_type);
 extern void e820_register_memory(void);
 extern void limit_regions(unsigned long long size);
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 71c4d685d30d..6ae3e2803286 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -21,7 +21,7 @@ extern unsigned long find_e820_area_size(unsigned long start,
 					 unsigned long align);
 extern void add_memory_region(unsigned long start, unsigned long size,
 			      int type);
-extern void update_memory_range(u64 start, u64 size, unsigned old_type,
+extern u64 update_memory_range(u64 start, u64 size, unsigned old_type,
 				unsigned new_type);
 extern void setup_memory_region(void);
 extern void contig_e820_setup(void);
-- 
cgit v1.2.3


From 8a374026c265476b1acfc3c186a66d59ebdb2cda Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Tue, 29 Apr 2008 20:25:16 -0700
Subject: x86: fix trimming e820 with MTRR holes. - fix

v2: process hole then end_pfn
    fix update_memory_range with whole cover comparing

Signed-off-by: Yinghai Lu <yinghai.lu@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mtrr/main.c | 44 ++++++++++++++---------------------------
 arch/x86/kernel/e820_32.c       |  3 ++-
 arch/x86/kernel/e820_64.c       |  3 ++-
 3 files changed, 19 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 9ab5c16b0d52..5a2a4c146333 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -1098,11 +1098,12 @@ int __init amd_special_default_mtrr(void)
 static u64 __init real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
 {
 	u64 trim_start, trim_size;
-	trim_start =  start_pfn;
+	trim_start = start_pfn;
 	trim_start <<= PAGE_SHIFT;
 	trim_size = limit_pfn;
 	trim_size <<= PAGE_SHIFT;
 	trim_size -= trim_start;
+
 	return update_memory_range(trim_start, trim_size, E820_RAM,
 				E820_RESERVED);
 }
@@ -1124,7 +1125,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	struct res_range range[RANGE_NUM];
 	int nr_range;
 	u64 total_real_trim_size;
-	int changed;
 
 	/* extra one for all 0 */
 	int num[MTRR_NUM_TYPES + 1];
@@ -1189,49 +1189,35 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	}
 	nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
 
-	changed = 0;
-	total_real_trim_size = 0;
-
-	/* check the top at first */
-	i = nr_range - 1;
-	if (range[i].end + 1 < end_pfn) {
-			total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn);
-	}
-
-	if (total_real_trim_size) {
-		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
-			" all of memory, losing %lluMB of RAM.\n",
-			total_real_trim_size >> 20);
-
-		WARN_ON(1);
-
-		printk(KERN_INFO "update e820 for mtrr -- end_pfn\n");
-		update_e820();
-		changed = 1;
-	}
-
 	total_real_trim_size = 0;
+	/* check the head */
 	if (range[0].start)
 		total_real_trim_size += real_trim_memory(0, range[0].start);
-
-	for (i = 0; i < nr_range - 1; i--) {
+	/* check the holes */
+	for (i = 0; i < nr_range - 1; i++) {
 		if (range[i].end + 1 < range[i+1].start)
 			total_real_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start);
 	}
+	/* check the top */
+	i = nr_range - 1;
+	if (range[i].end + 1 < end_pfn)
+		total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn);
 
 	if (total_real_trim_size) {
 		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
 			" all of memory, losing %lluMB of RAM.\n",
 			total_real_trim_size >> 20);
 
-		WARN_ON(1);
+		if (enable_mtrr_cleanup < 1)
+			WARN_ON(1);
 
-		printk(KERN_INFO "update e820 for mtrr -- holes\n");
+		printk(KERN_INFO "update e820 for mtrr\n");
 		update_e820();
-		changed = 1;
+
+		return 1;
 	}
 
-	return changed;
+	return 0;
 }
 
 /**
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 857f706273a8..751d517d802c 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -797,7 +797,8 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
 		if (ei->type != old_type)
 			continue;
 		/* totally covered? */
-		if (ei->addr >= start && ei->size <= size) {
+		if (ei->addr >= start &&
+		    (ei->addr + ei->size) <= (start + size)) {
 			ei->type = new_type;
 			real_updated_size += ei->size;
 			continue;
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 848b2cd2d1dd..c45b4dea4055 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -843,7 +843,8 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
 		if (ei->type != old_type)
 			continue;
 		/* totally covered? */
-		if (ei->addr >= start && ei->size <= size) {
+		if (ei->addr >= start &&
+		    (ei->addr + ei->size) <= (start + size)) {
 			ei->type = new_type;
 			real_updated_size += ei->size;
 			continue;
-- 
cgit v1.2.3


From f5098d62c1d1cede8ff23d01bbf50a421f110562 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Tue, 29 Apr 2008 20:25:58 -0700
Subject: x86: mtrr cleanup for converting continuous to discrete layout v8 -
 fix

v9: address format change requests by Ingo
    more case handling in range_to_var_with_hole

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig                |   9 +--
 arch/x86/kernel/cpu/mtrr/main.c | 164 ++++++++++++++++++++++------------------
 2 files changed, 94 insertions(+), 79 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 97a1764a3d20..f417fe33d852 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1110,13 +1110,12 @@ config MTRR_SANITIZER
 	  If unsure, say Y.
 
 config MTRR_SANITIZER_ENABLE_DEFAULT
-	def_bool y
-	prompt "Enable MTRR cleanup by default"
+	int "MTRR cleanup enable value (0-1)"
+	range 0 1
+	default "0"
 	depends on MTRR_SANITIZER
 	help
-	  Enable mtrr cleanup by default
-
-	  If unsure, say Y.
+	  Enable mtrr cleanup default value
 
 config X86_PAT
 	bool
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 5a2a4c146333..79597d3c3431 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -611,17 +611,9 @@ static struct sysdev_driver mtrr_sysdev_driver = {
 };
 
 #ifdef CONFIG_MTRR_SANITIZER
-
-#ifdef CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT
-static int enable_mtrr_cleanup __initdata = 1;
-#else
-static int enable_mtrr_cleanup __initdata;
-#endif
-
+static int enable_mtrr_cleanup __initdata = CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
 #else
-
 static int enable_mtrr_cleanup __initdata = -1;
-
 #endif
 
 static int __init disable_mtrr_cleanup_setup(char *str)
@@ -640,6 +632,7 @@ static int __init enable_mtrr_cleanup_setup(char *str)
 }
 early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
 
+/* should be related to MTRR_VAR_RANGES nums */
 #define RANGE_NUM 256
 
 struct res_range {
@@ -647,13 +640,27 @@ struct res_range {
 	unsigned long end;
 };
 
-static int __init add_range(struct res_range *range, int nr_range, unsigned long start,
-			      unsigned long end, int merge)
+static int __init
+add_range(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end)
 {
-	int i;
+	/* out of slots */
+	if (nr_range >= RANGE_NUM)
+		return nr_range;
 
-	if (!merge)
-		goto addit;
+	range[nr_range].start = start;
+	range[nr_range].end = end;
+
+	nr_range++;
+
+	return nr_range;
+}
+
+static int __init
+add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end)
+{
+	int i;
 
 	/* try to merge it with old one */
 	for (i = 0; i < nr_range; i++) {
@@ -676,24 +683,14 @@ static int __init add_range(struct res_range *range, int nr_range, unsigned long
 		return nr_range;
 	}
 
-addit:
 	/* need to add that */
-	if (nr_range >= RANGE_NUM)
-		return nr_range;
-
-	range[nr_range].start = start;
-	range[nr_range].end = end;
-
-	nr_range++;
-
-	return nr_range;
-
+	return add_range(range, nr_range, start, end);
 }
-static void __init subtract_range(struct res_range *range, unsigned long start,
-				unsigned long end)
+
+static void __init
+subtract_range(struct res_range *range, unsigned long start, unsigned long end)
 {
-	int i;
-	int j;
+	int i, j;
 
 	for (j = 0; j < RANGE_NUM; j++) {
 		if (!range[j].end)
@@ -747,46 +744,47 @@ static int __init cmp_range(const void *x1, const void *x2)
 }
 
 struct var_mtrr_state {
-	unsigned long range_startk, range_sizek;
-	unsigned long chunk_sizek;
-	unsigned long gran_sizek;
-	unsigned int reg;
-	unsigned address_bits;
+	unsigned long	range_startk;
+	unsigned long	range_sizek;
+	unsigned long	chunk_sizek;
+	unsigned long	gran_sizek;
+	unsigned int	reg;
+	unsigned int	address_bits;
 };
 
-static void __init set_var_mtrr(
-	unsigned int reg, unsigned long basek, unsigned long sizek,
-	unsigned char type, unsigned address_bits)
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+		unsigned char type, unsigned address_bits)
 {
 	u32 base_lo, base_hi, mask_lo, mask_hi;
-	unsigned address_mask_high;
+	u64 base, mask;
 
 	if (!sizek) {
 		fill_mtrr_var_range(reg, 0, 0, 0, 0);
 		return;
 	}
 
-	address_mask_high = ((1u << (address_bits - 32u)) - 1u);
+	mask = (1ULL << address_bits) - 1;
+	mask &= ~((((u64)sizek) << 10) - 1);
 
-	base_hi = basek >> 22;
-	base_lo  = basek << 10;
+	base  = ((u64)basek) << 10;
 
-	if (sizek < 4*1024*1024) {
-		mask_hi = address_mask_high;
-		mask_lo = ~((sizek << 10) - 1);
-	} else {
-		mask_hi = address_mask_high & (~((sizek >> 22) - 1));
-		mask_lo = 0;
-	}
+	base |= type;
+	mask |= 0x800;
+
+	base_lo = base & ((1ULL<<32) - 1);
+	base_hi = base >> 32;
+
+	mask_lo = mask & ((1ULL<<32) - 1);
+	mask_hi = mask >> 32;
 
-	base_lo |= type;
-	mask_lo |= 0x800;
 	fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
 }
 
-static unsigned int __init range_to_mtrr(unsigned int reg,
-	unsigned long range_startk, unsigned long range_sizek,
-	unsigned char type, unsigned address_bits)
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+	      unsigned long range_sizek, unsigned char type,
+	      unsigned address_bits)
 {
 	if (!range_sizek || (reg >= num_var_ranges))
 		return reg;
@@ -794,6 +792,7 @@ static unsigned int __init range_to_mtrr(unsigned int reg,
 	while (range_sizek) {
 		unsigned long max_align, align;
 		unsigned long sizek;
+
 		/* Compute the maximum size I can make a range */
 		if (range_startk)
 			max_align = ffs(range_startk) - 1;
@@ -818,7 +817,8 @@ static unsigned int __init range_to_mtrr(unsigned int reg,
 	return reg;
 }
 
-static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek)
+static void __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek)
 {
 	unsigned long hole_basek, hole_sizek;
 	unsigned long range0_basek, range0_sizek;
@@ -848,23 +848,31 @@ static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigne
 	/* try to append some small hole */
 	range0_basek = state->range_startk;
 	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
-	if ((range0_sizek == state->range_sizek) ||
-	    ((range0_basek + range0_sizek - chunk_sizek > basek) && basek)) {
+	if (range0_sizek == state->range_sizek) {
 			printk(KERN_INFO "rangeX: %016lx - %016lx\n", range0_basek<<10, (range0_basek + state->range_sizek)<<10);
 			state->reg = range_to_mtrr(state->reg, range0_basek,
 				state->range_sizek, MTRR_TYPE_WRBACK, state->address_bits);
 		return;
+	} else if (basek) {
+	    while (range0_basek + range0_sizek - chunk_sizek > basek) {
+		range0_sizek -= chunk_sizek;
+		if (!range0_sizek)
+			break;
+	    }
 	}
 
 
-	range0_sizek -= chunk_sizek;
+	if (range0_sizek > chunk_sizek)
+		range0_sizek -= chunk_sizek;
 	printk(KERN_INFO "range0: %016lx - %016lx\n", range0_basek<<10, (range0_basek + range0_sizek)<<10);
 	state->reg = range_to_mtrr(state->reg, range0_basek,
 			range0_sizek, MTRR_TYPE_WRBACK, state->address_bits);
 
 	range_basek = range0_basek + range0_sizek;
 	range_sizek = chunk_sizek;
-	if (range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) {
+
+	if ((range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) &&
+	    (range_basek + range_sizek <= basek)) {
 		hole_sizek = range_sizek - (state->range_sizek - range0_sizek);
 		hole_basek = range_basek + range_sizek - hole_sizek;
 	} else
@@ -880,7 +888,9 @@ static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigne
 	}
 }
 
-static void __init set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, unsigned long size_pfn)
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+		   unsigned long size_pfn)
 {
 	unsigned long basek, sizek;
 
@@ -921,7 +931,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p)
 early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
 
 /* granity of mtrr of block */
-static u64 mtrr_gran_size __initdata = (64ULL<<20);
+static u64 mtrr_gran_size __initdata = (1ULL<<20);
 
 static int __init parse_mtrr_gran_size_opt(char *p)
 {
@@ -932,17 +942,19 @@ static int __init parse_mtrr_gran_size_opt(char *p)
 }
 early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
 
-static void __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, unsigned address_bits)
+static void __init
+x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+		    unsigned address_bits)
 {
 	struct var_mtrr_state var_state;
 	int i;
 
-	var_state.range_startk = 0;
-	var_state.range_sizek = 0;
-	var_state.reg = 0;
-	var_state.address_bits = address_bits;
-	var_state.chunk_sizek = mtrr_chunk_size >> 10;
-	var_state.gran_sizek = mtrr_gran_size >> 10;
+	var_state.range_startk	= 0;
+	var_state.range_sizek	= 0;
+	var_state.reg		= 0;
+	var_state.address_bits	= address_bits;
+	var_state.chunk_sizek	= mtrr_chunk_size >> 10;
+	var_state.gran_sizek	= mtrr_gran_size >> 10;
 
 	/* Write the range etc */
 	for (i = 0; i < nr_range; i++)
@@ -952,11 +964,16 @@ static void __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, un
 	range_to_mtrr_with_hole(&var_state, 0);
 	printk(KERN_INFO "DONE variable MTRRs\n");
 	/* Clear out the extra MTRR's */
-	while (var_state.reg < num_var_ranges)
-		set_var_mtrr(var_state.reg++, 0, 0, 0, var_state.address_bits);
+	while (var_state.reg < num_var_ranges) {
+		set_var_mtrr(var_state.reg, 0, 0, 0, var_state.address_bits);
+		var_state.reg++;
+	}
 }
 
-static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size)
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+		       unsigned long extra_remove_base,
+		       unsigned long extra_remove_size)
 {
 	unsigned long i, base, size;
 	mtrr_type type;
@@ -965,7 +982,7 @@ static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
 		mtrr_if->get(i, &base, &size, &type);
 		if (type != MTRR_TYPE_WRBACK)
 			continue;
-		nr_range = add_range(range, nr_range, base, base + size - 1, 1);
+		nr_range = add_range_with_merge(range, nr_range, base, base + size - 1);
 	}
 	printk(KERN_INFO "After WB checking\n");
 	for (i = 0; i < nr_range; i++)
@@ -1005,11 +1022,11 @@ static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
 
 static int __init mtrr_cleanup(unsigned address_bits)
 {
+	unsigned long extra_remove_base, extra_remove_size;
 	unsigned long i, base, size, def, dummy;
-	mtrr_type type;
 	struct res_range range[RANGE_NUM];
+	mtrr_type type;
 	int nr_range;
-	unsigned long extra_remove_base, extra_remove_size;
 
 	/* extra one for all 0 */
 	int num[MTRR_NUM_TYPES + 1];
@@ -1053,7 +1070,6 @@ static int __init mtrr_cleanup(unsigned address_bits)
 	x86_setup_var_mtrrs(range, nr_range, address_bits);
 
 	return 1;
-
 }
 
 static int disable_mtrr_trim;
-- 
cgit v1.2.3


From 12031a624af7816ec7660b82be648aa3703b4ebe Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Fri, 2 May 2008 02:40:22 -0700
Subject: x86: mtrr cleanup for converting continuous to discrete - auto detect
 v4

Loop through mtrr chunk_size and gran_size from 1M to 2G to find out
the optimal value so user does not need to add mtrr_chunk_size and
mtrr_gran_size to the kernel command line.

If optimal value is not found, print out all list to help select less
optimal value.

Add mtrr_spare_reg_nr= so user could set 2 instead of 1, if the card
need more entries.

v2: find the one with more spare entries
v3: fix hole_basek offset
v4: tight the compare between range and range_new
    loop stop with 4g

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Gabriel C <nix.or.die@googlemail.com>
Cc: Mika Fischer <mika.fischer@zoopnet.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |  13 +-
 arch/x86/Kconfig                    |   9 +
 arch/x86/kernel/cpu/mtrr/main.c     | 610 +++++++++++++++++++++++++++---------
 3 files changed, 484 insertions(+), 148 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 44427607b7c6..2000977af776 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -610,8 +610,17 @@ and is between 256 and 4096 characters. It is defined in the file
 			that could hold holes aka. UC entries.
 
 	mtrr_gran_size=nn[KMG] [X86]
-			used for mtrr cleanup. It is granity of mtrr block.
-			Big value could prevent small alignment use up MTRRs.
+			Used for mtrr cleanup. It is granularity of mtrr block.
+			Default is 1.
+			Large value could prevent small alignment from
+			using up MTRRs.
+
+	mtrr_spare_reg_nr=n [X86]
+			Format: <integer>
+			Range: 0,7 : spare reg number
+			Default : 1
+			Used for mtrr cleanup. It is spare mtrr entries number.
+			Set to 2 or more if your graphical card needs more.
 
 	disable_mtrr_trim [X86, Intel and AMD only]
 			By default the kernel will trim any uncacheable
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f417fe33d852..9e9761504422 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1117,6 +1117,15 @@ config MTRR_SANITIZER_ENABLE_DEFAULT
 	help
 	  Enable mtrr cleanup default value
 
+config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
+	int "MTRR cleanup spare reg num (0-7)"
+	range 0 7
+	default "1"
+	depends on MTRR_SANITIZER
+	help
+	  mtrr cleanup spare entries default, it can be changed via
+	  mtrr_spare_reg_nr=
+
 config X86_PAT
 	bool
 	prompt "x86 PAT support"
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79597d3c3431..e6c162c379ac 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -610,28 +610,6 @@ static struct sysdev_driver mtrr_sysdev_driver = {
 	.resume		= mtrr_restore,
 };
 
-#ifdef CONFIG_MTRR_SANITIZER
-static int enable_mtrr_cleanup __initdata = CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
-#else
-static int enable_mtrr_cleanup __initdata = -1;
-#endif
-
-static int __init disable_mtrr_cleanup_setup(char *str)
-{
-	if (enable_mtrr_cleanup != -1)
-		enable_mtrr_cleanup = 0;
-	return 0;
-}
-early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
-
-static int __init enable_mtrr_cleanup_setup(char *str)
-{
-	if (enable_mtrr_cleanup != -1)
-		enable_mtrr_cleanup = 1;
-	return 0;
-}
-early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
-
 /* should be related to MTRR_VAR_RANGES nums */
 #define RANGE_NUM 256
 
@@ -702,13 +680,15 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end)
 			continue;
 		}
 
-		if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) {
+		if (start <= range[j].start && end < range[j].end &&
+		    range[j].start < end + 1) {
 			range[j].start = end + 1;
 			continue;
 		}
 
 
-		if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) {
+		if (start > range[j].start && end >= range[j].end &&
+		    range[j].end > start - 1) {
 			range[j].end = start - 1;
 			continue;
 		}
@@ -743,18 +723,123 @@ static int __init cmp_range(const void *x1, const void *x2)
 	return start1 - start2;
 }
 
+struct var_mtrr_range_state {
+	unsigned long base_pfn;
+	unsigned long size_pfn;
+	mtrr_type type;
+};
+
+struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+		       unsigned long extra_remove_base,
+		       unsigned long extra_remove_size)
+{
+	unsigned long i, base, size;
+	mtrr_type type;
+
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_WRBACK)
+			continue;
+		base = range_state[i].base_pfn;
+		size = range_state[i].size_pfn;
+		nr_range = add_range_with_merge(range, nr_range, base,
+						base + size - 1);
+	}
+	printk(KERN_DEBUG "After WB checking\n");
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+
+	/* take out UC ranges */
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_UNCACHABLE)
+			continue;
+		size = range_state[i].size_pfn;
+		if (!size)
+			continue;
+		base = range_state[i].base_pfn;
+		subtract_range(range, base, base + size - 1);
+	}
+	if (extra_remove_size)
+		subtract_range(range, extra_remove_base,
+				 extra_remove_base + extra_remove_size  - 1);
+
+	/* get new range num */
+	nr_range = 0;
+	for (i = 0; i < RANGE_NUM; i++) {
+		if (!range[i].end)
+			continue;
+		nr_range++;
+	}
+	printk(KERN_DEBUG "After UC checking\n");
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+			 range[i].start, range[i].end + 1);
+
+	/* sort the ranges */
+	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+	printk(KERN_DEBUG "After sorting\n");
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+
+	/* clear those is not used */
+	for (i = nr_range; i < RANGE_NUM; i++)
+		memset(&range[i], 0, sizeof(range[i]));
+
+	return nr_range;
+}
+
+static struct res_range __initdata range[RANGE_NUM];
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+{
+	unsigned long sum;
+	int i;
+
+	sum = 0;
+	for (i = 0; i < nr_range; i++)
+		sum += range[i].end + 1 - range[i].start;
+
+	return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+	CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+	if (enable_mtrr_cleanup != -1)
+		enable_mtrr_cleanup = 0;
+	return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+	if (enable_mtrr_cleanup != -1)
+		enable_mtrr_cleanup = 1;
+	return 0;
+}
+early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
 struct var_mtrr_state {
 	unsigned long	range_startk;
 	unsigned long	range_sizek;
 	unsigned long	chunk_sizek;
 	unsigned long	gran_sizek;
 	unsigned int	reg;
-	unsigned int	address_bits;
 };
 
 static void __init
 set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
-		unsigned char type, unsigned address_bits)
+		unsigned char type, unsigned int address_bits)
 {
 	u32 base_lo, base_hi, mask_lo, mask_hi;
 	u64 base, mask;
@@ -781,10 +866,34 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
 	fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
 }
 
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+		unsigned char type)
+{
+	range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+	range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+	range_state[reg].type = type;
+}
+
+static void __init
+set_var_mtrr_all(unsigned int address_bits)
+{
+	unsigned long basek, sizek;
+	unsigned char type;
+	unsigned int reg;
+
+	for (reg = 0; reg < num_var_ranges; reg++) {
+		basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+		sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+		type = range_state[reg].type;
+
+		set_var_mtrr(reg, basek, sizek, type, address_bits);
+	}
+}
+
 static unsigned int __init
 range_to_mtrr(unsigned int reg, unsigned long range_startk,
-	      unsigned long range_sizek, unsigned char type,
-	      unsigned address_bits)
+	      unsigned long range_sizek, unsigned char type)
 {
 	if (!range_sizek || (reg >= num_var_ranges))
 		return reg;
@@ -803,12 +912,13 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
 			align = max_align;
 
 		sizek = 1 << align;
-		printk(KERN_INFO "Setting variable MTRR %d, base: %ldMB, range: %ldMB, type %s\n",
+		printk(KERN_DEBUG "Setting variable MTRR %d, base: %ldMB, "
+		       "range: %ldMB, type %s\n",
 			reg, range_startk >> 10, sizek >> 10,
 			(type == MTRR_TYPE_UNCACHABLE)?"UC":
 			    ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
 			);
-		set_var_mtrr(reg++, range_startk, sizek, type, address_bits);
+		save_var_mtrr(reg++, range_startk, sizek, type);
 		range_startk += sizek;
 		range_sizek -= sizek;
 		if (reg >= num_var_ranges)
@@ -817,10 +927,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
 	return reg;
 }
 
-static void __init
-range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek)
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+			unsigned long sizek)
 {
 	unsigned long hole_basek, hole_sizek;
+	unsigned long second_basek, second_sizek;
 	unsigned long range0_basek, range0_sizek;
 	unsigned long range_basek, range_sizek;
 	unsigned long chunk_sizek;
@@ -828,64 +940,95 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek)
 
 	hole_basek = 0;
 	hole_sizek = 0;
+	second_basek = 0;
+	second_sizek = 0;
 	chunk_sizek = state->chunk_sizek;
 	gran_sizek = state->gran_sizek;
 
 	/* align with gran size, prevent small block used up MTRRs */
 	range_basek = ALIGN(state->range_startk, gran_sizek);
 	if ((range_basek > basek) && basek)
-		return;
-	range_sizek = ALIGN(state->range_sizek - (range_basek - state->range_startk), gran_sizek);
+		return second_sizek;
+	state->range_sizek -= (range_basek - state->range_startk);
+	range_sizek = ALIGN(state->range_sizek, gran_sizek);
 
-	while (range_basek + range_sizek > (state->range_startk + state->range_sizek)) {
+	while (range_sizek > state->range_sizek) {
 		range_sizek -= gran_sizek;
 		if (!range_sizek)
-			return;
+			return 0;
 	}
-	state->range_startk = range_basek;
 	state->range_sizek = range_sizek;
 
 	/* try to append some small hole */
 	range0_basek = state->range_startk;
 	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
 	if (range0_sizek == state->range_sizek) {
-			printk(KERN_INFO "rangeX: %016lx - %016lx\n", range0_basek<<10, (range0_basek + state->range_sizek)<<10);
-			state->reg = range_to_mtrr(state->reg, range0_basek,
-				state->range_sizek, MTRR_TYPE_WRBACK, state->address_bits);
-		return;
-	} else if (basek) {
-	    while (range0_basek + range0_sizek - chunk_sizek > basek) {
+		printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", range0_basek<<10,
+				(range0_basek + state->range_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range0_basek,
+				state->range_sizek, MTRR_TYPE_WRBACK);
+		return 0;
+	}
+
+	range0_sizek -= chunk_sizek;
+	if (range0_sizek && sizek) {
+	    while (range0_basek + range0_sizek > (basek + sizek)) {
 		range0_sizek -= chunk_sizek;
 		if (!range0_sizek)
 			break;
 	    }
 	}
 
+	if (range0_sizek) {
+		printk(KERN_DEBUG "range0: %016lx - %016lx\n", range0_basek<<10,
+				(range0_basek + range0_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range0_basek,
+				range0_sizek, MTRR_TYPE_WRBACK);
 
-	if (range0_sizek > chunk_sizek)
-		range0_sizek -= chunk_sizek;
-	printk(KERN_INFO "range0: %016lx - %016lx\n", range0_basek<<10, (range0_basek + range0_sizek)<<10);
-	state->reg = range_to_mtrr(state->reg, range0_basek,
-			range0_sizek, MTRR_TYPE_WRBACK, state->address_bits);
+	}
 
 	range_basek = range0_basek + range0_sizek;
 	range_sizek = chunk_sizek;
 
-	if ((range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) &&
-	    (range_basek + range_sizek <= basek)) {
-		hole_sizek = range_sizek - (state->range_sizek - range0_sizek);
-		hole_basek = range_basek + range_sizek - hole_sizek;
-	} else
+	if (range_basek + range_sizek > basek &&
+	    range_basek + range_sizek <= (basek + sizek)) {
+		/* one hole */
+		second_basek = basek;
+		second_sizek = range_basek + range_sizek - basek;
+	}
+
+	/* if last piece, only could one hole near end */
+	if ((second_basek || !basek) &&
+	    range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
+	    (chunk_sizek >> 1)) {
+		/*
+		 * one hole in middle (second_sizek is 0) or at end
+		 * (second_sizek is 0 )
+		 */
+		hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
+				 - second_sizek;
+		hole_basek = range_basek + range_sizek - hole_sizek
+				 - second_sizek;
+	} else {
+		/* fallback for big hole, or several holes */
 		range_sizek = state->range_sizek - range0_sizek;
+		second_basek = 0;
+		second_sizek = 0;
+	}
 
-	printk(KERN_INFO "range: %016lx - %016lx\n", range_basek<<10, (range_basek + range_sizek)<<10);
-	state->reg = range_to_mtrr(state->reg, range_basek,
-			range_sizek, MTRR_TYPE_WRBACK, state->address_bits);
+	printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
+			 (range_basek + range_sizek)<<10);
+	state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
+					 MTRR_TYPE_WRBACK);
 	if (hole_sizek) {
-		printk(KERN_INFO "hole: %016lx - %016lx\n", hole_basek<<10, (hole_basek + hole_sizek)<<10);
-		state->reg = range_to_mtrr(state->reg, hole_basek,
-				hole_sizek, MTRR_TYPE_UNCACHABLE, state->address_bits);
+		printk(KERN_DEBUG "hole: %016lx - %016lx\n", hole_basek<<10,
+				 (hole_basek + hole_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
+						 MTRR_TYPE_UNCACHABLE);
+
 	}
+
+	return second_sizek;
 }
 
 static void __init
@@ -893,6 +1036,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
 		   unsigned long size_pfn)
 {
 	unsigned long basek, sizek;
+	unsigned long second_sizek = 0;
 
 	if (state->reg >= num_var_ranges)
 		return;
@@ -901,21 +1045,19 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
 	sizek = size_pfn << (PAGE_SHIFT - 10);
 
 	/* See if I can merge with the last range */
-	if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) {
+	if ((basek <= 1024) ||
+	    (state->range_startk + state->range_sizek == basek)) {
 		unsigned long endk = basek + sizek;
 		state->range_sizek = endk - state->range_startk;
 		return;
 	}
 	/* Write the range mtrrs */
-	if (state->range_sizek != 0) {
-		range_to_mtrr_with_hole(state, basek);
+	if (state->range_sizek != 0)
+		second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
 
-		state->range_startk = 0;
-		state->range_sizek = 0;
-	}
 	/* Allocate an msr */
-	state->range_startk = basek;
-	state->range_sizek  = sizek;
+	state->range_startk = basek + second_sizek;
+	state->range_sizek  = sizek - second_sizek;
 }
 
 /* mininum size of mtrr block that can take hole */
@@ -931,7 +1073,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p)
 early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
 
 /* granity of mtrr of block */
-static u64 mtrr_gran_size __initdata = (1ULL<<20);
+static u64 mtrr_gran_size __initdata;
 
 static int __init parse_mtrr_gran_size_opt(char *p)
 {
@@ -942,91 +1084,84 @@ static int __init parse_mtrr_gran_size_opt(char *p)
 }
 early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
 
-static void __init
+static int nr_mtrr_spare_reg __initdata =
+				 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+	if (arg)
+		nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
 x86_setup_var_mtrrs(struct res_range *range, int nr_range,
-		    unsigned address_bits)
+		    u64 chunk_size, u64 gran_size)
 {
 	struct var_mtrr_state var_state;
 	int i;
+	int num_reg;
 
 	var_state.range_startk	= 0;
 	var_state.range_sizek	= 0;
 	var_state.reg		= 0;
-	var_state.address_bits	= address_bits;
-	var_state.chunk_sizek	= mtrr_chunk_size >> 10;
-	var_state.gran_sizek	= mtrr_gran_size >> 10;
+	var_state.chunk_sizek	= chunk_size >> 10;
+	var_state.gran_sizek	= gran_size >> 10;
+
+	memset(range_state, 0, sizeof(range_state));
 
 	/* Write the range etc */
 	for (i = 0; i < nr_range; i++)
-		set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1);
+		set_var_mtrr_range(&var_state, range[i].start,
+				   range[i].end - range[i].start + 1);
 
 	/* Write the last range */
-	range_to_mtrr_with_hole(&var_state, 0);
-	printk(KERN_INFO "DONE variable MTRRs\n");
+	if (var_state.range_sizek != 0)
+		range_to_mtrr_with_hole(&var_state, 0, 0);
+	printk(KERN_DEBUG "DONE variable MTRRs\n");
+
+	num_reg = var_state.reg;
 	/* Clear out the extra MTRR's */
 	while (var_state.reg < num_var_ranges) {
-		set_var_mtrr(var_state.reg, 0, 0, 0, var_state.address_bits);
+		save_var_mtrr(var_state.reg, 0, 0, 0);
 		var_state.reg++;
 	}
-}
-
-static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
-		       unsigned long extra_remove_base,
-		       unsigned long extra_remove_size)
-{
-	unsigned long i, base, size;
-	mtrr_type type;
-
-	for (i = 0; i < num_var_ranges; i++) {
-		mtrr_if->get(i, &base, &size, &type);
-		if (type != MTRR_TYPE_WRBACK)
-			continue;
-		nr_range = add_range_with_merge(range, nr_range, base, base + size - 1);
-	}
-	printk(KERN_INFO "After WB checking\n");
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1);
 
-	/* take out UC ranges */
-	for (i = 0; i < num_var_ranges; i++) {
-		mtrr_if->get(i, &base, &size, &type);
-		if (type != MTRR_TYPE_UNCACHABLE)
-			continue;
-		if (!size)
-			continue;
-		subtract_range(range, base, base + size - 1);
-	}
-	if (extra_remove_size)
-		subtract_range(range, extra_remove_base,  extra_remove_base + extra_remove_size  - 1);
+	return num_reg;
+}
 
-	/* get new range num */
-	nr_range = 0;
-	for (i = 0; i < RANGE_NUM; i++) {
-		if (!range[i].end)
-			continue;
-		nr_range++;
-	}
-	printk(KERN_INFO "After UC checking\n");
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1);
+struct mtrr_cleanup_result {
+	unsigned long gran_sizek;
+	unsigned long chunk_sizek;
+	unsigned long lose_cover_sizek;
+	unsigned int num_reg;
+	int bad;
+};
 
-	/* sort the ranges */
-	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-	printk(KERN_INFO "After sorting\n");
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1);
+/*
+ * gran_size: 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 4G
+ * so we need (2+13)*6
+ */
+#define NUM_RESULT	90
+#define PSHIFT		(PAGE_SHIFT - 10)
 
-	return nr_range;
-}
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static struct res_range __initdata range_new[RANGE_NUM];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
 
 static int __init mtrr_cleanup(unsigned address_bits)
 {
 	unsigned long extra_remove_base, extra_remove_size;
 	unsigned long i, base, size, def, dummy;
-	struct res_range range[RANGE_NUM];
 	mtrr_type type;
-	int nr_range;
+	int nr_range, nr_range_new;
+	u64 chunk_size, gran_size;
+	unsigned long range_sums, range_sums_new;
+	int index_good;
+	int num_reg_good;
 
 	/* extra one for all 0 */
 	int num[MTRR_NUM_TYPES + 1];
@@ -1038,10 +1173,20 @@ static int __init mtrr_cleanup(unsigned address_bits)
 	if (def != MTRR_TYPE_UNCACHABLE)
 		return 0;
 
+	/* get it and store it aside */
+	memset(range_state, 0, sizeof(range_state));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		range_state[i].base_pfn = base;
+		range_state[i].size_pfn = size;
+		range_state[i].type = type;
+	}
+
 	/* check entries number */
 	memset(num, 0, sizeof(num));
 	for (i = 0; i < num_var_ranges; i++) {
-		mtrr_if->get(i, &base, &size, &type);
+		type = range_state[i].type;
+		size = range_state[i].size_pfn;
 		if (type >= MTRR_NUM_TYPES)
 			continue;
 		if (!size)
@@ -1062,15 +1207,172 @@ static int __init mtrr_cleanup(unsigned address_bits)
 	extra_remove_size = 0;
 	if (mtrr_tom2) {
 		extra_remove_base = 1 << (32 - PAGE_SHIFT);
-		extra_remove_size = (mtrr_tom2>>PAGE_SHIFT) - extra_remove_base;
+		extra_remove_size =
+			(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+	}
+	nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
+					  extra_remove_size);
+	range_sums = sum_ranges(range, nr_range);
+	printk(KERN_INFO "total RAM coverred: %ldM\n",
+	       range_sums >> (20 - PAGE_SHIFT));
+
+	if (mtrr_chunk_size && mtrr_gran_size) {
+		int num_reg;
+
+		/* convert ranges to var ranges state */
+		num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
+					      mtrr_gran_size);
+
+		/* we got new setting in range_state, check it */
+		memset(range_new, 0, sizeof(range_new));
+		nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+						      extra_remove_base,
+						      extra_remove_size);
+		range_sums_new = sum_ranges(range_new, nr_range_new);
+
+		i = 0;
+		result[i].chunk_sizek = mtrr_chunk_size >> 10;
+		result[i].gran_sizek = mtrr_gran_size >> 10;
+		result[i].num_reg = num_reg;
+		if (range_sums < range_sums_new) {
+			result[i].lose_cover_sizek =
+				(range_sums_new - range_sums) << PSHIFT;
+			result[i].bad = 1;
+		} else
+			result[i].lose_cover_sizek =
+				(range_sums - range_sums_new) << PSHIFT;
+
+		printk(KERN_INFO " %sgran_size: %ldM  \tchunk_size: %ldM  \t",
+			 result[i].bad?" BAD ":"", result[i].gran_sizek >> 10,
+			 result[i].chunk_sizek >> 10);
+		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ldM \n",
+			 result[i].num_reg, result[i].bad?"-":"",
+			 result[i].lose_cover_sizek >> 10);
+		if (!result[i].bad) {
+			set_var_mtrr_all(address_bits);
+			return 1;
+		}
+		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
+		       "will find optimal one\n");
+		memset(result, 0, sizeof(result[0]));
+	}
+
+	i = 0;
+	memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+	memset(result, 0, sizeof(result));
+	for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
+		for (chunk_size = gran_size; chunk_size < (1ULL<<33);
+		     chunk_size <<= 1) {
+			int num_reg;
+
+			printk(KERN_INFO
+			       "\ngran_size: %lldM   chunk_size_size: %lldM\n",
+			       gran_size >> 20, chunk_size >> 20);
+			if (i >= NUM_RESULT)
+				continue;
+
+			/* convert ranges to var ranges state */
+			num_reg = x86_setup_var_mtrrs(range, nr_range,
+							 chunk_size, gran_size);
+
+			/* we got new setting in range_state, check it */
+			memset(range_new, 0, sizeof(range_new));
+			nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+					 extra_remove_base, extra_remove_size);
+			range_sums_new = sum_ranges(range_new, nr_range_new);
+
+			result[i].chunk_sizek = chunk_size >> 10;
+			result[i].gran_sizek = gran_size >> 10;
+			result[i].num_reg = num_reg;
+			if (range_sums < range_sums_new) {
+				result[i].lose_cover_sizek =
+					(range_sums_new - range_sums) << PSHIFT;
+				result[i].bad = 1;
+			} else
+				result[i].lose_cover_sizek =
+					(range_sums - range_sums_new) << PSHIFT;
+
+			/* double check it */
+			if (!result[i].bad && !result[i].lose_cover_sizek) {
+				if (nr_range_new != nr_range ||
+					memcmp(range, range_new, sizeof(range)))
+						result[i].bad = 1;
+			}
+
+			if (!result[i].bad && (range_sums - range_sums_new <
+					       min_loss_pfn[num_reg])) {
+				min_loss_pfn[num_reg] =
+					range_sums - range_sums_new;
+			}
+			i++;
+		}
 	}
-	nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, extra_remove_size);
 
-	/* convert ranges to var ranges state */
-	x86_setup_var_mtrrs(range, nr_range, address_bits);
+	/* print out all */
+	for (i = 0; i < NUM_RESULT; i++) {
+		printk(KERN_INFO "%sgran_size: %ldM  \tchunk_size: %ldM  \t",
+		       result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
+		       result[i].chunk_sizek >> 10);
+		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ldM \n",
+		       result[i].num_reg, result[i].bad?"-":"",
+		       result[i].lose_cover_sizek >> 10);
+	}
 
-	return 1;
+	/* try to find the optimal index */
+	if (nr_mtrr_spare_reg >= num_var_ranges)
+		nr_mtrr_spare_reg = num_var_ranges - 1;
+	num_reg_good = -1;
+	for (i = 1; i < num_var_ranges + 1 - nr_mtrr_spare_reg; i++) {
+		if (!min_loss_pfn[i]) {
+			num_reg_good = i;
+			break;
+		}
+	}
+
+	index_good = -1;
+	if (num_reg_good != -1) {
+		for (i = 0; i < NUM_RESULT; i++) {
+			if (!result[i].bad &&
+			    result[i].num_reg == num_reg_good &&
+			    !result[i].lose_cover_sizek) {
+				index_good = i;
+				break;
+			}
+		}
+	}
+
+	if (index_good != -1) {
+		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+		i = index_good;
+		printk(KERN_INFO "gran_size: %ldM  \tchunk_size: %ldM  \t",
+				result[i].gran_sizek >> 10,
+				result[i].chunk_sizek >> 10);
+		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %ldM \n",
+				result[i].num_reg,
+				result[i].lose_cover_sizek >> 10);
+		/* convert ranges to var ranges state */
+		chunk_size = result[i].chunk_sizek;
+		chunk_size <<= 10;
+		gran_size = result[i].gran_sizek;
+		gran_size <<= 10;
+		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+		set_var_mtrr_all(address_bits);
+		return 1;
+	}
+
+	printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
+	printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+	return 0;
 }
+#else
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+	return 0;
+}
+#endif
+
+static int __initdata changed_by_mtrr_cleanup;
 
 static int disable_mtrr_trim;
 
@@ -1111,7 +1413,8 @@ int __init amd_special_default_mtrr(void)
 	return 0;
 }
 
-static u64 __init real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
+static u64 __init real_trim_memory(unsigned long start_pfn,
+				   unsigned long limit_pfn)
 {
 	u64 trim_start, trim_size;
 	trim_start = start_pfn;
@@ -1138,9 +1441,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 {
 	unsigned long i, base, size, highest_pfn = 0, def, dummy;
 	mtrr_type type;
-	struct res_range range[RANGE_NUM];
 	int nr_range;
-	u64 total_real_trim_size;
+	u64 total_trim_size;
 
 	/* extra one for all 0 */
 	int num[MTRR_NUM_TYPES + 1];
@@ -1155,11 +1457,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	if (def != MTRR_TYPE_UNCACHABLE)
 		return 0;
 
-	/* Find highest cached pfn */
+	/* get it and store it aside */
+	memset(range_state, 0, sizeof(range_state));
 	for (i = 0; i < num_var_ranges; i++) {
 		mtrr_if->get(i, &base, &size, &type);
+		range_state[i].base_pfn = base;
+		range_state[i].size_pfn = size;
+		range_state[i].type = type;
+	}
+
+	/* Find highest cached pfn */
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
 		if (type != MTRR_TYPE_WRBACK)
 			continue;
+		base = range_state[i].base_pfn;
+		size = range_state[i].size_pfn;
 		if (highest_pfn < base + size)
 			highest_pfn = base + size;
 	}
@@ -1177,9 +1490,10 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	/* check entries number */
 	memset(num, 0, sizeof(num));
 	for (i = 0; i < num_var_ranges; i++) {
-		mtrr_if->get(i, &base, &size, &type);
+		type = range_state[i].type;
 		if (type >= MTRR_NUM_TYPES)
 			continue;
+		size = range_state[i].size_pfn;
 		if (!size)
 			type = MTRR_NUM_TYPES;
 		num[type]++;
@@ -1205,26 +1519,28 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	}
 	nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
 
-	total_real_trim_size = 0;
+	total_trim_size = 0;
 	/* check the head */
 	if (range[0].start)
-		total_real_trim_size += real_trim_memory(0, range[0].start);
+		total_trim_size += real_trim_memory(0, range[0].start);
 	/* check the holes */
 	for (i = 0; i < nr_range - 1; i++) {
 		if (range[i].end + 1 < range[i+1].start)
-			total_real_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start);
+			total_trim_size += real_trim_memory(range[i].end + 1,
+							    range[i+1].start);
 	}
 	/* check the top */
 	i = nr_range - 1;
 	if (range[i].end + 1 < end_pfn)
-		total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn);
+		total_trim_size += real_trim_memory(range[i].end + 1,
+							 end_pfn);
 
-	if (total_real_trim_size) {
+	if (total_trim_size) {
 		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
 			" all of memory, losing %lluMB of RAM.\n",
-			total_real_trim_size >> 20);
+			total_trim_size >> 20);
 
-		if (enable_mtrr_cleanup < 1)
+		if (!changed_by_mtrr_cleanup)
 			WARN_ON(1);
 
 		printk(KERN_INFO "update e820 for mtrr\n");
@@ -1314,8 +1630,10 @@ void __init mtrr_bp_init(void)
 		if (use_intel()) {
 			get_mtrr_state();
 
-			if (mtrr_cleanup(phys_addr))
+			if (mtrr_cleanup(phys_addr)) {
+				changed_by_mtrr_cleanup = 1;
 				mtrr_if->set_all();
+			}
 
 		}
 	}
@@ -1355,7 +1673,7 @@ static int __init mtrr_init_finialize(void)
 	if (!mtrr_if)
 		return 0;
 	if (use_intel()) {
-		if (enable_mtrr_cleanup < 1)
+		if (!changed_by_mtrr_cleanup)
 			mtrr_state_warn();
 	} else {
 		/* The CPUs haven't MTRR and seem to not support SMP. They have
-- 
cgit v1.2.3


From 833e78bfeeef628f0201349a0a05a54f48f07884 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Mon, 5 May 2008 15:57:38 -0700
Subject: x86: process fam 10h like k8 with fixed mtrr setting

otherwise fixed MTRR for family 10h may not be changed.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 5aae648600bf..a83f5cd78885 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -342,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
 
 	if (lo != msrwords[0] || hi != msrwords[1]) {
 		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		    boot_cpu_data.x86 == 15 &&
+		    (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
 		    ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
 			k8_enable_fixed_iorrs();
 		mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
-- 
cgit v1.2.3


From b79cd8f1268bab57ff85b19d131f7f23deab2dee Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 11 May 2008 00:30:15 -0700
Subject: x86: make e820.c to have common functions

remove the duplicated copy of these functions.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile   |   2 +-
 arch/x86/kernel/e820.c     | 475 +++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/e820_32.c  | 411 +--------------------------------------
 arch/x86/kernel/e820_64.c  | 444 ------------------------------------------
 arch/x86/kernel/setup_32.c |   2 +-
 include/asm-x86/e820.h     |  14 ++
 include/asm-x86/e820_32.h  |  12 --
 include/asm-x86/e820_64.h  |  12 --
 8 files changed, 496 insertions(+), 876 deletions(-)
 create mode 100644 arch/x86/kernel/e820.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..5369a4e36f17 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -22,7 +22,7 @@ obj-y			+= setup_$(BITS).o i8259_$(BITS).o setup.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
-obj-y			+= bootflag.o e820_$(BITS).o
+obj-y			+= bootflag.o e820_$(BITS).o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
new file mode 100644
index 000000000000..2cb686f60d0d
--- /dev/null
+++ b/arch/x86/kernel/e820.c
@@ -0,0 +1,475 @@
+/*
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ *
+ *  Getting sanitize_e820_map() in sync with i386 version by applying change:
+ *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *     Alex Achenbach <xela@slit.de>, December 2002.
+ *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+
+struct e820map e820;
+
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0xaeedbabe;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (type && ei->type != type)
+			continue;
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(u64 start, u64 end, unsigned type)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (type && ei->type != type)
+			continue;
+		/* is the region (part) in overlap with the current region ?*/
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+
+		/* if the region is at the beginning of <start,end> we move
+		 * start to the end of the region since it's ok until there
+		 */
+		if (ei->addr <= start)
+			start = ei->addr + ei->size;
+		/*
+		 * if start is now at or beyond end, we're done, full
+		 * coverage
+		 */
+		if (start >= end)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Add a memory region to the kernel e820 map.
+ */
+void __init add_memory_region(u64 start, u64 size, int type)
+{
+	int x = e820.nr_map;
+
+	if (x == E820MAX) {
+		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		return;
+	}
+
+	e820.map[x].addr = start;
+	e820.map[x].size = size;
+	e820.map[x].type = type;
+	e820.nr_map++;
+}
+
+void __init e820_print_map(char *who)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+		       (unsigned long long) e820.map[i].addr,
+		       (unsigned long long)
+		       (e820.map[i].addr + e820.map[i].size));
+		switch (e820.map[i].type) {
+		case E820_RAM:
+			printk(KERN_CONT "(usable)\n");
+			break;
+		case E820_RESERVED:
+			printk(KERN_CONT "(reserved)\n");
+			break;
+		case E820_ACPI:
+			printk(KERN_CONT "(ACPI data)\n");
+			break;
+		case E820_NVS:
+			printk(KERN_CONT "(ACPI NVS)\n");
+			break;
+		default:
+			printk(KERN_CONT "type %u\n", e820.map[i].type);
+			break;
+		}
+	}
+}
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
+{
+	struct change_member {
+		struct e820entry *pbios; /* pointer to original bios entry */
+		unsigned long long addr; /* address for this change point */
+	};
+	static struct change_member change_point_list[2*E820MAX] __initdata;
+	static struct change_member *change_point[2*E820MAX] __initdata;
+	static struct e820entry *overlap_list[E820MAX] __initdata;
+	static struct e820entry new_bios[E820MAX] __initdata;
+	struct change_member *change_tmp;
+	unsigned long current_type, last_type;
+	unsigned long long last_addr;
+	int chgidx, still_changing;
+	int overlap_entries;
+	int new_bios_entry;
+	int old_nr, new_nr, chg_nr;
+	int i;
+
+	/*
+		Visually we're performing the following
+		(1,2,3,4 = memory types)...
+
+		Sample memory map (w/overlaps):
+		   ____22__________________
+		   ______________________4_
+		   ____1111________________
+		   _44_____________________
+		   11111111________________
+		   ____________________33__
+		   ___________44___________
+		   __________33333_________
+		   ______________22________
+		   ___________________2222_
+		   _________111111111______
+		   _____________________11_
+		   _________________4______
+
+		Sanitized equivalent (no overlap):
+		   1_______________________
+		   _44_____________________
+		   ___1____________________
+		   ____22__________________
+		   ______11________________
+		   _________1______________
+		   __________3_____________
+		   ___________44___________
+		   _____________33_________
+		   _______________2________
+		   ________________1_______
+		   _________________4______
+		   ___________________2____
+		   ____________________33__
+		   ______________________4_
+	*/
+
+	/* if there's only one memory region, don't bother */
+	if (*pnr_map < 2)
+		return -1;
+
+	old_nr = *pnr_map;
+
+	/* bail out if we find any unreasonable addresses in bios map */
+	for (i = 0; i < old_nr; i++)
+		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+			return -1;
+
+	/* create pointers for initial change-point information (for sorting) */
+	for (i = 0; i < 2 * old_nr; i++)
+		change_point[i] = &change_point_list[i];
+
+	/* record all known change-points (starting and ending addresses),
+	   omitting those that are for empty memory regions */
+	chgidx = 0;
+	for (i = 0; i < old_nr; i++)	{
+		if (biosmap[i].size != 0) {
+			change_point[chgidx]->addr = biosmap[i].addr;
+			change_point[chgidx++]->pbios = &biosmap[i];
+			change_point[chgidx]->addr = biosmap[i].addr +
+				biosmap[i].size;
+			change_point[chgidx++]->pbios = &biosmap[i];
+		}
+	}
+	chg_nr = chgidx;
+
+	/* sort change-point list by memory addresses (low -> high) */
+	still_changing = 1;
+	while (still_changing)	{
+		still_changing = 0;
+		for (i = 1; i < chg_nr; i++)  {
+			unsigned long long curaddr, lastaddr;
+			unsigned long long curpbaddr, lastpbaddr;
+
+			curaddr = change_point[i]->addr;
+			lastaddr = change_point[i - 1]->addr;
+			curpbaddr = change_point[i]->pbios->addr;
+			lastpbaddr = change_point[i - 1]->pbios->addr;
+
+			/*
+			 * swap entries, when:
+			 *
+			 * curaddr > lastaddr or
+			 * curaddr == lastaddr and curaddr == curpbaddr and
+			 * lastaddr != lastpbaddr
+			 */
+			if (curaddr < lastaddr ||
+			    (curaddr == lastaddr && curaddr == curpbaddr &&
+			     lastaddr != lastpbaddr)) {
+				change_tmp = change_point[i];
+				change_point[i] = change_point[i-1];
+				change_point[i-1] = change_tmp;
+				still_changing = 1;
+			}
+		}
+	}
+
+	/* create a new bios memory map, removing overlaps */
+	overlap_entries = 0;	 /* number of entries in the overlap table */
+	new_bios_entry = 0;	 /* index for creating new bios map entries */
+	last_type = 0;		 /* start with undefined memory type */
+	last_addr = 0;		 /* start with 0 as last starting address */
+
+	/* loop through change-points, determining affect on the new bios map */
+	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
+		/* keep track of all overlapping bios entries */
+		if (change_point[chgidx]->addr ==
+		    change_point[chgidx]->pbios->addr) {
+			/*
+			 * add map entry to overlap list (> 1 entry
+			 * implies an overlap)
+			 */
+			overlap_list[overlap_entries++] =
+				change_point[chgidx]->pbios;
+		} else {
+			/*
+			 * remove entry from list (order independent,
+			 * so swap with last)
+			 */
+			for (i = 0; i < overlap_entries; i++) {
+				if (overlap_list[i] ==
+				    change_point[chgidx]->pbios)
+					overlap_list[i] =
+						overlap_list[overlap_entries-1];
+			}
+			overlap_entries--;
+		}
+		/*
+		 * if there are overlapping entries, decide which
+		 * "type" to use (larger value takes precedence --
+		 * 1=usable, 2,3,4,4+=unusable)
+		 */
+		current_type = 0;
+		for (i = 0; i < overlap_entries; i++)
+			if (overlap_list[i]->type > current_type)
+				current_type = overlap_list[i]->type;
+		/*
+		 * continue building up new bios map based on this
+		 * information
+		 */
+		if (current_type != last_type)	{
+			if (last_type != 0)	 {
+				new_bios[new_bios_entry].size =
+					change_point[chgidx]->addr - last_addr;
+				/*
+				 * move forward only if the new size
+				 * was non-zero
+				 */
+				if (new_bios[new_bios_entry].size != 0)
+					/*
+					 * no more space left for new
+					 * bios entries ?
+					 */
+					if (++new_bios_entry >= E820MAX)
+						break;
+			}
+			if (current_type != 0)	{
+				new_bios[new_bios_entry].addr =
+					change_point[chgidx]->addr;
+				new_bios[new_bios_entry].type = current_type;
+				last_addr = change_point[chgidx]->addr;
+			}
+			last_type = current_type;
+		}
+	}
+	/* retain count for new bios entries */
+	new_nr = new_bios_entry;
+
+	/* copy new bios mapping into original location */
+	memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
+	*pnr_map = new_nr;
+
+	return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ */
+int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
+{
+	/* Only one memory region (or negative)? Ignore it */
+	if (nr_map < 2)
+		return -1;
+
+	do {
+		u64 start = biosmap->addr;
+		u64 size = biosmap->size;
+		u64 end = start + size;
+		u32 type = biosmap->type;
+
+		/* Overflow in 64 bits? Ignore the memory map. */
+		if (start > end)
+			return -1;
+
+		add_memory_region(start, size, type);
+	} while (biosmap++, --nr_map);
+	return 0;
+}
+
+u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
+				unsigned new_type)
+{
+	int i;
+	u64 real_updated_size = 0;
+
+	BUG_ON(old_type == new_type);
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 final_start, final_end;
+		if (ei->type != old_type)
+			continue;
+		/* totally covered? */
+		if (ei->addr >= start &&
+		    (ei->addr + ei->size) <= (start + size)) {
+			ei->type = new_type;
+			real_updated_size += ei->size;
+			continue;
+		}
+		/* partially covered */
+		final_start = max(start, ei->addr);
+		final_end = min(start + size, ei->addr + ei->size);
+		if (final_start >= final_end)
+			continue;
+		add_memory_region(final_start, final_end - final_start,
+					 new_type);
+		real_updated_size += final_end - final_start;
+	}
+	return real_updated_size;
+}
+
+void __init update_e820(void)
+{
+	u8 nr_map;
+
+	nr_map = e820.nr_map;
+	if (sanitize_e820_map(e820.map, &nr_map))
+		return;
+	e820.nr_map = nr_map;
+	printk(KERN_INFO "modified physical RAM map:\n");
+	e820_print_map("modified");
+}
+
+/*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space.  We pass this space to PCI to assign MMIO resources
+ * for hotplug or unconfigured devices in.
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820_setup_gap(void)
+{
+	unsigned long gapstart, gapsize, round;
+	unsigned long long last;
+	int i;
+	int found = 0;
+
+	last = 0x100000000ull;
+	gapstart = 0x10000000;
+	gapsize = 0x400000;
+	i = e820.nr_map;
+	while (--i >= 0) {
+		unsigned long long start = e820.map[i].addr;
+		unsigned long long end = start + e820.map[i].size;
+
+		/*
+		 * Since "last" is at most 4GB, we know we'll
+		 * fit in 32 bits if this condition is true
+		 */
+		if (last > end) {
+			unsigned long gap = last - end;
+
+			if (gap > gapsize) {
+				gapsize = gap;
+				gapstart = end;
+				found = 1;
+			}
+		}
+		if (start < last)
+			last = start;
+	}
+
+#ifdef CONFIG_X86_64
+	if (!found) {
+		gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
+		printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
+		       "address range\n"
+		       KERN_ERR "PCI: Unassigned devices with 32bit resource "
+		       "registers may break!\n");
+	}
+#endif
+
+	/*
+	 * See how much we want to round up: start off with
+	 * rounding to the next 1MB area.
+	 */
+	round = 0x100000;
+	while ((gapsize >> 4) > round)
+		round += round;
+	/* Fun with two's complement */
+	pci_mem_start = (gapstart + round) & -round;
+
+	printk(KERN_INFO
+	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+	       pci_mem_start, gapstart, gapsize);
+}
+
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 751d517d802c..dfe25751038b 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -16,21 +16,6 @@
 #include <asm/e820.h>
 #include <asm/setup.h>
 
-struct e820map e820;
-struct change_member {
-	struct e820entry *pbios; /* pointer to original bios entry */
-	unsigned long long addr; /* address for this change point */
-};
-static struct change_member change_point_list[2*E820MAX] __initdata;
-static struct change_member *change_point[2*E820MAX] __initdata;
-static struct e820entry *overlap_list[E820MAX] __initdata;
-static struct e820entry new_bios[E820MAX] __initdata;
-/* For PCI or other memory-mapped resources */
-unsigned long pci_mem_start = 0x10000000;
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
-
 static struct resource system_rom_resource = {
 	.name	= "System ROM",
 	.start	= 0xf0000,
@@ -254,223 +239,6 @@ void __init e820_mark_nosave_regions(void)
 }
 #endif
 
-void __init add_memory_region(unsigned long long start,
-			      unsigned long long size, int type)
-{
-	int x;
-
-	x = e820.nr_map;
-
-	if (x == E820MAX) {
-		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-		return;
-	}
-
-	e820.map[x].addr = start;
-	e820.map[x].size = size;
-	e820.map[x].type = type;
-	e820.nr_map++;
-} /* add_memory_region */
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries.  The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
-	struct change_member *change_tmp;
-	unsigned long current_type, last_type;
-	unsigned long long last_addr;
-	int chgidx, still_changing;
-	int overlap_entries;
-	int new_bios_entry;
-	int old_nr, new_nr, chg_nr;
-	int i;
-
-	/*
-		Visually we're performing the following (1,2,3,4 = memory types)...
-
-		Sample memory map (w/overlaps):
-		   ____22__________________
-		   ______________________4_
-		   ____1111________________
-		   _44_____________________
-		   11111111________________
-		   ____________________33__
-		   ___________44___________
-		   __________33333_________
-		   ______________22________
-		   ___________________2222_
-		   _________111111111______
-		   _____________________11_
-		   _________________4______
-
-		Sanitized equivalent (no overlap):
-		   1_______________________
-		   _44_____________________
-		   ___1____________________
-		   ____22__________________
-		   ______11________________
-		   _________1______________
-		   __________3_____________
-		   ___________44___________
-		   _____________33_________
-		   _______________2________
-		   ________________1_______
-		   _________________4______
-		   ___________________2____
-		   ____________________33__
-		   ______________________4_
-	*/
-	/* if there's only one memory region, don't bother */
-	if (*pnr_map < 2) {
-		return -1;
-	}
-
-	old_nr = *pnr_map;
-
-	/* bail out if we find any unreasonable addresses in bios map */
-	for (i=0; i<old_nr; i++)
-		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
-			return -1;
-		}
-
-	/* create pointers for initial change-point information (for sorting) */
-	for (i=0; i < 2*old_nr; i++)
-		change_point[i] = &change_point_list[i];
-
-	/* record all known change-points (starting and ending addresses),
-	   omitting those that are for empty memory regions */
-	chgidx = 0;
-	for (i=0; i < old_nr; i++)	{
-		if (biosmap[i].size != 0) {
-			change_point[chgidx]->addr = biosmap[i].addr;
-			change_point[chgidx++]->pbios = &biosmap[i];
-			change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
-			change_point[chgidx++]->pbios = &biosmap[i];
-		}
-	}
-	chg_nr = chgidx;    	/* true number of change-points */
-
-	/* sort change-point list by memory addresses (low -> high) */
-	still_changing = 1;
-	while (still_changing)	{
-		still_changing = 0;
-		for (i=1; i < chg_nr; i++)  {
-			/* if <current_addr> > <last_addr>, swap */
-			/* or, if current=<start_addr> & last=<end_addr>, swap */
-			if ((change_point[i]->addr < change_point[i-1]->addr) ||
-				((change_point[i]->addr == change_point[i-1]->addr) &&
-				 (change_point[i]->addr == change_point[i]->pbios->addr) &&
-				 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
-			   )
-			{
-				change_tmp = change_point[i];
-				change_point[i] = change_point[i-1];
-				change_point[i-1] = change_tmp;
-				still_changing=1;
-			}
-		}
-	}
-
-	/* create a new bios memory map, removing overlaps */
-	overlap_entries=0;	 /* number of entries in the overlap table */
-	new_bios_entry=0;	 /* index for creating new bios map entries */
-	last_type = 0;		 /* start with undefined memory type */
-	last_addr = 0;		 /* start with 0 as last starting address */
-	/* loop through change-points, determining affect on the new bios map */
-	for (chgidx=0; chgidx < chg_nr; chgidx++)
-	{
-		/* keep track of all overlapping bios entries */
-		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
-		{
-			/* add map entry to overlap list (> 1 entry implies an overlap) */
-			overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
-		}
-		else
-		{
-			/* remove entry from list (order independent, so swap with last) */
-			for (i=0; i<overlap_entries; i++)
-			{
-				if (overlap_list[i] == change_point[chgidx]->pbios)
-					overlap_list[i] = overlap_list[overlap_entries-1];
-			}
-			overlap_entries--;
-		}
-		/* if there are overlapping entries, decide which "type" to use */
-		/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
-		current_type = 0;
-		for (i=0; i<overlap_entries; i++)
-			if (overlap_list[i]->type > current_type)
-				current_type = overlap_list[i]->type;
-		/* continue building up new bios map based on this information */
-		if (current_type != last_type)	{
-			if (last_type != 0)	 {
-				new_bios[new_bios_entry].size =
-					change_point[chgidx]->addr - last_addr;
-				/* move forward only if the new size was non-zero */
-				if (new_bios[new_bios_entry].size != 0)
-					if (++new_bios_entry >= E820MAX)
-						break; 	/* no more space left for new bios entries */
-			}
-			if (current_type != 0)	{
-				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
-				new_bios[new_bios_entry].type = current_type;
-				last_addr=change_point[chgidx]->addr;
-			}
-			last_type = current_type;
-		}
-	}
-	new_nr = new_bios_entry;   /* retain count for new bios entries */
-
-	/* copy new bios mapping into original location */
-	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
-	*pnr_map = new_nr;
-
-	return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory.  If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
- */
-int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
-	/* Only one memory region (or negative)? Ignore it */
-	if (nr_map < 2)
-		return -1;
-
-	do {
-		u64 start = biosmap->addr;
-		u64 size = biosmap->size;
-		u64 end = start + size;
-		u32 type = biosmap->type;
-
-		/* Overflow in 64 bits? Ignore the memory map. */
-		if (start > end)
-			return -1;
-
-		add_memory_region(start, size, type);
-	} while (biosmap++, --nr_map);
-
-	return 0;
-}
-
 /*
  * Find the highest page frame number we have available
  */
@@ -535,86 +303,12 @@ void __init register_bootmem_low_pages(unsigned long max_low_pfn)
 	}
 }
 
-void __init e820_register_memory(void)
-{
-	unsigned long gapstart, gapsize, round;
-	unsigned long long last;
-	int i;
-
-	/*
-	 * Search for the biggest gap in the low 32 bits of the e820
-	 * memory space.
-	 */
-	last = 0x100000000ull;
-	gapstart = 0x10000000;
-	gapsize = 0x400000;
-	i = e820.nr_map;
-	while (--i >= 0) {
-		unsigned long long start = e820.map[i].addr;
-		unsigned long long end = start + e820.map[i].size;
-
-		/*
-		 * Since "last" is at most 4GB, we know we'll
-		 * fit in 32 bits if this condition is true
-		 */
-		if (last > end) {
-			unsigned long gap = last - end;
-
-			if (gap > gapsize) {
-				gapsize = gap;
-				gapstart = end;
-			}
-		}
-		if (start < last)
-			last = start;
-	}
-
-	/*
-	 * See how much we want to round up: start off with
-	 * rounding to the next 1MB area.
-	 */
-	round = 0x100000;
-	while ((gapsize >> 4) > round)
-		round += round;
-	/* Fun with two's complement */
-	pci_mem_start = (gapstart + round) & -round;
-
-	printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
-		pci_mem_start, gapstart, gapsize);
-}
-
-static void __init print_memory_map(char *who)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		printk(" %s: %016Lx - %016Lx ", who,
-			e820.map[i].addr,
-			e820.map[i].addr + e820.map[i].size);
-		switch (e820.map[i].type) {
-		case E820_RAM:	printk("(usable)\n");
-				break;
-		case E820_RESERVED:
-				printk("(reserved)\n");
-				break;
-		case E820_ACPI:
-				printk("(ACPI data)\n");
-				break;
-		case E820_NVS:
-				printk("(ACPI NVS)\n");
-				break;
-		default:	printk("type %u\n", e820.map[i].type);
-				break;
-		}
-	}
-}
-
 void __init limit_regions(unsigned long long size)
 {
 	unsigned long long current_addr;
 	int i;
 
-	print_memory_map("limit_regions start");
+	e820_print_map("limit_regions start");
 	for (i = 0; i < e820.nr_map; i++) {
 		current_addr = e820.map[i].addr + e820.map[i].size;
 		if (current_addr < size)
@@ -633,62 +327,10 @@ void __init limit_regions(unsigned long long size)
 			e820.nr_map = i + 1;
 			e820.map[i].size -= current_addr - size;
 		}
-		print_memory_map("limit_regions endfor");
+		e820_print_map("limit_regions endfor");
 		return;
 	}
-	print_memory_map("limit_regions endfunc");
-}
-
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
-{
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		const struct e820entry *ei = &e820.map[i];
-		if (type && ei->type != type)
-			continue;
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
- /*
-  * This function checks if the entire range <start,end> is mapped with type.
-  *
-  * Note: this function only works correct if the e820 table is sorted and
-  * not-overlapping, which is the case
-  */
-int __init
-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
-{
-	u64 start = s;
-	u64 end = e;
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		if (type && ei->type != type)
-			continue;
-		/* is the region (part) in overlap with the current region ?*/
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		/* if the region is at the beginning of <start,end> we move
-		 * start to the end of the region since it's ok until there
-		 */
-		if (ei->addr <= start)
-			start = ei->addr + ei->size;
-		/* if start is now at or beyond end, we're done, full
-		 * coverage */
-		if (start >= end)
-			return 1; /* we're done */
-	}
-	return 0;
+	e820_print_map("limit_regions endfunc");
 }
 
 /* Overridden in paravirt.c if CONFIG_PARAVIRT */
@@ -700,7 +342,7 @@ char * __init __attribute__((weak)) memory_setup(void)
 void __init setup_memory_map(void)
 {
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	print_memory_map(memory_setup());
+	e820_print_map(memory_setup());
 }
 
 static int __initdata user_defined_memmap;
@@ -783,55 +425,12 @@ static int __init parse_memmap(char *arg)
 	return 0;
 }
 early_param("memmap", parse_memmap);
-u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type)
-{
-	int i;
-	u64 real_updated_size = 0;
-
-	BUG_ON(old_type == new_type);
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		u64 final_start, final_end;
-		if (ei->type != old_type)
-			continue;
-		/* totally covered? */
-		if (ei->addr >= start &&
-		    (ei->addr + ei->size) <= (start + size)) {
-			ei->type = new_type;
-			real_updated_size += ei->size;
-			continue;
-		}
-		/* partially covered */
-		final_start = max(start, ei->addr);
-		final_end = min(start + size, ei->addr + ei->size);
-		if (final_start >= final_end)
-			continue;
-		add_memory_region(final_start, final_end - final_start,
-					 new_type);
-		real_updated_size += final_end - final_start;
-	}
-
-	return real_updated_size;
-}
 
 void __init finish_e820_parsing(void)
 {
 	if (user_defined_memmap) {
 		printk(KERN_INFO "user-defined physical RAM map:\n");
-		print_memory_map("user");
+		e820_print_map("user");
 	}
 }
 
-void __init update_e820(void)
-{
-	u8 nr_map;
-
-	nr_map = e820.nr_map;
-	if (sanitize_e820_map(e820.map, &nr_map))
-		return;
-	e820.nr_map = nr_map;
-	printk(KERN_INFO "modified physical RAM map:\n");
-	print_memory_map("modified");
-}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index c45b4dea4055..28a14eb65d3d 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -29,8 +29,6 @@
 #include <asm/kdebug.h>
 #include <asm/trampoline.h>
 
-struct e820map e820;
-
 /*
  * PFN of last memory page.
  */
@@ -176,62 +174,6 @@ again:
 	}
 	return changed;
 }
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-
-		if (type && ei->type != type)
-			continue;
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
-/*
- * This function checks if the entire range <start,end> is mapped with type.
- *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
- */
-int __init e820_all_mapped(unsigned long start, unsigned long end,
-			   unsigned type)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-
-		if (type && ei->type != type)
-			continue;
-		/* is the region (part) in overlap with the current region ?*/
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-
-		/* if the region is at the beginning of <start,end> we move
-		 * start to the end of the region since it's ok until there
-		 */
-		if (ei->addr <= start)
-			start = ei->addr + ei->size;
-		/*
-		 * if start is now at or beyond end, we're done, full
-		 * coverage
-		 */
-		if (start >= end)
-			return 1;
-	}
-	return 0;
-}
 
 /*
  * Find a free area with specified alignment in a specific range.
@@ -434,24 +376,6 @@ e820_register_active_regions(int nid, unsigned long start_pfn,
 			add_active_range(nid, ei_startpfn, ei_endpfn);
 }
 
-/*
- * Add a memory region to the kernel e820 map.
- */
-void __init add_memory_region(unsigned long start, unsigned long size, int type)
-{
-	int x = e820.nr_map;
-
-	if (x == E820MAX) {
-		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-		return;
-	}
-
-	e820.map[x].addr = start;
-	e820.map[x].size = size;
-	e820.map[x].type = type;
-	e820.nr_map++;
-}
-
 /*
  * Find the hole size (in bytes) in the memory range.
  * @start: starting address of the memory range to scan
@@ -473,266 +397,6 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
 	return end - start - (ram << PAGE_SHIFT);
 }
 
-static void __init e820_print_map(char *who)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
-		       (unsigned long long) e820.map[i].addr,
-		       (unsigned long long)
-		       (e820.map[i].addr + e820.map[i].size));
-		switch (e820.map[i].type) {
-		case E820_RAM:
-			printk(KERN_CONT "(usable)\n");
-			break;
-		case E820_RESERVED:
-			printk(KERN_CONT "(reserved)\n");
-			break;
-		case E820_ACPI:
-			printk(KERN_CONT "(ACPI data)\n");
-			break;
-		case E820_NVS:
-			printk(KERN_CONT "(ACPI NVS)\n");
-			break;
-		default:
-			printk(KERN_CONT "type %u\n", e820.map[i].type);
-			break;
-		}
-	}
-}
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
-{
-	struct change_member {
-		struct e820entry *pbios; /* pointer to original bios entry */
-		unsigned long long addr; /* address for this change point */
-	};
-	static struct change_member change_point_list[2*E820MAX] __initdata;
-	static struct change_member *change_point[2*E820MAX] __initdata;
-	static struct e820entry *overlap_list[E820MAX] __initdata;
-	static struct e820entry new_bios[E820MAX] __initdata;
-	struct change_member *change_tmp;
-	unsigned long current_type, last_type;
-	unsigned long long last_addr;
-	int chgidx, still_changing;
-	int overlap_entries;
-	int new_bios_entry;
-	int old_nr, new_nr, chg_nr;
-	int i;
-
-	/*
-		Visually we're performing the following
-		(1,2,3,4 = memory types)...
-
-		Sample memory map (w/overlaps):
-		   ____22__________________
-		   ______________________4_
-		   ____1111________________
-		   _44_____________________
-		   11111111________________
-		   ____________________33__
-		   ___________44___________
-		   __________33333_________
-		   ______________22________
-		   ___________________2222_
-		   _________111111111______
-		   _____________________11_
-		   _________________4______
-
-		Sanitized equivalent (no overlap):
-		   1_______________________
-		   _44_____________________
-		   ___1____________________
-		   ____22__________________
-		   ______11________________
-		   _________1______________
-		   __________3_____________
-		   ___________44___________
-		   _____________33_________
-		   _______________2________
-		   ________________1_______
-		   _________________4______
-		   ___________________2____
-		   ____________________33__
-		   ______________________4_
-	*/
-
-	/* if there's only one memory region, don't bother */
-	if (*pnr_map < 2)
-		return -1;
-
-	old_nr = *pnr_map;
-
-	/* bail out if we find any unreasonable addresses in bios map */
-	for (i = 0; i < old_nr; i++)
-		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
-			return -1;
-
-	/* create pointers for initial change-point information (for sorting) */
-	for (i = 0; i < 2 * old_nr; i++)
-		change_point[i] = &change_point_list[i];
-
-	/* record all known change-points (starting and ending addresses),
-	   omitting those that are for empty memory regions */
-	chgidx = 0;
-	for (i = 0; i < old_nr; i++)	{
-		if (biosmap[i].size != 0) {
-			change_point[chgidx]->addr = biosmap[i].addr;
-			change_point[chgidx++]->pbios = &biosmap[i];
-			change_point[chgidx]->addr = biosmap[i].addr +
-				biosmap[i].size;
-			change_point[chgidx++]->pbios = &biosmap[i];
-		}
-	}
-	chg_nr = chgidx;
-
-	/* sort change-point list by memory addresses (low -> high) */
-	still_changing = 1;
-	while (still_changing)	{
-		still_changing = 0;
-		for (i = 1; i < chg_nr; i++)  {
-			unsigned long long curaddr, lastaddr;
-			unsigned long long curpbaddr, lastpbaddr;
-
-			curaddr = change_point[i]->addr;
-			lastaddr = change_point[i - 1]->addr;
-			curpbaddr = change_point[i]->pbios->addr;
-			lastpbaddr = change_point[i - 1]->pbios->addr;
-
-			/*
-			 * swap entries, when:
-			 *
-			 * curaddr > lastaddr or
-			 * curaddr == lastaddr and curaddr == curpbaddr and
-			 * lastaddr != lastpbaddr
-			 */
-			if (curaddr < lastaddr ||
-			    (curaddr == lastaddr && curaddr == curpbaddr &&
-			     lastaddr != lastpbaddr)) {
-				change_tmp = change_point[i];
-				change_point[i] = change_point[i-1];
-				change_point[i-1] = change_tmp;
-				still_changing = 1;
-			}
-		}
-	}
-
-	/* create a new bios memory map, removing overlaps */
-	overlap_entries = 0;	 /* number of entries in the overlap table */
-	new_bios_entry = 0;	 /* index for creating new bios map entries */
-	last_type = 0;		 /* start with undefined memory type */
-	last_addr = 0;		 /* start with 0 as last starting address */
-
-	/* loop through change-points, determining affect on the new bios map */
-	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
-		/* keep track of all overlapping bios entries */
-		if (change_point[chgidx]->addr ==
-		    change_point[chgidx]->pbios->addr) {
-			/*
-			 * add map entry to overlap list (> 1 entry
-			 * implies an overlap)
-			 */
-			overlap_list[overlap_entries++] =
-				change_point[chgidx]->pbios;
-		} else {
-			/*
-			 * remove entry from list (order independent,
-			 * so swap with last)
-			 */
-			for (i = 0; i < overlap_entries; i++) {
-				if (overlap_list[i] ==
-				    change_point[chgidx]->pbios)
-					overlap_list[i] =
-						overlap_list[overlap_entries-1];
-			}
-			overlap_entries--;
-		}
-		/*
-		 * if there are overlapping entries, decide which
-		 * "type" to use (larger value takes precedence --
-		 * 1=usable, 2,3,4,4+=unusable)
-		 */
-		current_type = 0;
-		for (i = 0; i < overlap_entries; i++)
-			if (overlap_list[i]->type > current_type)
-				current_type = overlap_list[i]->type;
-		/*
-		 * continue building up new bios map based on this
-		 * information
-		 */
-		if (current_type != last_type)	{
-			if (last_type != 0)	 {
-				new_bios[new_bios_entry].size =
-					change_point[chgidx]->addr - last_addr;
-				/*
-				 * move forward only if the new size
-				 * was non-zero
-				 */
-				if (new_bios[new_bios_entry].size != 0)
-					/*
-					 * no more space left for new
-					 * bios entries ?
-					 */
-					if (++new_bios_entry >= E820MAX)
-						break;
-			}
-			if (current_type != 0)	{
-				new_bios[new_bios_entry].addr =
-					change_point[chgidx]->addr;
-				new_bios[new_bios_entry].type = current_type;
-				last_addr = change_point[chgidx]->addr;
-			}
-			last_type = current_type;
-		}
-	}
-	/* retain count for new bios entries */
-	new_nr = new_bios_entry;
-
-	/* copy new bios mapping into original location */
-	memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
-	*pnr_map = new_nr;
-
-	return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory.  If we aren't, we'll fake a memory map.
- */
-static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
-	/* Only one memory region (or negative)? Ignore it */
-	if (nr_map < 2)
-		return -1;
-
-	do {
-		u64 start = biosmap->addr;
-		u64 size = biosmap->size;
-		u64 end = start + size;
-		u32 type = biosmap->type;
-
-		/* Overflow in 64 bits? Ignore the memory map. */
-		if (start > end)
-			return -1;
-
-		add_memory_region(start, size, type);
-	} while (biosmap++, --nr_map);
-	return 0;
-}
-
 static void early_panic(char *msg)
 {
 	early_printk(msg);
@@ -829,114 +493,6 @@ void __init finish_e820_parsing(void)
 	}
 }
 
-u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type)
-{
-	int i;
-	u64 real_updated_size = 0;
-
-	BUG_ON(old_type == new_type);
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		u64 final_start, final_end;
-		if (ei->type != old_type)
-			continue;
-		/* totally covered? */
-		if (ei->addr >= start &&
-		    (ei->addr + ei->size) <= (start + size)) {
-			ei->type = new_type;
-			real_updated_size += ei->size;
-			continue;
-		}
-		/* partially covered */
-		final_start = max(start, ei->addr);
-		final_end = min(start + size, ei->addr + ei->size);
-		if (final_start >= final_end)
-			continue;
-		add_memory_region(final_start, final_end - final_start,
-					 new_type);
-		real_updated_size += final_end - final_start;
-	}
-	return real_updated_size;
-}
-
-void __init update_e820(void)
-{
-	u8 nr_map;
-
-	nr_map = e820.nr_map;
-	if (sanitize_e820_map(e820.map, &nr_map))
-		return;
-	e820.nr_map = nr_map;
-	printk(KERN_INFO "modified physical RAM map:\n");
-	e820_print_map("modified");
-}
-
-unsigned long pci_mem_start = 0xaeedbabe;
-EXPORT_SYMBOL(pci_mem_start);
-
-/*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space.  We pass this space to PCI to assign MMIO resources
- * for hotplug or unconfigured devices in.
- * Hopefully the BIOS let enough space left.
- */
-__init void e820_setup_gap(void)
-{
-	unsigned long gapstart, gapsize, round;
-	unsigned long last;
-	int i;
-	int found = 0;
-
-	last = 0x100000000ull;
-	gapstart = 0x10000000;
-	gapsize = 0x400000;
-	i = e820.nr_map;
-	while (--i >= 0) {
-		unsigned long long start = e820.map[i].addr;
-		unsigned long long end = start + e820.map[i].size;
-
-		/*
-		 * Since "last" is at most 4GB, we know we'll
-		 * fit in 32 bits if this condition is true
-		 */
-		if (last > end) {
-			unsigned long gap = last - end;
-
-			if (gap > gapsize) {
-				gapsize = gap;
-				gapstart = end;
-				found = 1;
-			}
-		}
-		if (start < last)
-			last = start;
-	}
-
-	if (!found) {
-		gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
-		printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
-		       "address range\n"
-		       KERN_ERR "PCI: Unassigned devices with 32bit resource "
-		       "registers may break!\n");
-	}
-
-	/*
-	 * See how much we want to round up: start off with
-	 * rounding to the next 1MB area.
-	 */
-	round = 0x100000;
-	while ((gapsize >> 4) > round)
-		round += round;
-	/* Fun with two's complement */
-	pci_mem_start = (gapstart + round) & -round;
-
-	printk(KERN_INFO
-	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-	       pci_mem_start, gapstart, gapsize);
-}
-
 int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
 {
 	int i;
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index b54c79c91efd..5faeab69edd9 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -875,7 +875,7 @@ void __init setup_arch(char **cmdline_p)
 		get_smp_config();
 #endif
 
-	e820_register_memory();
+	e820_setup_gap();
 	e820_mark_nosave_regions();
 
 #ifdef CONFIG_VT
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 7004251fc66b..b5b519feba1d 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -20,6 +20,20 @@ struct e820map {
 	__u32 nr_map;
 	struct e820entry map[E820MAX];
 };
+
+extern struct e820map e820;
+
+extern int e820_any_mapped(u64 start, u64 end, unsigned type);
+extern int e820_all_mapped(u64 start, u64 end, unsigned type);
+extern void add_memory_region(u64 start, u64 size, int type);
+extern void e820_print_map(char *who);
+extern int sanitize_e820_map(struct e820entry *biosmap, char *pnr_map);
+extern int copy_e820_map(struct e820entry *biosmap, int nr_map);
+extern u64 update_memory_range(u64 start, u64 size, unsigned old_type,
+			       unsigned new_type);
+extern void update_e820(void);
+extern void e820_setup_gap(void);
+
 #endif /* __ASSEMBLY__ */
 
 #define ISA_START_ADDRESS	0xa0000
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index af0711b220df..9576b438fbd9 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -21,19 +21,8 @@
 extern void setup_memory_map(void);
 extern void finish_e820_parsing(void);
 
-extern struct e820map e820;
-extern void update_e820(void);
-
-extern int e820_all_mapped(unsigned long start, unsigned long end,
-			   unsigned type);
-extern int e820_any_mapped(u64 start, u64 end, unsigned type);
 extern void propagate_e820_map(void);
 extern void register_bootmem_low_pages(unsigned long max_low_pfn);
-extern void add_memory_region(unsigned long long start,
-			      unsigned long long size, int type);
-extern u64 update_memory_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type);
-extern void e820_register_memory(void);
 extern void limit_regions(unsigned long long size);
 extern void init_iomem_resources(struct resource *code_resource,
 				 struct resource *data_resource,
@@ -47,6 +36,5 @@ static inline void e820_mark_nosave_regions(void)
 }
 #endif
 
-
 #endif/*!__ASSEMBLY__*/
 #endif/*__E820_HEADER*/
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 6ae3e2803286..9fac77e01441 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -19,34 +19,22 @@ extern unsigned long find_e820_area(unsigned long start, unsigned long end,
 extern unsigned long find_e820_area_size(unsigned long start,
 					 unsigned long *sizep,
 					 unsigned long align);
-extern void add_memory_region(unsigned long start, unsigned long size,
-			      int type);
-extern u64 update_memory_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type);
 extern void setup_memory_region(void);
 extern void contig_e820_setup(void);
 extern unsigned long e820_end_of_ram(void);
 extern void e820_reserve_resources(void);
 extern void e820_mark_nosave_regions(void);
-extern int e820_any_mapped(unsigned long start, unsigned long end,
-			   unsigned type);
-extern int e820_all_mapped(unsigned long start, unsigned long end,
-			   unsigned type);
 extern int e820_any_non_reserved(unsigned long start, unsigned long end);
 extern int is_memory_any_valid(unsigned long start, unsigned long end);
 extern int e820_all_non_reserved(unsigned long start, unsigned long end);
 extern int is_memory_all_valid(unsigned long start, unsigned long end);
 extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
 
-extern void e820_setup_gap(void);
 extern void e820_register_active_regions(int nid, unsigned long start_pfn,
 					 unsigned long end_pfn);
 
 extern void finish_e820_parsing(void);
 
-extern struct e820map e820;
-extern void update_e820(void);
-
 extern void reserve_early(unsigned long start, unsigned long end, char *name);
 extern void free_early(unsigned long start, unsigned long end);
 extern void early_res_to_bootmem(unsigned long start, unsigned long end);
-- 
cgit v1.2.3


From 8004dd965b13b01a96def054d420f6df7ff22d53 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 12 May 2008 17:40:39 -0700
Subject: x86: amd opteron TOM2 mask val fix

there is a typo in the mask value, need to remove that extra 0,
to avoid 4bit clearing.

Signed-off-by: Yinghal Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 2 +-
 arch/x86/pci/k8-bus_64.c           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index a83f5cd78885..509bd3d9eacd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -233,7 +233,7 @@ void __init get_mtrr_state(void)
 		mtrr_tom2 = high;
 		mtrr_tom2 <<= 32;
 		mtrr_tom2 |= low;
-		mtrr_tom2 &= 0xffffff8000000ULL;
+		mtrr_tom2 &= 0xffffff800000ULL;
 	}
 	if (mtrr_show) {
 		int high_width;
diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c
index 5c2799c20e47..bfefdf0f40d4 100644
--- a/arch/x86/pci/k8-bus_64.c
+++ b/arch/x86/pci/k8-bus_64.c
@@ -384,7 +384,7 @@ static int __init early_fill_mp_bus_info(void)
 	/* need to take out [0, TOM) for RAM*/
 	address = MSR_K8_TOP_MEM1;
 	rdmsrl(address, val);
-	end = (val & 0xffffff8000000ULL);
+	end = (val & 0xffffff800000ULL);
 	printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
 	if (end < (1ULL<<32))
 		update_range(range, 0, end - 1);
@@ -478,7 +478,7 @@ static int __init early_fill_mp_bus_info(void)
 		/* TOP_MEM2 */
 		address = MSR_K8_TOP_MEM2;
 		rdmsrl(address, val);
-		end = (val & 0xffffff8000000ULL);
+		end = (val & 0xffffff800000ULL);
 		printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
 		update_range(range, 1ULL<<32, end - 1);
 	}
-- 
cgit v1.2.3


From 3f03c54a34fa3da680b3341dd3ba965ef3394bd1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 9 May 2008 22:40:52 -0700
Subject: x86: mtrr cleanup for converting continuous to discrete layout - fix
 #2

disable the noisy print out.
also use the one the less spare mtrr reg.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/main.c | 79 +++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index e6c162c379ac..0642201784e0 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -730,6 +730,7 @@ struct var_mtrr_range_state {
 };
 
 struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static int __initdata debug_print;
 
 static int __init
 x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
@@ -748,10 +749,12 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
 		nr_range = add_range_with_merge(range, nr_range, base,
 						base + size - 1);
 	}
-	printk(KERN_DEBUG "After WB checking\n");
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+	if (debug_print) {
+		printk(KERN_DEBUG "After WB checking\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
 				 range[i].start, range[i].end + 1);
+	}
 
 	/* take out UC ranges */
 	for (i = 0; i < num_var_ranges; i++) {
@@ -775,17 +778,21 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
 			continue;
 		nr_range++;
 	}
-	printk(KERN_DEBUG "After UC checking\n");
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
-			 range[i].start, range[i].end + 1);
+	if  (debug_print) {
+		printk(KERN_DEBUG "After UC checking\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
 
 	/* sort the ranges */
 	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-	printk(KERN_DEBUG "After sorting\n");
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+	if  (debug_print) {
+		printk(KERN_DEBUG "After sorting\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
 				 range[i].start, range[i].end + 1);
+	}
 
 	/* clear those is not used */
 	for (i = nr_range; i < RANGE_NUM; i++)
@@ -912,12 +919,13 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
 			align = max_align;
 
 		sizek = 1 << align;
-		printk(KERN_DEBUG "Setting variable MTRR %d, base: %ldMB, "
-		       "range: %ldMB, type %s\n",
-			reg, range_startk >> 10, sizek >> 10,
-			(type == MTRR_TYPE_UNCACHABLE)?"UC":
-			    ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
-			);
+		if (debug_print)
+			printk(KERN_DEBUG "Setting variable MTRR %d, "
+				"base: %ldMB, range: %ldMB, type %s\n",
+				reg, range_startk >> 10, sizek >> 10,
+				(type == MTRR_TYPE_UNCACHABLE)?"UC":
+				    ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
+				);
 		save_var_mtrr(reg++, range_startk, sizek, type);
 		range_startk += sizek;
 		range_sizek -= sizek;
@@ -963,7 +971,9 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
 	range0_basek = state->range_startk;
 	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
 	if (range0_sizek == state->range_sizek) {
-		printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", range0_basek<<10,
+		if (debug_print)
+			printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
+				range0_basek<<10,
 				(range0_basek + state->range_sizek)<<10);
 		state->reg = range_to_mtrr(state->reg, range0_basek,
 				state->range_sizek, MTRR_TYPE_WRBACK);
@@ -980,7 +990,9 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
 	}
 
 	if (range0_sizek) {
-		printk(KERN_DEBUG "range0: %016lx - %016lx\n", range0_basek<<10,
+		if (debug_print)
+			printk(KERN_DEBUG "range0: %016lx - %016lx\n",
+				range0_basek<<10,
 				(range0_basek + range0_sizek)<<10);
 		state->reg = range_to_mtrr(state->reg, range0_basek,
 				range0_sizek, MTRR_TYPE_WRBACK);
@@ -1016,13 +1028,15 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
 		second_sizek = 0;
 	}
 
-	printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
+	if (debug_print)
+		printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
 			 (range_basek + range_sizek)<<10);
 	state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
 					 MTRR_TYPE_WRBACK);
 	if (hole_sizek) {
-		printk(KERN_DEBUG "hole: %016lx - %016lx\n", hole_basek<<10,
-				 (hole_basek + hole_sizek)<<10);
+		if (debug_print)
+			printk(KERN_DEBUG "hole: %016lx - %016lx\n",
+				 hole_basek<<10, (hole_basek + hole_sizek)<<10);
 		state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
 						 MTRR_TYPE_UNCACHABLE);
 
@@ -1120,7 +1134,6 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
 	/* Write the last range */
 	if (var_state.range_sizek != 0)
 		range_to_mtrr_with_hole(&var_state, 0, 0);
-	printk(KERN_DEBUG "DONE variable MTRRs\n");
 
 	num_reg = var_state.reg;
 	/* Clear out the extra MTRR's */
@@ -1219,6 +1232,7 @@ static int __init mtrr_cleanup(unsigned address_bits)
 	if (mtrr_chunk_size && mtrr_gran_size) {
 		int num_reg;
 
+		debug_print = 1;
 		/* convert ranges to var ranges state */
 		num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
 					      mtrr_gran_size);
@@ -1242,8 +1256,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
 			result[i].lose_cover_sizek =
 				(range_sums - range_sums_new) << PSHIFT;
 
-		printk(KERN_INFO " %sgran_size: %ldM  \tchunk_size: %ldM  \t",
-			 result[i].bad?" BAD ":"", result[i].gran_sizek >> 10,
+		printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+			 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
 			 result[i].chunk_sizek >> 10);
 		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ldM \n",
 			 result[i].num_reg, result[i].bad?"-":"",
@@ -1254,6 +1268,7 @@ static int __init mtrr_cleanup(unsigned address_bits)
 		}
 		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
 		       "will find optimal one\n");
+		debug_print = 0;
 		memset(result, 0, sizeof(result[0]));
 	}
 
@@ -1265,9 +1280,10 @@ static int __init mtrr_cleanup(unsigned address_bits)
 		     chunk_size <<= 1) {
 			int num_reg;
 
-			printk(KERN_INFO
+			if (debug_print)
+				printk(KERN_INFO
 			       "\ngran_size: %lldM   chunk_size_size: %lldM\n",
-			       gran_size >> 20, chunk_size >> 20);
+				       gran_size >> 20, chunk_size >> 20);
 			if (i >= NUM_RESULT)
 				continue;
 
@@ -1310,10 +1326,10 @@ static int __init mtrr_cleanup(unsigned address_bits)
 
 	/* print out all */
 	for (i = 0; i < NUM_RESULT; i++) {
-		printk(KERN_INFO "%sgran_size: %ldM  \tchunk_size: %ldM  \t",
+		printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
 		       result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
 		       result[i].chunk_sizek >> 10);
-		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ldM \n",
+		printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
 		       result[i].num_reg, result[i].bad?"-":"",
 		       result[i].lose_cover_sizek >> 10);
 	}
@@ -1322,7 +1338,7 @@ static int __init mtrr_cleanup(unsigned address_bits)
 	if (nr_mtrr_spare_reg >= num_var_ranges)
 		nr_mtrr_spare_reg = num_var_ranges - 1;
 	num_reg_good = -1;
-	for (i = 1; i < num_var_ranges + 1 - nr_mtrr_spare_reg; i++) {
+	for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
 		if (!min_loss_pfn[i]) {
 			num_reg_good = i;
 			break;
@@ -1344,10 +1360,10 @@ static int __init mtrr_cleanup(unsigned address_bits)
 	if (index_good != -1) {
 		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
 		i = index_good;
-		printk(KERN_INFO "gran_size: %ldM  \tchunk_size: %ldM  \t",
+		printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
 				result[i].gran_sizek >> 10,
 				result[i].chunk_sizek >> 10);
-		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %ldM \n",
+		printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
 				result[i].num_reg,
 				result[i].lose_cover_sizek >> 10);
 		/* convert ranges to var ranges state */
@@ -1355,6 +1371,7 @@ static int __init mtrr_cleanup(unsigned address_bits)
 		chunk_size <<= 10;
 		gran_size = result[i].gran_sizek;
 		gran_size <<= 10;
+		debug_print = 1;
 		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
 		set_var_mtrr_all(address_bits);
 		return 1;
-- 
cgit v1.2.3


From b25e31cec7788ccba5749d10f8f4343fffff4750 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 14 May 2008 08:15:28 -0700
Subject: x86 boot: minor code format fixes in e820 and efi routines

Standardize a few pointer declarations to not have the
extra space after the '*' character.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_64.c | 2 +-
 arch/x86/kernel/efi_64.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 28a14eb65d3d..c10b4aece5e4 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -404,7 +404,7 @@ static void early_panic(char *msg)
 }
 
 /* We're not void only for x86 32-bit compat */
-char * __init machine_specific_memory_setup(void)
+char *__init machine_specific_memory_setup(void)
 {
 	char *who = "BIOS-e820";
 	/*
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index d0060fdcccac..21a7b759687a 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -103,7 +103,7 @@ void __init efi_reserve_bootmem(void)
 				memmap.nr_map * memmap.desc_size);
 }
 
-void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
 {
 	static unsigned pages_mapped __initdata;
 	unsigned i, pages;
-- 
cgit v1.2.3


From c3965bd15118742d72b4bc1a290d37b3f081eb98 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 14 May 2008 08:15:34 -0700
Subject: x86 boot: proper use of ARRAY_SIZE instead of repeated E820MAX
 constant

This patch is motivated by a subsequent patch which will allow for more
memory map entries on EFI supported systems than can be passed via the x86
legacy BIOS E820 interface.  The legacy interface is limited to E820MAX ==
128 memory entries, and that "E820MAX" manifest constant was used as the
size for several arrays and loops over those arrays.

The primary change in this patch is to change code loop sizes over those
arrays from using the constant E820MAX, to using the ARRAY_SIZE() macro
evaluated for the array being looped.  That way, a subsequent patch can
change the size of some of these arrays, without breaking this code.

This patch also adds a parameter to the sanitize_e820_map() routine,
which had an implicit size for the array passed it of E820MAX entries.
This new parameter explicitly passes the size of said array.  Once again,
this will allow a subsequent patch to change that array size for some
calls to sanitize_e820_map() without breaking the code.

As part of enhancing the sanitize_e820_map() interface this way, I further
combined the unnecessarily distinct x86_32 and x86_64 declarations for
this routine into a single, commonly used, declaration.

This patch in itself should make no difference to the resulting kernel
binary.

[ mingo@elte.hu: merged to -tip ]

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/boot/memory.c        | 3 ++-
 arch/x86/kernel/e820.c        | 9 +++++----
 arch/x86/kernel/e820_64.c     | 6 ++++--
 arch/x86/mach-default/setup.c | 4 +++-
 arch/x86/mach-voyager/setup.c | 4 +++-
 include/asm-x86/e820.h        | 3 ++-
 include/asm-x86/setup.h       | 1 -
 7 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index acad32eb4290..53165c97336b 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -13,6 +13,7 @@
  */
 
 #include "boot.h"
+#include <linux/kernel.h>
 
 #define SMAP	0x534d4150	/* ASCII "SMAP" */
 
@@ -53,7 +54,7 @@ static int detect_memory_e820(void)
 
 		count++;
 		desc++;
-	} while (next && count < E820MAX);
+	} while (next && count < ARRAY_SIZE(boot_params.e820_map));
 
 	return boot_params.e820_entries = count;
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 2cb686f60d0d..2396b9da8027 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -95,7 +95,7 @@ void __init add_memory_region(u64 start, u64 size, int type)
 {
 	int x = e820.nr_map;
 
-	if (x == E820MAX) {
+	if (x == ARRAY_SIZE(e820.map)) {
 		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
 		return;
 	}
@@ -142,7 +142,8 @@ void __init e820_print_map(char *who)
  * replaces the original e820 map with a new one, removing overlaps.
  *
  */
-int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
+int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
+				char *pnr_map)
 {
 	struct change_member {
 		struct e820entry *pbios; /* pointer to original bios entry */
@@ -314,7 +315,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
 					 * no more space left for new
 					 * bios entries ?
 					 */
-					if (++new_bios_entry >= E820MAX)
+					if (++new_bios_entry >= max_nr_map)
 						break;
 			}
 			if (current_type != 0)	{
@@ -403,7 +404,7 @@ void __init update_e820(void)
 	u8 nr_map;
 
 	nr_map = e820.nr_map;
-	if (sanitize_e820_map(e820.map, &nr_map))
+	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
 		return;
 	e820.nr_map = nr_map;
 	printk(KERN_INFO "modified physical RAM map:\n");
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index c10b4aece5e4..58dc6eee4d4f 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -413,7 +413,9 @@ char *__init machine_specific_memory_setup(void)
 	 * Otherwise fake a memory map; one section from 0k->640k,
 	 * the next section from 1mb->appropriate_mem_k
 	 */
-	sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
+	sanitize_e820_map(boot_params.e820_map,
+			ARRAY_SIZE(boot_params.e820_map),
+			&boot_params.e820_entries);
 	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
 		early_panic("Cannot find a valid memory map");
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
@@ -484,7 +486,7 @@ void __init finish_e820_parsing(void)
 	if (userdef) {
 		char nr = e820.nr_map;
 
-		if (sanitize_e820_map(e820.map, &nr) < 0)
+		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
 			early_panic("Invalid user supplied memory map");
 		e820.nr_map = nr;
 
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 0c28a071824c..38a856c4fc07 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -163,7 +163,9 @@ char * __init machine_specific_memory_setup(void)
 	 * Otherwise fake a memory map; one section from 0k->640k,
 	 * the next section from 1mb->appropriate_mem_k
 	 */
-	sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
+	sanitize_e820_map(boot_params.e820_map,
+			ARRAY_SIZE(boot_params.e820_map),
+			&boot_params.e820_entries);
 	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
 	    < 0) {
 		unsigned long mem_size;
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index 5ae5466b9eb9..662b5c0a77d6 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -111,7 +111,9 @@ char *__init machine_specific_memory_setup(void)
 	 * Otherwise fake a memory map; one section from 0k->640k,
 	 * the next section from 1mb->appropriate_mem_k
 	 */
-	sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
+	sanitize_e820_map(boot_params.e820_map,
+			ARRAY_SIZE(boot_params.e820_map),
+			&boot_params.e820_entries);
 	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
 	    < 0) {
 		unsigned long mem_size;
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index b5b519feba1d..65c891d2a4cb 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -27,7 +27,8 @@ extern int e820_any_mapped(u64 start, u64 end, unsigned type);
 extern int e820_all_mapped(u64 start, u64 end, unsigned type);
 extern void add_memory_region(u64 start, u64 size, int type);
 extern void e820_print_map(char *who);
-extern int sanitize_e820_map(struct e820entry *biosmap, char *pnr_map);
+extern int
+sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, char *pnr_map);
 extern int copy_e820_map(struct e820entry *biosmap, int nr_map);
 extern u64 update_memory_range(u64 start, u64 size, unsigned old_type,
 			       unsigned new_type);
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index fa6763af8d26..ffa0f540fc7f 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -56,7 +56,6 @@ char * __init machine_specific_memory_setup(void);
 char *memory_setup(void);
 
 int __init copy_e820_map(struct e820entry *biosmap, int nr_map);
-int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map);
 void __init add_memory_region(unsigned long long start,
 			      unsigned long long size, int type);
 
-- 
cgit v1.2.3


From 028b785888c523baccdf27af0cdbf1deb92edec0 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 14 May 2008 08:15:40 -0700
Subject: x86 boot: extend some internal memory map arrays to handle larger EFI
 input

Extend internal boot time memory tables to allow for up to
three entries per node, which may be larger than the 128 E820MAX
entries handled by the legacy BIOS E820 interface.  The EFI
interface, if present, is capable of passing memory map
entries for these larger node counts.

This patch requires an earlier patch that rewrote code depending
on these array sizes from using E820MAX explicitly to size loops,
to instead using ARRAY_SIZE() of the applicable array.

Another patch following this one will provide the code to pick
up additional memory entries passed via the EFI interface from
the BIOS and insert them in the following, now enlarged, arrays.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c |  8 ++++----
 include/asm-x86/e820.h | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 2396b9da8027..3f7777b255aa 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,10 +149,10 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 		struct e820entry *pbios; /* pointer to original bios entry */
 		unsigned long long addr; /* address for this change point */
 	};
-	static struct change_member change_point_list[2*E820MAX] __initdata;
-	static struct change_member *change_point[2*E820MAX] __initdata;
-	static struct e820entry *overlap_list[E820MAX] __initdata;
-	static struct e820entry new_bios[E820MAX] __initdata;
+static struct change_member change_point_list[2*E820_X_MAX] __initdata;
+static struct change_member *change_point[2*E820_X_MAX] __initdata;
+static struct e820entry *overlap_list[E820_X_MAX] __initdata;
+static struct e820entry new_bios[E820_X_MAX] __initdata;
 	struct change_member *change_tmp;
 	unsigned long current_type, last_type;
 	unsigned long long last_addr;
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 65c891d2a4cb..ab18457a95c0 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -2,6 +2,41 @@
 #define __ASM_E820_H
 #define E820MAP	0x2d0		/* our map */
 #define E820MAX	128		/* number of entries in E820MAP */
+
+/*
+ * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the
+ * constrained space in the zeropage.  If we have more nodes than
+ * that, and if we've booted off EFI firmware, then the EFI tables
+ * passed us from the EFI firmware can list more nodes.  Size our
+ * internal memory map tables to have room for these additional
+ * nodes, based on up to three entries per node for which the
+ * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT),
+ * plus E820MAX, allowing space for the possible duplicate E820
+ * entries that might need room in the same arrays, prior to the
+ * call to sanitize_e820_map() to remove duplicates.  The allowance
+ * of three memory map entries per node is "enough" entries for
+ * the initial hardware platform motivating this mechanism to make
+ * use of additional EFI map entries.  Future platforms may want
+ * to allow more than three entries per node or otherwise refine
+ * this size.
+ */
+
+/*
+ * Odd: 'make headers_check' complains about numa.h if I try
+ * to collapse the next two #ifdef lines to a single line:
+ *	#if defined(__KERNEL__) && defined(CONFIG_EFI)
+ */
+#ifdef __KERNEL__
+#ifdef CONFIG_EFI
+#include <linux/numa.h>
+#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES)
+#else	/* ! CONFIG_EFI */
+#define E820_X_MAX E820MAX
+#endif
+#else	/* ! __KERNEL__ */
+#define E820_X_MAX E820MAX
+#endif
+
 #define E820NR	0x1e8		/* # entries in E820MAP */
 
 #define E820_RAM	1
@@ -18,7 +53,7 @@ struct e820entry {
 
 struct e820map {
 	__u32 nr_map;
-	struct e820entry map[E820MAX];
+	struct e820entry map[E820_X_MAX];
 };
 
 extern struct e820map e820;
-- 
cgit v1.2.3


From 6e9bcc796b120d17b08dde7ab958b82ddb899889 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 14 May 2008 08:15:46 -0700
Subject: x86 boot: change sanitize_e820_map parameter from byte to int to
 allow bigger memory maps

The map size counter passed into, and back out of, sanitize_e820_map(),
was an eight bit type (char or u8), as derived from its origins in
legacy BIOS E820 structures.  This patch changes that type to an 'int',
to allow this sanitize routine to also be used on larger maps (larger
than the 256 count that fits in a char).  The legacy BIOS E820 interface
of course does not change; that remains at 8 bits for this count, holding
up to E820MAX == 128 entries.  But the kernel internals can handle more
when those additional memory map entries are passed from the BIOS via
EFI interfaces.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c        | 5 +++--
 arch/x86/kernel/e820_64.c     | 7 +++++--
 arch/x86/mach-default/setup.c | 5 ++++-
 arch/x86/mach-voyager/setup.c | 5 ++++-
 include/asm-x86/e820.h        | 2 +-
 5 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 3f7777b255aa..91abf5b2fb94 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -143,7 +143,7 @@ void __init e820_print_map(char *who)
  *
  */
 int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
-				char *pnr_map)
+				int *pnr_map)
 {
 	struct change_member {
 		struct e820entry *pbios; /* pointer to original bios entry */
@@ -204,6 +204,7 @@ static struct e820entry new_bios[E820_X_MAX] __initdata;
 		return -1;
 
 	old_nr = *pnr_map;
+	BUG_ON(old_nr > max_nr_map);
 
 	/* bail out if we find any unreasonable addresses in bios map */
 	for (i = 0; i < old_nr; i++)
@@ -401,7 +402,7 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
 
 void __init update_e820(void)
 {
-	u8 nr_map;
+	int nr_map;
 
 	nr_map = e820.nr_map;
 	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 58dc6eee4d4f..354fbb221709 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -407,15 +407,18 @@ static void early_panic(char *msg)
 char *__init machine_specific_memory_setup(void)
 {
 	char *who = "BIOS-e820";
+	int new_nr;
 	/*
 	 * Try to copy the BIOS-supplied E820-map.
 	 *
 	 * Otherwise fake a memory map; one section from 0k->640k,
 	 * the next section from 1mb->appropriate_mem_k
 	 */
+	new_nr = boot_params.e820_entries;
 	sanitize_e820_map(boot_params.e820_map,
 			ARRAY_SIZE(boot_params.e820_map),
-			&boot_params.e820_entries);
+			&new_nr);
+	boot_params.e820_entries = new_nr;
 	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
 		early_panic("Cannot find a valid memory map");
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
@@ -484,7 +487,7 @@ early_param("memmap", parse_memmap_opt);
 void __init finish_e820_parsing(void)
 {
 	if (userdef) {
-		char nr = e820.nr_map;
+		int nr = e820.nr_map;
 
 		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
 			early_panic("Invalid user supplied memory map");
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 38a856c4fc07..56b4c39cb7fa 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -153,6 +153,7 @@ late_initcall(print_ipi_mode);
 char * __init machine_specific_memory_setup(void)
 {
 	char *who;
+	int new_nr;
 
 
 	who = "BIOS-e820";
@@ -163,9 +164,11 @@ char * __init machine_specific_memory_setup(void)
 	 * Otherwise fake a memory map; one section from 0k->640k,
 	 * the next section from 1mb->appropriate_mem_k
 	 */
+	new_nr = boot_params.e820_entries;
 	sanitize_e820_map(boot_params.e820_map,
 			ARRAY_SIZE(boot_params.e820_map),
-			&boot_params.e820_entries);
+			&new_nr);
+	boot_params.e820_entries = new_nr;
 	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
 	    < 0) {
 		unsigned long mem_size;
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index 662b5c0a77d6..f4aca9fa9546 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -62,6 +62,7 @@ void __init time_init_hook(void)
 char *__init machine_specific_memory_setup(void)
 {
 	char *who;
+	int new_nr;
 
 	who = "NOT VOYAGER";
 
@@ -111,9 +112,11 @@ char *__init machine_specific_memory_setup(void)
 	 * Otherwise fake a memory map; one section from 0k->640k,
 	 * the next section from 1mb->appropriate_mem_k
 	 */
+	new_nr = boot_params.e820_entries;
 	sanitize_e820_map(boot_params.e820_map,
 			ARRAY_SIZE(boot_params.e820_map),
-			&boot_params.e820_entries);
+			&new_nr);
+	boot_params.e820_entries = new_nr;
 	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
 	    < 0) {
 		unsigned long mem_size;
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index ab18457a95c0..1eb13b881437 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -63,7 +63,7 @@ extern int e820_all_mapped(u64 start, u64 end, unsigned type);
 extern void add_memory_region(u64 start, u64 size, int type);
 extern void e820_print_map(char *who);
 extern int
-sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, char *pnr_map);
+sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
 extern int copy_e820_map(struct e820entry *biosmap, int nr_map);
 extern u64 update_memory_range(u64 start, u64 size, unsigned old_type,
 			       unsigned new_type);
-- 
cgit v1.2.3


From 5b7eb2e9ef4e467a1248537b47a63bab265be3cc Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 14 May 2008 08:15:52 -0700
Subject: x86 boot: longer comment explaining sanitize_e820_map routine

Elaborate on the comment for sanitize_e820_map(), epxlaining more what
it does, what it inputs, and what it returns.  Rearrange the placement of
this comment to fit kernel conventions, before the routine's code rather
than buried inside it.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 94 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 56 insertions(+), 38 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 91abf5b2fb94..41c480ae47df 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -139,9 +139,64 @@ void __init e820_print_map(char *who)
  * Sanitize the BIOS e820 map.
  *
  * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
+ * replaces the original e820 map with a new one, removing overlaps,
+ * and resolving conflicting memory types in favor of highest
+ * numbered type.
  *
+ * The input parameter biosmap points to an array of 'struct
+ * e820entry' which on entry has elements in the range [0, *pnr_map)
+ * valid, and which has space for up to max_nr_map entries.
+ * On return, the resulting sanitized e820 map entries will be in
+ * overwritten in the same location, starting at biosmap.
+ *
+ * The integer pointed to by pnr_map must be valid on entry (the
+ * current number of valid entries located at biosmap) and will
+ * be updated on return, with the new number of valid entries
+ * (something no more than max_nr_map.)
+ *
+ * The return value from sanitize_e820_map() is zero if it
+ * successfully 'sanitized' the map entries passed in, and is -1
+ * if it did nothing, which can happen if either of (1) it was
+ * only passed one map entry, or (2) any of the input map entries
+ * were invalid (start + size < start, meaning that the size was
+ * so big the described memory range wrapped around through zero.)
+ *
+ *	Visually we're performing the following
+ *	(1,2,3,4 = memory types)...
+ *
+ *	Sample memory map (w/overlaps):
+ *	   ____22__________________
+ *	   ______________________4_
+ *	   ____1111________________
+ *	   _44_____________________
+ *	   11111111________________
+ *	   ____________________33__
+ *	   ___________44___________
+ *	   __________33333_________
+ *	   ______________22________
+ *	   ___________________2222_
+ *	   _________111111111______
+ *	   _____________________11_
+ *	   _________________4______
+ *
+ *	Sanitized equivalent (no overlap):
+ *	   1_______________________
+ *	   _44_____________________
+ *	   ___1____________________
+ *	   ____22__________________
+ *	   ______11________________
+ *	   _________1______________
+ *	   __________3_____________
+ *	   ___________44___________
+ *	   _____________33_________
+ *	   _______________2________
+ *	   ________________1_______
+ *	   _________________4______
+ *	   ___________________2____
+ *	   ____________________33__
+ *	   ______________________4_
  */
+
 int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 				int *pnr_map)
 {
@@ -162,43 +217,6 @@ static struct e820entry new_bios[E820_X_MAX] __initdata;
 	int old_nr, new_nr, chg_nr;
 	int i;
 
-	/*
-		Visually we're performing the following
-		(1,2,3,4 = memory types)...
-
-		Sample memory map (w/overlaps):
-		   ____22__________________
-		   ______________________4_
-		   ____1111________________
-		   _44_____________________
-		   11111111________________
-		   ____________________33__
-		   ___________44___________
-		   __________33333_________
-		   ______________22________
-		   ___________________2222_
-		   _________111111111______
-		   _____________________11_
-		   _________________4______
-
-		Sanitized equivalent (no overlap):
-		   1_______________________
-		   _44_____________________
-		   ___1____________________
-		   ____22__________________
-		   ______11________________
-		   _________1______________
-		   __________3_____________
-		   ___________44___________
-		   _____________33_________
-		   _______________2________
-		   ________________1_______
-		   _________________4______
-		   ___________________2____
-		   ____________________33__
-		   ______________________4_
-	*/
-
 	/* if there's only one memory region, don't bother */
 	if (*pnr_map < 2)
 		return -1;
-- 
cgit v1.2.3


From 69c9189320c46b14e5ae3ad4b3a0d35cc63cba20 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 14 May 2008 08:15:58 -0700
Subject: x86 boot: add code to add BIOS provided EFI memory entries to kernel

Add to the kernels boot memory map 'memmap' entries found in
the EFI memory descriptors passed in from the BIOS.

On EFI systems, up to E820MAX == 128 memory map entries can
be passed via the legacy E820 interface (limited by the size
of the 'zeropage').  These entries can be duplicated in the
EFI descriptors also passed from the BIOS, and possibly more
entries passed by the EFI interface, which does not have the
E820MAX limit on number of memory map entries.

This code doesn't worry about the likely duplicate, overlapping
or (unlikely) conflicting entries between the EFI map and the
E820 map.  It just dumps all the EFI entries into the memmap[]
array (which already has the E820 entries) and lets the existing
routine sanitize_e820_map() sort the mess out.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/efi.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 77d424cf68b3..4a1a26d5931f 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -213,6 +213,31 @@ unsigned long efi_get_time(void)
 		      eft.minute, eft.second);
 }
 
+/*
+ * Tell the kernel about the EFI memory map.  This might include
+ * more than the max 128 entries that can fit in the e820 legacy
+ * (zeropage) memory map.
+ */
+
+static void __init add_efi_memmap(void)
+{
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		efi_memory_desc_t *md = p;
+		unsigned long long start = md->phys_addr;
+		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+		int e820_type;
+
+		if (md->attribute & EFI_MEMORY_WB)
+			e820_type = E820_RAM;
+		else
+			e820_type = E820_RESERVED;
+		add_memory_region(start, size, e820_type);
+	}
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
 #if EFI_DEBUG
 static void __init print_efi_memmap(void)
 {
@@ -370,6 +395,7 @@ void __init efi_init(void)
 	if (memmap.desc_size != sizeof(efi_memory_desc_t))
 		printk(KERN_WARNING "Kernel-defined memdesc"
 		       "doesn't match the one from EFI!\n");
+	add_efi_memmap();
 
 	/* Setup for EFI runtime service */
 	reboot_type = BOOT_EFI;
-- 
cgit v1.2.3


From a4c81cf684350797939416c99effb9d3ae46bca6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 18 May 2008 01:18:57 -0700
Subject: x86: extend e820 ealy_res support 32bit

move early_res related from e820_64.c to e820.c
make edba detection to be done in head32.c
remove smp_alloc_memory, because we have fixed trampoline address now.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>

 arch/x86/kernel/e820.c              |  214 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/e820_64.c           |  196 --------------------------------
 arch/x86/kernel/head32.c            |   76 ++++++++++++
 arch/x86/kernel/setup_32.c          |  109 +++---------------
 arch/x86/kernel/smpboot.c           |   17 --
 arch/x86/kernel/trampoline.c        |    2
 arch/x86/mach-voyager/voyager_smp.c |    9 -
 include/asm-x86/e820.h              |    6 +
 include/asm-x86/e820_64.h           |    9 -
 include/asm-x86/smp.h               |    1
 arch/x86/kernel/e820.c              |  214 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/e820_64.c           |  196 --------------------------------
 arch/x86/kernel/head32.c            |   76 ++++++++++++
 arch/x86/kernel/setup_32.c          |  109 +++---------------
 arch/x86/kernel/smpboot.c           |   17 --
 arch/x86/kernel/trampoline.c        |    2
 arch/x86/mach-voyager/voyager_smp.c |    9 -
 include/asm-x86/e820.h              |    6 +
 include/asm-x86/e820_64.h           |    9 -
 include/asm-x86/smp.h               |    1
 arch/x86/kernel/e820.c              |  214 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/e820_64.c           |  196 --------------------------------
 arch/x86/kernel/head32.c            |   76 ++++++++++++
 arch/x86/kernel/setup_32.c          |  109 +++---------------
 arch/x86/kernel/smpboot.c           |   17 --
 arch/x86/kernel/trampoline.c        |    2
 arch/x86/mach-voyager/voyager_smp.c |    9 -
 include/asm-x86/e820.h              |    6 +
 include/asm-x86/e820_64.h           |    9 -
 include/asm-x86/smp.h               |    1
 10 files changed, 320 insertions(+), 319 deletions(-)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c              | 214 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/e820_64.c           | 196 ---------------------------------
 arch/x86/kernel/head32.c            |  76 +++++++++++++
 arch/x86/kernel/setup_32.c          | 109 ++++--------------
 arch/x86/kernel/smpboot.c           |  17 ---
 arch/x86/kernel/trampoline.c        |   2 +-
 arch/x86/mach-voyager/voyager_smp.c |   9 --
 include/asm-x86/e820.h              |   6 +
 include/asm-x86/e820_64.h           |   9 --
 include/asm-x86/smp.h               |   1 -
 10 files changed, 320 insertions(+), 319 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 41c480ae47df..35da8cdbe5e6 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -22,7 +22,9 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/e820.h>
+#include <asm/proto.h>
 #include <asm/setup.h>
+#include <asm/trampoline.h>
 
 struct e820map e820;
 
@@ -493,3 +495,215 @@ __init void e820_setup_gap(void)
 	       pci_mem_start, gapstart, gapsize);
 }
 
+
+/*
+ * Early reserved memory areas.
+ */
+#define MAX_EARLY_RES 20
+
+struct early_res {
+	u64 start, end;
+	char name[16];
+};
+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+	{ 0, PAGE_SIZE, "BIOS data page" },	/* BIOS data page */
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
+	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
+#endif
+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
+	/*
+	 * But first pinch a few for the stack/trampoline stuff
+	 * FIXME: Don't need the extra page at 4K, but need to fix
+	 * trampoline before removing it. (see the GDT stuff)
+	 */
+	{ PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
+	/*
+	 * Has to be in very low memory so we can execute
+	 * real-mode AP code.
+	 */
+	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
+#endif
+	{}
+};
+
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+	int i;
+	struct early_res *r;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		r = &early_res[i];
+		if (end > r->start && start < r->end)
+			panic("Overlapping early reservations %llx-%llx %s to %llx-%llx %s\n",
+			      start, end - 1, name?name:"", r->start,
+			      r->end - 1, r->name);
+	}
+	if (i >= MAX_EARLY_RES)
+		panic("Too many early reservations");
+	r = &early_res[i];
+	r->start = start;
+	r->end = end;
+	if (name)
+		strncpy(r->name, name, sizeof(r->name) - 1);
+}
+
+void __init free_early(u64 start, u64 end)
+{
+	struct early_res *r;
+	int i, j;
+
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		r = &early_res[i];
+		if (start == r->start && end == r->end)
+			break;
+	}
+	if (i >= MAX_EARLY_RES || !early_res[i].end)
+		panic("free_early on not reserved area: %llx-%llx!",
+			 start, end);
+
+	for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+		;
+
+	memmove(&early_res[i], &early_res[i + 1],
+	       (j - 1 - i) * sizeof(struct early_res));
+
+	early_res[j - 1].end = 0;
+}
+
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+	int i;
+	u64 final_start, final_end;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		final_start = max(start, r->start);
+		final_end = min(end, r->end);
+		if (final_start >= final_end)
+			continue;
+		printk(KERN_INFO "  early res: %d [%llx-%llx] %s\n", i,
+			final_start, final_end - 1, r->name);
+#ifdef CONFIG_X86_64
+		reserve_bootmem_generic(final_start, final_end - final_start);
+#else
+		reserve_bootmem(final_start, final_end - final_start,
+				BOOTMEM_DEFAULT);
+#endif
+	}
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+	int i;
+	u64 addr = *addrp, last;
+	int changed = 0;
+again:
+	last = addr + size;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		if (last >= r->start && addr < r->end) {
+			*addrp = addr = round_up(r->end, align);
+			changed = 1;
+			goto again;
+		}
+	}
+	return changed;
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+	int i;
+	u64 addr = *addrp, last;
+	u64 size = *sizep;
+	int changed = 0;
+again:
+	last = addr + size;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		if (last > r->start && addr < r->start) {
+			size = r->start - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last > r->end && addr < r->end) {
+			addr = round_up(r->end, align);
+			size = last - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last <= r->end && addr >= r->start) {
+			(*sizep)++;
+			return 0;
+		}
+	}
+	if (changed) {
+		*addrp = addr;
+		*sizep = size;
+	}
+	return changed;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 addr, last;
+		u64 ei_last;
+
+		if (ei->type != E820_RAM)
+			continue;
+		addr = round_up(ei->addr, align);
+		ei_last = ei->addr + ei->size;
+		if (addr < start)
+			addr = round_up(start, align);
+		if (addr >= ei_last)
+			continue;
+		while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+			;
+		last = addr + size;
+		if (last > ei_last)
+			continue;
+		if (last > end)
+			continue;
+		return addr;
+	}
+	return -1ULL;
+}
+
+/*
+ * Find next free range after *start
+ */
+u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 addr, last;
+		u64 ei_last;
+
+		if (ei->type != E820_RAM)
+			continue;
+		addr = round_up(ei->addr, align);
+		ei_last = ei->addr + ei->size;
+		if (addr < start)
+			addr = round_up(start, align);
+		if (addr >= ei_last)
+			continue;
+		*sizep = ei_last - addr;
+		while (bad_addr_size(&addr, sizep, align) &&
+			addr + *sizep <= ei_last)
+			;
+		last = addr + *sizep;
+		if (last > ei_last)
+			continue;
+		return addr;
+	}
+	return -1UL;
+
+}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 354fbb221709..07941b554519 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -46,202 +46,6 @@ unsigned long max_pfn_mapped;
  */
 static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
 
-/*
- * Early reserved memory areas.
- */
-#define MAX_EARLY_RES 20
-
-struct early_res {
-	unsigned long start, end;
-	char name[16];
-};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
-	{ 0, PAGE_SIZE, "BIOS data page" },			/* BIOS data page */
-#ifdef CONFIG_X86_TRAMPOLINE
-	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
-#endif
-	{}
-};
-
-void __init reserve_early(unsigned long start, unsigned long end, char *name)
-{
-	int i;
-	struct early_res *r;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		r = &early_res[i];
-		if (end > r->start && start < r->end)
-			panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
-			      start, end - 1, name?name:"", r->start, r->end - 1, r->name);
-	}
-	if (i >= MAX_EARLY_RES)
-		panic("Too many early reservations");
-	r = &early_res[i];
-	r->start = start;
-	r->end = end;
-	if (name)
-		strncpy(r->name, name, sizeof(r->name) - 1);
-}
-
-void __init free_early(unsigned long start, unsigned long end)
-{
-	struct early_res *r;
-	int i, j;
-
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		r = &early_res[i];
-		if (start == r->start && end == r->end)
-			break;
-	}
-	if (i >= MAX_EARLY_RES || !early_res[i].end)
-		panic("free_early on not reserved area: %lx-%lx!", start, end);
-
-	for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
-		;
-
-	memmove(&early_res[i], &early_res[i + 1],
-	       (j - 1 - i) * sizeof(struct early_res));
-
-	early_res[j - 1].end = 0;
-}
-
-void __init early_res_to_bootmem(unsigned long start, unsigned long end)
-{
-	int i;
-	unsigned long final_start, final_end;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		struct early_res *r = &early_res[i];
-		final_start = max(start, r->start);
-		final_end = min(end, r->end);
-		if (final_start >= final_end)
-			continue;
-		printk(KERN_INFO "  early res: %d [%lx-%lx] %s\n", i,
-			final_start, final_end - 1, r->name);
-		reserve_bootmem_generic(final_start, final_end - final_start);
-	}
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
-{
-	int i;
-	unsigned long addr = *addrp, last;
-	int changed = 0;
-again:
-	last = addr + size;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		struct early_res *r = &early_res[i];
-		if (last >= r->start && addr < r->end) {
-			*addrp = addr = round_up(r->end, align);
-			changed = 1;
-			goto again;
-		}
-	}
-	return changed;
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
-{
-	int i;
-	unsigned long addr = *addrp, last;
-	unsigned long size = *sizep;
-	int changed = 0;
-again:
-	last = addr + size;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		struct early_res *r = &early_res[i];
-		if (last > r->start && addr < r->start) {
-			size = r->start - addr;
-			changed = 1;
-			goto again;
-		}
-		if (last > r->end && addr < r->end) {
-			addr = round_up(r->end, align);
-			size = last - addr;
-			changed = 1;
-			goto again;
-		}
-		if (last <= r->end && addr >= r->start) {
-			(*sizep)++;
-			return 0;
-		}
-	}
-	if (changed) {
-		*addrp = addr;
-		*sizep = size;
-	}
-	return changed;
-}
-
-/*
- * Find a free area with specified alignment in a specific range.
- */
-unsigned long __init find_e820_area(unsigned long start, unsigned long end,
-				    unsigned long size, unsigned long align)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long addr, last;
-		unsigned long ei_last;
-
-		if (ei->type != E820_RAM)
-			continue;
-		addr = round_up(ei->addr, align);
-		ei_last = ei->addr + ei->size;
-		if (addr < start)
-			addr = round_up(start, align);
-		if (addr >= ei_last)
-			continue;
-		while (bad_addr(&addr, size, align) && addr+size <= ei_last)
-			;
-		last = addr + size;
-		if (last > ei_last)
-			continue;
-		if (last > end)
-			continue;
-		return addr;
-	}
-	return -1UL;
-}
-
-/*
- * Find next free range after *start
- */
-unsigned long __init find_e820_area_size(unsigned long start,
-					 unsigned long *sizep,
-					 unsigned long align)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long addr, last;
-		unsigned long ei_last;
-
-		if (ei->type != E820_RAM)
-			continue;
-		addr = round_up(ei->addr, align);
-		ei_last = ei->addr + ei->size;
-		if (addr < start)
-			addr = round_up(start, align);
-		if (addr >= ei_last)
-			continue;
-		*sizep = ei_last - addr;
-		while (bad_addr_size(&addr, sizep, align) &&
-			addr + *sizep <= ei_last)
-			;
-		last = addr + *sizep;
-		if (last > ei_last)
-			continue;
-		return addr;
-	}
-	return -1UL;
-
-}
 /*
  * Find the highest page frame number we have available
  */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3db059058927..c216d3c2a991 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,7 +8,83 @@
 #include <linux/init.h>
 #include <linux/start_kernel.h>
 
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820.h>
+#include <asm/bios_ebda.h>
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+static void __init reserve_ebda_region(void)
+{
+	unsigned int lowmem, ebda_addr;
+
+	/* To determine the position of the EBDA and the */
+	/* end of conventional memory, we need to look at */
+	/* the BIOS data area. In a paravirtual environment */
+	/* that area is absent. We'll just have to assume */
+	/* that the paravirt case can handle memory setup */
+	/* correctly, without our help. */
+	if (paravirt_enabled())
+		return;
+
+	/* end of low (conventional) memory */
+	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+	lowmem <<= 10;
+
+	/* start of EBDA area */
+	ebda_addr = get_bios_ebda();
+
+	/* Fixup: bios puts an EBDA in the top 64K segment */
+	/* of conventional memory, but does not adjust lowmem. */
+	if ((lowmem - ebda_addr) <= 0x10000)
+		lowmem = ebda_addr;
+
+	/* Fixup: bios does not report an EBDA at all. */
+	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+		lowmem = 0x9f000;
+
+	/* Paranoia: should never happen, but... */
+	if ((lowmem == 0) || (lowmem >= 0x100000))
+		lowmem = 0x9f000;
+
+	/* reserve all memory between lowmem and the 1MB mark */
+	reserve_early(lowmem, 0x100000, "BIOS reserved");
+}
+
 void __init i386_start_kernel(void)
 {
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	/* Reserve INITRD */
+	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+		u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+		u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+		u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+	}
+#endif
+	reserve_early(__pa_symbol(&_end), init_pg_tables_end, "INIT_PG_TABLE");
+
+	reserve_ebda_region();
+
+	/*
+	 * At this point everything still needed from the boot loader
+	 * or BIOS or kernel text should be early reserved or marked not
+	 * RAM in e820. All other memory is free game.
+	 */
+
 	start_kernel();
 }
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 5faeab69edd9..fed482c62450 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -359,56 +359,6 @@ unsigned long __init find_max_low_pfn(void)
 	return max_low_pfn;
 }
 
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
-	unsigned int lowmem, ebda_addr;
-
-	/* To determine the position of the EBDA and the */
-	/* end of conventional memory, we need to look at */
-	/* the BIOS data area. In a paravirtual environment */
-	/* that area is absent. We'll just have to assume */
-	/* that the paravirt case can handle memory setup */
-	/* correctly, without our help. */
-	if (paravirt_enabled())
-		return;
-
-	/* end of low (conventional) memory */
-	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
-	lowmem <<= 10;
-
-	/* start of EBDA area */
-	ebda_addr = get_bios_ebda();
-
-	/* Fixup: bios puts an EBDA in the top 64K segment */
-	/* of conventional memory, but does not adjust lowmem. */
-	if ((lowmem - ebda_addr) <= 0x10000)
-		lowmem = ebda_addr;
-
-	/* Fixup: bios does not report an EBDA at all. */
-	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-		lowmem = 0x9f000;
-
-	/* Paranoia: should never happen, but... */
-	if ((lowmem == 0) || (lowmem >= 0x100000))
-		lowmem = 0x9f000;
-
-	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
-}
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static void __init setup_bootmem_allocator(void);
 static unsigned long __init setup_memory(void)
@@ -522,25 +472,32 @@ static void __init reserve_initrd(void)
 	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
 	unsigned long ramdisk_here;
 
-	initrd_start = 0;
-
 	if (!boot_params.hdr.type_of_loader ||
 	    !ramdisk_image || !ramdisk_size)
 		return;		/* No initrd provided by bootloader */
 
+	initrd_start = 0;
+
 	if (ramdisk_end < ramdisk_image) {
+		free_bootmem(ramdisk_image, ramdisk_size);
 		printk(KERN_ERR "initrd wraps around end of memory, "
 		       "disabling initrd\n");
 		return;
 	}
 	if (ramdisk_size >= end_of_lowmem/2) {
+		free_bootmem(ramdisk_image, ramdisk_size);
 		printk(KERN_ERR "initrd too large to handle, "
 		       "disabling initrd\n");
 		return;
 	}
+
 	if (ramdisk_end <= end_of_lowmem) {
 		/* All in lowmem, easy case */
-		reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
+		/*
+		 * don't need to reserve again, already reserved early
+		 * in i386_start_kernel, and early_res_to_bootmem
+		 * convert that to reserved in bootmem
+		 */
 		initrd_start = ramdisk_image + PAGE_OFFSET;
 		initrd_end = initrd_start+ramdisk_size;
 		return;
@@ -582,6 +539,8 @@ static void __init relocate_initrd(void)
 		p = (char *)__va(ramdisk_image);
 		memcpy(q, p, clen);
 		q += clen;
+		/* need to free these low pages...*/
+		free_bootmem(ramdisk_image, clen);
 		ramdisk_image += clen;
 		ramdisk_size  -= clen;
 	}
@@ -600,47 +559,28 @@ static void __init relocate_initrd(void)
 		ramdisk_image += clen;
 		ramdisk_size  -= clen;
 	}
+	/* high pages is not converted by early_res_to_bootmem */
 }
 
 #endif /* CONFIG_BLK_DEV_INITRD */
 
 void __init setup_bootmem_allocator(void)
 {
-	unsigned long bootmap_size;
+	unsigned long bootmap_size, bootmap;
 	/*
 	 * Initialize the boot-time allocator (with low memory only):
 	 */
-	bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
-
+	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
+	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+				 max_low_pfn<<PAGE_SHIFT, bootmap_size,
+				 PAGE_SIZE);
+	if (bootmap == -1L)
+		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
 	register_bootmem_low_pages(max_low_pfn);
+	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
 
-	/*
-	 * Reserve the bootmem bitmap itself as well. We do this in two
-	 * steps (first step was init_bootmem()) because this catches
-	 * the (very unlikely) case of us accidentally initializing the
-	 * bootmem allocator with an invalid RAM area.
-	 */
-	reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
-			 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
-			 BOOTMEM_DEFAULT);
-
-	/*
-	 * reserve physical page 0 - it's a special BIOS page on many boxes,
-	 * enabling clean reboots, SMP operation, laptop functions.
-	 */
-	reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
-
-	/* reserve EBDA region */
-	reserve_ebda_region();
-
-#ifdef CONFIG_SMP
-	/*
-	 * But first pinch a few for the stack/trampoline stuff
-	 * FIXME: Don't need the extra page at 4K, but need to fix
-	 * trampoline before removing it. (see the GDT stuff)
-	 */
-	reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
 #ifdef CONFIG_ACPI_SLEEP
 	/*
 	 * Reserve low memory region for sleep support.
@@ -803,9 +743,6 @@ void __init setup_arch(char **cmdline_p)
 	 * not to exceed the 8Mb limit.
 	 */
 
-#ifdef CONFIG_SMP
-	smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
-#endif
 	paging_init();
 
 	/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 38988491c622..843722e2b79e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -555,23 +555,6 @@ cpumask_t cpu_coregroup_map(int cpu)
 		return c->llc_shared_map;
 }
 
-#ifdef CONFIG_X86_32
-/*
- * We are called very early to get the low memory for the
- * SMP bootup trampoline page.
- */
-void __init smp_alloc_memory(void)
-{
-	trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
-	/*
-	 * Has to be in very low memory so we can execute
-	 * real-mode AP code.
-	 */
-	if (__pa(trampoline_base) >= 0x9F000)
-		BUG();
-}
-#endif
-
 static void impress_friends(void)
 {
 	int cpu;
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index abbf199adebb..1106fac6024d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,7 +2,7 @@
 
 #include <asm/trampoline.h>
 
-/* ready for x86_64, no harm for x86, since it will overwrite after alloc */
+/* ready for x86_64 and x86 */
 unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
 
 /*
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 8acbf0cdf1a5..7bbebbfe8c4e 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1137,15 +1137,6 @@ void flush_tlb_all(void)
 	on_each_cpu(do_flush_tlb_all, 0, 1, 1);
 }
 
-/* used to set up the trampoline for other CPUs when the memory manager
- * is sorted out */
-void __init smp_alloc_memory(void)
-{
-	trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
-	if (__pa(trampoline_base) >= 0x93000)
-		BUG();
-}
-
 /* send a reschedule CPI to one CPU by physical CPU number*/
 static void voyager_smp_send_reschedule(int cpu)
 {
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 1eb13b881437..5a58e2bb1d78 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -70,6 +70,12 @@ extern u64 update_memory_range(u64 start, u64 size, unsigned old_type,
 extern void update_e820(void);
 extern void e820_setup_gap(void);
 
+extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
+extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
+extern void reserve_early(u64 start, u64 end, char *name);
+extern void free_early(u64 start, u64 end);
+extern void early_res_to_bootmem(u64 start, u64 end);
+
 #endif /* __ASSEMBLY__ */
 
 #define ISA_START_ADDRESS	0xa0000
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 9fac77e01441..37f176a02bc6 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -14,11 +14,6 @@
 #include <linux/ioport.h>
 
 #ifndef __ASSEMBLY__
-extern unsigned long find_e820_area(unsigned long start, unsigned long end,
-				    unsigned long size, unsigned long align);
-extern unsigned long find_e820_area_size(unsigned long start,
-					 unsigned long *sizep,
-					 unsigned long align);
 extern void setup_memory_region(void);
 extern void contig_e820_setup(void);
 extern unsigned long e820_end_of_ram(void);
@@ -35,10 +30,6 @@ extern void e820_register_active_regions(int nid, unsigned long start_pfn,
 
 extern void finish_e820_parsing(void);
 
-extern void reserve_early(unsigned long start, unsigned long end, char *name);
-extern void free_early(unsigned long start, unsigned long end);
-extern void early_res_to_bootmem(unsigned long start, unsigned long end);
-
 #endif/*!__ASSEMBLY__*/
 
 #endif/*__E820_HEADER*/
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index 1ebaa5cd3112..514e52b95cef 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -201,7 +201,6 @@ extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
 #endif
 
-extern void smp_alloc_memory(void);
 extern void lock_ipi_call_lock(void);
 extern void unlock_ipi_call_lock(void);
 #endif /* __ASSEMBLY__ */
-- 
cgit v1.2.3


From ba5b14cc0311af6ed20dc25aa98a1d5ea6f2e6c0 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 21 May 2008 18:40:18 -0700
Subject: x86: extend e820 ealy_res support 32bit - fix

use find_e820_area to find addess for new RAMDISK, instead of using ram blindly

also print out low ram and bootmap info

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_32.c | 55 ++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index fed482c62450..3c451d143eba 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -466,11 +466,11 @@ static bool do_relocate_initrd = false;
 
 static void __init reserve_initrd(void)
 {
-	unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-	unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-	unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
-	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-	unsigned long ramdisk_here;
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	u64 ramdisk_here;
 
 	if (!boot_params.hdr.type_of_loader ||
 	    !ramdisk_image || !ramdisk_size)
@@ -478,14 +478,8 @@ static void __init reserve_initrd(void)
 
 	initrd_start = 0;
 
-	if (ramdisk_end < ramdisk_image) {
-		free_bootmem(ramdisk_image, ramdisk_size);
-		printk(KERN_ERR "initrd wraps around end of memory, "
-		       "disabling initrd\n");
-		return;
-	}
 	if (ramdisk_size >= end_of_lowmem/2) {
-		free_bootmem(ramdisk_image, ramdisk_size);
+		free_early(ramdisk_image, ramdisk_image + ramdisk_size - 1);
 		printk(KERN_ERR "initrd too large to handle, "
 		       "disabling initrd\n");
 		return;
@@ -495,8 +489,7 @@ static void __init reserve_initrd(void)
 		/* All in lowmem, easy case */
 		/*
 		 * don't need to reserve again, already reserved early
-		 * in i386_start_kernel, and early_res_to_bootmem
-		 * convert that to reserved in bootmem
+		 * in i386_start_kernel
 		 */
 		initrd_start = ramdisk_image + PAGE_OFFSET;
 		initrd_end = initrd_start+ramdisk_size;
@@ -504,11 +497,14 @@ static void __init reserve_initrd(void)
 	}
 
 	/* We need to move the initrd down into lowmem */
-	ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
+	ramdisk_here = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+				 end_of_lowmem, ramdisk_size,
+				 PAGE_SIZE);
 
 	/* Note: this includes all the lowmem currently occupied by
 	   the initrd, we rely on that fact to keep the data intact. */
-	reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
+	reserve_early(ramdisk_here, ramdisk_here + ramdisk_size - 1,
+			 "NEW RAMDISK");
 	initrd_start = ramdisk_here + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
 
@@ -519,10 +515,10 @@ static void __init reserve_initrd(void)
 
 static void __init relocate_initrd(void)
 {
-	unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-	unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-	unsigned long ramdisk_here;
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	u64 ramdisk_here;
 	unsigned long slop, clen, mapaddr;
 	char *p, *q;
 
@@ -540,6 +536,8 @@ static void __init relocate_initrd(void)
 		memcpy(q, p, clen);
 		q += clen;
 		/* need to free these low pages...*/
+		printk(KERN_INFO "Freeing old partial RAMDISK %08llx-%08llx\n",
+			 ramdisk_image, ramdisk_image + clen - 1);
 		free_bootmem(ramdisk_image, clen);
 		ramdisk_image += clen;
 		ramdisk_size  -= clen;
@@ -560,6 +558,11 @@ static void __init relocate_initrd(void)
 		ramdisk_size  -= clen;
 	}
 	/* high pages is not converted by early_res_to_bootmem */
+	ramdisk_image = boot_params.hdr.ramdisk_image;
+	ramdisk_size  = boot_params.hdr.ramdisk_size;
+	printk(KERN_INFO "Copied RAMDISK from %016llx - %016llx to %08llx - %08llx\n",
+		ramdisk_image, ramdisk_image + ramdisk_size - 1,
+		ramdisk_here, ramdisk_here + ramdisk_size - 1);
 }
 
 #endif /* CONFIG_BLK_DEV_INITRD */
@@ -576,10 +579,17 @@ void __init setup_bootmem_allocator(void)
 				 PAGE_SIZE);
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+	reserve_early(bootmap, bootmap + bootmap_size - 1, "BOOTMAP");
+#ifdef CONFIG_BLK_DEV_INITRD
+	reserve_initrd();
+#endif
 	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
+	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
+		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  bootmap [%08lx -  %08lx]\n",
+		 bootmap, bootmap + bootmap_size - 1);
 	register_bootmem_low_pages(max_low_pfn);
 	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
 
 #ifdef CONFIG_ACPI_SLEEP
 	/*
@@ -592,9 +602,6 @@ void __init setup_bootmem_allocator(void)
 	 * Find and reserve possible boot-time SMP configuration:
 	 */
 	find_smp_config();
-#endif
-#ifdef CONFIG_BLK_DEV_INITRD
-	reserve_initrd();
 #endif
 	numa_kva_reserve();
 	reserve_crashkernel();
-- 
cgit v1.2.3


From 7f028bc0fd119a4fc65aedc07728ce85c8813d33 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 15:43:35 +0200
Subject: x86: move mp_ioapic_routing to mpparse and make it static

mpparse is the only user of mp_ioapic_routing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 2 --
 arch/x86/kernel/mpparse.c   | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c49ebcc6c41e..6f20fa92bbaf 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -331,8 +331,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
 
 #ifdef CONFIG_X86_IO_APIC
 
-struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
-
 static int __init
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 404683b94e79..bb7213335991 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -1,5 +1,5 @@
 /*
- *	Intel Multiprocessor Specification 1.1 and 1.4
+2 *	Intel Multiprocessor Specification 1.1 and 1.4
  *	compliant MP-table parsing routines.
  *
  *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
@@ -805,7 +805,7 @@ int es7000_plat;
 
 #define MP_ISA_BUS		0
 
-extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
 
 static int mp_find_ioapic(int gsi)
 {
-- 
cgit v1.2.3


From a91eea6df383f26fc4fcbefcbdb5aee8facd17fd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 15:43:37 +0200
Subject: x86: fix shadow variables of global end_pnf in e820_64.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_64.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 07941b554519..5a9bc9244dc6 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -51,21 +51,21 @@ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
  */
 unsigned long __init e820_end_of_ram(void)
 {
-	unsigned long end_pfn;
+	unsigned long last_pfn;
 
-	end_pfn = find_max_pfn_with_active_regions();
+	last_pfn = find_max_pfn_with_active_regions();
 
-	if (end_pfn > max_pfn_mapped)
-		max_pfn_mapped = end_pfn;
+	if (last_pfn > max_pfn_mapped)
+		max_pfn_mapped = last_pfn;
 	if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
 		max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
-	if (end_pfn > end_user_pfn)
-		end_pfn = end_user_pfn;
-	if (end_pfn > max_pfn_mapped)
-		end_pfn = max_pfn_mapped;
+	if (last_pfn > end_user_pfn)
+		last_pfn = end_user_pfn;
+	if (last_pfn > max_pfn_mapped)
+		last_pfn = max_pfn_mapped;
 
 	printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
-	return end_pfn;
+	return last_pfn;
 }
 
 /*
@@ -124,12 +124,12 @@ void __init e820_mark_nosave_regions(void)
 }
 
 /*
- * Finds an active region in the address range from start_pfn to end_pfn and
+ * Finds an active region in the address range from start_pfn to last_pfn and
  * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
  */
 static int __init e820_find_active_region(const struct e820entry *ei,
 					  unsigned long start_pfn,
-					  unsigned long end_pfn,
+					  unsigned long last_pfn,
 					  unsigned long *ei_startpfn,
 					  unsigned long *ei_endpfn)
 {
@@ -146,14 +146,14 @@ static int __init e820_find_active_region(const struct e820entry *ei,
 
 	/* Skip if map is outside the node */
 	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
-				    *ei_startpfn >= end_pfn)
+				    *ei_startpfn >= last_pfn)
 		return 0;
 
 	/* Check for overlaps */
 	if (*ei_startpfn < start_pfn)
 		*ei_startpfn = start_pfn;
-	if (*ei_endpfn > end_pfn)
-		*ei_endpfn = end_pfn;
+	if (*ei_endpfn > last_pfn)
+		*ei_endpfn = last_pfn;
 
 	/* Obey end_user_pfn to save on memmap */
 	if (*ei_startpfn >= end_user_pfn)
@@ -167,7 +167,7 @@ static int __init e820_find_active_region(const struct e820entry *ei,
 /* Walk the e820 map and register active regions within a node */
 void __init
 e820_register_active_regions(int nid, unsigned long start_pfn,
-							unsigned long end_pfn)
+							unsigned long last_pfn)
 {
 	unsigned long ei_startpfn;
 	unsigned long ei_endpfn;
@@ -175,7 +175,7 @@ e820_register_active_regions(int nid, unsigned long start_pfn,
 
 	for (i = 0; i < e820.nr_map; i++)
 		if (e820_find_active_region(&e820.map[i],
-					    start_pfn, end_pfn,
+					    start_pfn, last_pfn,
 					    &ei_startpfn, &ei_endpfn))
 			add_active_range(nid, ei_startpfn, ei_endpfn);
 }
@@ -188,13 +188,13 @@ e820_register_active_regions(int nid, unsigned long start_pfn,
 unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long end_pfn = end >> PAGE_SHIFT;
+	unsigned long last_pfn = end >> PAGE_SHIFT;
 	unsigned long ei_startpfn, ei_endpfn, ram = 0;
 	int i;
 
 	for (i = 0; i < e820.nr_map; i++) {
 		if (e820_find_active_region(&e820.map[i],
-					    start_pfn, end_pfn,
+					    start_pfn, last_pfn,
 					    &ei_startpfn, &ei_endpfn))
 			ram += ei_endpfn - ei_startpfn;
 	}
-- 
cgit v1.2.3


From 4a139a7fdec8964cecf7a3564ee7ae327141d495 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 15:43:37 +0200
Subject: x86: include pci.h in e820_64.c

global pci_mem_start needs a declaration. include pci.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_64.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 5a9bc9244dc6..3b2e0b1bb49b 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -19,6 +19,7 @@
 #include <linux/mm.h>
 #include <linux/suspend.h>
 #include <linux/pfn.h>
+#include <linux/pci.h>
 
 #include <asm/pgtable.h>
 #include <asm/page.h>
-- 
cgit v1.2.3


From 11a62a056093a7f25f1595fbd8bd5f93559572b6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Mon, 12 May 2008 15:43:38 +0200
Subject: x86: cleanup print out for mptable

the new output is:

 MPTABLE: OEM ID: SUN
 MPTABLE: Product ID: 4600 M2
 MPTABLE: APIC at: 0x

instead of it all in one line with <6> and double Product ID...

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mpparse.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index bb7213335991..5a18b2b9852e 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -113,7 +113,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 #ifdef CONFIG_X86_NUMAQ
 	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
 #else
-	Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+	printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
 #endif
 
 #if MAX_MP_BUSSES < 256
@@ -183,7 +183,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
 static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
 {
 	mp_irqs[mp_irq_entries] = *m;
-	Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+	printk(KERN_INFO "Int: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
@@ -196,7 +196,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
 
 static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
 {
-	Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+	printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
@@ -309,16 +309,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 	}
 	memcpy(oem, mpc->mpc_oem, 8);
 	oem[8] = 0;
-	printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
+	printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
 
 	memcpy(str, mpc->mpc_productid, 12);
 	str[12] = 0;
-	printk("Product ID: %s ", str);
 
 #ifdef CONFIG_X86_32
 	mps_oem_check(mpc, oem, str);
 #endif
-	printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
+	printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
 
 	printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
 
@@ -689,7 +688,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 	unsigned int *bp = phys_to_virt(base);
 	struct intel_mp_floating *mpf;
 
-	Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
+	printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);
 
 	while (length > 0) {
-- 
cgit v1.2.3


From 32c5061265caf201d6a2c0d02181e2b68769c22c Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Wed, 14 May 2008 19:02:51 +0400
Subject: x86: move es7000_plat out of mpparse.c

Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c         | 11 ++++++-----
 arch/x86/mach-es7000/es7000plat.c |  2 ++
 include/asm-x86/system.h          |  1 -
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5a18b2b9852e..ff1342325efc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -793,15 +793,14 @@ void __init find_smp_config(void)
                             ACPI-based MP Configuration
    -------------------------------------------------------------------------- */
 
-/*
- * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
- */
-int es7000_plat;
-
 #ifdef CONFIG_ACPI
 
 #ifdef	CONFIG_X86_IO_APIC
 
+#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH)
+extern int es7000_plat;
+#endif
+
 #define MP_ISA_BUS		0
 
 static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
@@ -928,11 +927,13 @@ void __init mp_config_acpi_legacy_irqs(void)
 	set_bit(MP_ISA_BUS, mp_bus_not_pci);
 	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
 
+#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH)
 	/*
 	 * Older generations of ES7000 have no legacy identity mappings
 	 */
 	if (es7000_plat == 1)
 		return;
+#endif
 
 	/*
 	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c
index f5d6f7d8b86e..a41c77a47227 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/mach-es7000/es7000plat.c
@@ -52,6 +52,8 @@ static struct mip_reg		*host_reg;
 static int 			mip_port;
 static unsigned long		mip_addr, host_addr;
 
+int es7000_plat;
+
 /*
  * GSI override for ES7000 platforms.
  */
diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h
index a2f04cd79b29..9f7f63ba0042 100644
--- a/include/asm-x86/system.h
+++ b/include/asm-x86/system.h
@@ -303,7 +303,6 @@ static inline void clflush(volatile void *__p)
 void disable_hlt(void);
 void enable_hlt(void);
 
-extern int es7000_plat;
 void cpu_idle_wait(void);
 
 extern unsigned long arch_align_stack(unsigned long sp);
-- 
cgit v1.2.3


From 11113f84c72bd832dc4e76122e613b0e623e2346 Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Wed, 14 May 2008 19:02:57 +0400
Subject: x86: complete move ACPI from mpparse.c

Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 304 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/mpparse.c   | 299 +------------------------------------------
 2 files changed, 305 insertions(+), 298 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6f20fa92bbaf..b2ad09c4c6ae 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -846,6 +846,310 @@ static int __init acpi_parse_madt_lapic_entries(void)
 #endif				/* CONFIG_X86_LOCAL_APIC */
 
 #ifdef	CONFIG_X86_IO_APIC
+#define MP_ISA_BUS		0
+
+#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH)
+extern int es7000_plat;
+#endif
+
+static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+
+static int mp_find_ioapic(int gsi)
+{
+	int i = 0;
+
+	/* Find the IOAPIC that manages this GSI. */
+	for (i = 0; i < nr_ioapics; i++) {
+		if ((gsi >= mp_ioapic_routing[i].gsi_base)
+		    && (gsi <= mp_ioapic_routing[i].gsi_end))
+			return i;
+	}
+
+	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+	return -1;
+}
+
+static u8 __init uniq_ioapic_id(u8 id)
+{
+#ifdef CONFIG_X86_32
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		return io_apic_get_unique_id(nr_ioapics, id);
+	else
+		return id;
+#else
+	int i;
+	DECLARE_BITMAP(used, 256);
+	bitmap_zero(used, 256);
+	for (i = 0; i < nr_ioapics; i++) {
+		struct mpc_config_ioapic *ia = &mp_ioapics[i];
+		__set_bit(ia->mpc_apicid, used);
+	}
+	if (!test_bit(id, used))
+		return id;
+	return find_first_zero_bit(used, 256);
+#endif
+}
+
+static int bad_ioapic(unsigned long address)
+{
+	if (nr_ioapics >= MAX_IO_APICS) {
+		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+		       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+	}
+	if (!address) {
+		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+		       " found in table, skipping!\n");
+		return 1;
+	}
+	return 0;
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+	int idx = 0;
+
+	if (bad_ioapic(address))
+		return;
+
+	idx = nr_ioapics;
+
+	mp_ioapics[idx].mpc_type = MP_IOAPIC;
+	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+	mp_ioapics[idx].mpc_apicaddr = address;
+
+	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+	mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
+#ifdef CONFIG_X86_32
+	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+#else
+	mp_ioapics[idx].mpc_apicver = 0;
+#endif
+	/*
+	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+	 */
+	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+	mp_ioapic_routing[idx].gsi_base = gsi_base;
+	mp_ioapic_routing[idx].gsi_end = gsi_base +
+	    io_apic_get_redir_entries(idx);
+
+	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+	       "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
+	       mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+	       mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+
+	nr_ioapics++;
+}
+
+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+	int ioapic = -1;
+	int pin = -1;
+
+	/*
+	 * Convert 'gsi' to 'ioapic.pin'.
+	 */
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return;
+	pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+	/*
+	 * TBD: This check is for faulty timer entries, where the override
+	 *      erroneously sets the trigger to level, resulting in a HUGE
+	 *      increase of timer interrupts!
+	 */
+	if ((bus_irq == 0) && (trigger == 3))
+		trigger = 1;
+
+	mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC;
+	mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT;
+	mp_irqs[mp_irq_entries].mpc_irqflag = (trigger << 2) | polarity;
+	mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS;
+	mp_irqs[mp_irq_entries].mpc_srcbusirq = bus_irq;	/* IRQ */
+	mp_irqs[mp_irq_entries].mpc_dstapic =
+			mp_ioapics[ioapic].mpc_apicid;	/* APIC ID */
+	mp_irqs[mp_irq_entries].mpc_dstirq = pin;	/* INTIN# */
+
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!!\n");
+
+}
+
+void __init mp_config_acpi_legacy_irqs(void)
+{
+	int i = 0;
+	int ioapic = -1;
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+	/*
+	 * Fabricate the legacy ISA bus (bus #31).
+	 */
+	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+	set_bit(MP_ISA_BUS, mp_bus_not_pci);
+	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH)
+	/*
+	 * Older generations of ES7000 have no legacy identity mappings
+	 */
+	if (es7000_plat == 1)
+		return;
+#endif
+
+	/*
+	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
+	 */
+	ioapic = mp_find_ioapic(0);
+	if (ioapic < 0)
+		return;
+
+	mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC;
+	mp_irqs[mp_irq_entries].mpc_irqflag = 0;	/* Conforming */
+	mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS;
+#ifdef CONFIG_X86_IO_APIC
+	mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+#endif
+	/*
+	 * Use the default configuration for the IRQs 0-15.  Unless
+	 * overridden by (MADT) interrupt source override entries.
+	 */
+	for (i = 0; i < 16; i++) {
+		int idx;
+
+		for (idx = 0; idx < mp_irq_entries; idx++) {
+			struct mpc_config_intsrc *irq = mp_irqs + idx;
+
+			/* Do we already have a mapping for this ISA IRQ? */
+			if (irq->mpc_srcbus == MP_ISA_BUS
+			    && irq->mpc_srcbusirq == i)
+				break;
+
+			/* Do we already have a mapping for this IOAPIC pin */
+			if ((irq->mpc_dstapic ==
+				mp_irqs[mp_irq_entries].mpc_dstapic) &&
+			    (irq->mpc_dstirq == i))
+				break;
+		}
+
+		if (idx != mp_irq_entries) {
+			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+			continue;	/* IRQ already used */
+		}
+
+		mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT;
+		mp_irqs[mp_irq_entries].mpc_srcbusirq = i;	/* Identity mapped */
+		mp_irqs[mp_irq_entries].mpc_dstirq = i;
+
+		if (++mp_irq_entries == MAX_IRQ_SOURCES)
+			panic("Max # of irq sources exceeded!!\n");
+	}
+}
+
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+	int ioapic;
+	int ioapic_pin;
+#ifdef CONFIG_X86_32
+#define MAX_GSI_NUM	4096
+#define IRQ_COMPRESSION_START	64
+
+	static int pci_irq = IRQ_COMPRESSION_START;
+	/*
+	 * Mapping between Global System Interrupts, which
+	 * represent all possible interrupts, and IRQs
+	 * assigned to actual devices.
+	 */
+	static int gsi_to_irq[MAX_GSI_NUM];
+#else
+
+	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+		return gsi;
+#endif
+
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (acpi_gbl_FADT.sci_interrupt == gsi)
+		return gsi;
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0) {
+		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+		return gsi;
+	}
+
+	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+#ifdef CONFIG_X86_32
+	if (ioapic_renumber_irq)
+		gsi = ioapic_renumber_irq(ioapic, gsi);
+#endif
+
+	/*
+	 * Avoid pin reprogramming.  PRTs typically include entries
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
+	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
+		printk(KERN_ERR "Invalid reference to IOAPIC pin "
+		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+		       ioapic_pin);
+		return gsi;
+	}
+	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+#ifdef CONFIG_X86_32
+		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+#else
+		return gsi;
+#endif
+	}
+
+	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
+#ifdef CONFIG_X86_32
+	/*
+	 * For GSI >= 64, use IRQ compression
+	 */
+	if ((gsi >= IRQ_COMPRESSION_START)
+	    && (triggering == ACPI_LEVEL_SENSITIVE)) {
+		/*
+		 * For PCI devices assign IRQs in order, avoiding gaps
+		 * due to unused I/O APIC pins.
+		 */
+		int irq = gsi;
+		if (gsi < MAX_GSI_NUM) {
+			/*
+			 * Retain the VIA chipset work-around (gsi > 15), but
+			 * avoid a problem where the 8254 timer (IRQ0) is setup
+			 * via an override (so it's not on pin 0 of the ioapic),
+			 * and at the same time, the pin 0 interrupt is a PCI
+			 * type.  The gsi > 15 test could cause these two pins
+			 * to be shared as IRQ0, and they are not shareable.
+			 * So test for this condition, and if necessary, avoid
+			 * the pin collision.
+			 */
+			gsi = pci_irq++;
+			/*
+			 * Don't assign IRQ used by ACPI SCI
+			 */
+			if (gsi == acpi_gbl_FADT.sci_interrupt)
+				gsi = pci_irq++;
+			gsi_to_irq[irq] = gsi;
+		} else {
+			printk(KERN_ERR "GSI %u is too high\n", gsi);
+			return gsi;
+		}
+	}
+#endif
+	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+	return gsi;
+}
+
 /*
  * Parse IOAPIC related entries in MADT
  * returns 0 on success, < 0 on error
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ff1342325efc..d05b70c329c4 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -1,5 +1,5 @@
 /*
-2 *	Intel Multiprocessor Specification 1.1 and 1.4
+ *	Intel Multiprocessor Specification 1.1 and 1.4
  *	compliant MP-table parsing routines.
  *
  *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
@@ -788,300 +788,3 @@ void __init find_smp_config(void)
 {
 	__find_smp_config(1);
 }
-
-/* --------------------------------------------------------------------------
-                            ACPI-based MP Configuration
-   -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-#ifdef	CONFIG_X86_IO_APIC
-
-#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH)
-extern int es7000_plat;
-#endif
-
-#define MP_ISA_BUS		0
-
-static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
-
-static int mp_find_ioapic(int gsi)
-{
-	int i = 0;
-
-	/* Find the IOAPIC that manages this GSI. */
-	for (i = 0; i < nr_ioapics; i++) {
-		if ((gsi >= mp_ioapic_routing[i].gsi_base)
-		    && (gsi <= mp_ioapic_routing[i].gsi_end))
-			return i;
-	}
-
-	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-	return -1;
-}
-
-static u8 __init uniq_ioapic_id(u8 id)
-{
-#ifdef CONFIG_X86_32
-	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-		return io_apic_get_unique_id(nr_ioapics, id);
-	else
-		return id;
-#else
-	int i;
-	DECLARE_BITMAP(used, 256);
-	bitmap_zero(used, 256);
-	for (i = 0; i < nr_ioapics; i++) {
-		struct mpc_config_ioapic *ia = &mp_ioapics[i];
-		__set_bit(ia->mpc_apicid, used);
-	}
-	if (!test_bit(id, used))
-		return id;
-	return find_first_zero_bit(used, 256);
-#endif
-}
-
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
-{
-	int idx = 0;
-
-	if (bad_ioapic(address))
-		return;
-
-	idx = nr_ioapics;
-
-	mp_ioapics[idx].mpc_type = MP_IOAPIC;
-	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-	mp_ioapics[idx].mpc_apicaddr = address;
-
-	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-	mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
-	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-#else
-	mp_ioapics[idx].mpc_apicver = 0;
-#endif
-	/*
-	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
-	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
-	 */
-	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-	mp_ioapic_routing[idx].gsi_base = gsi_base;
-	mp_ioapic_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx);
-
-	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-	       "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-	       mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-	       mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
-
-	nr_ioapics++;
-}
-
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
-	struct mpc_config_intsrc intsrc;
-	int ioapic = -1;
-	int pin = -1;
-
-	/*
-	 * Convert 'gsi' to 'ioapic.pin'.
-	 */
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return;
-	pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
-
-	/*
-	 * TBD: This check is for faulty timer entries, where the override
-	 *      erroneously sets the trigger to level, resulting in a HUGE
-	 *      increase of timer interrupts!
-	 */
-	if ((bus_irq == 0) && (trigger == 3))
-		trigger = 1;
-
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqtype = mp_INT;
-	intsrc.mpc_irqflag = (trigger << 2) | polarity;
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-	intsrc.mpc_srcbusirq = bus_irq;	/* IRQ */
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;	/* APIC ID */
-	intsrc.mpc_dstirq = pin;	/* INTIN# */
-
-	MP_intsrc_info(&intsrc);
-}
-
-void __init mp_config_acpi_legacy_irqs(void)
-{
-	struct mpc_config_intsrc intsrc;
-	int i = 0;
-	int ioapic = -1;
-
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-	/*
-	 * Fabricate the legacy ISA bus (bus #31).
-	 */
-	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-#endif
-	set_bit(MP_ISA_BUS, mp_bus_not_pci);
-	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
-
-#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH)
-	/*
-	 * Older generations of ES7000 have no legacy identity mappings
-	 */
-	if (es7000_plat == 1)
-		return;
-#endif
-
-	/*
-	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
-	 */
-	ioapic = mp_find_ioapic(0);
-	if (ioapic < 0)
-		return;
-
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqflag = 0;	/* Conforming */
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-#ifdef CONFIG_X86_IO_APIC
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
-#endif
-	/*
-	 * Use the default configuration for the IRQs 0-15.  Unless
-	 * overridden by (MADT) interrupt source override entries.
-	 */
-	for (i = 0; i < 16; i++) {
-		int idx;
-
-		for (idx = 0; idx < mp_irq_entries; idx++) {
-			struct mpc_config_intsrc *irq = mp_irqs + idx;
-
-			/* Do we already have a mapping for this ISA IRQ? */
-			if (irq->mpc_srcbus == MP_ISA_BUS
-			    && irq->mpc_srcbusirq == i)
-				break;
-
-			/* Do we already have a mapping for this IOAPIC pin */
-			if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-			    (irq->mpc_dstirq == i))
-				break;
-		}
-
-		if (idx != mp_irq_entries) {
-			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-			continue;	/* IRQ already used */
-		}
-
-		intsrc.mpc_irqtype = mp_INT;
-		intsrc.mpc_srcbusirq = i;	/* Identity mapped */
-		intsrc.mpc_dstirq = i;
-
-		MP_intsrc_info(&intsrc);
-	}
-}
-
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
-{
-	int ioapic;
-	int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM	4096
-#define IRQ_COMPRESSION_START	64
-
-	static int pci_irq = IRQ_COMPRESSION_START;
-	/*
-	 * Mapping between Global System Interrupts, which
-	 * represent all possible interrupts, and IRQs
-	 * assigned to actual devices.
-	 */
-	static int gsi_to_irq[MAX_GSI_NUM];
-#else
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return gsi;
-#endif
-
-	/* Don't set up the ACPI SCI because it's already set up */
-	if (acpi_gbl_FADT.sci_interrupt == gsi)
-		return gsi;
-
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0) {
-		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-		return gsi;
-	}
-
-	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
-
-#ifdef CONFIG_X86_32
-	if (ioapic_renumber_irq)
-		gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
-
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
-		printk(KERN_ERR "Invalid reference to IOAPIC pin "
-		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
-		       ioapic_pin);
-		return gsi;
-	}
-	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
-		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
-		return gsi;
-#endif
-	}
-
-	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
-	/*
-	 * For GSI >= 64, use IRQ compression
-	 */
-	if ((gsi >= IRQ_COMPRESSION_START)
-	    && (triggering == ACPI_LEVEL_SENSITIVE)) {
-		/*
-		 * For PCI devices assign IRQs in order, avoiding gaps
-		 * due to unused I/O APIC pins.
-		 */
-		int irq = gsi;
-		if (gsi < MAX_GSI_NUM) {
-			/*
-			 * Retain the VIA chipset work-around (gsi > 15), but
-			 * avoid a problem where the 8254 timer (IRQ0) is setup
-			 * via an override (so it's not on pin 0 of the ioapic),
-			 * and at the same time, the pin 0 interrupt is a PCI
-			 * type.  The gsi > 15 test could cause these two pins
-			 * to be shared as IRQ0, and they are not shareable.
-			 * So test for this condition, and if necessary, avoid
-			 * the pin collision.
-			 */
-			gsi = pci_irq++;
-			/*
-			 * Don't assign IRQ used by ACPI SCI
-			 */
-			if (gsi == acpi_gbl_FADT.sci_interrupt)
-				gsi = pci_irq++;
-			gsi_to_irq[irq] = gsi;
-		} else {
-			printk(KERN_ERR "GSI %u is too high\n", gsi);
-			return gsi;
-		}
-	}
-#endif
-	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	return gsi;
-}
-
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
-- 
cgit v1.2.3


From 5f8951487ddbacbc949e9ffae574f94791f9b4dd Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Wed, 14 May 2008 19:03:04 +0400
Subject: x86: make mp_ioapic_routing definition local

Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 7 ++++++-
 include/asm-x86/io_apic.h   | 7 -------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b2ad09c4c6ae..f75c2030f4b5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -852,7 +852,12 @@ static int __init acpi_parse_madt_lapic_entries(void)
 extern int es7000_plat;
 #endif
 
-static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+static struct {
+	int apic_id;
+	int gsi_base;
+	int gsi_end;
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
 
 static int mp_find_ioapic(int gsi)
 {
diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h
index d593e14f0341..2ef96f0596e2 100644
--- a/include/asm-x86/io_apic.h
+++ b/include/asm-x86/io_apic.h
@@ -112,13 +112,6 @@ extern int nr_ioapic_registers[MAX_IO_APICS];
 
 #define MP_MAX_IOAPIC_PIN 127
 
-struct mp_ioapic_routing {
-	int apic_id;
-	int gsi_base;
-	int gsi_end;
-	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-};
-
 /* I/O APIC entries */
 extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
 
-- 
cgit v1.2.3


From ec2cd0a22e2715f776a934e01c4f8ea098324fe1 Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Wed, 14 May 2008 19:03:10 +0400
Subject: x86: make struct config_ioapic not MPspec specific

Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c  | 28 ++++++++++++------------
 arch/x86/kernel/apic_32.c    |  2 +-
 arch/x86/kernel/io_apic_32.c | 52 ++++++++++++++++++++++----------------------
 arch/x86/kernel/io_apic_64.c | 26 +++++++++++-----------
 arch/x86/kernel/mpparse.c    |  8 +++++--
 include/asm-x86/io_apic.h    | 10 ++++++++-
 6 files changed, 69 insertions(+), 57 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f75c2030f4b5..a26dd91faf0f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -887,8 +887,8 @@ static u8 __init uniq_ioapic_id(u8 id)
 	DECLARE_BITMAP(used, 256);
 	bitmap_zero(used, 256);
 	for (i = 0; i < nr_ioapics; i++) {
-		struct mpc_config_ioapic *ia = &mp_ioapics[i];
-		__set_bit(ia->mpc_apicid, used);
+		struct mp_config_ioapic *ia = &mp_ioapics[i];
+		__set_bit(ia->mp_apicid, used);
 	}
 	if (!test_bit(id, used))
 		return id;
@@ -920,29 +920,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 
 	idx = nr_ioapics;
 
-	mp_ioapics[idx].mpc_type = MP_IOAPIC;
-	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-	mp_ioapics[idx].mpc_apicaddr = address;
+	mp_ioapics[idx].mp_type = MP_IOAPIC;
+	mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
+	mp_ioapics[idx].mp_apicaddr = address;
 
 	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-	mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
+	mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
 #ifdef CONFIG_X86_32
-	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+	mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
 #else
-	mp_ioapics[idx].mpc_apicver = 0;
+	mp_ioapics[idx].mp_apicver = 0;
 #endif
 	/*
 	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
 	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
 	 */
-	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
 	mp_ioapic_routing[idx].gsi_base = gsi_base;
 	mp_ioapic_routing[idx].gsi_end = gsi_base +
 	    io_apic_get_redir_entries(idx);
 
-	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-	       "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-	       mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
+	       "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
+	       mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
 	       mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
 
 	nr_ioapics++;
@@ -975,7 +975,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS;
 	mp_irqs[mp_irq_entries].mpc_srcbusirq = bus_irq;	/* IRQ */
 	mp_irqs[mp_irq_entries].mpc_dstapic =
-			mp_ioapics[ioapic].mpc_apicid;	/* APIC ID */
+			mp_ioapics[ioapic].mp_apicid;	/* APIC ID */
 	mp_irqs[mp_irq_entries].mpc_dstirq = pin;	/* INTIN# */
 
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
@@ -1016,7 +1016,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	mp_irqs[mp_irq_entries].mpc_irqflag = 0;	/* Conforming */
 	mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS;
 #ifdef CONFIG_X86_IO_APIC
-	mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+	mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mp_apicid;
 #endif
 	/*
 	 * Use the default configuration for the IRQs 0-15.  Unless
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6c..fa1be1d62916 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1202,7 +1202,7 @@ void __init init_apic_mappings(void)
 
 		for (i = 0; i < nr_ioapics; i++) {
 			if (smp_found_config) {
-				ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+				ioapic_phys = mp_ioapics[i].mp_apicaddr;
 				if (!ioapic_phys) {
 					printk(KERN_ERR
 					       "WARNING: bogus zero IO-APIC "
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index a40d54fc1fdd..5af1b717236c 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -72,7 +72,7 @@ int sis_apic_bug = -1;
 int nr_ioapic_registers[MAX_IO_APICS];
 
 /* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
 
 /* MP IRQ source entries */
@@ -110,7 +110,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
 }
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -802,7 +802,7 @@ static int find_irq_entry(int apic, int pin, int type)
 
 	for (i = 0; i < mp_irq_entries; i++)
 		if (mp_irqs[i].mpc_irqtype == type &&
-		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid ||
 		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
 		    mp_irqs[i].mpc_dstirq == pin)
 			return i;
@@ -844,7 +844,7 @@ static int __init find_isa_irq_apic(int irq, int type)
 	if (i < mp_irq_entries) {
 		int apic;
 		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic)
 				return apic;
 		}
 	}
@@ -872,7 +872,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 		int lbus = mp_irqs[i].mpc_srcbus;
 
 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic ||
 			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
 				break;
 
@@ -1250,12 +1250,12 @@ static void __init setup_IO_APIC_irqs(void)
 			if (first_notcon) {
 				apic_printk(APIC_VERBOSE, KERN_DEBUG
 						" IO-APIC (apicid-pin) %d-%d",
-						mp_ioapics[apic].mpc_apicid,
+						mp_ioapics[apic].mp_apicid,
 						pin);
 				first_notcon = 0;
 			} else
 				apic_printk(APIC_VERBOSE, ", %d-%d",
-					mp_ioapics[apic].mpc_apicid, pin);
+					mp_ioapics[apic].mp_apicid, pin);
 			continue;
 		}
 
@@ -1357,7 +1357,7 @@ void __init print_IO_APIC(void)
  	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
 
 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -1376,7 +1376,7 @@ void __init print_IO_APIC(void)
 		reg_03.raw = io_apic_read(apic, 3);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1749,14 +1749,14 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		reg_00.raw = io_apic_read(apic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 		
-		old_id = mp_ioapics[apic].mpc_apicid;
+		old_id = mp_ioapics[apic].mp_apicid;
 
-		if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+		if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-				apic, mp_ioapics[apic].mpc_apicid);
+				apic, mp_ioapics[apic].mp_apicid);
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				reg_00.bits.ID);
-			mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+			mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
 		}
 
 		/*
@@ -1765,9 +1765,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		 * 'stuck on smp_invalidate_needed IPI wait' messages.
 		 */
 		if (check_apicid_used(phys_id_present_map,
-					mp_ioapics[apic].mpc_apicid)) {
+					mp_ioapics[apic].mp_apicid)) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-				apic, mp_ioapics[apic].mpc_apicid);
+				apic, mp_ioapics[apic].mp_apicid);
 			for (i = 0; i < get_physical_broadcast(); i++)
 				if (!physid_isset(i, phys_id_present_map))
 					break;
@@ -1776,13 +1776,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				i);
 			physid_set(i, phys_id_present_map);
-			mp_ioapics[apic].mpc_apicid = i;
+			mp_ioapics[apic].mp_apicid = i;
 		} else {
 			physid_mask_t tmp;
-			tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+			tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
 			apic_printk(APIC_VERBOSE, "Setting %d in the "
 					"phys_id_present_map\n",
-					mp_ioapics[apic].mpc_apicid);
+					mp_ioapics[apic].mp_apicid);
 			physids_or(phys_id_present_map, phys_id_present_map, tmp);
 		}
 
@@ -1791,11 +1791,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		 * We need to adjust the IRQ routing table
 		 * if the ID changed.
 		 */
-		if (old_id != mp_ioapics[apic].mpc_apicid)
+		if (old_id != mp_ioapics[apic].mp_apicid)
 			for (i = 0; i < mp_irq_entries; i++)
 				if (mp_irqs[i].mpc_dstapic == old_id)
 					mp_irqs[i].mpc_dstapic
-						= mp_ioapics[apic].mpc_apicid;
+						= mp_ioapics[apic].mp_apicid;
 
 		/*
 		 * Read the right value from the MPC table and
@@ -1803,9 +1803,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
 	 	 */
 		apic_printk(APIC_VERBOSE, KERN_INFO
 			"...changing IO-APIC physical APIC ID to %d ...",
-			mp_ioapics[apic].mpc_apicid);
+			mp_ioapics[apic].mp_apicid);
 
-		reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+		reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0, reg_00.raw);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1816,7 +1816,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		reg_00.raw = io_apic_read(apic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
-		if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+		if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
 			printk("could not set ID!\n");
 		else
 			apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2355,8 +2355,8 @@ static int ioapic_resume(struct sys_device *dev)
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2789,7 +2789,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 
 	apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
 		"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
-		mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+		mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
 		edge_level, active_high_low);
 
 	ioapic_register_intr(irq, entry.vector, edge_level);
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..4555ad8c2070 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -104,7 +104,7 @@ DEFINE_SPINLOCK(vector_lock);
 int nr_ioapic_registers[MAX_IO_APICS];
 
 /* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
 
 /* MP IRQ source entries */
@@ -140,7 +140,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
 }
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -454,7 +454,7 @@ static int find_irq_entry(int apic, int pin, int type)
 
 	for (i = 0; i < mp_irq_entries; i++)
 		if (mp_irqs[i].mpc_irqtype == type &&
-		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid ||
 		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
 		    mp_irqs[i].mpc_dstirq == pin)
 			return i;
@@ -496,7 +496,7 @@ static int __init find_isa_irq_apic(int irq, int type)
 	if (i < mp_irq_entries) {
 		int apic;
 		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic)
 				return apic;
 		}
 	}
@@ -524,7 +524,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 		int lbus = mp_irqs[i].mpc_srcbus;
 
 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic ||
 			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
 				break;
 
@@ -846,7 +846,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
 	apic_printk(APIC_VERBOSE,KERN_DEBUG
 		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
 		    "IRQ %d Mode:%i Active:%i)\n",
-		    apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+		    apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
 		    irq, trigger, polarity);
 
 	/*
@@ -887,10 +887,10 @@ static void __init setup_IO_APIC_irqs(void)
 		idx = find_irq_entry(apic,pin,mp_INT);
 		if (idx == -1) {
 			if (first_notcon) {
-				apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+				apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
 				first_notcon = 0;
 			} else
-				apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+				apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
 			continue;
 		}
 		if (!first_notcon) {
@@ -965,7 +965,7 @@ void __apicdebuginit print_IO_APIC(void)
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
 
 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -983,7 +983,7 @@ void __apicdebuginit print_IO_APIC(void)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	printk("\n");
-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 
@@ -1841,8 +1841,8 @@ static int ioapic_resume(struct sys_device *dev)
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2336,7 +2336,7 @@ void __init ioapic_init_mappings(void)
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
-			ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+			ioapic_phys = mp_ioapics[i].mp_apicaddr;
 		} else {
 			ioapic_phys = (unsigned long)
 				alloc_bootmem_pages(PAGE_SIZE);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d05b70c329c4..9f1e5bf7f0fb 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -176,7 +176,11 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
 	if (bad_ioapic(m->mpc_apicaddr))
 		return;
 
-	mp_ioapics[nr_ioapics] = *m;
+	mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
+	mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
+	mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
+	mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
+	mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
 	nr_ioapics++;
 }
 
@@ -426,7 +430,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
 	intsrc.mpc_type = MP_INTSRC;
 	intsrc.mpc_irqflag = 0;	/* conforming */
 	intsrc.mpc_srcbus = 0;
-	intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+	intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
 
 	intsrc.mpc_irqtype = mp_INT;
 
diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h
index 2ef96f0596e2..ade76c0d03ae 100644
--- a/include/asm-x86/io_apic.h
+++ b/include/asm-x86/io_apic.h
@@ -112,8 +112,16 @@ extern int nr_ioapic_registers[MAX_IO_APICS];
 
 #define MP_MAX_IOAPIC_PIN 127
 
+struct mp_config_ioapic {
+	unsigned long mp_apicaddr;
+	unsigned int mp_apicid;
+	unsigned char mp_type;
+	unsigned char mp_apicver;
+	unsigned char mp_flags;
+};
+
 /* I/O APIC entries */
-extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 
 /* # of MP IRQ source entries */
 extern int mp_irq_entries;
-- 
cgit v1.2.3


From 2fddb6e28e903a3ab1704cc5aac01be5a59dc05b Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Wed, 14 May 2008 19:03:17 +0400
Subject: x86: make config_irqsrc not MPspec specific

Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c  | 43 +++++++++++++++--------------
 arch/x86/kernel/io_apic_32.c | 64 ++++++++++++++++++++++----------------------
 arch/x86/kernel/io_apic_64.c | 58 +++++++++++++++++++--------------------
 arch/x86/kernel/mpparse.c    |  8 +++++-
 include/asm-x86/io_apic.h    | 12 ++++++++-
 5 files changed, 100 insertions(+), 85 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a26dd91faf0f..276ec058f683 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -969,14 +969,14 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	if ((bus_irq == 0) && (trigger == 3))
 		trigger = 1;
 
-	mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC;
-	mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT;
-	mp_irqs[mp_irq_entries].mpc_irqflag = (trigger << 2) | polarity;
-	mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS;
-	mp_irqs[mp_irq_entries].mpc_srcbusirq = bus_irq;	/* IRQ */
-	mp_irqs[mp_irq_entries].mpc_dstapic =
+	mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
+	mp_irqs[mp_irq_entries].mp_irqtype = mp_INT;
+	mp_irqs[mp_irq_entries].mp_irqflag = (trigger << 2) | polarity;
+	mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
+	mp_irqs[mp_irq_entries].mp_srcbusirq = bus_irq;	/* IRQ */
+	mp_irqs[mp_irq_entries].mp_dstapic =
 			mp_ioapics[ioapic].mp_apicid;	/* APIC ID */
-	mp_irqs[mp_irq_entries].mpc_dstirq = pin;	/* INTIN# */
+	mp_irqs[mp_irq_entries].mp_dstirq = pin;	/* INTIN# */
 
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
@@ -1012,12 +1012,11 @@ void __init mp_config_acpi_legacy_irqs(void)
 	if (ioapic < 0)
 		return;
 
-	mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC;
-	mp_irqs[mp_irq_entries].mpc_irqflag = 0;	/* Conforming */
-	mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS;
-#ifdef CONFIG_X86_IO_APIC
-	mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mp_apicid;
-#endif
+	mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
+	mp_irqs[mp_irq_entries].mp_irqflag = 0;	/* Conforming */
+	mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
+	mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid;
+
 	/*
 	 * Use the default configuration for the IRQs 0-15.  Unless
 	 * overridden by (MADT) interrupt source override entries.
@@ -1026,17 +1025,17 @@ void __init mp_config_acpi_legacy_irqs(void)
 		int idx;
 
 		for (idx = 0; idx < mp_irq_entries; idx++) {
-			struct mpc_config_intsrc *irq = mp_irqs + idx;
+			struct mp_config_intsrc *irq = mp_irqs + idx;
 
 			/* Do we already have a mapping for this ISA IRQ? */
-			if (irq->mpc_srcbus == MP_ISA_BUS
-			    && irq->mpc_srcbusirq == i)
+			if (irq->mp_srcbus == MP_ISA_BUS
+			    && irq->mp_srcbusirq == i)
 				break;
 
 			/* Do we already have a mapping for this IOAPIC pin */
-			if ((irq->mpc_dstapic ==
-				mp_irqs[mp_irq_entries].mpc_dstapic) &&
-			    (irq->mpc_dstirq == i))
+			if ((irq->mp_dstapic ==
+				mp_irqs[mp_irq_entries].mp_dstapic) &&
+			    (irq->mp_dstirq == i))
 				break;
 		}
 
@@ -1045,9 +1044,9 @@ void __init mp_config_acpi_legacy_irqs(void)
 			continue;	/* IRQ already used */
 		}
 
-		mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT;
-		mp_irqs[mp_irq_entries].mpc_srcbusirq = i;	/* Identity mapped */
-		mp_irqs[mp_irq_entries].mpc_dstirq = i;
+		mp_irqs[mp_irq_entries].mp_irqtype = mp_INT;
+		mp_irqs[mp_irq_entries].mp_srcbusirq = i;	/* Identity mapped */
+		mp_irqs[mp_irq_entries].mp_dstirq = i;
 
 		if (++mp_irq_entries == MAX_IRQ_SOURCES)
 			panic("Max # of irq sources exceeded!!\n");
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 5af1b717236c..ea68c3e5ba1d 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -76,7 +76,7 @@ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
 
 /* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 /* # of MP IRQ source entries */
 int mp_irq_entries;
@@ -801,10 +801,10 @@ static int find_irq_entry(int apic, int pin, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == type &&
-		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid ||
-		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-		    mp_irqs[i].mpc_dstirq == pin)
+		if (mp_irqs[i].mp_irqtype == type &&
+		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].mp_dstirq == pin)
 			return i;
 
 	return -1;
@@ -818,13 +818,13 @@ static int __init find_isa_irq_pin(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 
-			return mp_irqs[i].mpc_dstirq;
+			return mp_irqs[i].mp_dstirq;
 	}
 	return -1;
 }
@@ -834,17 +834,17 @@ static int __init find_isa_irq_apic(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 			break;
 	}
 	if (i < mp_irq_entries) {
 		int apic;
 		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
 				return apic;
 		}
 	}
@@ -869,23 +869,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 		return -1;
 	}
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic ||
-			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+			    mp_irqs[i].mp_dstapic == MP_APIC_ALL)
 				break;
 
 		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].mpc_irqtype &&
+		    !mp_irqs[i].mp_irqtype &&
 		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
 
 			if (!(apic || IO_APIC_IRQ(irq)))
 				continue;
 
-			if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+			if (pin == (mp_irqs[i].mp_srcbusirq & 3))
 				return irq;
 			/*
 			 * Use the first all-but-pin matching entry as a
@@ -952,7 +952,7 @@ static int EISA_ELCR(unsigned int irq)
  * EISA conforming in the MP table, that means its trigger type must
  * be read in from the ELCR */
 
-#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)
 
 /* PCI interrupts are always polarity one level triggered,
@@ -969,13 +969,13 @@ static int EISA_ELCR(unsigned int irq)
 
 static int MPBIOS_polarity(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int polarity;
 
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].mpc_irqflag & 3)
+	switch (mp_irqs[idx].mp_irqflag & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent polarity */
 		{
@@ -1012,13 +1012,13 @@ static int MPBIOS_polarity(int idx)
 
 static int MPBIOS_trigger(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int trigger;
 
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+	switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent */
 		{
@@ -1097,16 +1097,16 @@ static inline int irq_trigger(int idx)
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 
 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
-	if (mp_irqs[idx].mpc_dstirq != pin)
+	if (mp_irqs[idx].mp_dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
 
 	if (test_bit(bus, mp_bus_not_pci))
-		irq = mp_irqs[idx].mpc_srcbusirq;
+		irq = mp_irqs[idx].mp_srcbusirq;
 	else {
 		/*
 		 * PCI IRQs are mapped in order
@@ -1793,8 +1793,8 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		 */
 		if (old_id != mp_ioapics[apic].mp_apicid)
 			for (i = 0; i < mp_irq_entries; i++)
-				if (mp_irqs[i].mpc_dstapic == old_id)
-					mp_irqs[i].mpc_dstapic
+				if (mp_irqs[i].mp_dstapic == old_id)
+					mp_irqs[i].mp_dstapic
 						= mp_ioapics[apic].mp_apicid;
 
 		/*
@@ -2810,8 +2810,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 		return -1;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == mp_INT &&
-		    mp_irqs[i].mpc_srcbusirq == bus_irq)
+		if (mp_irqs[i].mp_irqtype == mp_INT &&
+		    mp_irqs[i].mp_srcbusirq == bus_irq)
 			break;
 	if (i >= mp_irq_entries)
 		return -1;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 4555ad8c2070..e7f1476ed537 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -108,7 +108,7 @@ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
 
 /* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 /* # of MP IRQ source entries */
 int mp_irq_entries;
@@ -453,10 +453,10 @@ static int find_irq_entry(int apic, int pin, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == type &&
-		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid ||
-		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-		    mp_irqs[i].mpc_dstirq == pin)
+		if (mp_irqs[i].mp_irqtype == type &&
+		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].mp_dstirq == pin)
 			return i;
 
 	return -1;
@@ -470,13 +470,13 @@ static int __init find_isa_irq_pin(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 
-			return mp_irqs[i].mpc_dstirq;
+			return mp_irqs[i].mp_dstirq;
 	}
 	return -1;
 }
@@ -486,17 +486,17 @@ static int __init find_isa_irq_apic(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 			break;
 	}
 	if (i < mp_irq_entries) {
 		int apic;
 		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
 				return apic;
 		}
 	}
@@ -521,23 +521,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 		return -1;
 	}
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic ||
-			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+			    mp_irqs[i].mp_dstapic == MP_APIC_ALL)
 				break;
 
 		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].mpc_irqtype &&
+		    !mp_irqs[i].mp_irqtype &&
 		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
 
 			if (!(apic || IO_APIC_IRQ(irq)))
 				continue;
 
-			if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+			if (pin == (mp_irqs[i].mp_srcbusirq & 3))
 				return irq;
 			/*
 			 * Use the first all-but-pin matching entry as a
@@ -565,13 +565,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 
 static int MPBIOS_polarity(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int polarity;
 
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].mpc_irqflag & 3)
+	switch (mp_irqs[idx].mp_irqflag & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent polarity */
 			if (test_bit(bus, mp_bus_not_pci))
@@ -607,13 +607,13 @@ static int MPBIOS_polarity(int idx)
 
 static int MPBIOS_trigger(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int trigger;
 
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+	switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent */
 			if (test_bit(bus, mp_bus_not_pci))
@@ -660,16 +660,16 @@ static inline int irq_trigger(int idx)
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 
 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
-	if (mp_irqs[idx].mpc_dstirq != pin)
+	if (mp_irqs[idx].mp_dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
 
 	if (test_bit(bus, mp_bus_not_pci)) {
-		irq = mp_irqs[idx].mpc_srcbusirq;
+		irq = mp_irqs[idx].mp_srcbusirq;
 	} else {
 		/*
 		 * PCI IRQs are mapped in order
@@ -2242,8 +2242,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 		return -1;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == mp_INT &&
-		    mp_irqs[i].mpc_srcbusirq == bus_irq)
+		if (mp_irqs[i].mp_irqtype == mp_INT &&
+		    mp_irqs[i].mp_srcbusirq == bus_irq)
 			break;
 	if (i >= mp_irq_entries)
 		return -1;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 9f1e5bf7f0fb..59f051db236d 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -186,12 +186,18 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
 
 static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
 {
-	mp_irqs[mp_irq_entries] = *m;
 	printk(KERN_INFO "Int: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 		m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+	mp_irqs[mp_irq_entries].mp_dstapic = m->mpc_dstapic;
+	mp_irqs[mp_irq_entries].mp_type = m->mpc_type;
+	mp_irqs[mp_irq_entries].mp_irqtype = m->mpc_irqtype;
+	mp_irqs[mp_irq_entries].mp_irqflag = m->mpc_irqflag;
+	mp_irqs[mp_irq_entries].mp_srcbus = m->mpc_srcbus;
+	mp_irqs[mp_irq_entries].mp_srcbusirq = m->mpc_srcbusirq;
+	mp_irqs[mp_irq_entries].mp_dstirq = m->mpc_dstirq;
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h
index ade76c0d03ae..86d8c3bdcca4 100644
--- a/include/asm-x86/io_apic.h
+++ b/include/asm-x86/io_apic.h
@@ -120,6 +120,16 @@ struct mp_config_ioapic {
 	unsigned char mp_flags;
 };
 
+struct mp_config_intsrc {
+	unsigned int mp_dstapic;
+	unsigned char mp_type;
+	unsigned char mp_irqtype;
+	unsigned short mp_irqflag;
+	unsigned char mp_srcbus;
+	unsigned char mp_srcbusirq;
+	unsigned char mp_dstirq;
+};
+
 /* I/O APIC entries */
 extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 
@@ -127,7 +137,7 @@ extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 extern int mp_irq_entries;
 
 /* MP IRQ source entries */
-extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 /* non-0 if default (table-less) MP configuration */
 extern int mpc_default_type;
-- 
cgit v1.2.3


From 59f4519ad7ade61123e90085b0dadb2ea197bd87 Mon Sep 17 00:00:00 2001
From: Soeren Sandmann <sandmann@daimi.au.dk>
Date: Sun, 18 May 2008 05:24:41 +0200
Subject: x86: initialize all fields of mp_irqs[mp_irq_entries]

Commit "x86: make config_irqsrc not MPspec specific" introduced some uses
of uninitialized fields in mp_config_acpi_legacy_irqs(). I need the
following patch to get sched-devel/master to boot.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/acpi/boot.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 276ec058f683..6170c6aeaf79 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1012,11 +1012,6 @@ void __init mp_config_acpi_legacy_irqs(void)
 	if (ioapic < 0)
 		return;
 
-	mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
-	mp_irqs[mp_irq_entries].mp_irqflag = 0;	/* Conforming */
-	mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
-	mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid;
-
 	/*
 	 * Use the default configuration for the IRQs 0-15.  Unless
 	 * overridden by (MADT) interrupt source override entries.
@@ -1044,6 +1039,10 @@ void __init mp_config_acpi_legacy_irqs(void)
 			continue;	/* IRQ already used */
 		}
 
+		mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
+		mp_irqs[mp_irq_entries].mp_irqflag = 0;	/* Conforming */
+		mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
+		mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid;
 		mp_irqs[mp_irq_entries].mp_irqtype = mp_INT;
 		mp_irqs[mp_irq_entries].mp_srcbusirq = i;	/* Identity mapped */
 		mp_irqs[mp_irq_entries].mp_dstirq = i;
-- 
cgit v1.2.3


From aafbdf71f1d3aeffd679b1a86e1b28f71515856c Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Thu, 22 May 2008 12:26:15 +0400
Subject: x86: fix mpparse/acpi interaction

Sitsofe Wheeler reported boot problems on linux-next.

It looks like the same issue as found by Soeren Sandman in 7575217f656a93,
"x86: initialize all fields of mp_irqs[mp_irq_entries]".

But his fix is also not complete, as dstapic is used before it assigned.

Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Bisected-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/acpi/boot.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6170c6aeaf79..2ddfabae382b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1019,6 +1019,11 @@ void __init mp_config_acpi_legacy_irqs(void)
 	for (i = 0; i < 16; i++) {
 		int idx;
 
+		mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
+		mp_irqs[mp_irq_entries].mp_irqflag = 0;	/* Conforming */
+		mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
+		mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid;
+
 		for (idx = 0; idx < mp_irq_entries; idx++) {
 			struct mp_config_intsrc *irq = mp_irqs + idx;
 
@@ -1039,10 +1044,6 @@ void __init mp_config_acpi_legacy_irqs(void)
 			continue;	/* IRQ already used */
 		}
 
-		mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
-		mp_irqs[mp_irq_entries].mp_irqflag = 0;	/* Conforming */
-		mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
-		mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid;
 		mp_irqs[mp_irq_entries].mp_irqtype = mp_INT;
 		mp_irqs[mp_irq_entries].mp_srcbusirq = i;	/* Identity mapped */
 		mp_irqs[mp_irq_entries].mp_dstirq = i;
-- 
cgit v1.2.3


From bf62f3981c7076714e3b9f5fa6989a806cad02bf Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 20 May 2008 20:10:58 -0700
Subject: x86: move e820_mark_nosave_regions to e820.c

and make e820_mark_nosave_regions to take limit_pfn to use max_low_pfn
for 32bit and end_pfn for 64bit

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820.c     | 32 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/e820_32.c  | 32 --------------------------------
 arch/x86/kernel/e820_64.c  | 32 --------------------------------
 arch/x86/kernel/setup_32.c |  2 +-
 arch/x86/kernel/setup_64.c |  2 +-
 include/asm-x86/e820.h     |  9 +++++++++
 include/asm-x86/e820_32.h  |  8 --------
 include/asm-x86/e820_64.h  |  1 -
 8 files changed, 43 insertions(+), 75 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 35da8cdbe5e6..0cd9132c9450 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/pfn.h>
+#include <linux/suspend.h>
 
 #include <asm/pgtable.h>
 #include <asm/page.h>
@@ -495,6 +496,37 @@ __init void e820_setup_gap(void)
 	       pci_mem_start, gapstart, gapsize);
 }
 
+#if defined(CONFIG_X86_64) || \
+	(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+/**
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+	int i;
+	unsigned long pfn;
+
+	pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+	for (i = 1; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (pfn < PFN_UP(ei->addr))
+			register_nosave_region(pfn, PFN_UP(ei->addr));
+
+		pfn = PFN_DOWN(ei->addr + ei->size);
+		if (ei->type != E820_RAM)
+			register_nosave_region(PFN_UP(ei->addr), pfn);
+
+		if (pfn >= limit_pfn)
+			break;
+	}
+}
+#endif
 
 /*
  * Early reserved memory areas.
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index dfe25751038b..db760d4706af 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -9,7 +9,6 @@
 #include <linux/mm.h>
 #include <linux/pfn.h>
 #include <linux/uaccess.h>
-#include <linux/suspend.h>
 
 #include <asm/pgtable.h>
 #include <asm/page.h>
@@ -208,37 +207,6 @@ void __init init_iomem_resources(struct resource *code_resource,
 	}
 }
 
-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
-/**
- * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
- * correspond to e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
-	int i;
-	unsigned long pfn;
-
-	pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
-	for (i = 1; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-
-		if (pfn < PFN_UP(ei->addr))
-			register_nosave_region(pfn, PFN_UP(ei->addr));
-
-		pfn = PFN_DOWN(ei->addr + ei->size);
-		if (ei->type != E820_RAM)
-			register_nosave_region(PFN_UP(ei->addr), pfn);
-
-		if (pfn >= max_low_pfn)
-			break;
-	}
-}
-#endif
-
 /*
  * Find the highest page frame number we have available
  */
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 3b2e0b1bb49b..5e063e72b24a 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -17,7 +17,6 @@
 #include <linux/kexec.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/suspend.h>
 #include <linux/pfn.h>
 #include <linux/pci.h>
 
@@ -93,37 +92,6 @@ void __init e820_reserve_resources(void)
 	}
 }
 
-/*
- * Find the ranges of physical addresses that do not correspond to
- * e820 RAM areas and mark the corresponding pages as nosave for software
- * suspend and suspend to RAM.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
-	int i;
-	unsigned long paddr;
-
-	paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
-	for (i = 1; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-
-		if (paddr < ei->addr)
-			register_nosave_region(PFN_DOWN(paddr),
-						PFN_UP(ei->addr));
-
-		paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
-		if (ei->type != E820_RAM)
-			register_nosave_region(PFN_UP(ei->addr),
-						PFN_DOWN(paddr));
-
-		if (paddr >= (end_pfn << PAGE_SHIFT))
-			break;
-	}
-}
-
 /*
  * Finds an active region in the address range from start_pfn to last_pfn and
  * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 3c451d143eba..e1173aecf69a 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -820,7 +820,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	e820_setup_gap();
-	e820_mark_nosave_regions();
+	e820_mark_nosave_regions(max_low_pfn);
 
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6dff1286ad8a..975a5da46e49 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -516,7 +516,7 @@ void __init setup_arch(char **cmdline_p)
 	 * We trust e820 completely. No explicit ROM probing in memory.
 	 */
 	e820_reserve_resources();
-	e820_mark_nosave_regions();
+	e820_mark_nosave_regions(end_pfn);
 
 	/* request I/O space for devices used on all i[345]86 PCs */
 	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 5a58e2bb1d78..4266a2c5f2e8 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -70,6 +70,15 @@ extern u64 update_memory_range(u64 start, u64 size, unsigned old_type,
 extern void update_e820(void);
 extern void e820_setup_gap(void);
 
+#if defined(CONFIG_X86_64) || \
+	(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+extern void e820_mark_nosave_regions(unsigned long limit_pfn);
+#else
+static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+}
+#endif
+
 extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
 extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
 extern void reserve_early(u64 start, u64 end, char *name);
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index 9576b438fbd9..7ace82570a36 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -28,13 +28,5 @@ extern void init_iomem_resources(struct resource *code_resource,
 				 struct resource *data_resource,
 				 struct resource *bss_resource);
 
-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
-extern void e820_mark_nosave_regions(void);
-#else
-static inline void e820_mark_nosave_regions(void)
-{
-}
-#endif
-
 #endif/*!__ASSEMBLY__*/
 #endif/*__E820_HEADER*/
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 37f176a02bc6..917963ccab69 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -18,7 +18,6 @@ extern void setup_memory_region(void);
 extern void contig_e820_setup(void);
 extern unsigned long e820_end_of_ram(void);
 extern void e820_reserve_resources(void);
-extern void e820_mark_nosave_regions(void);
 extern int e820_any_non_reserved(unsigned long start, unsigned long end);
 extern int is_memory_any_valid(unsigned long start, unsigned long end);
 extern int e820_all_non_reserved(unsigned long start, unsigned long end);
-- 
cgit v1.2.3


From b3e2416465bc9140bfd209f247e6a789f68f0d19 Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Fri, 23 May 2008 01:54:44 +0400
Subject: x86: Set pic_mode only if local apic code is present

---
 arch/x86/kernel/mpparse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 59f051db236d..e61523a13ecd 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -609,7 +609,7 @@ static void __init __get_smp_config(unsigned early)
 
 	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
 	       mpf->mpf_specification);
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
 	if (mpf->mpf_feature2 & (1 << 7)) {
 		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
 		pic_mode = 1;
-- 
cgit v1.2.3


From f3918352909f839a7b0dbf9b3f81d2e183c46f88 Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Fri, 23 May 2008 01:54:51 +0400
Subject: x86: move pic_mode to apic_32.c

---
 arch/x86/kernel/apic_32.c     | 2 ++
 arch/x86/kernel/mpparse.c     | 2 --
 arch/x86/mach-visws/mpparse.c | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index fa1be1d62916..848c603457fe 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -76,6 +76,8 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
  */
 int apic_verbosity;
 
+int pic_mode;
+
 static unsigned int calibration_result;
 
 static int lapic_next_event(unsigned long delta,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e61523a13ecd..b72c04602ad2 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -48,8 +48,6 @@ int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
 
 static int mp_current_pci_id;
 
-int pic_mode;
-
 /*
  * Intel MP BIOS table parsing routines:
  */
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
index 57484e91ab90..b7d063e7de1f 100644
--- a/arch/x86/mach-visws/mpparse.c
+++ b/arch/x86/mach-visws/mpparse.c
@@ -11,8 +11,6 @@
 /* Have we found an MP table */
 int smp_found_config;
 
-int pic_mode;
-
 extern unsigned int __cpuinitdata maxcpus;
 
 /*
-- 
cgit v1.2.3


From bab4b27c00c4880737c18bb91138b1a7dd94164c Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Mon, 19 May 2008 19:47:03 +0400
Subject: x86: move smp_found_config

---
 arch/x86/kernel/apic_32.c           | 3 +++
 arch/x86/kernel/apic_64.c           | 3 +++
 arch/x86/kernel/mpparse.c           | 8 ++++----
 arch/x86/mach-visws/mpparse.c       | 5 ++---
 arch/x86/mach-voyager/voyager_smp.c | 5 -----
 5 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 848c603457fe..c304759f0834 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -78,6 +78,9 @@ int apic_verbosity;
 
 int pic_mode;
 
+/* Have we found an MP table */
+int smp_found_config;
+
 static unsigned int calibration_result;
 
 static int lapic_next_event(unsigned long delta,
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 5910020c3f24..54087f920f2f 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
  */
 int apic_verbosity;
 
+/* Have we found an MP table */
+int smp_found_config;
+
 static struct resource lapic_resource = {
 	.name = "Local APIC",
 	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b72c04602ad2..d67cd7600a21 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -32,9 +32,6 @@
 #include <mach_mpparse.h>
 #endif
 
-/* Have we found an MP table */
-int smp_found_config;
-
 /*
  * Various Linux-internal data structures created from the
  * MP-table.
@@ -639,7 +636,9 @@ static void __init __get_smp_config(unsigned early)
 		 * override the defaults.
 		 */
 		if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 0;
+#endif
 			printk(KERN_ERR
 			       "BIOS bug, MP table errors detected!...\n");
 			printk(KERN_ERR "... disabling SMP support. "
@@ -706,8 +705,9 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 		    !mpf_checksum((unsigned char *)bp, 16) &&
 		    ((mpf->mpf_specification == 1)
 		     || (mpf->mpf_specification == 4))) {
-
+#ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 1;
+#endif
 			mpf_found = mpf;
 #ifdef CONFIG_X86_32
 			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
index b7d063e7de1f..a2fb78c0d154 100644
--- a/arch/x86/mach-visws/mpparse.c
+++ b/arch/x86/mach-visws/mpparse.c
@@ -8,9 +8,6 @@
 #include "cobalt.h"
 #include "mach_apic.h"
 
-/* Have we found an MP table */
-int smp_found_config;
-
 extern unsigned int __cpuinitdata maxcpus;
 
 /*
@@ -74,7 +71,9 @@ void __init find_smp_config(void)
 	if (ncpus > maxcpus)
 		ncpus = maxcpus;
 
+#ifdef CONFIG_X86_LOCAL_APIC
 	smp_found_config = 1;
+#endif
 	while (ncpus--)
 		MP_processor_info(mp++);
 
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 7bbebbfe8c4e..8dedd01e909f 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -59,11 +59,6 @@ __u32 voyager_quad_processors = 0;
  * activity count.  Finally exported by i386_ksyms.c */
 static int voyager_extended_cpus = 1;
 
-/* Have we found an SMP box - used by time.c to do the profiling
-   interrupt for timeslicing; do not set to 1 until the per CPU timer
-   interrupt is active */
-int smp_found_config = 0;
-
 /* Used for the invalidate map that's also checked in the spinlock */
 static volatile unsigned long smp_invalidate_needed;
 
-- 
cgit v1.2.3


From ce6444d39fdea29dcf145d2d95fe9cdc850aa53c Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Mon, 19 May 2008 19:47:09 +0400
Subject: x86: mp_bus_id_to_pci_bus is not needed

---
 arch/x86/kernel/io_apic_32.c |  2 +-
 arch/x86/kernel/io_apic_64.c |  2 +-
 arch/x86/kernel/mpparse.c    | 10 ----------
 include/asm-x86/mpspec.h     |  2 --
 4 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index ea68c3e5ba1d..17b2e8f862d3 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -864,7 +864,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 
 	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
 		"slot:%d, pin:%d.\n", bus, slot, pin);
-	if (mp_bus_id_to_pci_bus[bus] == -1) {
+	if (test_bit(bus, mp_bus_not_pci)) {
 		printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
 		return -1;
 	}
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index e7f1476ed537..9637675ea555 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -516,7 +516,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 
 	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
 		bus, slot, pin);
-	if (mp_bus_id_to_pci_bus[bus] == -1) {
+	if (test_bit(bus, mp_bus_not_pci)) {
 		apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
 		return -1;
 	}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d67cd7600a21..83dfd6696c2a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -41,13 +41,6 @@ int mp_bus_id_to_type[MAX_MP_BUSSES];
 #endif
 
 DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
-
-static int mp_current_pci_id;
-
-/*
- * Intel MP BIOS table parsing routines:
- */
 
 /*
  * Checksum an MP configuration block.
@@ -101,7 +94,6 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 static void __init MP_bus_info(struct mpc_config_bus *m)
 {
 	char str[7];
-
 	memcpy(str, m->mpc_bustype, 6);
 	str[6] = 0;
 
@@ -130,8 +122,6 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 		mpc_oem_pci_bus(m, translation_table[mpc_record]);
 #endif
 		clear_bit(m->mpc_busid, mp_bus_not_pci);
-		mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-		mp_current_pci_id++;
 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
 	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index 57a991b9c053..b785ddd8d761 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -32,8 +32,6 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 
 extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
-extern int mp_bus_id_to_pci_bus[MAX_MP_BUSSES];
-
 extern unsigned int boot_cpu_physical_apicid;
 extern int smp_found_config;
 extern int mpc_default_type;
-- 
cgit v1.2.3


From 8732fc4b237fca3bd3cb0ec87ca8fb90271b0baf Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Mon, 19 May 2008 19:47:16 +0400
Subject: x86: move mp_bus_not_pci from mpparse.c

---
 arch/x86/kernel/io_apic_32.c |  6 ++++++
 arch/x86/kernel/io_apic_64.c |  2 ++
 arch/x86/kernel/mpparse.c    | 10 ----------
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 17b2e8f862d3..2285b81ad1d5 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -81,6 +81,12 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 /* # of MP IRQ source entries */
 int mp_irq_entries;
 
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
 static int disable_timer_pin_1 __initdata;
 
 /*
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 9637675ea555..339cf6f926dc 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -113,6 +113,8 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 /* # of MP IRQ source entries */
 int mp_irq_entries;
 
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
 /*
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 83dfd6696c2a..e8e041ed84d6 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -32,16 +32,6 @@
 #include <mach_mpparse.h>
 #endif
 
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-
 /*
  * Checksum an MP configuration block.
  */
-- 
cgit v1.2.3


From 136ef671df04dc157afa0d4b96c7bd23ba072c9c Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Tue, 20 May 2008 00:29:59 +0400
Subject: x86: allow MPPARSE to be deselected in SMP configs

---
 arch/x86/Kconfig.debug     | 2 +-
 arch/x86/kernel/setup_32.c | 2 +-
 arch/x86/kernel/setup_64.c | 4 ++++
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index ac1e31ba4795..d5b364b43a02 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -127,7 +127,7 @@ config 4KSTACKS
 
 config X86_FIND_SMP_CONFIG
 	def_bool y
-	depends on X86_LOCAL_APIC || X86_VOYAGER
+	depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS
 	depends on X86_32
 
 config X86_MPPARSE
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index e1173aecf69a..c04e8c91daf7 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -814,7 +814,7 @@ void __init setup_arch(char **cmdline_p)
 			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
 #endif
 #endif
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
 	if (smp_found_config)
 		get_smp_config();
 #endif
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 975a5da46e49..89e6cca5d693 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -456,10 +456,12 @@ void __init setup_arch(char **cmdline_p)
 	if (efi_enabled)
 		efi_reserve_bootmem();
 
+#ifdef CONFIG_X86_MPPARSE
        /*
 	* Find and reserve possible boot-time SMP configuration:
 	*/
 	find_smp_config();
+#endif
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
 		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -502,11 +504,13 @@ void __init setup_arch(char **cmdline_p)
 
 	init_cpu_to_node();
 
+#ifdef CONFIG_X86_MPPARSE
 	/*
 	 * get boot-time SMP configuration:
 	 */
 	if (smp_found_config)
 		get_smp_config();
+#endif
 	init_apic_mappings();
 	ioapic_init_mappings();
 
-- 
cgit v1.2.3


From b764a15f679942a7bc9d4f9645299e1defcc5b43 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 22 May 2008 21:22:04 +0100
Subject: x86: Switch apm to unlocked_kernel

This pushes the lock a fair way down and the final kill looks like it
should be an easy project for someone who wants to have a shot at it.

Signed-off-by: Alan Cox <alan@redhat.com>
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apm_32.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9290e29013..00e6d1370954 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,6 +228,7 @@
 #include <linux/suspend.h>
 #include <linux/kthread.h>
 #include <linux/jiffies.h>
+#include <linux/smp_lock.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -1149,7 +1150,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
 				as->event_tail = 0;
 		}
 		as->events[as->event_head] = event;
-		if ((!as->suser) || (!as->writer))
+		if (!as->suser || !as->writer)
 			continue;
 		switch (event) {
 		case APM_SYS_SUSPEND:
@@ -1396,7 +1397,7 @@ static void apm_mainloop(void)
 
 static int check_apm_user(struct apm_user *as, const char *func)
 {
-	if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
+	if (as == NULL || as->magic != APM_BIOS_MAGIC) {
 		printk(KERN_ERR "apm: %s passed bad filp\n", func);
 		return 1;
 	}
@@ -1459,18 +1460,19 @@ static unsigned int do_poll(struct file *fp, poll_table *wait)
 	return 0;
 }
 
-static int do_ioctl(struct inode *inode, struct file *filp,
-		    u_int cmd, u_long arg)
+static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
 {
 	struct apm_user *as;
+	int ret;
 
 	as = filp->private_data;
 	if (check_apm_user(as, "ioctl"))
 		return -EIO;
-	if ((!as->suser) || (!as->writer))
+	if (!as->suser || !as->writer)
 		return -EPERM;
 	switch (cmd) {
 	case APM_IOC_STANDBY:
+		lock_kernel();
 		if (as->standbys_read > 0) {
 			as->standbys_read--;
 			as->standbys_pending--;
@@ -1479,8 +1481,10 @@ static int do_ioctl(struct inode *inode, struct file *filp,
 			queue_event(APM_USER_STANDBY, as);
 		if (standbys_pending <= 0)
 			standby();
+		unlock_kernel();
 		break;
 	case APM_IOC_SUSPEND:
+		lock_kernel();
 		if (as->suspends_read > 0) {
 			as->suspends_read--;
 			as->suspends_pending--;
@@ -1488,16 +1492,17 @@ static int do_ioctl(struct inode *inode, struct file *filp,
 		} else
 			queue_event(APM_USER_SUSPEND, as);
 		if (suspends_pending <= 0) {
-			return suspend(1);
+			ret = suspend(1);
 		} else {
 			as->suspend_wait = 1;
 			wait_event_interruptible(apm_suspend_waitqueue,
 					as->suspend_wait == 0);
-			return as->suspend_result;
+			ret = as->suspend_result;
 		}
-		break;
+		unlock_kernel();
+		return ret;
 	default:
-		return -EINVAL;
+		return -ENOTTY;
 	}
 	return 0;
 }
@@ -1860,7 +1865,7 @@ static const struct file_operations apm_bios_fops = {
 	.owner		= THIS_MODULE,
 	.read		= do_read,
 	.poll		= do_poll,
-	.ioctl		= do_ioctl,
+	.unlocked_ioctl	= do_ioctl,
 	.open		= do_open,
 	.release	= do_release,
 };
-- 
cgit v1.2.3


From 85cc35fa7255d113b5383a9c8536c363274bb475 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 25 May 2008 21:21:36 +0200
Subject: x86: fix mpparse fallout

UP builds with LOCAL_APIC=y and IO_APIC=n fail with a missing
reference to mp_bus_not_pci. Distangle the mpparse code some more and
move the ioapic specific bus check into a separate function.

This code needs sume urgent un#ifdef surgery all over the place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mpparse.c | 73 ++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e8e041ed84d6..9f3792d55044 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -81,6 +81,7 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 	generic_processor_info(apicid, m->mpc_apicver);
 }
 
+#ifdef CONFIG_X86_IO_APIC
 static void __init MP_bus_info(struct mpc_config_bus *m)
 {
 	char str[7];
@@ -122,6 +123,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 	} else
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
+#endif
 
 #ifdef CONFIG_X86_IO_APIC
 
@@ -336,7 +338,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 			{
 				struct mpc_config_bus *m =
 				    (struct mpc_config_bus *)mpt;
+#ifdef CONFIG_X86_IO_APIC
 				MP_bus_info(m);
+#endif
 				mpt += sizeof(*m);
 				count += sizeof(*m);
 				break;
@@ -472,40 +476,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
 	MP_intsrc_info(&intsrc);
 }
 
-#endif
 
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+static void construct_ioapic_table(int mpc_default_type)
 {
-	struct mpc_config_processor processor;
-	struct mpc_config_bus bus;
-#ifdef CONFIG_X86_IO_APIC
 	struct mpc_config_ioapic ioapic;
-#endif
-	struct mpc_config_lintsrc lintsrc;
-	int linttypes[2] = { mp_ExtINT, mp_NMI };
-	int i;
-
-	/*
-	 * local APIC has default address
-	 */
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-	/*
-	 * 2 CPUs, numbered 0 & 1.
-	 */
-	processor.mpc_type = MP_PROCESSOR;
-	/* Either an integrated APIC or a discrete 82489DX. */
-	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-	processor.mpc_cpuflag = CPU_ENABLED;
-	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-	processor.mpc_reserved[0] = 0;
-	processor.mpc_reserved[1] = 0;
-	for (i = 0; i < 2; i++) {
-		processor.mpc_apicid = i;
-		MP_processor_info(&processor);
-	}
+	struct mpc_config_bus bus;
 
 	bus.mpc_type = MP_BUS;
 	bus.mpc_busid = 0;
@@ -534,7 +509,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 		MP_bus_info(&bus);
 	}
 
-#ifdef CONFIG_X86_IO_APIC
 	ioapic.mpc_type = MP_IOAPIC;
 	ioapic.mpc_apicid = 2;
 	ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -546,7 +520,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 	 * We set up most of the low 16 IO-APIC pins according to MPS rules.
 	 */
 	construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void construct_ioapic_table(int mpc_default_type) { }
 #endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+	struct mpc_config_processor processor;
+	struct mpc_config_lintsrc lintsrc;
+	int linttypes[2] = { mp_ExtINT, mp_NMI };
+	int i;
+
+	/*
+	 * local APIC has default address
+	 */
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+	/*
+	 * 2 CPUs, numbered 0 & 1.
+	 */
+	processor.mpc_type = MP_PROCESSOR;
+	/* Either an integrated APIC or a discrete 82489DX. */
+	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	processor.mpc_cpuflag = CPU_ENABLED;
+	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_reserved[0] = 0;
+	processor.mpc_reserved[1] = 0;
+	for (i = 0; i < 2; i++) {
+		processor.mpc_apicid = i;
+		MP_processor_info(&processor);
+	}
+
+	construct_ioapic_table(mpc_default_type);
+
 	lintsrc.mpc_type = MP_LINTSRC;
 	lintsrc.mpc_irqflag = 0;	/* conforming */
 	lintsrc.mpc_srcbusid = 0;
-- 
cgit v1.2.3


From ddca03c98a7f7ad5ab09967ff52d4ed60358c896 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:31 +0400
Subject: x86: nmi - unify die_nmi() interface

By slightly changing 32bit mode die_nmi() we may unify the
interface and make it common for both (32/64bit) modes

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_32.c   | 7 +++----
 arch/x86/kernel/traps_32.c | 8 +++++---
 include/asm-x86/nmi.h      | 2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 69bdae555c18..bd04a28f7a5c 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -320,8 +320,6 @@ void touch_nmi_watchdog(void)
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
 
-extern void die_nmi(struct pt_regs *, const char *msg);
-
 notrace __kprobes int
 nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 {
@@ -375,7 +373,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 			/*
 			 * die_nmi will return ONLY if NOTIFY_STOP happens..
 			 */
-			die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
+			die_nmi("BUG: NMI Watchdog detected LOCKUP",
+				regs, 0);
 	} else {
 		__get_cpu_var(last_irq_sum) = sum;
 		local_set(&__get_cpu_var(alert_counter), 0);
@@ -406,7 +405,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 	char buf[64];
 
 	sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-	die_nmi(regs, buf);
+	die_nmi(buf, regs, 0);
 	return 0;
 }
 
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index bde6f63e15d5..f31e6651fa6f 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -755,9 +755,9 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 
 static DEFINE_SPINLOCK(nmi_print_lock);
 
-void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
 {
-	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
 		return;
 
 	spin_lock(&nmi_print_lock);
@@ -766,10 +766,12 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
 	* to get a message out:
 	*/
 	bust_spinlocks(1);
-	printk(KERN_EMERG "%s", msg);
+	printk(KERN_EMERG "%s", str);
 	printk(" on CPU%d, ip %08lx, registers:\n",
 		smp_processor_id(), regs->ip);
 	show_registers(regs);
+	if (do_panic)
+		panic("Non maskable interrupt");
 	console_silent();
 	spin_unlock(&nmi_print_lock);
 	bust_spinlocks(0);
diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h
index 8455bf36d8df..7cd5b6a88138 100644
--- a/include/asm-x86/nmi.h
+++ b/include/asm-x86/nmi.h
@@ -38,12 +38,12 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev)
 
 #ifdef CONFIG_X86_64
 extern void default_do_nmi(struct pt_regs *);
-extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern void nmi_watchdog_default(void);
 #else
 #define nmi_watchdog_default() do {} while (0)
 #endif
 
+extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
 extern int nmi_watchdog_enabled;
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
-- 
cgit v1.2.3


From e56b3a12c45e439169f2d1f7194be397667320a6 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:32 +0400
Subject: x86: nmi - die_nmi() output message unification

Make 64bit die_nmi() to produce the same message as 32bit mode has

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_64.c   | 4 ++--
 arch/x86/kernel/traps_64.c | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 33cb8ecbb6dd..1a98705270a2 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -358,8 +358,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 		 */
 		local_inc(&__get_cpu_var(alert_counter));
 		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
-			die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
-				panic_on_timeout);
+			die_nmi("NMI Watchdog detected LOCKUP",
+				regs, panic_on_timeout);
 	} else {
 		__get_cpu_var(last_irq_sum) = sum;
 		local_set(&__get_cpu_var(alert_counter), 0);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index adff76ea97c4..6a048ce3c787 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -614,7 +614,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
 	 * We are in trouble anyway, lets at least try
 	 * to get a message out.
 	 */
-	printk(str, smp_processor_id());
+	printk(KERN_EMERG "%s", str);
+	printk(" on CPU%d, ip %08lx, registers:\n",
+		smp_processor_id(), regs->ip);
 	show_registers(regs);
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
-- 
cgit v1.2.3


From c6425b9f143a75bbcd0a7684b4df40e20d0b2f32 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:33 +0400
Subject: x86: move do_nmi(), stop_nmi() and restart_nmi() to traps_64.c

traps_32.c already holds these functions so do the same for traps_64.c

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_64.c   | 25 -------------------------
 arch/x86/kernel/traps_64.c | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 1a98705270a2..c1ea671525e5 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -30,7 +30,6 @@
 
 int unknown_nmi_panic;
 int nmi_watchdog_enabled;
-int panic_on_unrecovered_nmi;
 
 static cpumask_t backtrace_mask = CPU_MASK_NONE;
 
@@ -383,30 +382,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 	return rc;
 }
 
-static unsigned ignore_nmis;
-
-asmlinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
-{
-	nmi_enter();
-	add_pda(__nmi_count,1);
-	if (!ignore_nmis)
-		default_do_nmi(regs);
-	nmi_exit();
-}
-
-void stop_nmi(void)
-{
-	acpi_nmi_disable();
-	ignore_nmis++;
-}
-
-void restart_nmi(void)
-{
-	ignore_nmis--;
-	acpi_nmi_enable();
-}
-
 #ifdef CONFIG_SYSCTL
 
 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 6a048ce3c787..e4a380797dc1 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -76,7 +76,9 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);
 
+int panic_on_unrecovered_nmi;
 static unsigned int code_bytes = 64;
+static unsigned ignore_nmis;
 
 static inline void conditional_sti(struct pt_regs *regs)
 {
@@ -867,6 +869,28 @@ asmlinkage notrace  __kprobes void default_do_nmi(struct pt_regs *regs)
 		io_check_error(reason, regs);
 }
 
+asmlinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
+{
+	nmi_enter();
+	add_pda(__nmi_count, 1);
+	if (!ignore_nmis)
+		default_do_nmi(regs);
+	nmi_exit();
+}
+
+void stop_nmi(void)
+{
+	acpi_nmi_disable();
+	ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+	ignore_nmis--;
+	acpi_nmi_enable();
+}
+
 /* runs on IST stack. */
 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
 {
-- 
cgit v1.2.3


From d1b946b97d71423f365fa797d1428e1847c0bec1 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:34 +0400
Subject: x86: nmi_32.c - add "panic" option

Allow to pass "panic" option in 32bit mode

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_32.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index bd04a28f7a5c..4437fe1edabc 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -42,6 +42,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
  *  0: the lapic NMI watchdog is disabled, but can be enabled
  */
 atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
+static int panic_on_timeout;
 
 unsigned int nmi_watchdog = NMI_DEFAULT;
 static unsigned int nmi_hz = HZ;
@@ -140,6 +141,14 @@ static int __init setup_nmi_watchdog(char *str)
 {
 	int nmi;
 
+	if (!strncmp(str, "panic", 5)) {
+		panic_on_timeout = 1;
+		str = strchr(str, ',');
+		if (!str)
+			return 1;
+		++str;
+	}
+
 	get_option(&str, &nmi);
 
 	if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
@@ -374,7 +383,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 			 * die_nmi will return ONLY if NOTIFY_STOP happens..
 			 */
 			die_nmi("BUG: NMI Watchdog detected LOCKUP",
-				regs, 0);
+				regs, panic_on_timeout);
 	} else {
 		__get_cpu_var(last_irq_sum) = sum;
 		local_set(&__get_cpu_var(alert_counter), 0);
-- 
cgit v1.2.3


From 4b82b277707a39b97271439c475f186f63ec4692 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:35 +0400
Subject: x86: nmi_32.c - add nmi_watchdog_default helper

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_32.c | 19 +++++++++++++------
 include/asm-x86/nmi.h    |  4 +---
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 4437fe1edabc..7714e8478213 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -51,6 +51,17 @@ static DEFINE_PER_CPU(short, wd_enabled);
 
 static int endflag __initdata = 0;
 
+/* Run after command line and cpu_init init, but before all other checks */
+void nmi_watchdog_default(void)
+{
+	if (nmi_watchdog != NMI_DEFAULT)
+		return;
+	if (lapic_watchdog_ok())
+		nmi_watchdog = NMI_LOCAL_APIC;
+	else
+		nmi_watchdog = NMI_IO_APIC;
+}
+
 #ifdef CONFIG_SMP
 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
  * the CPU is idle. To make sure the NMI watchdog really ticks on all
@@ -437,12 +448,8 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 		return -EIO;
 	}
 
-	if (nmi_watchdog == NMI_DEFAULT) {
-		if (lapic_watchdog_ok())
-			nmi_watchdog = NMI_LOCAL_APIC;
-		else
-			nmi_watchdog = NMI_IO_APIC;
-	}
+	/* if nmi_watchdog is not set yet, then set it */
+	nmi_watchdog_default();
 
 	if (nmi_watchdog == NMI_LOCAL_APIC) {
 		if (nmi_watchdog_enabled)
diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h
index 7cd5b6a88138..1e8f34d7ab68 100644
--- a/include/asm-x86/nmi.h
+++ b/include/asm-x86/nmi.h
@@ -38,11 +38,9 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev)
 
 #ifdef CONFIG_X86_64
 extern void default_do_nmi(struct pt_regs *);
-extern void nmi_watchdog_default(void);
-#else
-#define nmi_watchdog_default() do {} while (0)
 #endif
 
+extern void nmi_watchdog_default(void);
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
 extern int nmi_watchdog_enabled;
-- 
cgit v1.2.3


From ad63ba169d45ccb6594eb35a6c31c421fe70ff45 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:36 +0400
Subject: x86: nmi_32/64.c - use apic_write_around instead of apic_write

apic_write_around will be expanded to apic_write in 64bit mode
anyway. Only a few CPUs (well, old CPUs to be precise) requires
such an action. In general it should not hurt and could be cleaned
up for apic_write (just in case)

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_32.c | 2 +-
 arch/x86/kernel/nmi_64.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 7714e8478213..ec5842b4dd68 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -248,7 +248,7 @@ void acpi_nmi_enable(void)
 
 static void __acpi_nmi_disable(void *__unused)
 {
-	apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+	apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
 }
 
 /*
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index c1ea671525e5..f546e3164a74 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -215,7 +215,7 @@ late_initcall(init_lapic_nmi_sysfs);
 
 static void __acpi_nmi_enable(void *__unused)
 {
-	apic_write(APIC_LVT0, APIC_DM_NMI);
+	apic_write_around(APIC_LVT0, APIC_DM_NMI);
 }
 
 /*
@@ -229,7 +229,7 @@ void acpi_nmi_enable(void)
 
 static void __acpi_nmi_disable(void *__unused)
 {
-	apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+	apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
 }
 
 /*
-- 
cgit v1.2.3


From 6c8decdf14b17d42f6eb817d310642f4d6598a19 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:37 +0400
Subject: x86: nmi_32.c - unknown_nmi_panic_callback should always panic

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index ec5842b4dd68..d0b0684bde9e 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -425,7 +425,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 	char buf[64];
 
 	sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-	die_nmi(buf, regs, 0);
+	die_nmi(buf, regs, 1); /* Always panic here */
 	return 0;
 }
 
-- 
cgit v1.2.3


From 96f9dcb10755e96eae706b9e869c0dc25adb3d74 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:38 +0400
Subject: x86: nmi_64.c - use for_each_possible_cpu helper

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index f546e3164a74..d98b21cd4928 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -98,7 +98,7 @@ int __init check_nmi_watchdog(void)
 		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 #endif
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_possible_cpu(cpu)
 		prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
 	local_irq_enable();
 	mdelay((20*1000)/nmi_hz); // wait 20 ticks
-- 
cgit v1.2.3


From 7c2ba83f9a479eee6f302147767a30f3187fbd4b Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:39 +0400
Subject: x86: nmi_32.c cleanup - use for_each_online_cpu helper

Since cpu_online_map is touched (by for_each_online_cpu)
at moment when cpu_callin_map is already filled up we can
get rid of its checking at all

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_32.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index d0b0684bde9e..c6d0777aaf90 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -108,13 +108,7 @@ int __init check_nmi_watchdog(void)
 	local_irq_enable();
 	mdelay((20*1000)/nmi_hz); // wait 20 ticks
 
-	for_each_possible_cpu(cpu) {
-#ifdef CONFIG_SMP
-		/* Check cpu_callin_map here because that is set
-		   after the timer is started. */
-		if (!cpu_isset(cpu, cpu_callin_map))
-			continue;
-#endif
+	for_each_online_cpu(cpu) {
 		if (!per_cpu(wd_enabled, cpu))
 			continue;
 		if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
-- 
cgit v1.2.3


From fd5cea02de100197a4c26d9e103508cf09b50a82 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:40 +0400
Subject: x86: nmi_32/64.c - add helper functions to hide arch specific data

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_32.c | 44 ++++++++++++++++++++++++++++++++------------
 arch/x86/kernel/nmi_64.c | 42 +++++++++++++++++++++++++++++++-----------
 2 files changed, 63 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index c6d0777aaf90..a94dbedf2b74 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -51,6 +51,26 @@ static DEFINE_PER_CPU(short, wd_enabled);
 
 static int endflag __initdata = 0;
 
+static inline unsigned int get_nmi_count(int cpu)
+{
+	return nmi_count(cpu);
+}
+
+static inline int mce_in_progress(void)
+{
+	return 0;
+}
+
+/*
+ * Take the local apic timer and PIT/HPET into account. We don't
+ * know which one is active, when we have highres/dyntick on
+ */
+static inline unsigned int get_timer_irqs(int cpu)
+{
+	return per_cpu(irq_stat, cpu).apic_timer_irqs +
+		per_cpu(irq_stat, cpu).irq0_irqs;
+}
+
 /* Run after command line and cpu_init init, but before all other checks */
 void nmi_watchdog_default(void)
 {
@@ -104,19 +124,19 @@ int __init check_nmi_watchdog(void)
 #endif
 
 	for_each_possible_cpu(cpu)
-		prev_nmi_count[cpu] = nmi_count(cpu);
+		prev_nmi_count[cpu] = get_nmi_count(cpu);
 	local_irq_enable();
 	mdelay((20*1000)/nmi_hz); // wait 20 ticks
 
 	for_each_online_cpu(cpu) {
 		if (!per_cpu(wd_enabled, cpu))
 			continue;
-		if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
 			printk(KERN_WARNING "WARNING: CPU#%d: NMI "
 				"appears to be stuck (%d->%d)!\n",
 				cpu,
 				prev_nmi_count[cpu],
-				nmi_count(cpu));
+				get_nmi_count(cpu));
 			per_cpu(wd_enabled, cpu) = 0;
 			atomic_dec(&nmi_active);
 		}
@@ -355,6 +375,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 		touched = 1;
 	}
 
+	sum = get_timer_irqs(cpu);
+
+	if (__get_cpu_var(nmi_touch)) {
+		__get_cpu_var(nmi_touch) = 0;
+		touched = 1;
+	}
+
 	if (cpu_isset(cpu, backtrace_mask)) {
 		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
 
@@ -365,16 +392,9 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 		cpu_clear(cpu, backtrace_mask);
 	}
 
-	/*
-	 * Take the local apic timer and PIT/HPET into account. We don't
-	 * know which one is active, when we have highres/dyntick on
-	 */
-	sum = per_cpu(irq_stat, cpu).apic_timer_irqs +
-		per_cpu(irq_stat, cpu).irq0_irqs;
-	if (__get_cpu_var(nmi_touch)) {
-		__get_cpu_var(nmi_touch) = 0;
+	/* Could check oops_in_progress here too, but it's safer not to */
+	if (mce_in_progress())
 		touched = 1;
-	}
 
 	/* if the none of the timers isn't firing, this cpu isn't doing much */
 	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index d98b21cd4928..c6802447262c 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -47,6 +47,30 @@ static unsigned int nmi_hz = HZ;
 
 static DEFINE_PER_CPU(short, wd_enabled);
 
+static int endflag __initdata = 0;
+
+static inline unsigned int get_nmi_count(int cpu)
+{
+	return cpu_pda(cpu)->__nmi_count;
+}
+
+static inline int mce_in_progress(void)
+{
+#ifdef CONFIG_X86_MCE
+	return atomic_read(&mce_entry) > 0;
+#endif
+	return 0;
+}
+
+/*
+ * Take the local apic timer and PIT/HPET into account. We don't
+ * know which one is active, when we have highres/dyntick on
+ */
+static inline unsigned int get_timer_irqs(int cpu)
+{
+	return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
+}
+
 /* Run after command line and cpu_init init, but before all other checks */
 void nmi_watchdog_default(void)
 {
@@ -55,8 +79,6 @@ void nmi_watchdog_default(void)
 	nmi_watchdog = NMI_NONE;
 }
 
-static int endflag __initdata = 0;
-
 #ifdef CONFIG_SMP
 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
  * the CPU is idle. To make sure the NMI watchdog really ticks on all
@@ -99,19 +121,19 @@ int __init check_nmi_watchdog(void)
 #endif
 
 	for_each_possible_cpu(cpu)
-		prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
+		prev_nmi_count[cpu] = get_nmi_count(cpu);
 	local_irq_enable();
 	mdelay((20*1000)/nmi_hz); // wait 20 ticks
 
 	for_each_online_cpu(cpu) {
 		if (!per_cpu(wd_enabled, cpu))
 			continue;
-		if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) {
+		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
 			printk(KERN_WARNING "WARNING: CPU#%d: NMI "
 			       "appears to be stuck (%d->%d)!\n",
 				cpu,
 				prev_nmi_count[cpu],
-				cpu_pda(cpu)->__nmi_count);
+				get_nmi_count(cpu));
 			per_cpu(wd_enabled, cpu) = 0;
 			atomic_dec(&nmi_active);
 		}
@@ -327,7 +349,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 		touched = 1;
 	}
 
-	sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
+	sum = get_timer_irqs(cpu);
+
 	if (__get_cpu_var(nmi_touch)) {
 		__get_cpu_var(nmi_touch) = 0;
 		touched = 1;
@@ -343,12 +366,9 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 		cpu_clear(cpu, backtrace_mask);
 	}
 
-#ifdef CONFIG_X86_MCE
-	/* Could check oops_in_progress here too, but it's safer
-	   not too */
-	if (atomic_read(&mce_entry) > 0)
+	if (mce_in_progress())
 		touched = 1;
-#endif
+
 	/* if the apic timer isn't firing, this cpu isn't doing much */
 	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
 		/*
-- 
cgit v1.2.3


From 1798bc22b2790bf2a956588e6b17c36ef79ceff7 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 24 May 2008 19:36:41 +0400
Subject: x86: nmi_32/64.c - merge down nmi_32.c and nmi_64.c to nmi.c

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: hpa@zytor.com
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile |   2 +-
 arch/x86/kernel/nmi.c    | 533 +++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/nmi_32.c | 506 --------------------------------------------
 arch/x86/kernel/nmi_64.c | 477 ------------------------------------------
 4 files changed, 534 insertions(+), 984 deletions(-)
 create mode 100644 arch/x86/kernel/nmi.c
 delete mode 100644 arch/x86/kernel/nmi_32.c
 delete mode 100644 arch/x86/kernel/nmi_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..d5dd6668225f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -53,7 +53,7 @@ obj-$(CONFIG_X86_32_SMP)	+= smpcommon.o
 obj-$(CONFIG_X86_64_SMP)	+= tsc_sync.o smpcommon.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline_$(BITS).o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_$(BITS).o nmi_$(BITS).o
+obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_$(BITS).o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic_$(BITS).o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
new file mode 100644
index 000000000000..69a839fc1eb0
--- /dev/null
+++ b/arch/x86/kernel/nmi.c
@@ -0,0 +1,533 @@
+/*
+ *  NMI watchdog support on APIC systems
+ *
+ *  Started by Ingo Molnar <mingo@redhat.com>
+ *
+ *  Fixes:
+ *  Mikael Pettersson	: AMD K7 support for local APIC NMI watchdog.
+ *  Mikael Pettersson	: Power Management for local APIC NMI watchdog.
+ *  Mikael Pettersson	: Pentium 4 support for local APIC NMI watchdog.
+ *  Pavel Machek and
+ *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
+ */
+
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
+#include <linux/kprobes.h>
+#include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
+#include <linux/kdebug.h>
+
+#include <asm/smp.h>
+#include <asm/nmi.h>
+#include <asm/proto.h>
+#include <asm/timer.h>
+
+#include <asm/mce.h>
+
+#include <mach_traps.h>
+
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
+/* nmi_active:
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
+ *     be enabled
+ *  0: the lapic NMI watchdog is disabled, but can be enabled
+ */
+atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
+static int panic_on_timeout;
+
+unsigned int nmi_watchdog = NMI_DEFAULT;
+
+static unsigned int nmi_hz = HZ;
+static DEFINE_PER_CPU(short, wd_enabled);
+static int endflag __initdata = 0;
+
+static inline unsigned int get_nmi_count(int cpu)
+{
+#ifdef CONFIG_X86_64
+	return cpu_pda(cpu)->__nmi_count;
+#else
+	return nmi_count(cpu);
+#endif
+}
+
+static inline int mce_in_progress(void)
+{
+#if defined(CONFIX_X86_64) && defined(CONFIG_X86_MCE)
+	return atomic_read(&mce_entry) > 0;
+#endif
+	return 0;
+}
+
+/*
+ * Take the local apic timer and PIT/HPET into account. We don't
+ * know which one is active, when we have highres/dyntick on
+ */
+static inline unsigned int get_timer_irqs(int cpu)
+{
+#ifdef CONFIG_X86_64
+	return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
+#else
+	return per_cpu(irq_stat, cpu).apic_timer_irqs +
+		per_cpu(irq_stat, cpu).irq0_irqs;
+#endif
+}
+
+/* Run after command line and cpu_init init, but before all other checks */
+void nmi_watchdog_default(void)
+{
+	if (nmi_watchdog != NMI_DEFAULT)
+		return;
+#ifdef CONFIG_X86_64
+	nmi_watchdog = NMI_NONE;
+#else
+	if (lapic_watchdog_ok())
+		nmi_watchdog = NMI_LOCAL_APIC;
+	else
+		nmi_watchdog = NMI_IO_APIC;
+#endif
+}
+
+#ifdef CONFIG_SMP
+/*
+ * The performance counters used by NMI_LOCAL_APIC don't trigger when
+ * the CPU is idle. To make sure the NMI watchdog really ticks on all
+ * CPUs during the test make them busy.
+ */
+static __init void nmi_cpu_busy(void *data)
+{
+	local_irq_enable_in_hardirq();
+	/*
+	 * Intentionally don't use cpu_relax here. This is
+	 * to make sure that the performance counter really ticks,
+	 * even if there is a simulator or similar that catches the
+	 * pause instruction. On a real HT machine this is fine because
+	 * all other CPUs are busy with "useless" delay loops and don't
+	 * care if they get somewhat less cycles.
+	 */
+	while (endflag == 0)
+		mb();
+}
+#endif
+
+int __init check_nmi_watchdog(void)
+{
+	unsigned int *prev_nmi_count;
+	int cpu;
+
+	if (nmi_watchdog == NMI_NONE || nmi_watchdog == NMI_DISABLED)
+		return 0;
+
+	if (!atomic_read(&nmi_active))
+		return 0;
+
+	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+	if (!prev_nmi_count)
+		goto error;
+
+	printk(KERN_INFO "Testing NMI watchdog ... ");
+
+#ifdef CONFIG_SMP
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+#endif
+
+	for_each_possible_cpu(cpu)
+		prev_nmi_count[cpu] = get_nmi_count(cpu);
+	local_irq_enable();
+	mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
+
+	for_each_online_cpu(cpu) {
+		if (!per_cpu(wd_enabled, cpu))
+			continue;
+		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+			printk(KERN_WARNING "WARNING: CPU#%d: NMI "
+				"appears to be stuck (%d->%d)!\n",
+				cpu,
+				prev_nmi_count[cpu],
+				get_nmi_count(cpu));
+			per_cpu(wd_enabled, cpu) = 0;
+			atomic_dec(&nmi_active);
+		}
+	}
+	endflag = 1;
+	if (!atomic_read(&nmi_active)) {
+		kfree(prev_nmi_count);
+		atomic_set(&nmi_active, -1);
+		goto error;
+	}
+	printk("OK.\n");
+
+	/*
+	 * now that we know it works we can reduce NMI frequency to
+	 * something more reasonable; makes a difference in some configs
+	 */
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		nmi_hz = lapic_adjust_nmi_hz(1);
+
+	kfree(prev_nmi_count);
+	return 0;
+
+error:
+#ifdef CONFIG_X86_32
+	timer_ack = !cpu_has_tsc;
+#endif
+	return -1;
+}
+
+static int __init setup_nmi_watchdog(char *str)
+{
+	int nmi;
+
+	if (!strncmp(str, "panic", 5)) {
+		panic_on_timeout = 1;
+		str = strchr(str, ',');
+		if (!str)
+			return 1;
+		++str;
+	}
+
+	get_option(&str, &nmi);
+
+	if (nmi >= NMI_INVALID || nmi < NMI_NONE)
+		return 0;
+
+	nmi_watchdog = nmi;
+	return 1;
+}
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+/*
+ * Suspend/resume support
+ */
+#ifdef CONFIG_PM
+
+static int nmi_pm_active; /* nmi_active before suspend */
+
+static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
+{
+	/* only CPU0 goes here, other CPUs should be offline */
+	nmi_pm_active = atomic_read(&nmi_active);
+	stop_apic_nmi_watchdog(NULL);
+	BUG_ON(atomic_read(&nmi_active) != 0);
+	return 0;
+}
+
+static int lapic_nmi_resume(struct sys_device *dev)
+{
+	/* only CPU0 goes here, other CPUs should be offline */
+	if (nmi_pm_active > 0) {
+		setup_apic_nmi_watchdog(NULL);
+		touch_nmi_watchdog();
+	}
+	return 0;
+}
+
+static struct sysdev_class nmi_sysclass = {
+	.name		= "lapic_nmi",
+	.resume		= lapic_nmi_resume,
+	.suspend	= lapic_nmi_suspend,
+};
+
+static struct sys_device device_lapic_nmi = {
+	.id	= 0,
+	.cls	= &nmi_sysclass,
+};
+
+static int __init init_lapic_nmi_sysfs(void)
+{
+	int error;
+
+	/*
+	 * should really be a BUG_ON but b/c this is an
+	 * init call, it just doesn't work.  -dcz
+	 */
+	if (nmi_watchdog != NMI_LOCAL_APIC)
+		return 0;
+
+	if (atomic_read(&nmi_active) < 0)
+		return 0;
+
+	error = sysdev_class_register(&nmi_sysclass);
+	if (!error)
+		error = sysdev_register(&device_lapic_nmi);
+	return error;
+}
+
+/* must come after the local APIC's device_initcall() */
+late_initcall(init_lapic_nmi_sysfs);
+
+#endif	/* CONFIG_PM */
+
+static void __acpi_nmi_enable(void *__unused)
+{
+	apic_write_around(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_disable(void *__unused)
+{
+	apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+void setup_apic_nmi_watchdog(void *unused)
+{
+	if (__get_cpu_var(wd_enabled))
+		return;
+
+	/* cheap hack to support suspend/resume */
+	/* if cpu0 is not active neither should the other cpus */
+	if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
+		return;
+
+	switch (nmi_watchdog) {
+	case NMI_LOCAL_APIC:
+		 /* enable it before to avoid race with handler */
+		__get_cpu_var(wd_enabled) = 1;
+		if (lapic_watchdog_init(nmi_hz) < 0) {
+			__get_cpu_var(wd_enabled) = 0;
+			return;
+		}
+		/* FALL THROUGH */
+	case NMI_IO_APIC:
+		__get_cpu_var(wd_enabled) = 1;
+		atomic_inc(&nmi_active);
+	}
+}
+
+void stop_apic_nmi_watchdog(void *unused)
+{
+	/* only support LOCAL and IO APICs for now */
+	if (nmi_watchdog != NMI_LOCAL_APIC &&
+	    nmi_watchdog != NMI_IO_APIC)
+		return;
+	if (__get_cpu_var(wd_enabled) == 0)
+		return;
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		lapic_watchdog_stop();
+	__get_cpu_var(wd_enabled) = 0;
+	atomic_dec(&nmi_active);
+}
+
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up any console locks first ...
+ * [when there will be more tty-related locks, break them up here too!]
+ */
+
+static DEFINE_PER_CPU(unsigned, last_irq_sum);
+static DEFINE_PER_CPU(local_t, alert_counter);
+static DEFINE_PER_CPU(int, nmi_touch);
+
+void touch_nmi_watchdog(void)
+{
+	if (nmi_watchdog > 0) {
+		unsigned cpu;
+
+		/*
+		 * Tell other CPUs to reset their alert counters. We cannot
+		 * do it ourselves because the alert count increase is not
+		 * atomic.
+		 */
+		for_each_present_cpu(cpu) {
+			if (per_cpu(nmi_touch, cpu) != 1)
+				per_cpu(nmi_touch, cpu) = 1;
+		}
+	}
+
+	/*
+	 * Tickle the softlockup detector too:
+	 */
+	touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+notrace __kprobes int
+nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
+{
+	/*
+	 * Since current_thread_info()-> is always on the stack, and we
+	 * always switch the stack NMI-atomically, it's safe to use
+	 * smp_processor_id().
+	 */
+	unsigned int sum;
+	int touched = 0;
+	int cpu = smp_processor_id();
+	int rc = 0;
+
+	/* check for other users first */
+	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+			== NOTIFY_STOP) {
+		rc = 1;
+		touched = 1;
+	}
+
+	sum = get_timer_irqs(cpu);
+
+	if (__get_cpu_var(nmi_touch)) {
+		__get_cpu_var(nmi_touch) = 0;
+		touched = 1;
+	}
+
+	if (cpu_isset(cpu, backtrace_mask)) {
+		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
+
+		spin_lock(&lock);
+		printk("NMI backtrace for cpu %d\n", cpu);
+		dump_stack();
+		spin_unlock(&lock);
+		cpu_clear(cpu, backtrace_mask);
+	}
+
+	/* Could check oops_in_progress here too, but it's safer not to */
+	if (mce_in_progress())
+		touched = 1;
+
+	/* if the none of the timers isn't firing, this cpu isn't doing much */
+	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
+		/*
+		 * Ayiee, looks like this CPU is stuck ...
+		 * wait a few IRQs (5 seconds) before doing the oops ...
+		 */
+		local_inc(&__get_cpu_var(alert_counter));
+		if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz)
+			/*
+			 * die_nmi will return ONLY if NOTIFY_STOP happens..
+			 */
+			die_nmi("BUG: NMI Watchdog detected LOCKUP",
+				regs, panic_on_timeout);
+	} else {
+		__get_cpu_var(last_irq_sum) = sum;
+		local_set(&__get_cpu_var(alert_counter), 0);
+	}
+
+	/* see if the nmi watchdog went off */
+	if (!__get_cpu_var(wd_enabled))
+		return rc;
+	switch (nmi_watchdog) {
+	case NMI_LOCAL_APIC:
+		rc |= lapic_wd_event(nmi_hz);
+		break;
+	case NMI_IO_APIC:
+		/*
+		 * don't know how to accurately check for this.
+		 * just assume it was a watchdog timer interrupt
+		 * This matches the old behaviour.
+		 */
+		rc = 1;
+		break;
+	}
+	return rc;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+{
+	unsigned char reason = get_nmi_reason();
+	char buf[64];
+
+	sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+	die_nmi(buf, regs, 1); /* Always panic here */
+	return 0;
+}
+
+/*
+ * proc handler for /proc/sys/kernel/nmi
+ */
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+			void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int old_state;
+
+	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+	old_state = nmi_watchdog_enabled;
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (!!old_state == !!nmi_watchdog_enabled)
+		return 0;
+
+	if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
+		printk(KERN_WARNING
+			"NMI watchdog is permanently disabled\n");
+		return -EIO;
+	}
+
+	/* if nmi_watchdog is not set yet, then set it */
+	nmi_watchdog_default();
+
+	if (nmi_watchdog == NMI_LOCAL_APIC) {
+		if (nmi_watchdog_enabled)
+			enable_lapic_nmi_watchdog();
+		else
+			disable_lapic_nmi_watchdog();
+	} else {
+		printk(KERN_WARNING
+			"NMI watchdog doesn't know what hardware to touch\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+#endif /* CONFIG_SYSCTL */
+
+int do_nmi_callback(struct pt_regs *regs, int cpu)
+{
+#ifdef CONFIG_SYSCTL
+	if (unknown_nmi_panic)
+		return unknown_nmi_panic_callback(regs, cpu);
+#endif
+	return 0;
+}
+
+void __trigger_all_cpu_backtrace(void)
+{
+	int i;
+
+	backtrace_mask = cpu_online_map;
+	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
+	for (i = 0; i < 10 * 1000; i++) {
+		if (cpus_empty(backtrace_mask))
+			break;
+		mdelay(1);
+	}
+}
+
+EXPORT_SYMBOL(nmi_active);
+EXPORT_SYMBOL(nmi_watchdog);
+
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
deleted file mode 100644
index a94dbedf2b74..000000000000
--- a/arch/x86/kernel/nmi_32.c
+++ /dev/null
@@ -1,506 +0,0 @@
-/*
- *  NMI watchdog support on APIC systems
- *
- *  Started by Ingo Molnar <mingo@redhat.com>
- *
- *  Fixes:
- *  Mikael Pettersson	: AMD K7 support for local APIC NMI watchdog.
- *  Mikael Pettersson	: Power Management for local APIC NMI watchdog.
- *  Mikael Pettersson	: Pentium 4 support for local APIC NMI watchdog.
- *  Pavel Machek and
- *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
- */
-
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/percpu.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <linux/kdebug.h>
-#include <linux/slab.h>
-
-#include <asm/smp.h>
-#include <asm/nmi.h>
-#include <asm/timer.h>
-
-#include "mach_traps.h"
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-
-static cpumask_t backtrace_mask = CPU_MASK_NONE;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- *     be enabled
- *  0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
-static int panic_on_timeout;
-
-unsigned int nmi_watchdog = NMI_DEFAULT;
-static unsigned int nmi_hz = HZ;
-
-static DEFINE_PER_CPU(short, wd_enabled);
-
-static int endflag __initdata = 0;
-
-static inline unsigned int get_nmi_count(int cpu)
-{
-	return nmi_count(cpu);
-}
-
-static inline int mce_in_progress(void)
-{
-	return 0;
-}
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
-	return per_cpu(irq_stat, cpu).apic_timer_irqs +
-		per_cpu(irq_stat, cpu).irq0_irqs;
-}
-
-/* Run after command line and cpu_init init, but before all other checks */
-void nmi_watchdog_default(void)
-{
-	if (nmi_watchdog != NMI_DEFAULT)
-		return;
-	if (lapic_watchdog_ok())
-		nmi_watchdog = NMI_LOCAL_APIC;
-	else
-		nmi_watchdog = NMI_IO_APIC;
-}
-
-#ifdef CONFIG_SMP
-/* The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
-	local_irq_enable_in_hardirq();
-	/* Intentionally don't use cpu_relax here. This is
-	   to make sure that the performance counter really ticks,
-	   even if there is a simulator or similar that catches the
-	   pause instruction. On a real HT machine this is fine because
-	   all other CPUs are busy with "useless" delay loops and don't
-	   care if they get somewhat less cycles. */
-	while (endflag == 0)
-		mb();
-}
-#endif
-
-int __init check_nmi_watchdog(void)
-{
-	unsigned int *prev_nmi_count;
-	int cpu;
-
-	if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
-		return 0;
-
-	if (!atomic_read(&nmi_active))
-		return 0;
-
-	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
-	if (!prev_nmi_count)
-		goto error;
-
-	printk(KERN_INFO "Testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
-#endif
-
-	for_each_possible_cpu(cpu)
-		prev_nmi_count[cpu] = get_nmi_count(cpu);
-	local_irq_enable();
-	mdelay((20*1000)/nmi_hz); // wait 20 ticks
-
-	for_each_online_cpu(cpu) {
-		if (!per_cpu(wd_enabled, cpu))
-			continue;
-		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
-			printk(KERN_WARNING "WARNING: CPU#%d: NMI "
-				"appears to be stuck (%d->%d)!\n",
-				cpu,
-				prev_nmi_count[cpu],
-				get_nmi_count(cpu));
-			per_cpu(wd_enabled, cpu) = 0;
-			atomic_dec(&nmi_active);
-		}
-	}
-	endflag = 1;
-	if (!atomic_read(&nmi_active)) {
-		kfree(prev_nmi_count);
-		atomic_set(&nmi_active, -1);
-		goto error;
-	}
-	printk("OK.\n");
-
-	/* now that we know it works we can reduce NMI frequency to
-	   something more reasonable; makes a difference in some configs */
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		nmi_hz = lapic_adjust_nmi_hz(1);
-
-	kfree(prev_nmi_count);
-	return 0;
-error:
-	timer_ack = !cpu_has_tsc;
-
-	return -1;
-}
-
-static int __init setup_nmi_watchdog(char *str)
-{
-	int nmi;
-
-	if (!strncmp(str, "panic", 5)) {
-		panic_on_timeout = 1;
-		str = strchr(str, ',');
-		if (!str)
-			return 1;
-		++str;
-	}
-
-	get_option(&str, &nmi);
-
-	if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
-		return 0;
-
-	nmi_watchdog = nmi;
-	return 1;
-}
-
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-
-/* Suspend/resume support */
-
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
-	/* only CPU0 goes here, other CPUs should be offline */
-	nmi_pm_active = atomic_read(&nmi_active);
-	stop_apic_nmi_watchdog(NULL);
-	BUG_ON(atomic_read(&nmi_active) != 0);
-	return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
-	/* only CPU0 goes here, other CPUs should be offline */
-	if (nmi_pm_active > 0) {
-		setup_apic_nmi_watchdog(NULL);
-		touch_nmi_watchdog();
-	}
-	return 0;
-}
-
-
-static struct sysdev_class nmi_sysclass = {
-	.name		= "lapic_nmi",
-	.resume		= lapic_nmi_resume,
-	.suspend	= lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
-	.id	= 0,
-	.cls	= &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
-	int error;
-
-	/* should really be a BUG_ON but b/c this is an
-	 * init call, it just doesn't work.  -dcz
-	 */
-	if (nmi_watchdog != NMI_LOCAL_APIC)
-		return 0;
-
-	if (atomic_read(&nmi_active) < 0)
-		return 0;
-
-	error = sysdev_class_register(&nmi_sysclass);
-	if (!error)
-		error = sysdev_register(&device_lapic_nmi);
-	return error;
-}
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif	/* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
-	apic_write_around(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
-	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
-	apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
-	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
-}
-
-void setup_apic_nmi_watchdog(void *unused)
-{
-	if (__get_cpu_var(wd_enabled))
-		return;
-
-	/* cheap hack to support suspend/resume */
-	/* if cpu0 is not active neither should the other cpus */
-	if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
-		return;
-
-	switch (nmi_watchdog) {
-	case NMI_LOCAL_APIC:
-		__get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
-		if (lapic_watchdog_init(nmi_hz) < 0) {
-			__get_cpu_var(wd_enabled) = 0;
-			return;
-		}
-		/* FALL THROUGH */
-	case NMI_IO_APIC:
-		__get_cpu_var(wd_enabled) = 1;
-		atomic_inc(&nmi_active);
-	}
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
-	/* only support LOCAL and IO APICs for now */
-	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
-	    (nmi_watchdog != NMI_IO_APIC))
-	    	return;
-	if (__get_cpu_var(wd_enabled) == 0)
-		return;
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		lapic_watchdog_stop();
-	__get_cpu_var(wd_enabled) = 0;
-	atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up
- *  here too!]
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(local_t, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
-	if (nmi_watchdog > 0) {
-		unsigned cpu;
-
-		/*
-		 * Tell other CPUs to reset their alert counters. We cannot
-		 * do it ourselves because the alert count increase is not
-		 * atomic.
-		 */
-		for_each_present_cpu(cpu) {
-			if (per_cpu(nmi_touch, cpu) != 1)
-				per_cpu(nmi_touch, cpu) = 1;
-		}
-	}
-
-	/*
-	 * Tickle the softlockup detector too:
-	 */
-	touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
-
-	/*
-	 * Since current_thread_info()-> is always on the stack, and we
-	 * always switch the stack NMI-atomically, it's safe to use
-	 * smp_processor_id().
-	 */
-	unsigned int sum;
-	int touched = 0;
-	int cpu = smp_processor_id();
-	int rc = 0;
-
-	/* check for other users first */
-	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-			== NOTIFY_STOP) {
-		rc = 1;
-		touched = 1;
-	}
-
-	sum = get_timer_irqs(cpu);
-
-	if (__get_cpu_var(nmi_touch)) {
-		__get_cpu_var(nmi_touch) = 0;
-		touched = 1;
-	}
-
-	if (cpu_isset(cpu, backtrace_mask)) {
-		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
-
-		spin_lock(&lock);
-		printk("NMI backtrace for cpu %d\n", cpu);
-		dump_stack();
-		spin_unlock(&lock);
-		cpu_clear(cpu, backtrace_mask);
-	}
-
-	/* Could check oops_in_progress here too, but it's safer not to */
-	if (mce_in_progress())
-		touched = 1;
-
-	/* if the none of the timers isn't firing, this cpu isn't doing much */
-	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
-		/*
-		 * Ayiee, looks like this CPU is stuck ...
-		 * wait a few IRQs (5 seconds) before doing the oops ...
-		 */
-		local_inc(&__get_cpu_var(alert_counter));
-		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
-			/*
-			 * die_nmi will return ONLY if NOTIFY_STOP happens..
-			 */
-			die_nmi("BUG: NMI Watchdog detected LOCKUP",
-				regs, panic_on_timeout);
-	} else {
-		__get_cpu_var(last_irq_sum) = sum;
-		local_set(&__get_cpu_var(alert_counter), 0);
-	}
-	/* see if the nmi watchdog went off */
-	if (!__get_cpu_var(wd_enabled))
-		return rc;
-	switch (nmi_watchdog) {
-	case NMI_LOCAL_APIC:
-		rc |= lapic_wd_event(nmi_hz);
-		break;
-	case NMI_IO_APIC:
-		/* don't know how to accurately check for this.
-		 * just assume it was a watchdog timer interrupt
-		 * This matches the old behaviour.
-		 */
-		rc = 1;
-		break;
-	}
-	return rc;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
-	unsigned char reason = get_nmi_reason();
-	char buf[64];
-
-	sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-	die_nmi(buf, regs, 1); /* Always panic here */
-	return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
-			void __user *buffer, size_t *length, loff_t *ppos)
-{
-	int old_state;
-
-	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
-	old_state = nmi_watchdog_enabled;
-	proc_dointvec(table, write, file, buffer, length, ppos);
-	if (!!old_state == !!nmi_watchdog_enabled)
-		return 0;
-
-	if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
-		printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
-		return -EIO;
-	}
-
-	/* if nmi_watchdog is not set yet, then set it */
-	nmi_watchdog_default();
-
-	if (nmi_watchdog == NMI_LOCAL_APIC) {
-		if (nmi_watchdog_enabled)
-			enable_lapic_nmi_watchdog();
-		else
-			disable_lapic_nmi_watchdog();
-	} else {
-		printk( KERN_WARNING
-			"NMI watchdog doesn't know what hardware to touch\n");
-		return -EIO;
-	}
-	return 0;
-}
-
-#endif
-
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
-	if (unknown_nmi_panic)
-		return unknown_nmi_panic_callback(regs, cpu);
-#endif
-	return 0;
-}
-
-void __trigger_all_cpu_backtrace(void)
-{
-	int i;
-
-	backtrace_mask = cpu_online_map;
-	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
-	for (i = 0; i < 10 * 1000; i++) {
-		if (cpus_empty(backtrace_mask))
-			break;
-		mdelay(1);
-	}
-}
-
-EXPORT_SYMBOL(nmi_active);
-EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
deleted file mode 100644
index c6802447262c..000000000000
--- a/arch/x86/kernel/nmi_64.c
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
- *  NMI watchdog support on APIC systems
- *
- *  Started by Ingo Molnar <mingo@redhat.com>
- *
- *  Fixes:
- *  Mikael Pettersson	: AMD K7 support for local APIC NMI watchdog.
- *  Mikael Pettersson	: Power Management for local APIC NMI watchdog.
- *  Pavel Machek and
- *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
- */
-
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kdebug.h>
-
-#include <asm/smp.h>
-#include <asm/nmi.h>
-#include <asm/proto.h>
-#include <asm/mce.h>
-
-#include <mach_traps.h>
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-
-static cpumask_t backtrace_mask = CPU_MASK_NONE;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- *     be enabled
- *  0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
-static int panic_on_timeout;
-
-unsigned int nmi_watchdog = NMI_DEFAULT;
-static unsigned int nmi_hz = HZ;
-
-static DEFINE_PER_CPU(short, wd_enabled);
-
-static int endflag __initdata = 0;
-
-static inline unsigned int get_nmi_count(int cpu)
-{
-	return cpu_pda(cpu)->__nmi_count;
-}
-
-static inline int mce_in_progress(void)
-{
-#ifdef CONFIG_X86_MCE
-	return atomic_read(&mce_entry) > 0;
-#endif
-	return 0;
-}
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
-	return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
-}
-
-/* Run after command line and cpu_init init, but before all other checks */
-void nmi_watchdog_default(void)
-{
-	if (nmi_watchdog != NMI_DEFAULT)
-		return;
-	nmi_watchdog = NMI_NONE;
-}
-
-#ifdef CONFIG_SMP
-/* The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
-	local_irq_enable_in_hardirq();
-	/* Intentionally don't use cpu_relax here. This is
-	   to make sure that the performance counter really ticks,
-	   even if there is a simulator or similar that catches the
-	   pause instruction. On a real HT machine this is fine because
-	   all other CPUs are busy with "useless" delay loops and don't
-	   care if they get somewhat less cycles. */
-	while (endflag == 0)
-		mb();
-}
-#endif
-
-int __init check_nmi_watchdog(void)
-{
-	unsigned int *prev_nmi_count;
-	int cpu;
-
-	if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
-		return 0;
-
-	if (!atomic_read(&nmi_active))
-		return 0;
-
-	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
-	if (!prev_nmi_count)
-		return -1;
-
-	printk(KERN_INFO "Testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
-#endif
-
-	for_each_possible_cpu(cpu)
-		prev_nmi_count[cpu] = get_nmi_count(cpu);
-	local_irq_enable();
-	mdelay((20*1000)/nmi_hz); // wait 20 ticks
-
-	for_each_online_cpu(cpu) {
-		if (!per_cpu(wd_enabled, cpu))
-			continue;
-		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
-			printk(KERN_WARNING "WARNING: CPU#%d: NMI "
-			       "appears to be stuck (%d->%d)!\n",
-				cpu,
-				prev_nmi_count[cpu],
-				get_nmi_count(cpu));
-			per_cpu(wd_enabled, cpu) = 0;
-			atomic_dec(&nmi_active);
-		}
-	}
-	endflag = 1;
-	if (!atomic_read(&nmi_active)) {
-		kfree(prev_nmi_count);
-		atomic_set(&nmi_active, -1);
-		return -1;
-	}
-	printk("OK.\n");
-
-	/* now that we know it works we can reduce NMI frequency to
-	   something more reasonable; makes a difference in some configs */
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		nmi_hz = lapic_adjust_nmi_hz(1);
-
-	kfree(prev_nmi_count);
-	return 0;
-}
-
-static int __init setup_nmi_watchdog(char *str)
-{
-	int nmi;
-
-	if (!strncmp(str,"panic",5)) {
-		panic_on_timeout = 1;
-		str = strchr(str, ',');
-		if (!str)
-			return 1;
-		++str;
-	}
-
-	get_option(&str, &nmi);
-
-	if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
-		return 0;
-
-	nmi_watchdog = nmi;
-	return 1;
-}
-
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
-	/* only CPU0 goes here, other CPUs should be offline */
-	nmi_pm_active = atomic_read(&nmi_active);
-	stop_apic_nmi_watchdog(NULL);
-	BUG_ON(atomic_read(&nmi_active) != 0);
-	return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
-	/* only CPU0 goes here, other CPUs should be offline */
-	if (nmi_pm_active > 0) {
-		setup_apic_nmi_watchdog(NULL);
-		touch_nmi_watchdog();
-	}
-	return 0;
-}
-
-static struct sysdev_class nmi_sysclass = {
-	.name		= "lapic_nmi",
-	.resume		= lapic_nmi_resume,
-	.suspend	= lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
-	.id	= 0,
-	.cls	= &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
-	int error;
-
-	/* should really be a BUG_ON but b/c this is an
-	 * init call, it just doesn't work.  -dcz
-	 */
-	if (nmi_watchdog != NMI_LOCAL_APIC)
-		return 0;
-
-	if (atomic_read(&nmi_active) < 0)
-		return 0;
-
-	error = sysdev_class_register(&nmi_sysclass);
-	if (!error)
-		error = sysdev_register(&device_lapic_nmi);
-	return error;
-}
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif	/* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
-	apic_write_around(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
-	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
-	apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
-	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
-}
-
-void setup_apic_nmi_watchdog(void *unused)
-{
-	if (__get_cpu_var(wd_enabled))
-		return;
-
-	/* cheap hack to support suspend/resume */
-	/* if cpu0 is not active neither should the other cpus */
-	if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
-		return;
-
-	switch (nmi_watchdog) {
-	case NMI_LOCAL_APIC:
-		__get_cpu_var(wd_enabled) = 1;
-		if (lapic_watchdog_init(nmi_hz) < 0) {
-			__get_cpu_var(wd_enabled) = 0;
-			return;
-		}
-		/* FALL THROUGH */
-	case NMI_IO_APIC:
-		__get_cpu_var(wd_enabled) = 1;
-		atomic_inc(&nmi_active);
-	}
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
-	/* only support LOCAL and IO APICs for now */
-	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
-	    (nmi_watchdog != NMI_IO_APIC))
-	    	return;
-	if (__get_cpu_var(wd_enabled) == 0)
-		return;
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		lapic_watchdog_stop();
-	__get_cpu_var(wd_enabled) = 0;
-	atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(local_t, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
-	if (nmi_watchdog > 0) {
-		unsigned cpu;
-
-		/*
- 		 * Tell other CPUs to reset their alert counters. We cannot
-		 * do it ourselves because the alert count increase is not
-		 * atomic.
-		 */
-		for_each_present_cpu(cpu) {
-			if (per_cpu(nmi_touch, cpu) != 1)
-				per_cpu(nmi_touch, cpu) = 1;
-		}
-	}
-
-	touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
-	unsigned int sum;
-	int touched = 0;
-	int cpu = smp_processor_id();
-	int rc = 0;
-
-	/* check for other users first */
-	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-			== NOTIFY_STOP) {
-		rc = 1;
-		touched = 1;
-	}
-
-	sum = get_timer_irqs(cpu);
-
-	if (__get_cpu_var(nmi_touch)) {
-		__get_cpu_var(nmi_touch) = 0;
-		touched = 1;
-	}
-
-	if (cpu_isset(cpu, backtrace_mask)) {
-		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
-
-		spin_lock(&lock);
-		printk("NMI backtrace for cpu %d\n", cpu);
-		dump_stack();
-		spin_unlock(&lock);
-		cpu_clear(cpu, backtrace_mask);
-	}
-
-	if (mce_in_progress())
-		touched = 1;
-
-	/* if the apic timer isn't firing, this cpu isn't doing much */
-	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
-		/*
-		 * Ayiee, looks like this CPU is stuck ...
-		 * wait a few IRQs (5 seconds) before doing the oops ...
-		 */
-		local_inc(&__get_cpu_var(alert_counter));
-		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
-			die_nmi("NMI Watchdog detected LOCKUP",
-				regs, panic_on_timeout);
-	} else {
-		__get_cpu_var(last_irq_sum) = sum;
-		local_set(&__get_cpu_var(alert_counter), 0);
-	}
-
-	/* see if the nmi watchdog went off */
-	if (!__get_cpu_var(wd_enabled))
-		return rc;
-	switch (nmi_watchdog) {
-	case NMI_LOCAL_APIC:
-		rc |= lapic_wd_event(nmi_hz);
-		break;
-	case NMI_IO_APIC:
-		/* don't know how to accurately check for this.
-		 * just assume it was a watchdog timer interrupt
-		 * This matches the old behaviour.
-		 */
-		rc = 1;
-		break;
-	}
-	return rc;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
-	unsigned char reason = get_nmi_reason();
-	char buf[64];
-
-	sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-	die_nmi(buf, regs, 1);	/* Always panic here */
-	return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
-			void __user *buffer, size_t *length, loff_t *ppos)
-{
-	int old_state;
-
-	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
-	old_state = nmi_watchdog_enabled;
-	proc_dointvec(table, write, file, buffer, length, ppos);
-	if (!!old_state == !!nmi_watchdog_enabled)
-		return 0;
-
-	if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
-		printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
-		return -EIO;
-	}
-
-	/* if nmi_watchdog is not set yet, then set it */
-	nmi_watchdog_default();
-
-	if (nmi_watchdog == NMI_LOCAL_APIC) {
-		if (nmi_watchdog_enabled)
-			enable_lapic_nmi_watchdog();
-		else
-			disable_lapic_nmi_watchdog();
-	} else {
-		printk( KERN_WARNING
-			"NMI watchdog doesn't know what hardware to touch\n");
-		return -EIO;
-	}
-	return 0;
-}
-
-#endif
-
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
-	if (unknown_nmi_panic)
-		return unknown_nmi_panic_callback(regs, cpu);
-#endif
-	return 0;
-}
-
-void __trigger_all_cpu_backtrace(void)
-{
-	int i;
-
-	backtrace_mask = cpu_online_map;
-	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
-	for (i = 0; i < 10 * 1000; i++) {
-		if (cpus_empty(backtrace_mask))
-			break;
-		mdelay(1);
-	}
-}
-
-EXPORT_SYMBOL(nmi_active);
-EXPORT_SYMBOL(nmi_watchdog);
-- 
cgit v1.2.3


From a15af1c9ea2750a9ff01e51615c45950bad8221b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 26 May 2008 23:31:06 +0100
Subject: x86/paravirt: add pte_flags to just get pte flags

Add pte_flags() to extract the flags from a pte.  This is a special
case of pte_val() which is only guaranteed to return the pte's flags
correctly; the page number may be corrupted or missing.

The intent is to allow paravirt implementations to return pte flags
without having to do any translation of the page number (most notably,
Xen).

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt.c |  1 +
 arch/x86/xen/enlighten.c   |  1 +
 drivers/lguest/lg.h        |  1 -
 include/asm-x86/page.h     |  1 +
 include/asm-x86/paravirt.h | 15 +++++++++++++++
 include/asm-x86/pgtable.h  | 16 ++++++++--------
 6 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 74f0c5ea2a03..c98d54688180 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -403,6 +403,7 @@ struct pv_mmu_ops pv_mmu_ops = {
 #endif /* PAGETABLE_LEVELS >= 3 */
 
 	.pte_val = native_pte_val,
+	.pte_flags = native_pte_val,
 	.pgd_val = native_pgd_val,
 
 	.make_pte = native_make_pte,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 4a372b71239d..1b4b5fa498b3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1101,6 +1101,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.set_pmd = xen_set_pmd,
 
 	.pte_val = xen_pte_val,
+	.pte_flags = native_pte_val,
 	.pgd_val = xen_pgd_val,
 
 	.make_pte = xen_make_pte,
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 005bd045d2eb..5faefeaf6790 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -136,7 +136,6 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
  * first step in the migration to the kernel types.  pte_pfn is already defined
  * in the kernel. */
 #define pgd_flags(x)	(pgd_val(x) & ~PAGE_MASK)
-#define pte_flags(x)	(pte_val(x) & ~PAGE_MASK)
 #define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
 
 /* interrupts_and_traps.c: */
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h
index dc936dddf161..a1e2b9470f25 100644
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -160,6 +160,7 @@ static inline pteval_t native_pte_val(pte_t pte)
 #endif
 
 #define pte_val(x)	native_pte_val(x)
+#define pte_flags(x)	native_pte_val(x)
 #define __pte(x)	native_make_pte(x)
 
 #endif	/* CONFIG_PARAVIRT */
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 0f13b945e240..5ea37a48eecb 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -239,6 +239,7 @@ struct pv_mmu_ops {
 				 unsigned long addr, pte_t *ptep);
 
 	pteval_t (*pte_val)(pte_t);
+	pteval_t (*pte_flags)(pte_t);
 	pte_t (*make_pte)(pteval_t pte);
 
 	pgdval_t (*pgd_val)(pgd_t);
@@ -996,6 +997,20 @@ static inline pteval_t pte_val(pte_t pte)
 	return ret;
 }
 
+static inline pteval_t pte_flags(pte_t pte)
+{
+	pteval_t ret;
+
+	if (sizeof(pteval_t) > sizeof(long))
+		ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
+				 pte.pte, (u64)pte.pte >> 32);
+	else
+		ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags,
+				 pte.pte);
+
+	return ret;
+}
+
 static inline pgd_t __pgd(pgdval_t val)
 {
 	pgdval_t ret;
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 97c271b2910b..47a852cb8c92 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -164,37 +164,37 @@ extern struct list_head pgd_list;
  */
 static inline int pte_dirty(pte_t pte)
 {
-	return pte_val(pte) & _PAGE_DIRTY;
+	return pte_flags(pte) & _PAGE_DIRTY;
 }
 
 static inline int pte_young(pte_t pte)
 {
-	return pte_val(pte) & _PAGE_ACCESSED;
+	return pte_flags(pte) & _PAGE_ACCESSED;
 }
 
 static inline int pte_write(pte_t pte)
 {
-	return pte_val(pte) & _PAGE_RW;
+	return pte_flags(pte) & _PAGE_RW;
 }
 
 static inline int pte_file(pte_t pte)
 {
-	return pte_val(pte) & _PAGE_FILE;
+	return pte_flags(pte) & _PAGE_FILE;
 }
 
 static inline int pte_huge(pte_t pte)
 {
-	return pte_val(pte) & _PAGE_PSE;
+	return pte_flags(pte) & _PAGE_PSE;
 }
 
 static inline int pte_global(pte_t pte)
 {
-	return pte_val(pte) & _PAGE_GLOBAL;
+	return pte_flags(pte) & _PAGE_GLOBAL;
 }
 
 static inline int pte_exec(pte_t pte)
 {
-	return !(pte_val(pte) & _PAGE_NX);
+	return !(pte_flags(pte) & _PAGE_NX);
 }
 
 static inline int pte_special(pte_t pte)
@@ -305,7 +305,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 	return __pgprot(preservebits | addbits);
 }
 
-#define pte_pgprot(x) __pgprot(pte_val(x) & ~PTE_MASK)
+#define pte_pgprot(x) __pgprot(pte_flags(x) & ~PTE_MASK)
 
 #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
 
-- 
cgit v1.2.3


From 3945e2c9abf8e00c2edc4aa29215ddfad1cd8cf7 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 25 May 2008 10:00:09 -0700
Subject: x86: extend e820 ealy_res support 32bit - fix #2

remove extra -1 in reseve_early calling
    panic if can not find space for new RAMDISK

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_32.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index c04e8c91daf7..08b47a2f7af0 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -479,7 +479,7 @@ static void __init reserve_initrd(void)
 	initrd_start = 0;
 
 	if (ramdisk_size >= end_of_lowmem/2) {
-		free_early(ramdisk_image, ramdisk_image + ramdisk_size - 1);
+		free_early(ramdisk_image, ramdisk_end);
 		printk(KERN_ERR "initrd too large to handle, "
 		       "disabling initrd\n");
 		return;
@@ -501,9 +501,13 @@ static void __init reserve_initrd(void)
 				 end_of_lowmem, ramdisk_size,
 				 PAGE_SIZE);
 
+	if (ramdisk_here == -1ULL)
+		panic("Cannot find place for new RAMDISK of size %lld\n",
+			 ramdisk_size);
+
 	/* Note: this includes all the lowmem currently occupied by
 	   the initrd, we rely on that fact to keep the data intact. */
-	reserve_early(ramdisk_here, ramdisk_here + ramdisk_size - 1,
+	reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
 			 "NEW RAMDISK");
 	initrd_start = ramdisk_here + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
@@ -579,15 +583,15 @@ void __init setup_bootmem_allocator(void)
 				 PAGE_SIZE);
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
-	reserve_early(bootmap, bootmap + bootmap_size - 1, "BOOTMAP");
+	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
 #ifdef CONFIG_BLK_DEV_INITRD
 	reserve_initrd();
 #endif
 	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
 	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
 		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
-	printk(KERN_INFO "  bootmap [%08lx -  %08lx]\n",
-		 bootmap, bootmap + bootmap_size - 1);
+	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
+		 bootmap, bootmap + bootmap_size);
 	register_bootmem_low_pages(max_low_pfn);
 	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
 
-- 
cgit v1.2.3


From dd564d0cf08686cf0cc332bf9d48cba5b26a8171 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 27 May 2008 18:03:56 +0200
Subject: x86: aperture_64.c: cleanups

Some small cleanups for aperture_64.c; they should not really change
any code.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 5373f7834d8a..6eea42eb287b 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -111,7 +111,7 @@ static u32 __init allocate_aperture(void)
 
 
 /* Find a PCI capability */
-static __u32 __init find_cap(int bus, int slot, int func, int cap)
+static u32 __init find_cap(int bus, int slot, int func, int cap)
 {
 	int bytes;
 	u8 pos;
@@ -137,7 +137,7 @@ static __u32 __init find_cap(int bus, int slot, int func, int cap)
 }
 
 /* Read a standard AGPv3 bridge header */
-static __u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
+static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
 {
 	u32 apsize;
 	u32 apsizereg;
@@ -202,7 +202,7 @@ static __u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
  * the AGP bridges should be always an own bus on the HT hierarchy,
  * but do it here for future safety.
  */
-static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
 {
 	int bus, slot, func;
 
-- 
cgit v1.2.3


From 19ec673ced067316b9732bc6d1c4ff4052e5f795 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 28 May 2008 23:00:47 +0400
Subject: x86: nmi - fix incorrect NMI watchdog used by default

The commit

	commit 4b82b277707a39b97271439c475f186f63ec4692
	Author: Cyrill Gorcunov <gorcunov@gmail.com>
	Date:   Sat May 24 19:36:35 2008 +0400

set nmi_watchdog to NMI_IO_APIC as by default. This causes hangs on some
machines with buggy watchdogs. Fix it - i.e. restore old behaviour.

Thanks to Sitsofe Wheeler and Adrian Bunk for catching the problem
and Maciej W. Rozycki for explanation what is going on there.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
CC: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi.c | 16 +++++++++-------
 include/asm-x86/nmi.h |  4 +++-
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 69a839fc1eb0..3671a9f3564b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -84,20 +84,15 @@ static inline unsigned int get_timer_irqs(int cpu)
 #endif
 }
 
+#ifdef CONFIG_X86_64
 /* Run after command line and cpu_init init, but before all other checks */
 void nmi_watchdog_default(void)
 {
 	if (nmi_watchdog != NMI_DEFAULT)
 		return;
-#ifdef CONFIG_X86_64
 	nmi_watchdog = NMI_NONE;
-#else
-	if (lapic_watchdog_ok())
-		nmi_watchdog = NMI_LOCAL_APIC;
-	else
-		nmi_watchdog = NMI_IO_APIC;
-#endif
 }
+#endif
 
 #ifdef CONFIG_SMP
 /*
@@ -488,8 +483,15 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 		return -EIO;
 	}
 
+#ifdef CONFIG_X86_64
 	/* if nmi_watchdog is not set yet, then set it */
 	nmi_watchdog_default();
+#else
+	if (lapic_watchdog_ok())
+		nmi_watchdog = NMI_LOCAL_APIC;
+	else
+		nmi_watchdog = NMI_IO_APIC;
+#endif
 
 	if (nmi_watchdog == NMI_LOCAL_APIC) {
 		if (nmi_watchdog_enabled)
diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h
index 1e8f34d7ab68..6f4d44fc051f 100644
--- a/include/asm-x86/nmi.h
+++ b/include/asm-x86/nmi.h
@@ -38,9 +38,11 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev)
 
 #ifdef CONFIG_X86_64
 extern void default_do_nmi(struct pt_regs *);
+extern void nmi_watchdog_default(void);
+#else
+#define nmi_watchdog_default(void) do {} while (0)
 #endif
 
-extern void nmi_watchdog_default(void);
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
 extern int nmi_watchdog_enabled;
-- 
cgit v1.2.3


From d364319b989967b74e57fef5c8017fd56a16c392 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 2 May 2008 23:42:01 +0200
Subject: x86: move mmconfig declarations to header

arch/x86/kernel/mmconf-fam10h_64.c is missing the prototypes, which
are decalred in arch/x86/kernel/setup_64.c. Move the prototypes and
the inline stubs to the appropriate header file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mmconf-fam10h_64.c |  1 +
 arch/x86/kernel/setup_64.c         | 13 +------------
 include/asm-x86/mmconfig.h         | 12 ++++++++++++
 3 files changed, 14 insertions(+), 12 deletions(-)
 create mode 100644 include/asm-x86/mmconfig.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index edc5fbfe85c0..fdfdc550b366 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -12,6 +12,7 @@
 #include <asm/io.h>
 #include <asm/msr.h>
 #include <asm/acpi.h>
+#include <asm/mmconfig.h>
 
 #include "../pci/pci.h"
 
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6dff1286ad8a..341230db74e1 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -71,6 +71,7 @@
 #include <asm/topology.h>
 #include <asm/trampoline.h>
 #include <asm/pat.h>
+#include <asm/mmconfig.h>
 
 #include <mach_apic.h>
 #ifdef CONFIG_PARAVIRT
@@ -293,18 +294,6 @@ static void __init parse_setup_data(void)
 	}
 }
 
-#ifdef CONFIG_PCI_MMCONFIG
-extern void __cpuinit fam10h_check_enable_mmcfg(void);
-extern void __init check_enable_amd_mmconf_dmi(void);
-#else
-void __cpuinit fam10h_check_enable_mmcfg(void)
-{
-}
-void __init check_enable_amd_mmconf_dmi(void)
-{
-}
-#endif
-
 /*
  * setup_arch - architecture-specific boot-time initializations
  *
diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h
new file mode 100644
index 000000000000..95beda07c6fa
--- /dev/null
+++ b/include/asm-x86/mmconfig.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_MMCONFIG_H
+#define _ASM_MMCONFIG_H
+
+#ifdef CONFIG_PCI_MMCONFIG
+extern void __cpuinit fam10h_check_enable_mmcfg(void);
+extern void __init check_enable_amd_mmconf_dmi(void);
+#else
+static inline void fam10h_check_enable_mmcfg(void) { }
+static inline void check_enable_amd_mmconf_dmi(void) { }
+#endif
+
+#endif
-- 
cgit v1.2.3


From 4d285878564bb46cf64e54be18eeffe33ca583a0 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 22 May 2008 18:48:32 -0400
Subject: x86: Move the AMD64 specific parts out of setup_64.c

Create a separate amd_64.c file in the cpu/ dir for
the useful parts to live in.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/Makefile |   1 +
 arch/x86/kernel/cpu/amd_64.c | 235 +++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_64.c   | 232 ++----------------------------------------
 3 files changed, 242 insertions(+), 226 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/amd_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a0c6f8190887..ef065c1a2e1a 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -7,6 +7,7 @@ obj-y			+= proc.o feature_names.o
 
 obj-$(CONFIG_X86_32)	+= common.o bugs.o
 obj-$(CONFIG_X86_32)	+= amd.o
+obj-$(CONFIG_X86_64)	+= amd_64.o
 obj-$(CONFIG_X86_32)	+= cyrix.o
 obj-$(CONFIG_X86_32)	+= centaur.o
 obj-$(CONFIG_X86_32)	+= transmeta.o
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
new file mode 100644
index 000000000000..1746f6f95726
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -0,0 +1,235 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/numa_64.h>
+#include <asm/mmconfig.h>
+#include <asm/cacheflush.h>
+
+#include <mach_apic.h>
+
+extern int __cpuinit get_model_name(struct cpuinfo_x86 *c);
+extern void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c);
+
+int force_mwait __cpuinitdata;
+
+#ifdef CONFIG_NUMA
+static int __cpuinit nearby_node(int apicid)
+{
+	int i, node;
+
+	for (i = apicid - 1; i >= 0; i--) {
+		node = apicid_to_node[i];
+		if (node != NUMA_NO_NODE && node_online(node))
+			return node;
+	}
+	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+		node = apicid_to_node[i];
+		if (node != NUMA_NO_NODE && node_online(node))
+			return node;
+	}
+	return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	unsigned bits;
+#ifdef CONFIG_NUMA
+	int cpu = smp_processor_id();
+	int node = 0;
+	unsigned apicid = hard_smp_processor_id();
+#endif
+	bits = c->x86_coreid_bits;
+
+	/* Low order bits define the core id (index of core in socket) */
+	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+	/* Convert the initial APIC ID into the socket ID */
+	c->phys_proc_id = c->initial_apicid >> bits;
+
+#ifdef CONFIG_NUMA
+	node = c->phys_proc_id;
+	if (apicid_to_node[apicid] != NUMA_NO_NODE)
+		node = apicid_to_node[apicid];
+	if (!node_online(node)) {
+		/* Two possibilities here:
+		   - The CPU is missing memory and no node was created.
+		   In that case try picking one from a nearby CPU
+		   - The APIC IDs differ from the HyperTransport node IDs
+		   which the K8 northbridge parsing fills in.
+		   Assume they are all increased by a constant offset,
+		   but in the same order as the HT nodeids.
+		   If that doesn't result in a usable node fall back to the
+		   path for the previous case.  */
+
+		int ht_nodeid = c->initial_apicid;
+
+		if (ht_nodeid >= 0 &&
+		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+			node = apicid_to_node[ht_nodeid];
+		/* Pick a nearby node */
+		if (!node_online(node))
+			node = nearby_node(apicid);
+	}
+	numa_set_node(cpu, node);
+
+	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+#endif
+}
+
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	unsigned bits, ecx;
+
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level < 0x80000008)
+		return;
+
+	ecx = cpuid_ecx(0x80000008);
+
+	c->x86_max_cores = (ecx & 0xff) + 1;
+
+	/* CPU telling us the core id bits shift? */
+	bits = (ecx >> 12) & 0xF;
+
+	/* Otherwise recompute */
+	if (bits == 0) {
+		while ((1 << bits) < c->x86_max_cores)
+			bits++;
+	}
+
+	c->x86_coreid_bits = bits;
+
+#endif
+}
+
+#define ENABLE_C1E_MASK		0x18000000
+#define CPUID_PROCESSOR_SIGNATURE	1
+#define CPUID_XFAM		0x0ff00000
+#define CPUID_XFAM_K8		0x00000000
+#define CPUID_XFAM_10H		0x00100000
+#define CPUID_XFAM_11H		0x00200000
+#define CPUID_XMOD		0x000f0000
+#define CPUID_XMOD_REV_F	0x00040000
+
+/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
+static __cpuinit int amd_apic_timer_broken(void)
+{
+	u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+
+	switch (eax & CPUID_XFAM) {
+	case CPUID_XFAM_K8:
+		if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
+			break;
+	case CPUID_XFAM_10H:
+	case CPUID_XFAM_11H:
+		rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
+		if (lo & ENABLE_C1E_MASK)
+			return 1;
+		break;
+	default:
+		/* err on the side of caution */
+		return 1;
+	}
+	return 0;
+}
+
+void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+	early_init_amd_mc(c);
+
+	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+	if (c->x86_power & (1<<8))
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+void __cpuinit init_amd(struct cpuinfo_x86 *c)
+{
+	unsigned level;
+
+#ifdef CONFIG_SMP
+	unsigned long value;
+
+	/*
+	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
+	 * bit 6 of msr C001_0015
+	 *
+	 * Errata 63 for SH-B3 steppings
+	 * Errata 122 for all steppings (F+ have it disabled by default)
+	 */
+	if (c->x86 == 15) {
+		rdmsrl(MSR_K8_HWCR, value);
+		value |= 1 << 6;
+		wrmsrl(MSR_K8_HWCR, value);
+	}
+#endif
+
+	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+	clear_cpu_cap(c, 0*32+31);
+
+	/* On C+ stepping K8 rep microcode works well for copy/memset */
+	level = cpuid_eax(1);
+	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
+			     level >= 0x0f58))
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	if (c->x86 == 0x10 || c->x86 == 0x11)
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+	/* Enable workaround for FXSAVE leak */
+	if (c->x86 >= 6)
+		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+
+	level = get_model_name(c);
+	if (!level) {
+		switch (c->x86) {
+		case 15:
+			/* Should distinguish Models here, but this is only
+			   a fallback anyways. */
+			strcpy(c->x86_model_id, "Hammer");
+			break;
+		}
+	}
+	display_cacheinfo(c);
+
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level >= 0x80000008)
+		amd_detect_cmp(c);
+
+	if (c->extended_cpuid_level >= 0x80000006 &&
+		(cpuid_edx(0x80000006) & 0xf000))
+		num_cache_leaves = 4;
+	else
+		num_cache_leaves = 3;
+
+	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
+		set_cpu_cap(c, X86_FEATURE_K8);
+
+	/* MFENCE stops RDTSC speculation */
+	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+
+	if (c->x86 == 0x10)
+		fam10h_check_enable_mmcfg();
+
+	if (amd_apic_timer_broken())
+		disable_apic_timer = 1;
+
+	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+		unsigned long long tseg;
+
+		/*
+		 * Split up direct mapping around the TSEG SMM area.
+		 * Don't do it for gbpages because there seems very little
+		 * benefit in doing so.
+		 */
+		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
+		(tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
+			set_memory_4k((unsigned long)__va(tseg), 1);
+	}
+}
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 341230db74e1..b07b1997ed97 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -96,8 +96,6 @@ int bootloader_type;
 
 unsigned long saved_video_mode;
 
-int force_mwait __cpuinitdata;
-
 /*
  * Early DMI memory
  */
@@ -526,7 +524,7 @@ void __init setup_arch(char **cmdline_p)
 	check_enable_amd_mmconf_dmi();
 }
 
-static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
 
@@ -542,7 +540,7 @@ static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 }
 
 
-static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 {
 	unsigned int n, dummy, eax, ebx, ecx, edx;
 
@@ -574,228 +572,6 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 	}
 }
 
-#ifdef CONFIG_NUMA
-static int __cpuinit nearby_node(int apicid)
-{
-	int i, node;
-
-	for (i = apicid - 1; i >= 0; i--) {
-		node = apicid_to_node[i];
-		if (node != NUMA_NO_NODE && node_online(node))
-			return node;
-	}
-	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-		node = apicid_to_node[i];
-		if (node != NUMA_NO_NODE && node_online(node))
-			return node;
-	}
-	return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned bits;
-#ifdef CONFIG_NUMA
-	int cpu = smp_processor_id();
-	int node = 0;
-	unsigned apicid = hard_smp_processor_id();
-#endif
-	bits = c->x86_coreid_bits;
-
-	/* Low order bits define the core id (index of core in socket) */
-	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
-	/* Convert the initial APIC ID into the socket ID */
-	c->phys_proc_id = c->initial_apicid >> bits;
-
-#ifdef CONFIG_NUMA
-	node = c->phys_proc_id;
-	if (apicid_to_node[apicid] != NUMA_NO_NODE)
-		node = apicid_to_node[apicid];
-	if (!node_online(node)) {
-		/* Two possibilities here:
-		   - The CPU is missing memory and no node was created.
-		   In that case try picking one from a nearby CPU
-		   - The APIC IDs differ from the HyperTransport node IDs
-		   which the K8 northbridge parsing fills in.
-		   Assume they are all increased by a constant offset,
-		   but in the same order as the HT nodeids.
-		   If that doesn't result in a usable node fall back to the
-		   path for the previous case.  */
-
-		int ht_nodeid = c->initial_apicid;
-
-		if (ht_nodeid >= 0 &&
-		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-			node = apicid_to_node[ht_nodeid];
-		/* Pick a nearby node */
-		if (!node_online(node))
-			node = nearby_node(apicid);
-	}
-	numa_set_node(cpu, node);
-
-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-#endif
-}
-
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned bits, ecx;
-
-	/* Multi core CPU? */
-	if (c->extended_cpuid_level < 0x80000008)
-		return;
-
-	ecx = cpuid_ecx(0x80000008);
-
-	c->x86_max_cores = (ecx & 0xff) + 1;
-
-	/* CPU telling us the core id bits shift? */
-	bits = (ecx >> 12) & 0xF;
-
-	/* Otherwise recompute */
-	if (bits == 0) {
-		while ((1 << bits) < c->x86_max_cores)
-			bits++;
-	}
-
-	c->x86_coreid_bits = bits;
-
-#endif
-}
-
-#define ENABLE_C1E_MASK		0x18000000
-#define CPUID_PROCESSOR_SIGNATURE	1
-#define CPUID_XFAM		0x0ff00000
-#define CPUID_XFAM_K8		0x00000000
-#define CPUID_XFAM_10H		0x00100000
-#define CPUID_XFAM_11H		0x00200000
-#define CPUID_XMOD		0x000f0000
-#define CPUID_XMOD_REV_F	0x00040000
-
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
-{
-	u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-
-	switch (eax & CPUID_XFAM) {
-	case CPUID_XFAM_K8:
-		if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
-			break;
-	case CPUID_XFAM_10H:
-	case CPUID_XFAM_11H:
-		rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
-		if (lo & ENABLE_C1E_MASK)
-			return 1;
-		break;
-	default:
-		/* err on the side of caution */
-		return 1;
-	}
-	return 0;
-}
-
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
-{
-	early_init_amd_mc(c);
-
- 	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
-	if (c->x86_power & (1<<8))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-{
-	unsigned level;
-
-#ifdef CONFIG_SMP
-	unsigned long value;
-
-	/*
-	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
-	 * bit 6 of msr C001_0015
-	 *
-	 * Errata 63 for SH-B3 steppings
-	 * Errata 122 for all steppings (F+ have it disabled by default)
-	 */
-	if (c->x86 == 15) {
-		rdmsrl(MSR_K8_HWCR, value);
-		value |= 1 << 6;
-		wrmsrl(MSR_K8_HWCR, value);
-	}
-#endif
-
-	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
-	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-	clear_cpu_cap(c, 0*32+31);
-
-	/* On C+ stepping K8 rep microcode works well for copy/memset */
-	level = cpuid_eax(1);
-	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
-			     level >= 0x0f58))
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	if (c->x86 == 0x10 || c->x86 == 0x11)
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
-	/* Enable workaround for FXSAVE leak */
-	if (c->x86 >= 6)
-		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
-	level = get_model_name(c);
-	if (!level) {
-		switch (c->x86) {
-		case 15:
-			/* Should distinguish Models here, but this is only
-			   a fallback anyways. */
-			strcpy(c->x86_model_id, "Hammer");
-			break;
-		}
-	}
-	display_cacheinfo(c);
-
-	/* Multi core CPU? */
-	if (c->extended_cpuid_level >= 0x80000008)
-		amd_detect_cmp(c);
-
-	if (c->extended_cpuid_level >= 0x80000006 &&
-		(cpuid_edx(0x80000006) & 0xf000))
-		num_cache_leaves = 4;
-	else
-		num_cache_leaves = 3;
-
-	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
-		set_cpu_cap(c, X86_FEATURE_K8);
-
-	/* MFENCE stops RDTSC speculation */
-	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
-	if (c->x86 == 0x10)
-		fam10h_check_enable_mmcfg();
-
-	if (amd_apic_timer_broken())
-		disable_apic_timer = 1;
-
-	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
-		unsigned long long tseg;
-
-		/*
-		 * Split up direct mapping around the TSEG SMM area.
-		 * Don't do it for gbpages because there seems very little
-		 * benefit in doing so.
-		 */
-		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
-		(tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
-			set_memory_4k((unsigned long)__va(tseg), 1);
-	}
-}
-
 void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
@@ -977,6 +753,10 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 		c->x86_vendor = X86_VENDOR_UNKNOWN;
 }
 
+// FIXME: Needs to use cpu_vendor_dev_register
+extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c);
+extern void __cpuinit init_amd(struct cpuinfo_x86 *c);
+
 /* Do some early cpuid on the boot CPU to get some parameter that are
    needed before check_bugs. Everything advanced is in identify_cpu
    below. */
-- 
cgit v1.2.3


From a82fbe31cb387bb246e2d3b3c177f551bb991135 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 22 May 2008 18:54:32 -0400
Subject: x86: Move the 64-bit Intel specific parts out of setup_64.c

Create a separate intel_64.c file in the cpu/ dir for
the useful parts to live in.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/Makefile   |  1 +
 arch/x86/kernel/cpu/intel_64.c | 97 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_64.c     | 93 +---------------------------------------
 3 files changed, 100 insertions(+), 91 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/intel_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ef065c1a2e1a..b7a11924fed7 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_X86_32)	+= cyrix.o
 obj-$(CONFIG_X86_32)	+= centaur.o
 obj-$(CONFIG_X86_32)	+= transmeta.o
 obj-$(CONFIG_X86_32)	+= intel.o
+obj-$(CONFIG_X86_64)	+= intel_64.o
 obj-$(CONFIG_X86_32)	+= umc.o
 
 obj-$(CONFIG_X86_MCE)	+= mcheck/
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
new file mode 100644
index 000000000000..e5f929f6c3d4
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -0,0 +1,97 @@
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/topology.h>
+#include <asm/numa_64.h>
+
+void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+{
+	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+/*
+ * find out the number of processor cores on the die
+ */
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+	unsigned int eax, t;
+
+	if (c->cpuid_level < 4)
+		return 1;
+
+	cpuid_count(4, 0, &eax, &t, &t, &t);
+
+	if (eax & 0x1f)
+		return ((eax >> 26) + 1);
+	else
+		return 1;
+}
+
+static void __cpuinit srat_detect_node(void)
+{
+#ifdef CONFIG_NUMA
+	unsigned node;
+	int cpu = smp_processor_id();
+	int apicid = hard_smp_processor_id();
+
+	/* Don't do the funky fallback heuristics the AMD version employs
+	   for now. */
+	node = apicid_to_node[apicid];
+	if (node == NUMA_NO_NODE || !node_online(node))
+		node = first_node(node_online_map);
+	numa_set_node(cpu, node);
+
+	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+
+void __cpuinit init_intel(struct cpuinfo_x86 *c)
+{
+	/* Cache sizes */
+	unsigned n;
+
+	init_intel_cacheinfo(c);
+	if (c->cpuid_level > 9) {
+		unsigned eax = cpuid_eax(10);
+		/* Check for version and the number of counters */
+		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+	}
+
+	if (cpu_has_ds) {
+		unsigned int l1, l2;
+		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+		if (!(l1 & (1<<11)))
+			set_cpu_cap(c, X86_FEATURE_BTS);
+		if (!(l1 & (1<<12)))
+			set_cpu_cap(c, X86_FEATURE_PEBS);
+	}
+
+
+	if (cpu_has_bts)
+		ds_init_intel(c);
+
+	n = c->extended_cpuid_level;
+	if (n >= 0x80000008) {
+		unsigned eax = cpuid_eax(0x80000008);
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+		/* CPUID workaround for Intel 0F34 CPU */
+		if (c->x86_vendor == X86_VENDOR_INTEL &&
+		    c->x86 == 0xF && c->x86_model == 0x3 &&
+		    c->x86_mask == 0x4)
+			c->x86_phys_bits = 36;
+	}
+
+	if (c->x86 == 15)
+		c->x86_cache_alignment = c->x86_clflush_size * 2;
+	if (c->x86 == 6)
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+	c->x86_max_cores = intel_num_cpu_cores(c);
+
+	srat_detect_node();
+}
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index b07b1997ed97..c4e6a0b6c303 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -622,97 +622,6 @@ out:
 #endif
 }
 
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
-	unsigned int eax, t;
-
-	if (c->cpuid_level < 4)
-		return 1;
-
-	cpuid_count(4, 0, &eax, &t, &t, &t);
-
-	if (eax & 0x1f)
-		return ((eax >> 26) + 1);
-	else
-		return 1;
-}
-
-static void __cpuinit srat_detect_node(void)
-{
-#ifdef CONFIG_NUMA
-	unsigned node;
-	int cpu = smp_processor_id();
-	int apicid = hard_smp_processor_id();
-
-	/* Don't do the funky fallback heuristics the AMD version employs
-	   for now. */
-	node = apicid_to_node[apicid];
-	if (node == NUMA_NO_NODE || !node_online(node))
-		node = first_node(node_online_map);
-	numa_set_node(cpu, node);
-
-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-}
-
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
-{
-	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
-	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
-	/* Cache sizes */
-	unsigned n;
-
-	init_intel_cacheinfo(c);
-	if (c->cpuid_level > 9) {
-		unsigned eax = cpuid_eax(10);
-		/* Check for version and the number of counters */
-		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
-			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
-	}
-
-	if (cpu_has_ds) {
-		unsigned int l1, l2;
-		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
-		if (!(l1 & (1<<11)))
-			set_cpu_cap(c, X86_FEATURE_BTS);
-		if (!(l1 & (1<<12)))
-			set_cpu_cap(c, X86_FEATURE_PEBS);
-	}
-
-
-	if (cpu_has_bts)
-		ds_init_intel(c);
-
-	n = c->extended_cpuid_level;
-	if (n >= 0x80000008) {
-		unsigned eax = cpuid_eax(0x80000008);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-		/* CPUID workaround for Intel 0F34 CPU */
-		if (c->x86_vendor == X86_VENDOR_INTEL &&
-		    c->x86 == 0xF && c->x86_model == 0x3 &&
-		    c->x86_mask == 0x4)
-			c->x86_phys_bits = 36;
-	}
-
-	if (c->x86 == 15)
-		c->x86_cache_alignment = c->x86_clflush_size * 2;
-	if (c->x86 == 6)
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-	c->x86_max_cores = intel_num_cpu_cores(c);
-
-	srat_detect_node();
-}
-
 static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
 {
 	if (c->x86 == 0x6 && c->x86_model >= 0xf)
@@ -756,6 +665,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 // FIXME: Needs to use cpu_vendor_dev_register
 extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c);
 extern void __cpuinit init_amd(struct cpuinfo_x86 *c);
+extern void __cpuinit early_init_intel(struct cpuinfo_x86 *c);
+extern void __cpuinit init_intel(struct cpuinfo_x86 *c);
 
 /* Do some early cpuid on the boot CPU to get some parameter that are
    needed before check_bugs. Everything advanced is in identify_cpu
-- 
cgit v1.2.3


From 7e2191127eb414d7d5a11df6552ab6e3845d17a1 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 22 May 2008 18:55:06 -0400
Subject: x86: Remove workaround for prescott (32bit P4) from 64-bit code.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_64.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
index e5f929f6c3d4..b33912199484 100644
--- a/arch/x86/kernel/cpu/intel_64.c
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -79,11 +79,6 @@ void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		unsigned eax = cpuid_eax(0x80000008);
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
-		/* CPUID workaround for Intel 0F34 CPU */
-		if (c->x86_vendor == X86_VENDOR_INTEL &&
-		    c->x86 == 0xF && c->x86_model == 0x3 &&
-		    c->x86_mask == 0x4)
-			c->x86_phys_bits = 36;
 	}
 
 	if (c->x86 == 15)
-- 
cgit v1.2.3


From 30a713180b3d08fdec5ca572e5a1cd35253c5d8e Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 22 May 2008 18:57:25 -0400
Subject: x86: Move the 64-bit Centaur specific parts out of setup_64.c

Create a separate centaur_64.c file in the cpu/ dir for
the useful parts to live in.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/Makefile     |  1 +
 arch/x86/kernel/cpu/centaur_64.c | 31 +++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_64.c       | 28 ++--------------------------
 3 files changed, 34 insertions(+), 26 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/centaur_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index b7a11924fed7..c77a1c50d944 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_X86_32)	+= amd.o
 obj-$(CONFIG_X86_64)	+= amd_64.o
 obj-$(CONFIG_X86_32)	+= cyrix.o
 obj-$(CONFIG_X86_32)	+= centaur.o
+obj-$(CONFIG_X86_64)	+= centaur_64.o
 obj-$(CONFIG_X86_32)	+= transmeta.o
 obj-$(CONFIG_X86_32)	+= intel.o
 obj-$(CONFIG_X86_64)	+= intel_64.o
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
new file mode 100644
index 000000000000..bac96d187d05
--- /dev/null
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -0,0 +1,31 @@
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+{
+	if (c->x86 == 0x6 && c->x86_model >= 0xf)
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+{
+	/* Cache sizes */
+	unsigned n;
+
+	n = c->extended_cpuid_level;
+	if (n >= 0x80000008) {
+		unsigned eax = cpuid_eax(0x80000008);
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+
+	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+		c->x86_cache_alignment = c->x86_clflush_size * 2;
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	}
+	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+}
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index c4e6a0b6c303..25afdd80a3cf 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -622,32 +622,6 @@ out:
 #endif
 }
 
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
-	if (c->x86 == 0x6 && c->x86_model >= 0xf)
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
-{
-	/* Cache sizes */
-	unsigned n;
-
-	n = c->extended_cpuid_level;
-	if (n >= 0x80000008) {
-		unsigned eax = cpuid_eax(0x80000008);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-
-	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
-		c->x86_cache_alignment = c->x86_clflush_size * 2;
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	}
-	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-}
-
 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
@@ -667,6 +641,8 @@ extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c);
 extern void __cpuinit init_amd(struct cpuinfo_x86 *c);
 extern void __cpuinit early_init_intel(struct cpuinfo_x86 *c);
 extern void __cpuinit init_intel(struct cpuinfo_x86 *c);
+extern void __cpuinit early_init_centaur(struct cpuinfo_x86 *c);
+extern void __cpuinit init_centaur(struct cpuinfo_x86 *c);
 
 /* Do some early cpuid on the boot CPU to get some parameter that are
    needed before check_bugs. Everything advanced is in identify_cpu
-- 
cgit v1.2.3


From 1c47cd638e8302bc38be1f6d81067950e038ebd3 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 30 May 2008 15:42:45 -0700
Subject: x86: fix overlong line in arch/x86/kernel/cpu/amd_64.c

Clean up an overlong line in arch/x86/kernel/cpu/amd_64.c.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/amd_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 1746f6f95726..c815c2c0484b 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -229,7 +229,8 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		 * benefit in doing so.
 		 */
 		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
-		(tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
+		    (tseg >> PMD_SHIFT) <
+			(max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
 			set_memory_4k((unsigned long)__va(tseg), 1);
 	}
 }
-- 
cgit v1.2.3


From f0d43100f13be0fa5bf52741d7084bb27f00e621 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 29 May 2008 12:56:36 -0700
Subject: x86: extend e820 early_res support 32bit -fix #3

introduce init_pg_table_start, so xen PV could specify the value.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head32.c   | 3 ++-
 arch/x86/kernel/head_32.S  | 2 ++
 arch/x86/kernel/setup_32.c | 7 +++++++
 arch/x86/lguest/boot.c     | 5 +++--
 arch/x86/xen/enlighten.c   | 3 ++-
 include/asm-x86/setup.h    | 4 +++-
 6 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index c216d3c2a991..0b6cb9fba71e 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -76,7 +76,8 @@ void __init i386_start_kernel(void)
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
-	reserve_early(__pa_symbol(&_end), init_pg_tables_end, "INIT_PG_TABLE");
+	reserve_early(init_pg_tables_start, init_pg_tables_end,
+			"INIT_PG_TABLE");
 
 	reserve_ebda_region();
 
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b2cc73768a9d..bef4618feadb 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -194,6 +194,7 @@ default_entry:
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
 	movl $pa(pg0), %edi
+	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_pmd), %edx
 	movl $PTE_ATTR, %eax
 10:
@@ -228,6 +229,7 @@ default_entry:
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
 	movl $pa(pg0), %edi
+	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_dir), %edx
 	movl $PTE_ATTR, %eax
 10:
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 08b47a2f7af0..de9c5ee77d07 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -71,6 +71,7 @@
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
    address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_start __initdata = ~0UL;
 unsigned long init_pg_tables_end __initdata = ~0UL;
 
 /*
@@ -485,6 +486,10 @@ static void __init reserve_initrd(void)
 		return;
 	}
 
+	printk(KERN_INFO "old RAMDISK: %08llx - %08llx\n", ramdisk_image,
+			ramdisk_end);
+
+
 	if (ramdisk_end <= end_of_lowmem) {
 		/* All in lowmem, easy case */
 		/*
@@ -511,6 +516,8 @@ static void __init reserve_initrd(void)
 			 "NEW RAMDISK");
 	initrd_start = ramdisk_here + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
+	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+			 ramdisk_here, ramdisk_here + ramdisk_size);
 
 	do_relocate_initrd = true;
 }
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index af65b2da3ba0..bf7c34a3aaeb 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1011,6 +1011,7 @@ __init void lguest_init(void)
 	 * clobbered.  The Launcher places our initial pagetables somewhere at
 	 * the top of our physical memory, so we don't need extra space: set
 	 * init_pg_tables_end to the end of the kernel. */
+	init_pg_tables_start = __pa(pg0);
 	init_pg_tables_end = __pa(pg0);
 
 	/* Load the %fs segment register (the per-cpu segment register) with
@@ -1064,9 +1065,9 @@ __init void lguest_init(void)
 	pm_power_off = lguest_power_off;
 	machine_ops.restart = lguest_restart;
 
-	/* Now we're set up, call start_kernel() in init/main.c and we proceed
+	/* Now we're set up, call i386_start_kernel() in head32.c and we proceed
 	 * to boot as normal.  It never returns. */
-	start_kernel();
+	i386_start_kernel();
 }
 /*
  * This marks the end of stage II of our journey, The Guest.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c8a56e457d61..ccc7b84ddabf 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1211,6 +1211,7 @@ asmlinkage void __init xen_start_kernel(void)
 
 	pgd = (pgd_t *)xen_start_info->pt_base;
 
+	init_pg_tables_start = __pa(pgd);
 	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
 
 	init_mm.pgd = pgd; /* use the Xen pagetables to start */
@@ -1246,5 +1247,5 @@ asmlinkage void __init xen_start_kernel(void)
 		add_preferred_console("hvc", 0, NULL);
 
 	/* Start the world */
-	start_kernel();
+	i386_start_kernel();
 }
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index ffa0f540fc7f..64f5f0c11d66 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -59,9 +59,11 @@ int __init copy_e820_map(struct e820entry *biosmap, int nr_map);
 void __init add_memory_region(unsigned long long start,
 			      unsigned long long size, int type);
 
-extern unsigned long init_pg_tables_end;
 
+void __init i386_start_kernel(void);
 
+extern unsigned long init_pg_tables_start;
+extern unsigned long init_pg_tables_end;
 
 #endif /* __i386__ */
 #endif /* _SETUP */
-- 
cgit v1.2.3


From a5481280b29b6a3db912ec100498bd31eaa6d2db Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 29 May 2008 12:58:37 -0700
Subject: x86: extend e820 early_res support 32bit -fix #5

reserve early numa kva, so it will not clash with new RAMDISK

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c  |  1 -
 arch/x86/mm/discontig_32.c  | 12 +++++-------
 include/asm-x86/mmzone_32.h |  4 ----
 3 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index de9c5ee77d07..6f40cb560ea9 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -614,7 +614,6 @@ void __init setup_bootmem_allocator(void)
 	 */
 	find_smp_config();
 #endif
-	numa_kva_reserve();
 	reserve_crashkernel();
 
 	reserve_ibft_region();
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 47749727907e..55fdbab6b014 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -357,6 +357,11 @@ unsigned long __init setup_memory(void)
 	printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
 		kva_start_pfn, max_low_pfn);
 	printk("max_pfn = %ld\n", max_pfn);
+
+	/* avoid clash with initrd */
+	reserve_early(kva_start_pfn<<PAGE_SHIFT,
+		      (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
+		     "KVA PG");
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
 	if (max_pfn > system_max_low_pfn)
@@ -392,13 +397,6 @@ unsigned long __init setup_memory(void)
 	return max_low_pfn;
 }
 
-void __init numa_kva_reserve(void)
-{
-	if (kva_pages)
-		reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
-				BOOTMEM_DEFAULT);
-}
-
 void __init zone_sizes_init(void)
 {
 	int nid;
diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h
index cb2cad0b65a7..faef751181b7 100644
--- a/include/asm-x86/mmzone_32.h
+++ b/include/asm-x86/mmzone_32.h
@@ -38,16 +38,12 @@ static inline void get_memcfg_numa(void)
 }
 
 extern int early_pfn_to_nid(unsigned long pfn);
-extern void numa_kva_reserve(void);
 
 #else /* !CONFIG_NUMA */
 
 #define get_memcfg_numa get_memcfg_numa_flat
 #define get_zholes_size(n) (0)
 
-static inline void numa_kva_reserve(void)
-{
-}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_DISCONTIGMEM
-- 
cgit v1.2.3


From 9a73aa81ffb993382afed2ed404bc2b330d75427 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 29 May 2008 16:25:56 -0700
Subject: x86: 32bit numa srat fix early_ioremap leak

on two node system (16g RAM) with numa config I got this crash:

get_memcfg_from_srat: assigning address to rsdp
RSD PTR  v0 [ACPIAM]
ACPI: Too big length in RSDT: 92
failed to get NUMA memory information from SRAT table
NUMA - single node, flat memory mode
Node: 0, start_pfn: 0, end_pfn: 153
 Setting physnode_map array to node 0 for pfns:
 0
...
Pid: 0, comm: swapper Not tainted 2.6.26-rc4 #4
 [<80b41289>] hlt_loop+0x0/0x3
 [<8011efa0>] ? alloc_remap+0x50/0x70
 [<8079e32e>] alloc_node_mem_map+0x5e/0xa0
 [<8012e77b>] ? printk+0x1b/0x20
 [<80b590f6>] free_area_init_node+0xc6/0x470
 [<80b588fc>] ? __alloc_bootmem_node+0x2c/0x50
 [<80b58ad8>] ? find_min_pfn_for_node+0x38/0x70
 [<8012e77b>] ? printk+0x1b/0x20
 [<80b597c4>] free_area_init_nodes+0x254/0x2d0
 [<80b544d7>] zone_sizes_init+0x97/0xa0
 [<80b48a03>] setup_arch+0x383/0x530
 [<8012e77b>] ? printk+0x1b/0x20
 [<80b41aa4>] start_kernel+0x64/0x350
 [<80b412d8>] i386_start_kernel+0x8/0x10
 =======================

this patch increases the acpi table limit to 32.
Also match early_ioremap() with early_iounmap().

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/srat_32.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 70e4a374b4e8..88971ee166d4 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -261,7 +261,7 @@ out_fail:
 
 struct acpi_static_rsdt {
 	struct acpi_table_rsdt table;
-	u32 padding[7]; /* Allow for 7 more table entries */
+	u32 padding[32]; /* Allow for 32 more table entries */
 };
 
 int __init get_memcfg_from_srat(void)
@@ -297,7 +297,7 @@ int __init get_memcfg_from_srat(void)
 	}
 
 	rsdt = (struct acpi_table_rsdt *)
-	    early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
+	    early_ioremap(rsdp->rsdt_physical_address, sizeof(saved_rsdt));
 
 	if (!rsdt) {
 		printk(KERN_WARNING
@@ -310,6 +310,7 @@ int __init get_memcfg_from_srat(void)
 
 	if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
 		printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
+		early_iounmap(rsdt, sizeof(saved_rsdt));
 		goto out_err;
 	}
 
@@ -319,37 +320,51 @@ int __init get_memcfg_from_srat(void)
 	 * size of RSDT) divided by the size of each entry
 	 * (4-byte table pointers).
 	 */
-	tables = (header->length - sizeof(struct acpi_table_header)) / 4;
+	tables = (header->length - sizeof(struct acpi_table_header)) / sizeof(u32);
 
 	if (!tables)
 		goto out_err;
 
 	memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
-
+	early_iounmap(rsdt, sizeof(saved_rsdt));
 	if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
 		printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
 		       saved_rsdt.table.header.length);
 		goto out_err;
 	}
 
-	printk("Begin SRAT table scan....\n");
+	printk("Begin SRAT table scan....%d\n", tables);
 
-	for (i = 0; i < tables; i++) {
+	for (i = 0; i < tables; i++){
+		int result;
+		u32 length;
 		/* Map in header, then map in full table length. */
 		header = (struct acpi_table_header *)
 			early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
 		if (!header)
 			break;
+
+                printk(KERN_INFO "ACPI: %4.4s %08lX, %04X\n",
+                           header->signature,
+		   (unsigned long)saved_rsdt.table.table_offset_entry[i],
+                           header->length);
+
+		if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4)) {
+			early_iounmap(header, sizeof(struct acpi_table_header));
+			continue;
+		}
+
+		length = header->length;
+		early_iounmap(header, sizeof(struct acpi_table_header));
 		header = (struct acpi_table_header *)
-			early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
+			early_ioremap(saved_rsdt.table.table_offset_entry[i], length);
 		if (!header)
 			break;
 
-		if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
-			continue;
-
 		/* we've found the srat table. don't need to look at any more tables */
-		return acpi20_parse_srat((struct acpi_table_srat *)header);
+		result = acpi20_parse_srat((struct acpi_table_srat *)header);
+		early_iounmap(header, length);
+		return result;
 	}
 out_err:
 	remove_all_active_ranges();
-- 
cgit v1.2.3


From 831d991821daedd4839073dbca55514432ef1768 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 Sep 2007 10:17:39 +0200
Subject: x86: add PCI extended config space access for AMD Barcelona

This patch implements PCI extended configuration space access for
AMD's Barcelona CPUs. It extends the method using CF8/CFC IO
addresses. An x86 capability bit has been introduced that is set for
CPUs supporting PCI extended config space accesses.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c    |  4 ++++
 arch/x86/kernel/cpu/amd_64.c |  3 +++
 arch/x86/kernel/setup.c      | 13 +++++++++++++
 arch/x86/kernel/setup.h      | 26 ++++++++++++++++++++++++++
 arch/x86/kernel/setup_64.c   |  2 ++
 arch/x86/pci/direct.c        | 21 +++++++++++++++------
 include/asm-x86/cpufeature.h |  2 ++
 7 files changed, 65 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/setup.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 245866828294..99221f9834e4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
 #include <asm/apic.h>
 
 #include <mach_apic.h>
+#include "../setup.h"
 #include "cpu.h"
 
 /*
@@ -308,6 +309,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	if (cpu_has_xmm2)
 		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+
+	if (c->x86 == 0x10)
+		amd_enable_pci_ext_cfg(c);
 }
 
 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index c815c2c0484b..180097e99219 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -217,6 +217,9 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 == 0x10)
 		fam10h_check_enable_mmcfg();
 
+	if (c->x86 == 0x10)
+		amd_enable_pci_ext_cfg(c);
+
 	if (amd_apic_timer_broken())
 		disable_apic_timer = 1;
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6f80b852a196..d8f171234013 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -136,4 +136,17 @@ void __init setup_per_cpu_areas(void)
 	setup_cpumask_of_cpu();
 }
 
+#define ENABLE_CF8_EXT_CFG      (1ULL << 46)
+
+void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c)
+{
+	u64 reg;
+	rdmsrl(MSR_AMD64_NB_CFG, reg);
+	if (!(reg & ENABLE_CF8_EXT_CFG)) {
+		reg |= ENABLE_CF8_EXT_CFG;
+		wrmsrl(MSR_AMD64_NB_CFG, reg);
+	}
+	set_cpu_cap(c, X86_FEATURE_PCI_EXT_CFG);
+}
+
 #endif
diff --git a/arch/x86/kernel/setup.h b/arch/x86/kernel/setup.h
new file mode 100644
index 000000000000..66cc2c7f961d
--- /dev/null
+++ b/arch/x86/kernel/setup.h
@@ -0,0 +1,26 @@
+/*
+ * Internal declarations for shared x86 setup code.
+ *
+ * Copyright (c) 2008 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+
+#ifndef _ARCH_X86_KERNEL_SETUP_H
+
+extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c);
+
+#endif	/* _ARCH_X86_KERNEL_SETUP_H */
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 25afdd80a3cf..215bd67e5b6c 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -73,6 +73,8 @@
 #include <asm/pat.h>
 #include <asm/mmconfig.h>
 
+#include "setup.h"
+
 #include <mach_apic.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 21d1e0e0d535..27d61b63def6 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -8,18 +8,21 @@
 #include "pci.h"
 
 /*
- * Functions for accessing PCI configuration space with type 1 accesses
+ * Functions for accessing PCI base (first 256 bytes) and extended
+ * (4096 bytes per PCI function) configuration space with type 1
+ * accesses.
  */
 
 #define PCI_CONF1_ADDRESS(bus, devfn, reg) \
-	(0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
+	(0x80000000 | ((reg & 0xF00) << 16) | (bus << 16) \
+	| (devfn << 8) | (reg & 0xFC))
 
 static int pci_conf1_read(unsigned int seg, unsigned int bus,
 			  unsigned int devfn, int reg, int len, u32 *value)
 {
 	unsigned long flags;
 
-	if ((bus > 255) || (devfn > 255) || (reg > 255)) {
+	if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
 		*value = -1;
 		return -EINVAL;
 	}
@@ -50,7 +53,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
 {
 	unsigned long flags;
 
-	if ((bus > 255) || (devfn > 255) || (reg > 255)) 
+	if ((bus > 255) || (devfn > 255) || (reg > 4095))
 		return -EINVAL;
 
 	spin_lock_irqsave(&pci_config_lock, flags);
@@ -260,10 +263,16 @@ void __init pci_direct_init(int type)
 		return;
 	printk(KERN_INFO "PCI: Using configuration type %d for base access\n",
 		 type);
-	if (type == 1)
+	if (type == 1) {
 		raw_pci_ops = &pci_direct_conf1;
-	else
+		if (!raw_pci_ext_ops && cpu_has_pci_ext_cfg) {
+			printk(KERN_INFO "PCI: Using configuration type 1 "
+			       "for extended access\n");
+			raw_pci_ext_ops = &pci_direct_conf1;
+		}
+	} else {
 		raw_pci_ops = &pci_direct_conf2;
+	}
 }
 
 int __init pci_direct_probe(void)
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 0d609c837a41..40fcbba00f15 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -79,6 +79,7 @@
 #define X86_FEATURE_REP_GOOD	(3*32+16) /* rep microcode works well on this CPU */
 #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
 #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */
+#define X86_FEATURE_PCI_EXT_CFG	(3*32+19) /* PCI extended cfg access */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
@@ -187,6 +188,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_gbpages		boot_cpu_has(X86_FEATURE_GBPAGES)
 #define cpu_has_arch_perfmon	boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
 #define cpu_has_pat		boot_cpu_has(X86_FEATURE_PAT)
+#define cpu_has_pci_ext_cfg	boot_cpu_has(X86_FEATURE_PCI_EXT_CFG)
 
 #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
 # define cpu_has_invlpg		1
-- 
cgit v1.2.3


From c46e62f73569d7ef42255bd6f31e35925b7f1492 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 28 May 2008 12:42:57 +0200
Subject: i8259: fix final ugliness

Introduce IRQx_VECTOR on 32-bit, so that #ifdef noise is kept
down. There should be no object code change.

[ mingo@elte.hu: merged to x86/irq not x86/i8259 due to x86/irq having
  restructured the vector code into asm-x86/irq_vectors.h, which this
  patch touches. ]

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i8259.c       | 22 ++++++++--------------
 include/asm-x86/irq_vectors.h |  9 ++++++---
 2 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 7a0fda8f01b5..dc92b49d9204 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -297,34 +297,28 @@ void init_8259A(int auto_eoi)
 	 * outb_pic - this has to work on a wide range of PC hardware.
 	 */
 	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
-#ifndef CONFIG_X86_64
-	outb_pic(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
-	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
-#else /* CONFIG_X86_64 */
-	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
+
+	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
+	                       to 0x20-0x27 on i386 */
 	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+
 	/* 8259A-1 (the master) has a slave on IR2 */
-	outb_pic(0x04, PIC_MASTER_IMR);
-#endif /* CONFIG_X86_64 */
+	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
+
 	if (auto_eoi)	/* master does Auto EOI */
 		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
 	else		/* master expects normal EOI */
 		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
 
 	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
-#ifndef CONFIG_X86_64
-	outb_pic(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
-	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);	/* 8259A-2 is a slave on master's IR2 */
-	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
-#else /* CONFIG_X86_64 */
-	/* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
+
+	/* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
 	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
 	/* 8259A-2 is a slave on master's IR2 */
 	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
 	/* (slave's support for AEOI in flat mode is to be investigated) */
 	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
 
-#endif /* CONFIG_X86_64 */
 	if (auto_eoi)
 		/*
 		 * In AEOI mode we just have to mask the interrupt
diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h
index 3cb6d8c77b39..b58581e2e24e 100644
--- a/include/asm-x86/irq_vectors.h
+++ b/include/asm-x86/irq_vectors.h
@@ -18,17 +18,20 @@
 #endif
 
 /*
- * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit.
- *
  * Reserve the lowest usable priority level 0x20 - 0x2f for triggering
  * cleanup after irq migration on 64 bit.
  */
 #define IRQ_MOVE_CLEANUP_VECTOR	FIRST_EXTERNAL_VECTOR
 
 /*
- * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit
+ * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit.
+ * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit.
  */
+#ifdef CONFIG_X86_32
+#define IRQ0_VECTOR		(FIRST_EXTERNAL_VECTOR)
+#else
 #define IRQ0_VECTOR		(FIRST_EXTERNAL_VECTOR + 0x10)
+#endif
 #define IRQ1_VECTOR		(IRQ0_VECTOR + 1)
 #define IRQ2_VECTOR		(IRQ0_VECTOR + 2)
 #define IRQ3_VECTOR		(IRQ0_VECTOR + 3)
-- 
cgit v1.2.3


From 1a5726528a70bb239bdd149aef7f2155cd2b1699 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 2 Jun 2008 12:21:36 +0200
Subject: fix build bug in "x86: add PCI extended config space access for AMD
 Barcelona"

---
 arch/x86/kernel/cpu/amd.c  | 1 +
 include/asm-x86/mmconfig.h | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 99221f9834e4..656b40aed647 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -4,6 +4,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
+#include <asm/mmconfig.h>
 
 #include <mach_apic.h>
 #include "../setup.h"
diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h
index 46d6bb135df4..691798fbee10 100644
--- a/include/asm-x86/mmconfig.h
+++ b/include/asm-x86/mmconfig.h
@@ -9,6 +9,10 @@ static inline void fam10h_check_enable_mmcfg(void) { }
 static inline void check_enable_amd_mmconf_dmi(void) { }
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_64)
 extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c);
+#else
+static inline void amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c) { }
+#endif
 
 #endif
-- 
cgit v1.2.3


From 88ff0a474e98f869d8c321e29481f298320100d6 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Tue, 27 May 2008 18:49:39 -0700
Subject: x86: coding style fixes for nmi.c

before
	total: 1 errors, 6 warnings, 534 lines checked
after
	total: 0 errors, 1 warnings, 532 lines checked

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 3671a9f3564b..bf143f191fb1 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -23,9 +23,8 @@
 #include <linux/cpumask.h>
 #include <linux/kernel_stat.h>
 #include <linux/kdebug.h>
+#include <linux/smp.h>
 
-#include <asm/smp.h>
-#include <asm/nmi.h>
 #include <asm/proto.h>
 #include <asm/timer.h>
 
@@ -45,13 +44,16 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
  *  0: the lapic NMI watchdog is disabled, but can be enabled
  */
 atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
-static int panic_on_timeout;
+EXPORT_SYMBOL(nmi_active);
 
 unsigned int nmi_watchdog = NMI_DEFAULT;
+EXPORT_SYMBOL(nmi_watchdog);
+
+static int panic_on_timeout;
 
 static unsigned int nmi_hz = HZ;
 static DEFINE_PER_CPU(short, wd_enabled);
-static int endflag __initdata = 0;
+static int endflag __initdata;
 
 static inline unsigned int get_nmi_count(int cpu)
 {
@@ -404,7 +406,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
 
 		spin_lock(&lock);
-		printk("NMI backtrace for cpu %d\n", cpu);
+		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
 		dump_stack();
 		spin_unlock(&lock);
 		cpu_clear(cpu, backtrace_mask);
@@ -529,7 +531,3 @@ void __trigger_all_cpu_backtrace(void)
 		mdelay(1);
 	}
 }
-
-EXPORT_SYMBOL(nmi_active);
-EXPORT_SYMBOL(nmi_watchdog);
-
-- 
cgit v1.2.3


From 9f5314fb4d556d3132c784d0df47352b2830ca53 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 28 May 2008 09:51:18 -0500
Subject: x86, uv: update macros used by UV platform

Update the UV address macros to better describe the
fields of UV physical addresses. Improve comments
in the header files. Add additional MMR definitions.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/genx2apic_uv_x.c | 141 +++++++----
 include/asm-x86/uv/uv_hub.h      | 188 ++++++++++-----
 include/asm-x86/uv/uv_mmrs.h     | 509 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 731 insertions(+), 107 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index ebf13908a743..45e84acca8a9 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -5,7 +5,7 @@
  *
  * SGI UV APIC functions (note: not an Intel compatible APIC)
  *
- * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
  */
 
 #include <linux/threads.h>
@@ -55,37 +55,37 @@ static cpumask_t uv_vector_allocation_domain(int cpu)
 int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
 {
 	unsigned long val;
-	int nasid;
+	int pnode;
 
-	nasid = uv_apicid_to_nasid(phys_apicid);
+	pnode = uv_apicid_to_pnode(phys_apicid);
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
 	    (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
 	    APIC_DM_INIT;
-	uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 	mdelay(10);
 
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
 	    (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
 	    APIC_DM_STARTUP;
-	uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 	return 0;
 }
 
 static void uv_send_IPI_one(int cpu, int vector)
 {
 	unsigned long val, apicid, lapicid;
-	int nasid;
+	int pnode;
 
 	apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */
 	lapicid = apicid & 0x3f;		/* ZZZ macro needed */
-	nasid = uv_apicid_to_nasid(apicid);
+	pnode = uv_apicid_to_pnode(apicid);
 	val =
 	    (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
 					      UVH_IPI_INT_APIC_ID_SHFT) |
 	    (vector << UVH_IPI_INT_VECTOR_SHFT);
-	uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 }
 
 static void uv_send_IPI_mask(cpumask_t mask, int vector)
@@ -159,39 +159,81 @@ struct genapic apic_x2apic_uv_x = {
 	.phys_pkg_id = phys_pkg_id,	/* Fixme ZZZ */
 };
 
-static __cpuinit void set_x2apic_extra_bits(int nasid)
+static __cpuinit void set_x2apic_extra_bits(int pnode)
 {
-	__get_cpu_var(x2apic_extra_bits) = ((nasid >> 1) << 6);
+	__get_cpu_var(x2apic_extra_bits) = (pnode << 6);
 }
 
 /*
  * Called on boot cpu.
  */
+static __init int boot_pnode_to_blade(int pnode)
+{
+	int blade;
+
+	for (blade = 0; blade < uv_num_possible_blades(); blade++)
+		if (pnode == uv_blade_info[blade].pnode)
+			return blade;
+	BUG();
+}
+
+struct redir_addr {
+	unsigned long redirect;
+	unsigned long alias;
+};
+
+#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
+
+static __initdata struct redir_addr redir_addrs[] = {
+	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
+	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
+	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
+};
+
+static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
+{
+	union uvh_si_alias0_overlay_config_u alias;
+	union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
+		alias.v = uv_read_local_mmr(redir_addrs[i].alias);
+		if (alias.s.base == 0) {
+			*size = (1UL << alias.s.m_alias);
+			redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
+			*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
+			return;
+		}
+	}
+	BUG();
+}
+
 static __init void uv_system_init(void)
 {
 	union uvh_si_addr_map_config_u m_n_config;
-	int bytes, nid, cpu, lcpu, nasid, last_nasid, blade;
-	unsigned long mmr_base;
+	union uvh_node_id_u node_id;
+	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
+	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+	unsigned long mmr_base, present;
 
 	m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
+	m_val = m_n_config.s.m_skt;
+	n_val = m_n_config.s.n_skt;
 	mmr_base =
 	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
 	    ~UV_MMR_ENABLE;
 	printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
 
-	last_nasid = -1;
-	for_each_possible_cpu(cpu) {
-		nid = cpu_to_node(cpu);
-		nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
-		if (nasid != last_nasid)
-			uv_possible_blades++;
-		last_nasid = nasid;
-	}
+	for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
+		uv_possible_blades +=
+		  hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
 	printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
 
 	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
 	uv_blade_info = alloc_bootmem_pages(bytes);
 
+	get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
+
 	bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
 	uv_node_to_blade = alloc_bootmem_pages(bytes);
 	memset(uv_node_to_blade, 255, bytes);
@@ -200,43 +242,56 @@ static __init void uv_system_init(void)
 	uv_cpu_to_blade = alloc_bootmem_pages(bytes);
 	memset(uv_cpu_to_blade, 255, bytes);
 
-	last_nasid = -1;
-	blade = -1;
-	lcpu = -1;
-	for_each_possible_cpu(cpu) {
-		nid = cpu_to_node(cpu);
-		nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
-		if (nasid != last_nasid) {
-			blade++;
-			lcpu = -1;
-			uv_blade_info[blade].nr_posible_cpus = 0;
+	blade = 0;
+	for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
+		present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
+		for (j = 0; j < 64; j++) {
+			if (!test_bit(j, &present))
+				continue;
+			uv_blade_info[blade].pnode = (i * 64 + j);
+			uv_blade_info[blade].nr_possible_cpus = 0;
 			uv_blade_info[blade].nr_online_cpus = 0;
+			blade++;
 		}
-		last_nasid = nasid;
-		lcpu++;
+	}
 
-		uv_cpu_hub_info(cpu)->m_val = m_n_config.s.m_skt;
-		uv_cpu_hub_info(cpu)->n_val = m_n_config.s.n_skt;
+	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+	gnode_upper = (((unsigned long)node_id.s.node_id) &
+		       ~((1 << n_val) - 1)) << m_val;
+
+	for_each_present_cpu(cpu) {
+		nid = cpu_to_node(cpu);
+		pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
+		blade = boot_pnode_to_blade(pnode);
+		lcpu = uv_blade_info[blade].nr_possible_cpus;
+		uv_blade_info[blade].nr_possible_cpus++;
+
+		uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
+		uv_cpu_hub_info(cpu)->lowmem_remap_top =
+					lowmem_redir_base + lowmem_redir_size;
+		uv_cpu_hub_info(cpu)->m_val = m_val;
+		uv_cpu_hub_info(cpu)->n_val = m_val;
 		uv_cpu_hub_info(cpu)->numa_blade_id = blade;
 		uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
-		uv_cpu_hub_info(cpu)->local_nasid = nasid;
-		uv_cpu_hub_info(cpu)->gnode_upper =
-		    nasid & ~((1 << uv_hub_info->n_val) - 1);
+		uv_cpu_hub_info(cpu)->pnode = pnode;
+		uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1;
+		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
+		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
 		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
 		uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
-		uv_blade_info[blade].nasid = nasid;
-		uv_blade_info[blade].nr_posible_cpus++;
 		uv_node_to_blade[nid] = blade;
 		uv_cpu_to_blade[cpu] = blade;
 
-		printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, nasid %d, nid %d\n",
-		       cpu, per_cpu(x86_cpu_to_apicid, cpu), nasid, nid);
-		printk(KERN_DEBUG "UV   lcpu %d, blade %d\n", lcpu, blade);
+		printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, pnode %d, nid %d, "
+			"lcpu %d, blade %d\n",
+			cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
+			lcpu, blade);
 	}
 }
 
 /*
  * Called on each cpu to initialize the per_cpu UV data area.
+ * 	ZZZ hotplug not supported yet
  */
 void __cpuinit uv_cpu_init(void)
 {
@@ -246,5 +301,5 @@ void __cpuinit uv_cpu_init(void)
 	uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
 
 	if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
-		set_x2apic_extra_bits(uv_hub_info->local_nasid);
+		set_x2apic_extra_bits(uv_hub_info->pnode);
 }
diff --git a/include/asm-x86/uv/uv_hub.h b/include/asm-x86/uv/uv_hub.h
index 26b9240d1e23..65004881de5f 100644
--- a/include/asm-x86/uv/uv_hub.h
+++ b/include/asm-x86/uv/uv_hub.h
@@ -5,7 +5,7 @@
  *
  * SGI UV architectural definitions
  *
- * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef __ASM_X86_UV_HUB_H__
@@ -20,26 +20,49 @@
 /*
  * Addressing Terminology
  *
- * 	NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
- * 		routers always have low bit of 1, C/MBricks have low bit
- * 		equal to 0. Most addressing macros that target UV hub chips
- * 		right shift the NASID by 1 to exclude the always-zero bit.
+ *	M       - The low M bits of a physical address represent the offset
+ *		  into the blade local memory. RAM memory on a blade is physically
+ *		  contiguous (although various IO spaces may punch holes in
+ *		  it)..
  *
- *	SNASID - NASID right shifted by 1 bit.
+ * 	N	- Number of bits in the node portion of a socket physical
+ * 		  address.
+ *
+ * 	NASID   - network ID of a router, Mbrick or Cbrick. Nasid values of
+ * 	 	  routers always have low bit of 1, C/MBricks have low bit
+ * 		  equal to 0. Most addressing macros that target UV hub chips
+ * 		  right shift the NASID by 1 to exclude the always-zero bit.
+ * 		  NASIDs contain up to 15 bits.
+ *
+ *	GNODE   - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
+ *		  of nasids.
+ *
+ * 	PNODE   - the low N bits of the GNODE. The PNODE is the most useful variant
+ * 		  of the nasid for socket usage.
+ *
+ *
+ *  NumaLink Global Physical Address Format:
+ *  +--------------------------------+---------------------+
+ *  |00..000|      GNODE             |      NodeOffset     |
+ *  +--------------------------------+---------------------+
+ *          |<-------53 - M bits --->|<--------M bits ----->
+ *
+ *	M - number of node offset bits (35 .. 40)
  *
  *
  *  Memory/UV-HUB Processor Socket Address Format:
- *  +--------+---------------+---------------------+
- *  |00..0000|    SNASID     |      NodeOffset     |
- *  +--------+---------------+---------------------+
- *           <--- N bits --->|<--------M bits ----->
+ *  +----------------+---------------+---------------------+
+ *  |00..000000000000|   PNODE       |      NodeOffset     |
+ *  +----------------+---------------+---------------------+
+ *                   <--- N bits --->|<--------M bits ----->
  *
- *	M number of node offset bits (35 .. 40)
- *	N number of SNASID bits (0 .. 10)
+ *	M - number of node offset bits (35 .. 40)
+ *	N - number of PNODE bits (0 .. 10)
  *
  *		Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64).
  *		The actual values are configuration dependent and are set at
- *		boot time
+ *		boot time. M & N values are set by the hardware/BIOS at boot.
+ *
  *
  * APICID format
  * 	NOTE!!!!!! This is the current format of the APICID. However, code
@@ -48,14 +71,14 @@
  *
  * 		1111110000000000
  * 		5432109876543210
- *		nnnnnnnnnnlc0cch
+ *		pppppppppplc0cch
  *		sssssssssss
  *
- *			n  = snasid bits
+ *			p  = pnode bits
  *			l =  socket number on board
  *			c  = core
  *			h  = hyperthread
- *			s  = bits that are in the socket CSR
+ *			s  = bits that are in the SOCKET_ID CSR
  *
  *	Note: Processor only supports 12 bits in the APICID register. The ACPI
  *	      tables hold all 16 bits. Software needs to be aware of this.
@@ -74,7 +97,7 @@
  * This value is also the value of the maximum number of non-router NASIDs
  * in the numalink fabric.
  *
- * NOTE: a brick may be 1 or 2 OS nodes. Don't get these confused.
+ * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused.
  */
 #define UV_MAX_NUMALINK_BLADES	16384
 
@@ -96,8 +119,12 @@
  */
 struct uv_hub_info_s {
 	unsigned long	global_mmr_base;
-	unsigned short	local_nasid;
-	unsigned short	gnode_upper;
+	unsigned long	gpa_mask;
+	unsigned long	gnode_upper;
+	unsigned long	lowmem_remap_top;
+	unsigned long	lowmem_remap_base;
+	unsigned short	pnode;
+	unsigned short	pnode_mask;
 	unsigned short	coherency_domain_number;
 	unsigned short	numa_blade_id;
 	unsigned char	blade_processor_id;
@@ -112,83 +139,124 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
  * Local & Global MMR space macros.
  * 	Note: macros are intended to be used ONLY by inline functions
  * 	in this file - not by other kernel code.
+ * 		n -  NASID (full 15-bit global nasid)
+ * 		g -  GNODE (full 15-bit global nasid, right shifted 1)
+ * 		p -  PNODE (local part of nsids, right shifted 1)
  */
-#define UV_SNASID(n)			((n) >> 1)
-#define UV_NASID(n)			((n) << 1)
+#define UV_NASID_TO_PNODE(n)		(((n) >> 1) & uv_hub_info->pnode_mask)
+#define UV_PNODE_TO_NASID(p)		(((p) << 1) | uv_hub_info->gnode_upper)
 
 #define UV_LOCAL_MMR_BASE		0xf4000000UL
 #define UV_GLOBAL_MMR32_BASE		0xf8000000UL
 #define UV_GLOBAL_MMR64_BASE		(uv_hub_info->global_mmr_base)
 
-#define UV_GLOBAL_MMR32_SNASID_MASK	0x3ff
-#define UV_GLOBAL_MMR32_SNASID_SHIFT	15
-#define UV_GLOBAL_MMR64_SNASID_SHIFT	26
+#define UV_GLOBAL_MMR32_PNODE_SHIFT	15
+#define UV_GLOBAL_MMR64_PNODE_SHIFT	26
 
-#define UV_GLOBAL_MMR32_NASID_BITS(n)					\
-		(((UV_SNASID(n) & UV_GLOBAL_MMR32_SNASID_MASK)) <<	\
-		(UV_GLOBAL_MMR32_SNASID_SHIFT))
+#define UV_GLOBAL_MMR32_PNODE_BITS(p)	((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
 
-#define UV_GLOBAL_MMR64_NASID_BITS(n)					\
-	((unsigned long)UV_SNASID(n) << UV_GLOBAL_MMR64_SNASID_SHIFT)
+#define UV_GLOBAL_MMR64_PNODE_BITS(p)					\
+	((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+
+#define UV_APIC_PNODE_SHIFT	6
+
+/*
+ * Macros for converting between kernel virtual addresses, socket local physical
+ * addresses, and UV global physical addresses.
+ * 	Note: use the standard __pa() & __va() macros for converting
+ * 	      between socket virtual and socket physical addresses.
+ */
+
+/* socket phys RAM --> UV global physical address */
+static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
+{
+	if (paddr < uv_hub_info->lowmem_remap_top)
+		paddr += uv_hub_info->lowmem_remap_base;
+	return paddr | uv_hub_info->gnode_upper;
+}
+
+
+/* socket virtual --> UV global physical address */
+static inline unsigned long uv_gpa(void *v)
+{
+	return __pa(v) | uv_hub_info->gnode_upper;
+}
+
+/* socket virtual --> UV global physical address */
+static inline void *uv_vgpa(void *v)
+{
+	return (void *)uv_gpa(v);
+}
+
+/* UV global physical address --> socket virtual */
+static inline void *uv_va(unsigned long gpa)
+{
+	return __va(gpa & uv_hub_info->gpa_mask);
+}
+
+/* pnode, offset --> socket virtual */
+static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
+{
+	return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset);
+}
 
-#define UV_APIC_NASID_SHIFT	6
 
 /*
- * Extract a NASID from an APICID (full apicid, not processor subset)
+ * Extract a PNODE from an APICID (full apicid, not processor subset)
  */
-static inline int uv_apicid_to_nasid(int apicid)
+static inline int uv_apicid_to_pnode(int apicid)
 {
-	return (UV_NASID(apicid >> UV_APIC_NASID_SHIFT));
+	return (apicid >> UV_APIC_PNODE_SHIFT);
 }
 
 /*
  * Access global MMRs using the low memory MMR32 space. This region supports
  * faster MMR access but not all MMRs are accessible in this space.
  */
-static inline unsigned long *uv_global_mmr32_address(int nasid,
+static inline unsigned long *uv_global_mmr32_address(int pnode,
 				unsigned long offset)
 {
 	return __va(UV_GLOBAL_MMR32_BASE |
-		       UV_GLOBAL_MMR32_NASID_BITS(nasid) | offset);
+		       UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
 }
 
-static inline void uv_write_global_mmr32(int nasid, unsigned long offset,
+static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
 				 unsigned long val)
 {
-	*uv_global_mmr32_address(nasid, offset) = val;
+	*uv_global_mmr32_address(pnode, offset) = val;
 }
 
-static inline unsigned long uv_read_global_mmr32(int nasid,
+static inline unsigned long uv_read_global_mmr32(int pnode,
 						 unsigned long offset)
 {
-	return *uv_global_mmr32_address(nasid, offset);
+	return *uv_global_mmr32_address(pnode, offset);
 }
 
 /*
  * Access Global MMR space using the MMR space located at the top of physical
  * memory.
  */
-static inline unsigned long *uv_global_mmr64_address(int nasid,
+static inline unsigned long *uv_global_mmr64_address(int pnode,
 				unsigned long offset)
 {
 	return __va(UV_GLOBAL_MMR64_BASE |
-		       UV_GLOBAL_MMR64_NASID_BITS(nasid) | offset);
+		    UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
 }
 
-static inline void uv_write_global_mmr64(int nasid, unsigned long offset,
+static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
 				unsigned long val)
 {
-	*uv_global_mmr64_address(nasid, offset) = val;
+	*uv_global_mmr64_address(pnode, offset) = val;
 }
 
-static inline unsigned long uv_read_global_mmr64(int nasid,
+static inline unsigned long uv_read_global_mmr64(int pnode,
 						 unsigned long offset)
 {
-	return *uv_global_mmr64_address(nasid, offset);
+	return *uv_global_mmr64_address(pnode, offset);
 }
 
 /*
- * Access node local MMRs. Faster than using global space but only local MMRs
+ * Access hub local MMRs. Faster than using global space but only local MMRs
  * are accessible.
  */
 static inline unsigned long *uv_local_mmr_address(unsigned long offset)
@@ -207,15 +275,15 @@ static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
 }
 
 /*
- * Structures and definitions for converting between cpu, node, and blade
+ * Structures and definitions for converting between cpu, node, pnode, and blade
  * numbers.
  */
 struct uv_blade_info {
-	unsigned short	nr_posible_cpus;
+	unsigned short	nr_possible_cpus;
 	unsigned short	nr_online_cpus;
-	unsigned short	nasid;
+	unsigned short	pnode;
 };
-struct uv_blade_info *uv_blade_info;
+extern struct uv_blade_info *uv_blade_info;
 extern short *uv_node_to_blade;
 extern short *uv_cpu_to_blade;
 extern short uv_possible_blades;
@@ -244,16 +312,16 @@ static inline int uv_node_to_blade_id(int nid)
 	return uv_node_to_blade[nid];
 }
 
-/* Convert a blade id to the NASID of the blade */
-static inline int uv_blade_to_nasid(int bid)
+/* Convert a blade id to the PNODE of the blade */
+static inline int uv_blade_to_pnode(int bid)
 {
-	return uv_blade_info[bid].nasid;
+	return uv_blade_info[bid].pnode;
 }
 
 /* Determine the number of possible cpus on a blade */
 static inline int uv_blade_nr_possible_cpus(int bid)
 {
-	return uv_blade_info[bid].nr_posible_cpus;
+	return uv_blade_info[bid].nr_possible_cpus;
 }
 
 /* Determine the number of online cpus on a blade */
@@ -262,16 +330,16 @@ static inline int uv_blade_nr_online_cpus(int bid)
 	return uv_blade_info[bid].nr_online_cpus;
 }
 
-/* Convert a cpu id to the NASID of the blade containing the cpu */
-static inline int uv_cpu_to_nasid(int cpu)
+/* Convert a cpu id to the PNODE of the blade containing the cpu */
+static inline int uv_cpu_to_pnode(int cpu)
 {
-	return uv_blade_info[uv_cpu_to_blade_id(cpu)].nasid;
+	return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode;
 }
 
-/* Convert a node number to the NASID of the blade */
-static inline int uv_node_to_nasid(int nid)
+/* Convert a linux node number to the PNODE of the blade */
+static inline int uv_node_to_pnode(int nid)
 {
-	return uv_blade_info[uv_node_to_blade_id(nid)].nasid;
+	return uv_blade_info[uv_node_to_blade_id(nid)].pnode;
 }
 
 /* Maximum possible number of blades */
diff --git a/include/asm-x86/uv/uv_mmrs.h b/include/asm-x86/uv/uv_mmrs.h
index 3b69fe6b6376..ac9846076521 100644
--- a/include/asm-x86/uv/uv_mmrs.h
+++ b/include/asm-x86/uv/uv_mmrs.h
@@ -11,11 +11,46 @@
 #ifndef __ASM_X86_UV_MMRS__
 #define __ASM_X86_UV_MMRS__
 
-/*
- *       AUTO GENERATED - Do not edit
- */
+#define UV_MMR_ENABLE		(1UL << 63)
 
- #define UV_MMR_ENABLE		(1UL << 63)
+/* ========================================================================= */
+/*                           UVH_BAU_DATA_CONFIG                             */
+/* ========================================================================= */
+#define UVH_BAU_DATA_CONFIG 0x61680UL
+#define UVH_BAU_DATA_CONFIG_32 0x0450
+
+#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
+#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
+#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
+#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
+#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_BAU_DATA_CONFIG_P_SHFT 13
+#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_BAU_DATA_CONFIG_T_SHFT 15
+#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_BAU_DATA_CONFIG_M_SHFT 16
+#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
+#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_bau_data_config_u {
+    unsigned long	v;
+    struct uvh_bau_data_config_s {
+	unsigned long	vector_  :  8;  /* RW */
+	unsigned long	dm       :  3;  /* RW */
+	unsigned long	destmode :  1;  /* RW */
+	unsigned long	status   :  1;  /* RO */
+	unsigned long	p        :  1;  /* RO */
+	unsigned long	rsvd_14  :  1;  /*    */
+	unsigned long	t        :  1;  /* RO */
+	unsigned long	m        :  1;  /* RW */
+	unsigned long	rsvd_17_31: 15;  /*    */
+	unsigned long	apic_id  : 32;  /* RW */
+    } s;
+};
 
 /* ========================================================================= */
 /*                               UVH_IPI_INT                                 */
@@ -109,6 +144,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u {
 /*                   UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE                    */
 /* ========================================================================= */
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0aa0
 
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
@@ -169,6 +205,7 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 /*                UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS                 */
 /* ========================================================================= */
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0aa8
 
 /* ========================================================================= */
 /*                     UVH_LB_BAU_SB_ACTIVATION_CONTROL                      */
@@ -247,6 +284,331 @@ union uvh_lb_bau_sb_descriptor_base_u {
     } s;
 };
 
+/* ========================================================================= */
+/*                      UVH_LB_MCAST_AOERR0_RPT_ENABLE                       */
+/* ========================================================================= */
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL
+
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_TIMEOUT_SHFT 22
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_TIMEOUT_MASK 0x0000000000400000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 23
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000000800000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 24
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000001000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 25
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000002000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 26
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000004000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 27
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000008000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 28
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000010000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 29
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000020000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 30
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000040000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 31
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000080000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 32
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000100000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 33
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000200000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 34
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000400000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 35
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000000800000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 36
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000001000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 37
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000002000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 38
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000004000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 39
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000008000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 40
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000010000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 41
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000020000000000UL
+
+union uvh_lb_mcast_aoerr0_rpt_enable_u {
+    unsigned long	v;
+    struct uvh_lb_mcast_aoerr0_rpt_enable_s {
+	unsigned long	mcast_obese_msg                         :  1;  /* RW */
+	unsigned long	mcast_data_sb_err                       :  1;  /* RW */
+	unsigned long	mcast_nack_buff_parity                  :  1;  /* RW */
+	unsigned long	mcast_timeout                           :  1;  /* RW */
+	unsigned long	mcast_inactive_reply                    :  1;  /* RW */
+	unsigned long	mcast_upgrade_error                     :  1;  /* RW */
+	unsigned long	mcast_reg_count_underflow               :  1;  /* RW */
+	unsigned long	mcast_rep_obese_msg                     :  1;  /* RW */
+	unsigned long	ucache_req_runt_msg                     :  1;  /* RW */
+	unsigned long	ucache_req_obese_msg                    :  1;  /* RW */
+	unsigned long	ucache_req_data_sb_err                  :  1;  /* RW */
+	unsigned long	ucache_rep_runt_msg                     :  1;  /* RW */
+	unsigned long	ucache_rep_obese_msg                    :  1;  /* RW */
+	unsigned long	ucache_rep_data_sb_err                  :  1;  /* RW */
+	unsigned long	ucache_rep_command_err                  :  1;  /* RW */
+	unsigned long	ucache_pend_timeout                     :  1;  /* RW */
+	unsigned long	macc_req_runt_msg                       :  1;  /* RW */
+	unsigned long	macc_req_obese_msg                      :  1;  /* RW */
+	unsigned long	macc_req_data_sb_err                    :  1;  /* RW */
+	unsigned long	macc_rep_runt_msg                       :  1;  /* RW */
+	unsigned long	macc_rep_obese_msg                      :  1;  /* RW */
+	unsigned long	macc_rep_data_sb_err                    :  1;  /* RW */
+	unsigned long	macc_timeout                            :  1;  /* RW */
+	unsigned long	macc_spurious_event                     :  1;  /* RW */
+	unsigned long	ioh_destination_table_parity            :  1;  /* RW */
+	unsigned long	get_had_error_reply                     :  1;  /* RW */
+	unsigned long	get_timeout                             :  1;  /* RW */
+	unsigned long	lock_manager_had_error_reply            :  1;  /* RW */
+	unsigned long	put_had_error_reply                     :  1;  /* RW */
+	unsigned long	put_timeout                             :  1;  /* RW */
+	unsigned long	sb_activation_overrun                   :  1;  /* RW */
+	unsigned long	completed_gb_activation_had_error_reply :  1;  /* RW */
+	unsigned long	completed_gb_activation_timeout         :  1;  /* RW */
+	unsigned long	descriptor_buffer_0_parity              :  1;  /* RW */
+	unsigned long	descriptor_buffer_1_parity              :  1;  /* RW */
+	unsigned long	socket_destination_table_parity         :  1;  /* RW */
+	unsigned long	bau_reply_payload_corruption            :  1;  /* RW */
+	unsigned long	io_port_destination_table_parity        :  1;  /* RW */
+	unsigned long	intd_soft_ack_timeout                   :  1;  /* RW */
+	unsigned long	int_rep_obese_msg                       :  1;  /* RW */
+	unsigned long	int_rep_command_err                     :  1;  /* RW */
+	unsigned long	int_timeout                             :  1;  /* RW */
+	unsigned long	rsvd_42_63                              : 22;  /*    */
+    } s;
+};
+
+/* ========================================================================= */
+/*                          UVH_LOCAL_INT0_CONFIG                            */
+/* ========================================================================= */
+#define UVH_LOCAL_INT0_CONFIG 0x61000UL
+
+#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8
+#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13
+#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15
+#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16
+#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_local_int0_config_u {
+    unsigned long	v;
+    struct uvh_local_int0_config_s {
+	unsigned long	vector_  :  8;  /* RW */
+	unsigned long	dm       :  3;  /* RW */
+	unsigned long	destmode :  1;  /* RW */
+	unsigned long	status   :  1;  /* RO */
+	unsigned long	p        :  1;  /* RO */
+	unsigned long	rsvd_14  :  1;  /*    */
+	unsigned long	t        :  1;  /* RO */
+	unsigned long	m        :  1;  /* RW */
+	unsigned long	rsvd_17_31: 15;  /*    */
+	unsigned long	apic_id  : 32;  /* RW */
+    } s;
+};
+
+/* ========================================================================= */
+/*                          UVH_LOCAL_INT0_ENABLE                            */
+/* ========================================================================= */
+#define UVH_LOCAL_INT0_ENABLE 0x65000UL
+
+#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0
+#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL
+#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1
+#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL
+#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2
+#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL
+#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3
+#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL
+#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4
+#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL
+#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5
+#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL
+#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6
+#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL
+#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21
+#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL
+#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22
+#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL
+#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39
+#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL
+#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40
+#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL
+#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41
+#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL
+#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42
+#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43
+#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44
+#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
+
+union uvh_local_int0_enable_u {
+    unsigned long	v;
+    struct uvh_local_int0_enable_s {
+	unsigned long	lb_hcerr            :  1;  /* RW */
+	unsigned long	gr0_hcerr           :  1;  /* RW */
+	unsigned long	gr1_hcerr           :  1;  /* RW */
+	unsigned long	lh_hcerr            :  1;  /* RW */
+	unsigned long	rh_hcerr            :  1;  /* RW */
+	unsigned long	xn_hcerr            :  1;  /* RW */
+	unsigned long	si_hcerr            :  1;  /* RW */
+	unsigned long	lb_aoerr0           :  1;  /* RW */
+	unsigned long	gr0_aoerr0          :  1;  /* RW */
+	unsigned long	gr1_aoerr0          :  1;  /* RW */
+	unsigned long	lh_aoerr0           :  1;  /* RW */
+	unsigned long	rh_aoerr0           :  1;  /* RW */
+	unsigned long	xn_aoerr0           :  1;  /* RW */
+	unsigned long	si_aoerr0           :  1;  /* RW */
+	unsigned long	lb_aoerr1           :  1;  /* RW */
+	unsigned long	gr0_aoerr1          :  1;  /* RW */
+	unsigned long	gr1_aoerr1          :  1;  /* RW */
+	unsigned long	lh_aoerr1           :  1;  /* RW */
+	unsigned long	rh_aoerr1           :  1;  /* RW */
+	unsigned long	xn_aoerr1           :  1;  /* RW */
+	unsigned long	si_aoerr1           :  1;  /* RW */
+	unsigned long	rh_vpi_int          :  1;  /* RW */
+	unsigned long	system_shutdown_int :  1;  /* RW */
+	unsigned long	lb_irq_int_0        :  1;  /* RW */
+	unsigned long	lb_irq_int_1        :  1;  /* RW */
+	unsigned long	lb_irq_int_2        :  1;  /* RW */
+	unsigned long	lb_irq_int_3        :  1;  /* RW */
+	unsigned long	lb_irq_int_4        :  1;  /* RW */
+	unsigned long	lb_irq_int_5        :  1;  /* RW */
+	unsigned long	lb_irq_int_6        :  1;  /* RW */
+	unsigned long	lb_irq_int_7        :  1;  /* RW */
+	unsigned long	lb_irq_int_8        :  1;  /* RW */
+	unsigned long	lb_irq_int_9        :  1;  /* RW */
+	unsigned long	lb_irq_int_10       :  1;  /* RW */
+	unsigned long	lb_irq_int_11       :  1;  /* RW */
+	unsigned long	lb_irq_int_12       :  1;  /* RW */
+	unsigned long	lb_irq_int_13       :  1;  /* RW */
+	unsigned long	lb_irq_int_14       :  1;  /* RW */
+	unsigned long	lb_irq_int_15       :  1;  /* RW */
+	unsigned long	l1_nmi_int          :  1;  /* RW */
+	unsigned long	stop_clock          :  1;  /* RW */
+	unsigned long	asic_to_l1          :  1;  /* RW */
+	unsigned long	l1_to_asic          :  1;  /* RW */
+	unsigned long	ltc_int             :  1;  /* RW */
+	unsigned long	la_seq_trigger      :  1;  /* RW */
+	unsigned long	rsvd_45_63          : 19;  /*    */
+    } s;
+};
+
 /* ========================================================================= */
 /*                               UVH_NODE_ID                                 */
 /* ========================================================================= */
@@ -283,6 +645,73 @@ union uvh_node_id_u {
     } s;
 };
 
+/* ========================================================================= */
+/*                          UVH_NODE_PRESENT_TABLE                           */
+/* ========================================================================= */
+#define UVH_NODE_PRESENT_TABLE 0x1400UL
+#define UVH_NODE_PRESENT_TABLE_DEPTH 16
+
+#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
+#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
+
+union uvh_node_present_table_u {
+    unsigned long	v;
+    struct uvh_node_present_table_s {
+	unsigned long	nodes : 64;  /* RW */
+    } s;
+};
+
+/* ========================================================================= */
+/*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR                  */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
+    unsigned long	v;
+    struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
+	unsigned long	rsvd_0_23 : 24;  /*    */
+	unsigned long	dest_base : 22;  /* RW */
+	unsigned long	rsvd_46_63: 18;  /*    */
+    } s;
+};
+
+/* ========================================================================= */
+/*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR                  */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
+    unsigned long	v;
+    struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
+	unsigned long	rsvd_0_23 : 24;  /*    */
+	unsigned long	dest_base : 22;  /* RW */
+	unsigned long	rsvd_46_63: 18;  /*    */
+    } s;
+};
+
+/* ========================================================================= */
+/*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR                  */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
+    unsigned long	v;
+    struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
+	unsigned long	rsvd_0_23 : 24;  /*    */
+	unsigned long	dest_base : 22;  /* RW */
+	unsigned long	rsvd_46_63: 18;  /*    */
+    } s;
+};
+
 /* ========================================================================= */
 /*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
@@ -369,5 +798,77 @@ union uvh_si_addr_map_config_u {
     } s;
 };
 
+/* ========================================================================= */
+/*                       UVH_SI_ALIAS0_OVERLAY_CONFIG                        */
+/* ========================================================================= */
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
+
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_si_alias0_overlay_config_u {
+    unsigned long	v;
+    struct uvh_si_alias0_overlay_config_s {
+	unsigned long	rsvd_0_23: 24;  /*    */
+	unsigned long	base    :  8;  /* RW */
+	unsigned long	rsvd_32_47: 16;  /*    */
+	unsigned long	m_alias :  5;  /* RW */
+	unsigned long	rsvd_53_62: 10;  /*    */
+	unsigned long	enable  :  1;  /* RW */
+    } s;
+};
+
+/* ========================================================================= */
+/*                       UVH_SI_ALIAS1_OVERLAY_CONFIG                        */
+/* ========================================================================= */
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
+
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_si_alias1_overlay_config_u {
+    unsigned long	v;
+    struct uvh_si_alias1_overlay_config_s {
+	unsigned long	rsvd_0_23: 24;  /*    */
+	unsigned long	base    :  8;  /* RW */
+	unsigned long	rsvd_32_47: 16;  /*    */
+	unsigned long	m_alias :  5;  /* RW */
+	unsigned long	rsvd_53_62: 10;  /*    */
+	unsigned long	enable  :  1;  /* RW */
+    } s;
+};
+
+/* ========================================================================= */
+/*                       UVH_SI_ALIAS2_OVERLAY_CONFIG                        */
+/* ========================================================================= */
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
+
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_si_alias2_overlay_config_u {
+    unsigned long	v;
+    struct uvh_si_alias2_overlay_config_s {
+	unsigned long	rsvd_0_23: 24;  /*    */
+	unsigned long	base    :  8;  /* RW */
+	unsigned long	rsvd_32_47: 16;  /*    */
+	unsigned long	m_alias :  5;  /* RW */
+	unsigned long	rsvd_53_62: 10;  /*    */
+	unsigned long	enable  :  1;  /* RW */
+    } s;
+};
+
 
 #endif /* __ASM_X86_UV_MMRS__ */
-- 
cgit v1.2.3


From 3d1ba1da2b4ff4ace7801e99fb9a3095b182d847 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 2 Jun 2008 13:50:10 +0200
Subject: x86: fix nmi.c build bug

apic.h needs to be included for the apic_write_around() definition.
---
 arch/x86/kernel/nmi.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bf143f191fb1..cbd4fa3c475b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -11,6 +11,8 @@
  *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
  */
 
+#include <asm/apic.h>
+
 #include <linux/nmi.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
-- 
cgit v1.2.3


From f3294690979634ee10398bb0beadfe1d4edb881d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 3 Jun 2008 10:16:10 +0200
Subject: x86, numaq: add pci_acpi_scan_root() stub

allow 32-bit numaq build to succeed with ACPI enabled.
---
 arch/x86/kernel/numaq_32.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index e65281b1634b..992f53cb79b6 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -87,3 +87,14 @@ static int __init numaq_tsc_disable(void)
 	return 0;
 }
 arch_initcall(numaq_tsc_disable);
+
+#ifdef CONFIG_ACPI
+/*
+ * Dummy implementation:
+ */
+struct pci_bus * __devinit
+pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum)
+{
+	return NULL;
+}
+#endif
-- 
cgit v1.2.3


From 0596152388e234efebce464355186ad9e16c8cb6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 31 May 2008 22:52:47 -0700
Subject: x86, 32-bit: change propagate_e820_map() back to find_max_pfn()

we don't need to call memory_present that early.
numa and sparse will call memory_present later and might
even fail, it will call memory_present for the full range.

also for sparse it will call alloc_bootmem ... before we set up bootmem.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_32.c  | 5 ++---
 arch/x86/kernel/setup_32.c | 4 ++--
 arch/x86/mm/discontig_32.c | 2 +-
 include/asm-x86/e820_32.h  | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index db760d4706af..0c025d04fc2e 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -210,7 +210,7 @@ void __init init_iomem_resources(struct resource *code_resource,
 /*
  * Find the highest page frame number we have available
  */
-void __init propagate_e820_map(void)
+void __init find_max_pfn(void)
 {
 	int i;
 
@@ -227,7 +227,6 @@ void __init propagate_e820_map(void)
 			continue;
 		if (end > max_pfn)
 			max_pfn = end;
-		memory_present(0, start, end);
 	}
 }
 
@@ -361,7 +360,7 @@ static int __init parse_memmap(char *arg)
 		 * size before original memory map is
 		 * reset.
 		 */
-		propagate_e820_map();
+		find_max_pfn();
 		saved_max_pfn = max_pfn;
 #endif
 		e820.nr_map = 0;
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 6f40cb560ea9..29010420458d 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -730,10 +730,10 @@ void __init setup_arch(char **cmdline_p)
 		efi_init();
 
 	/* update e820 for memory not covered by WB MTRRs */
-	propagate_e820_map();
+	find_max_pfn();
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(max_pfn))
-		propagate_e820_map();
+		find_max_pfn();
 
 	max_low_pfn = setup_memory();
 
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 98b099eeab3a..ebbbba338150 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -120,7 +120,7 @@ int __init get_memcfg_numa_flat(void)
 	printk("NUMA - single node, flat memory mode\n");
 
 	/* Run the memory configuration and find the top of memory. */
-	propagate_e820_map();
+	find_max_pfn();
 	node_start_pfn[0] = 0;
 	node_end_pfn[0] = max_pfn;
 	memory_present(0, 0, max_pfn);
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index 7ace82570a36..00fbc60b9d30 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -21,7 +21,7 @@
 extern void setup_memory_map(void);
 extern void finish_e820_parsing(void);
 
-extern void propagate_e820_map(void);
+extern void find_max_pfn(void);
 extern void register_bootmem_low_pages(unsigned long max_low_pfn);
 extern void limit_regions(unsigned long long size);
 extern void init_iomem_resources(struct resource *code_resource,
-- 
cgit v1.2.3


From 2944e16b25e7fb8b5ee0dd9dc7197a0f9e523cfd Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 1 Jun 2008 13:17:38 -0700
Subject: x86: update mptable

make mptable to be consistent with acpi routing, so we could:

1. kexec kernel with acpi=off
2. work around BIOSes where acpi routing is working, but mptable is
   not right, so can use kernel/kexec to start other OSes that don't have
   good acpi support.

command line: update_mptable

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c |  22 +++
 arch/x86/kernel/e820.c      |  25 +++
 arch/x86/kernel/mpparse.c   | 401 ++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/kernel/setup_64.c  |   4 +
 drivers/acpi/pci_irq.c      |   5 +
 include/asm-x86/e820.h      |   1 +
 include/asm-x86/mpspec.h    |   4 +
 7 files changed, 445 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2ddfabae382b..f226bdc19f69 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1154,6 +1154,28 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 	return gsi;
 }
 
+int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+			u32 gsi, int triggering, int polarity)
+{
+	struct mpc_config_intsrc intsrc;
+	int ioapic;
+
+	/* print the entry should happen on mptable identically */
+	intsrc.mpc_type = MP_INTSRC;
+	intsrc.mpc_irqtype = mp_INT;
+	intsrc.mpc_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+	intsrc.mpc_srcbus = number;
+	intsrc.mpc_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+	ioapic = mp_find_ioapic(gsi);
+	intsrc.mpc_dstapic = mp_ioapic_routing[ioapic].apic_id;
+	intsrc.mpc_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+	MP_intsrc_info(&intsrc);
+
+	return 0;
+}
+
 /*
  * Parse IOAPIC related entries in MADT
  * returns 0 on success, < 0 on error
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0cd9132c9450..cd2b99e27d43 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -739,3 +739,28 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
 	return -1UL;
 
 }
+
+/*
+ * pre allocated 4k and reserved it in e820
+ */
+u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+{
+	u64 size = 0;
+	u64 addr;
+	u64 start;
+
+	start = startt;
+	while (size < sizet)
+		start = find_e820_area_size(start, &size, align);
+
+	if (size < sizet)
+		return 0;
+
+	addr = round_down(start + size - sizet, align);
+	update_memory_range(addr, sizet, E820_RAM, E820_RESERVED);
+	printk(KERN_INFO "update e820 for early_reserve_e820\n");
+	update_e820();
+
+	return addr;
+}
+
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 9f3792d55044..8898aa49079d 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -25,6 +25,8 @@
 #include <asm/proto.h>
 #include <asm/acpi.h>
 #include <asm/bios_ebda.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
 
 #include <mach_apic.h>
 #ifdef CONFIG_X86_32
@@ -161,20 +163,81 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
 	nr_ioapics++;
 }
 
-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
 {
-	printk(KERN_INFO "Int: type %d, pol %d, trig %d, bus %02x,"
+	printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 		m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	mp_irqs[mp_irq_entries].mp_dstapic = m->mpc_dstapic;
-	mp_irqs[mp_irq_entries].mp_type = m->mpc_type;
-	mp_irqs[mp_irq_entries].mp_irqtype = m->mpc_irqtype;
-	mp_irqs[mp_irq_entries].mp_irqflag = m->mpc_irqflag;
-	mp_irqs[mp_irq_entries].mp_srcbus = m->mpc_srcbus;
-	mp_irqs[mp_irq_entries].mp_srcbusirq = m->mpc_srcbusirq;
-	mp_irqs[mp_irq_entries].mp_dstirq = m->mpc_dstirq;
+}
+
+static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
+{
+	printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
+		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
+		mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
+		(mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
+		mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
+}
+
+static void assign_to_mp_irq(struct mpc_config_intsrc *m,
+				    struct mp_config_intsrc *mp_irq)
+{
+	mp_irq->mp_dstapic = m->mpc_dstapic;
+	mp_irq->mp_type = m->mpc_type;
+	mp_irq->mp_irqtype = m->mpc_irqtype;
+	mp_irq->mp_irqflag = m->mpc_irqflag;
+	mp_irq->mp_srcbus = m->mpc_srcbus;
+	mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
+	mp_irq->mp_dstirq = m->mpc_dstirq;
+}
+
+static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+					struct mpc_config_intsrc *m)
+{
+	m->mpc_dstapic = mp_irq->mp_dstapic;
+	m->mpc_type = mp_irq->mp_type;
+	m->mpc_irqtype = mp_irq->mp_irqtype;
+	m->mpc_irqflag = mp_irq->mp_irqflag;
+	m->mpc_srcbus = mp_irq->mp_srcbus;
+	m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
+	m->mpc_dstirq = mp_irq->mp_dstirq;
+}
+
+static int mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+					struct mpc_config_intsrc *m)
+{
+	if (mp_irq->mp_dstapic != m->mpc_dstapic)
+		return 1;
+	if (mp_irq->mp_type != m->mpc_type)
+		return 2;
+	if (mp_irq->mp_irqtype != m->mpc_irqtype)
+		return 3;
+	if (mp_irq->mp_irqflag != m->mpc_irqflag)
+		return 4;
+	if (mp_irq->mp_srcbus != m->mpc_srcbus)
+		return 5;
+	if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
+		return 6;
+	if (mp_irq->mp_dstirq != m->mpc_dstirq)
+		return 7;
+
+	return 0;
+}
+
+void MP_intsrc_info(struct mpc_config_intsrc *m)
+{
+	int i;
+
+	print_MP_intsrc_info(m);
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
+			return;
+	}
+
+	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
@@ -268,12 +331,9 @@ static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
  * Read/parse the MPC
  */
 
-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
+				char *str)
 {
-	char str[16];
-	char oem[10];
-	int count = sizeof(*mpc);
-	unsigned char *mpt = ((unsigned char *)mpc) + count;
 
 	if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
 		printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
@@ -301,13 +361,28 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 	memcpy(str, mpc->mpc_productid, 12);
 	str[12] = 0;
 
-#ifdef CONFIG_X86_32
-	mps_oem_check(mpc, oem, str);
-#endif
 	printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
 
 	printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
 
+	return 1;
+}
+
+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+{
+	char str[16];
+	char oem[10];
+
+	int count = sizeof(*mpc);
+	unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+	if (!smp_check_mpc(mpc, oem, str))
+		return 0;
+
+#ifdef CONFIG_X86_32
+	mps_oem_check(mpc, oem, str);
+#endif
+
 	/* save the local APIC address, it might be non-default */
 	if (!acpi_lapic)
 		mp_lapic_addr = mpc->mpc_lapic;
@@ -785,3 +860,295 @@ void __init find_smp_config(void)
 {
 	__find_smp_config(1);
 }
+
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
+
+static int  __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
+{
+	int i;
+
+	if (m->mpc_irqtype != mp_INT)
+		return 0;
+
+	if (m->mpc_irqflag != 0x0f)
+		return 0;
+
+	/* not legacy */
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (mp_irqs[i].mp_irqtype != mp_INT)
+			continue;
+
+		if (mp_irqs[i].mp_irqflag != 0x0f)
+			continue;
+
+		if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
+			continue;
+		if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
+			continue;
+		if (irq_used[i]) {
+			/* already claimed */
+			return -2;
+		}
+		irq_used[i] = 1;
+		return i;
+	}
+
+	/* not found */
+	return -1;
+}
+
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
+#endif
+
+static int  __init replace_intsrc_all(struct mp_config_table *mpc,
+					unsigned long mpc_new_phys,
+					unsigned long mpc_new_length)
+{
+#ifdef CONFIG_X86_IO_APIC
+	int i;
+	int nr_m_spare = 0;
+#endif
+
+	int count = sizeof(*mpc);
+	unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+	printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
+	while (count < mpc->mpc_length) {
+		switch (*mpt) {
+		case MP_PROCESSOR:
+			{
+				struct mpc_config_processor *m =
+				    (struct mpc_config_processor *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		case MP_BUS:
+			{
+				struct mpc_config_bus *m =
+				    (struct mpc_config_bus *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		case MP_IOAPIC:
+			{
+				mpt += sizeof(struct mpc_config_ioapic);
+				count += sizeof(struct mpc_config_ioapic);
+				break;
+			}
+		case MP_INTSRC:
+			{
+#ifdef CONFIG_X86_IO_APIC
+				struct mpc_config_intsrc *m =
+				    (struct mpc_config_intsrc *)mpt;
+
+				printk(KERN_INFO "OLD ");
+				print_MP_intsrc_info(m);
+				i = get_MP_intsrc_index(m);
+				if (i > 0) {
+					assign_to_mpc_intsrc(&mp_irqs[i], m);
+					printk(KERN_INFO "NEW ");
+					print_mp_irq_info(&mp_irqs[i]);
+				} else if (!i) {
+					/* legacy, do nothing */
+				} else if (nr_m_spare < SPARE_SLOT_NUM) {
+					/*
+					 * not found (-1), or duplicated (-2)
+					 * are invalid entries,
+					 * we need to use the slot  later
+					 */
+					m_spare[nr_m_spare] = m;
+					nr_m_spare++;
+				}
+#endif
+				mpt += sizeof(struct mpc_config_intsrc);
+				count += sizeof(struct mpc_config_intsrc);
+				break;
+			}
+		case MP_LINTSRC:
+			{
+				struct mpc_config_lintsrc *m =
+				    (struct mpc_config_lintsrc *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		default:
+			/* wrong mptable */
+			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
+			printk(KERN_ERR "type %x\n", *mpt);
+			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+					1, mpc, mpc->mpc_length, 1);
+			goto out;
+		}
+	}
+
+#ifdef CONFIG_X86_IO_APIC
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (irq_used[i])
+			continue;
+
+		if (mp_irqs[i].mp_irqtype != mp_INT)
+			continue;
+
+		if (mp_irqs[i].mp_irqflag != 0x0f)
+			continue;
+
+		if (nr_m_spare > 0) {
+			printk(KERN_INFO "*NEW* found ");
+			nr_m_spare--;
+			assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+			m_spare[nr_m_spare] = NULL;
+		} else {
+			struct mpc_config_intsrc *m =
+			    (struct mpc_config_intsrc *)mpt;
+			count += sizeof(struct mpc_config_intsrc);
+			if (!mpc_new_phys) {
+				printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
+			} else {
+				if (count <= mpc_new_length)
+					printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
+				else {
+					printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
+					goto out;
+				}
+			}
+			assign_to_mpc_intsrc(&mp_irqs[i], m);
+			mpc->mpc_length = count;
+			mpt += sizeof(struct mpc_config_intsrc);
+		}
+		print_mp_irq_info(&mp_irqs[i]);
+	}
+#endif
+out:
+	/* update checksum */
+	mpc->mpc_checksum = 0;
+	mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
+					   mpc->mpc_length);
+
+	return 0;
+}
+
+int __initdata enable_update_mptable;
+
+static int __init update_mptable_setup(char *str)
+{
+	enable_update_mptable = 1;
+	return 0;
+}
+early_param("update_mptable", update_mptable_setup);
+
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;
+
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+	enable_update_mptable = 1;
+	alloc_mptable = 1;
+	if (!p)
+		return 0;
+	mpc_new_length = memparse(p, &p);
+	return 0;
+}
+early_param("alloc_mptable", parse_alloc_mptable_opt);
+
+void __init early_reserve_e820_mpc_new(void)
+{
+	if (enable_update_mptable && alloc_mptable) {
+		u64 startt = 0;
+#ifdef CONFIG_X86_TRAMPOLINE
+		startt = TRAMPOLINE_BASE;
+#endif
+		mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
+	}
+}
+
+static int __init update_mp_table(void)
+{
+	char str[16];
+	char oem[10];
+	struct intel_mp_floating *mpf;
+	struct mp_config_table *mpc;
+	struct mp_config_table *mpc_new;
+
+	if (!enable_update_mptable)
+		return 0;
+
+	mpf = mpf_found;
+	if (!mpf)
+		return 0;
+
+	/*
+	 * Now see if we need to go further.
+	 */
+	if (mpf->mpf_feature1 != 0)
+		return 0;
+
+	if (!mpf->mpf_physptr)
+		return 0;
+
+	mpc = phys_to_virt(mpf->mpf_physptr);
+
+	if (!smp_check_mpc(mpc, oem, str))
+		return 0;
+
+	printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
+	printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
+
+	if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
+		mpc_new_phys = 0;
+		printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
+			 mpc_new_length);
+	}
+
+	if (!mpc_new_phys) {
+		unsigned char old, new;
+		/* check if we can change the postion */
+		mpc->mpc_checksum = 0;
+		old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+		mpc->mpc_checksum = 0xff;
+		new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+		if (old == new) {
+			printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+			return 0;
+		}
+		printk(KERN_INFO "use in-positon replacing\n");
+	} else {
+		mpf->mpf_physptr = mpc_new_phys;
+		mpc_new = phys_to_virt(mpc_new_phys);
+		memcpy(mpc_new, mpc, mpc->mpc_length);
+		mpc = mpc_new;
+		/* check if we can modify that */
+		if (mpc_new_phys - mpf->mpf_physptr) {
+			struct intel_mp_floating *mpf_new;
+			/* steal 16 bytes from [0, 1k) */
+			printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+			mpf_new = phys_to_virt(0x400 - 16);
+			memcpy(mpf_new, mpf, 16);
+			mpf = mpf_new;
+			mpf->mpf_physptr = mpc_new_phys;
+		}
+		mpf->mpf_checksum = 0;
+		mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
+		printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
+	}
+
+	/*
+	 * only replace the one with mp_INT and
+	 *	 MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+	 * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+	 * may need pci=routeirq for all coverage
+	 */
+	replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+	return 0;
+}
+
+late_initcall(update_mp_table);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 89e6cca5d693..978a0d637f3f 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -56,6 +56,7 @@
 #include <asm/desc.h>
 #include <video/edid.h>
 #include <asm/e820.h>
+#include <asm/mpspec.h>
 #include <asm/dma.h>
 #include <asm/gart.h>
 #include <asm/mpspec.h>
@@ -381,6 +382,9 @@ void __init setup_arch(char **cmdline_p)
 	 * we are rounding upwards:
 	 */
 	end_pfn = e820_end_of_ram();
+
+	/* pre allocte 4k for mptable mpc */
+	early_reserve_e820_mpc_new();
 	/* update e820 for memory not covered by WB MTRRs */
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(end_pfn)) {
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 89022a74faee..e556f30c7c16 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -570,6 +570,11 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 	       (triggering == ACPI_LEVEL_SENSITIVE) ? "level" : "edge",
 	       (polarity == ACPI_ACTIVE_LOW) ? "low" : "high", dev->irq);
 
+#ifdef CONFIG_X86
+	mp_config_acpi_gsi(dev->bus->number, dev->devfn, dev->pin, irq,
+				 triggering, polarity);
+#endif
+
 	return 0;
 }
 
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 4266a2c5f2e8..ee8fe4c5da41 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -84,6 +84,7 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
 extern void reserve_early(u64 start, u64 end, char *name);
 extern void free_early(u64 start, u64 end);
 extern void early_res_to_bootmem(u64 start, u64 end);
+extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index b785ddd8d761..6a34d6dfd042 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -39,6 +39,7 @@ extern unsigned long mp_lapic_addr;
 
 extern void find_smp_config(void);
 extern void get_smp_config(void);
+extern void early_reserve_e820_mpc_new(void);
 
 void __cpuinit generic_processor_info(int apicid, int version);
 #ifdef CONFIG_ACPI
@@ -47,6 +48,9 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 				   u32 gsi);
 extern void mp_config_acpi_legacy_irqs(void);
 extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+extern void MP_intsrc_info(struct mpc_config_intsrc *m);
+extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+				u32 gsi, int triggering, int polarity);
 #endif /* CONFIG_ACPI */
 
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
-- 
cgit v1.2.3


From 6af61a7614a306fe882a0c2b4ddc63b65aa66efc Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 1 Jun 2008 23:53:50 -0700
Subject: x86: clean up max_pfn_mapped usage - 32-bit

on 32-bit in head_32.S after initial page table is done, we get initial
max_pfn_mapped, and then kernel_physical_mapping_init will give us
a final one.

We need to use that to make sure find_e820_area will get valid addresses
for boot_map and for NODE_DATA(0) on numa32.

XEN PV and lguest may need to assign max_pfn_mapped too.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_32.S  | 4 ++++
 arch/x86/kernel/setup_32.c | 4 +++-
 arch/x86/mm/discontig_32.c | 3 ++-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index bef4618feadb..ac7002fdd637 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -220,6 +220,8 @@ default_entry:
 	jb 10b
 1:
 	movl %edi,pa(init_pg_tables_end)
+	shrl $12, %eax
+	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
 	movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
@@ -251,6 +253,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	cmpl %ebp,%eax
 	jb 10b
 	movl %edi,pa(init_pg_tables_end)
+	shrl $12, %eax
+	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
 	movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 29010420458d..c985a8c305e0 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -586,7 +586,7 @@ void __init setup_bootmem_allocator(void)
 	 */
 	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
 	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
-				 max_low_pfn<<PAGE_SHIFT, bootmap_size,
+				 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
 				 PAGE_SIZE);
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
@@ -595,6 +595,8 @@ void __init setup_bootmem_allocator(void)
 	reserve_initrd();
 #endif
 	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
+	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+		 max_pfn_mapped<<PAGE_SHIFT);
 	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
 		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
 	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 73a983489c60..914a81ee7855 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -163,7 +163,8 @@ static void __init allocate_pgdat(int nid)
 	else {
 		unsigned long pgdat_phys;
 		pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
-				 max_low_pfn<<PAGE_SHIFT, sizeof(pg_data_t),
+				 (nid ? max_low_pfn:max_pfn_mapped)<<PAGE_SHIFT,
+				 sizeof(pg_data_t),
 				 PAGE_SIZE);
 		NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
 		reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t),
-- 
cgit v1.2.3


From c8c034ce79418d2143c00c4cf751cfa51701f788 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 1 Jun 2008 23:55:37 -0700
Subject: x86: clean up max_pfn_mapped usage - 64-bit

on 64-bit we only get valid max_pfn_mapped after init_memory_mapping().

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_64.c  | 15 +++------------
 arch/x86/kernel/setup_64.c |  2 +-
 2 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 5e063e72b24a..a11bec20f65d 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -55,16 +55,12 @@ unsigned long __init e820_end_of_ram(void)
 
 	last_pfn = find_max_pfn_with_active_regions();
 
-	if (last_pfn > max_pfn_mapped)
-		max_pfn_mapped = last_pfn;
-	if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
-		max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
+	if (last_pfn > MAXMEM>>PAGE_SHIFT)
+		last_pfn = MAXMEM>>PAGE_SHIFT;
 	if (last_pfn > end_user_pfn)
 		last_pfn = end_user_pfn;
-	if (last_pfn > max_pfn_mapped)
-		last_pfn = max_pfn_mapped;
 
-	printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
+	printk(KERN_INFO "last_pfn = %lu\n", last_pfn);
 	return last_pfn;
 }
 
@@ -109,10 +105,6 @@ static int __init e820_find_active_region(const struct e820entry *ei,
 	if (*ei_startpfn >= *ei_endpfn)
 		return 0;
 
-	/* Check if max_pfn_mapped should be updated */
-	if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
-		max_pfn_mapped = *ei_endpfn;
-
 	/* Skip if map is outside the node */
 	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
 				    *ei_startpfn >= last_pfn)
@@ -229,7 +221,6 @@ static int __init parse_memmap_opt(char *p)
 		saved_max_pfn = e820_end_of_ram();
 		remove_all_active_ranges();
 #endif
-		max_pfn_mapped = 0;
 		e820.nr_map = 0;
 		userdef = 1;
 		return 0;
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 978a0d637f3f..2599b2744207 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -396,7 +396,7 @@ void __init setup_arch(char **cmdline_p)
 
 	check_efer();
 
-	max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
+	max_pfn_mapped = init_memory_mapping(0, (end_pfn << PAGE_SHIFT));
 	if (efi_enabled)
 		efi_init();
 
-- 
cgit v1.2.3


From d44b9d17faf7bca165ce73a1acb936b65a3f0cc6 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Tue, 3 Jun 2008 13:06:07 -0700
Subject: x86: move bugs_64.c to cpu/bugs_64.c

It looks good to move bugs_64.c to cpu/bugs_64.c.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/Makefile      |  1 -
 arch/x86/kernel/bugs_64.c     | 33 ---------------------------------
 arch/x86/kernel/cpu/Makefile  |  1 +
 arch/x86/kernel/cpu/bugs_64.c | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 34 insertions(+), 34 deletions(-)
 delete mode 100644 arch/x86/kernel/bugs_64.c
 create mode 100644 arch/x86/kernel/cpu/bugs_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..4934fec38c42 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,6 @@ obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
 obj-y			+= bootflag.o e820_$(BITS).o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o
-obj-$(CONFIG_X86_64)	+= bugs_64.o
 obj-y			+= tsc_$(BITS).o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
deleted file mode 100644
index 9a3ed0649d4e..000000000000
--- a/arch/x86/kernel/bugs_64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (C) 1994  Linus Torvalds
- *  Copyright (C) 2000  SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-#include <asm/cacheflush.h>
-
-void __init check_bugs(void)
-{
-	identify_boot_cpu();
-#if !defined(CONFIG_SMP)
-	printk("CPU: ");
-	print_cpu_info(&boot_cpu_data);
-#endif
-	alternative_instructions();
-
-	/*
-	 * Make sure the first 2MB area is not mapped by huge pages
-	 * There are typically fixed size MTRRs in there and overlapping
-	 * MTRRs into large pages causes slow downs.
-	 *
-	 * Right now we don't do that with gbpages because there seems
-	 * very little benefit for that case.
-	 */
-	if (!direct_gbpages)
-		set_memory_4k((unsigned long)__va(0), 1);
-}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c77a1c50d944..65b1be5fe9ce 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -6,6 +6,7 @@ obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o feature_names.o
 
 obj-$(CONFIG_X86_32)	+= common.o bugs.o
+obj-$(CONFIG_X86_64)	+= bugs_64.o
 obj-$(CONFIG_X86_32)	+= amd.o
 obj-$(CONFIG_X86_64)	+= amd_64.o
 obj-$(CONFIG_X86_32)	+= cyrix.o
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
new file mode 100644
index 000000000000..9a3ed0649d4e
--- /dev/null
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (C) 1994  Linus Torvalds
+ *  Copyright (C) 2000  SuSE
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <asm/alternative.h>
+#include <asm/bugs.h>
+#include <asm/processor.h>
+#include <asm/mtrr.h>
+#include <asm/cacheflush.h>
+
+void __init check_bugs(void)
+{
+	identify_boot_cpu();
+#if !defined(CONFIG_SMP)
+	printk("CPU: ");
+	print_cpu_info(&boot_cpu_data);
+#endif
+	alternative_instructions();
+
+	/*
+	 * Make sure the first 2MB area is not mapped by huge pages
+	 * There are typically fixed size MTRRs in there and overlapping
+	 * MTRRs into large pages causes slow downs.
+	 *
+	 * Right now we don't do that with gbpages because there seems
+	 * very little benefit for that case.
+	 */
+	if (!direct_gbpages)
+		set_memory_4k((unsigned long)__va(0), 1);
+}
-- 
cgit v1.2.3


From f19dc2f22a180dde3f9e611b76c73f5390c11ecd Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 3 Jun 2008 10:24:49 -0700
Subject: x86: change propagate_e820_map() back to find_max_pfn(), 32-bit, fix

add memory_present() calls for sparse and non-numa.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index c985a8c305e0..ccf3595202fa 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -377,11 +377,13 @@ static unsigned long __init setup_memory(void)
 	if (max_pfn > max_low_pfn) {
 		highstart_pfn = max_low_pfn;
 	}
+	memory_present(0, 0, highend_pfn);
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 		pages_to_mb(highend_pfn - highstart_pfn));
 	num_physpages = highend_pfn;
 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
+	memory_present(0, 0, max_low_pfn);
 	num_physpages = max_low_pfn;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
-- 
cgit v1.2.3


From ab530e1f781da4d704892daab2bdd568f473687d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 3 Jun 2008 10:25:54 -0700
Subject: x86: early check if a system is numaq

so we could fall back to one node numa.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c  | 14 +++++++++++---
 arch/x86/kernel/numaq_32.c | 24 +++++++++++++++++++-----
 include/asm-x86/mpspec.h   |  4 ++--
 include/asm-x86/numaq.h    |  1 +
 4 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 8898aa49079d..b4a950da2f97 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -70,7 +70,10 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 		return;
 	}
 #ifdef CONFIG_X86_NUMAQ
-	apicid = mpc_apic_id(m, translation_table[mpc_record]);
+	if (found_numaq)
+		apicid = mpc_apic_id(m, translation_table[mpc_record]);
+	else
+		apicid = m->mpc_apicid;
 #else
 	apicid = m->mpc_apicid;
 #endif
@@ -91,7 +94,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 	str[6] = 0;
 
 #ifdef CONFIG_X86_NUMAQ
-	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+	if (found_numaq)
+		mpc_oem_bus_info(m, str, translation_table[mpc_record]);
 #else
 	printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
 #endif
@@ -112,7 +116,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 #endif
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
 #ifdef CONFIG_X86_NUMAQ
-		mpc_oem_pci_bus(m, translation_table[mpc_record]);
+		if (found_numaq)
+			mpc_oem_pci_bus(m, translation_table[mpc_record]);
 #endif
 		clear_bit(m->mpc_busid, mp_bus_not_pci);
 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
@@ -321,6 +326,9 @@ static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
 {
 	if (strncmp(oem, "IBM NUMA", 8))
 		printk("Warning!  May not be a NUMA-Q system!\n");
+	else
+		found_numaq = 1;
+
 	if (mpc->mpc_oemptr)
 		smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
 				 mpc->mpc_oemsize);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index e65281b1634b..922be66668b8 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,9 +31,12 @@
 #include <asm/numaq.h>
 #include <asm/topology.h>
 #include <asm/processor.h>
+#include <asm/mpspec.h>
 
 #define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
 
+int found_numaq;
+
 /*
  * Function: smp_dump_qct()
  *
@@ -67,13 +70,24 @@ static void __init smp_dump_qct(void)
 	}
 }
 
-/*
- * Unlike Summit, we don't really care to let the NUMA-Q
- * fall back to flat mode.  Don't compile for NUMA-Q
- * unless you really need it!
- */
+static __init void early_check_numaq(void)
+{
+	/*
+	 * Find possible boot-time SMP configuration:
+	 */
+	early_find_smp_config();
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	if (smp_found_config)
+		early_get_smp_config();
+}
+
 int __init get_memcfg_numaq(void)
 {
+	early_check_numaq();
+	if (!found_numaq)
+		return 0;
 	smp_dump_qct();
 	return 1;
 }
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index dbbf363dafb5..f38b68e8de5a 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -21,11 +21,11 @@ extern int pic_mode;
 /* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
 #define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 
+#endif
+
 extern void early_find_smp_config(void);
 extern void early_get_smp_config(void);
 
-#endif
-
 #if defined(CONFIG_MCA) || defined(CONFIG_EISA)
 extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 #endif
diff --git a/include/asm-x86/numaq.h b/include/asm-x86/numaq.h
index 94b86c31239a..739d16484b8c 100644
--- a/include/asm-x86/numaq.h
+++ b/include/asm-x86/numaq.h
@@ -28,6 +28,7 @@
 
 #ifdef CONFIG_X86_NUMAQ
 
+extern int found_numaq;
 extern int get_memcfg_numaq(void);
 
 /*
-- 
cgit v1.2.3


From ee0c80fadfa56bf4f9d90c1c023429a6bd8edd69 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 3 Jun 2008 19:34:00 -0700
Subject: x86: move e820_register_active() to e820.c

to prepare 32-bit to use it.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c    | 109 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/e820_64.c |  97 -----------------------------------------
 include/asm-x86/e820.h    |  11 +++++
 include/asm-x86/e820_64.h |   5 ---
 4 files changed, 120 insertions(+), 102 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index cd2b99e27d43..c140f731743b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -764,3 +764,112 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 	return addr;
 }
 
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+#  define MAX_ARCH_PFN		(1ULL<<(36-PAGE_SHIFT))
+# else
+#  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
+# endif
+#else /* CONFIG_X86_32 */
+# define MAX_ARCH_PFN MAXMEM<<PAGE_SHIFT
+#endif
+
+/*
+ * Last pfn which the user wants to use.
+ */
+unsigned long __initdata end_user_pfn = MAX_ARCH_PFN;
+
+/*
+ * Find the highest page frame number we have available
+ */
+unsigned long __init e820_end_of_ram(void)
+{
+	unsigned long last_pfn;
+	unsigned long max_arch_pfn = MAX_ARCH_PFN;
+
+	last_pfn = find_max_pfn_with_active_regions();
+
+	if (last_pfn > max_arch_pfn)
+		last_pfn = max_arch_pfn;
+	if (last_pfn > end_user_pfn)
+		last_pfn = end_user_pfn;
+
+	printk(KERN_INFO "last_pfn = %lu max_arch_pfn = %lu\n",
+			 last_pfn, max_arch_pfn);
+	return last_pfn;
+}
+
+/*
+ * Finds an active region in the address range from start_pfn to last_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+int __init e820_find_active_region(const struct e820entry *ei,
+				  unsigned long start_pfn,
+				  unsigned long last_pfn,
+				  unsigned long *ei_startpfn,
+				  unsigned long *ei_endpfn)
+{
+	u64 align = PAGE_SIZE;
+
+	*ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
+	*ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
+
+	/* Skip map entries smaller than a page */
+	if (*ei_startpfn >= *ei_endpfn)
+		return 0;
+
+	/* Skip if map is outside the node */
+	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+				    *ei_startpfn >= last_pfn)
+		return 0;
+
+	/* Check for overlaps */
+	if (*ei_startpfn < start_pfn)
+		*ei_startpfn = start_pfn;
+	if (*ei_endpfn > last_pfn)
+		*ei_endpfn = last_pfn;
+
+	/* Obey end_user_pfn to save on memmap */
+	if (*ei_startpfn >= end_user_pfn)
+		return 0;
+	if (*ei_endpfn > end_user_pfn)
+		*ei_endpfn = end_user_pfn;
+
+	return 1;
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init e820_register_active_regions(int nid, unsigned long start_pfn,
+					 unsigned long last_pfn)
+{
+	unsigned long ei_startpfn;
+	unsigned long ei_endpfn;
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++)
+		if (e820_find_active_region(&e820.map[i],
+					    start_pfn, last_pfn,
+					    &ei_startpfn, &ei_endpfn))
+			add_active_range(nid, ei_startpfn, ei_endpfn);
+}
+
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+u64 __init e820_hole_size(u64 start, u64 end)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long last_pfn = end >> PAGE_SHIFT;
+	unsigned long ei_startpfn, ei_endpfn, ram = 0;
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		if (e820_find_active_region(&e820.map[i],
+					    start_pfn, last_pfn,
+					    &ei_startpfn, &ei_endpfn))
+			ram += ei_endpfn - ei_startpfn;
+	}
+	return end - start - ((u64)ram << PAGE_SHIFT);
+}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index a11bec20f65d..0afee2ca0bf8 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -41,29 +41,6 @@ unsigned long end_pfn;
  */
 unsigned long max_pfn_mapped;
 
-/*
- * Last pfn which the user wants to use.
- */
-static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
-
-/*
- * Find the highest page frame number we have available
- */
-unsigned long __init e820_end_of_ram(void)
-{
-	unsigned long last_pfn;
-
-	last_pfn = find_max_pfn_with_active_regions();
-
-	if (last_pfn > MAXMEM>>PAGE_SHIFT)
-		last_pfn = MAXMEM>>PAGE_SHIFT;
-	if (last_pfn > end_user_pfn)
-		last_pfn = end_user_pfn;
-
-	printk(KERN_INFO "last_pfn = %lu\n", last_pfn);
-	return last_pfn;
-}
-
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
@@ -88,80 +65,6 @@ void __init e820_reserve_resources(void)
 	}
 }
 
-/*
- * Finds an active region in the address range from start_pfn to last_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-static int __init e820_find_active_region(const struct e820entry *ei,
-					  unsigned long start_pfn,
-					  unsigned long last_pfn,
-					  unsigned long *ei_startpfn,
-					  unsigned long *ei_endpfn)
-{
-	*ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
-	*ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
-
-	/* Skip map entries smaller than a page */
-	if (*ei_startpfn >= *ei_endpfn)
-		return 0;
-
-	/* Skip if map is outside the node */
-	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
-				    *ei_startpfn >= last_pfn)
-		return 0;
-
-	/* Check for overlaps */
-	if (*ei_startpfn < start_pfn)
-		*ei_startpfn = start_pfn;
-	if (*ei_endpfn > last_pfn)
-		*ei_endpfn = last_pfn;
-
-	/* Obey end_user_pfn to save on memmap */
-	if (*ei_startpfn >= end_user_pfn)
-		return 0;
-	if (*ei_endpfn > end_user_pfn)
-		*ei_endpfn = end_user_pfn;
-
-	return 1;
-}
-
-/* Walk the e820 map and register active regions within a node */
-void __init
-e820_register_active_regions(int nid, unsigned long start_pfn,
-							unsigned long last_pfn)
-{
-	unsigned long ei_startpfn;
-	unsigned long ei_endpfn;
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++)
-		if (e820_find_active_region(&e820.map[i],
-					    start_pfn, last_pfn,
-					    &ei_startpfn, &ei_endpfn))
-			add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long last_pfn = end >> PAGE_SHIFT;
-	unsigned long ei_startpfn, ei_endpfn, ram = 0;
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		if (e820_find_active_region(&e820.map[i],
-					    start_pfn, last_pfn,
-					    &ei_startpfn, &ei_endpfn))
-			ram += ei_endpfn - ei_startpfn;
-	}
-	return end - start - (ram << PAGE_SHIFT);
-}
-
 static void early_panic(char *msg)
 {
 	early_printk(msg);
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index ee8fe4c5da41..44ed9c0a4dfd 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -79,6 +79,8 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
 }
 #endif
 
+extern unsigned long end_user_pfn;
+
 extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
 extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
 extern void reserve_early(u64 start, u64 end, char *name);
@@ -86,6 +88,15 @@ extern void free_early(u64 start, u64 end);
 extern void early_res_to_bootmem(u64 start, u64 end);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
+extern unsigned long e820_end_of_ram(void);
+extern int e820_find_active_region(const struct e820entry *ei,
+				  unsigned long start_pfn,
+				  unsigned long last_pfn,
+				  unsigned long *ei_startpfn,
+				  unsigned long *ei_endpfn);
+extern void e820_register_active_regions(int nid, unsigned long start_pfn,
+					 unsigned long end_pfn);
+extern u64 e820_hole_size(u64 start, u64 end);
 #endif /* __ASSEMBLY__ */
 
 #define ISA_START_ADDRESS	0xa0000
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 917963ccab69..368585daaa42 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -16,16 +16,11 @@
 #ifndef __ASSEMBLY__
 extern void setup_memory_region(void);
 extern void contig_e820_setup(void);
-extern unsigned long e820_end_of_ram(void);
 extern void e820_reserve_resources(void);
 extern int e820_any_non_reserved(unsigned long start, unsigned long end);
 extern int is_memory_any_valid(unsigned long start, unsigned long end);
 extern int e820_all_non_reserved(unsigned long start, unsigned long end);
 extern int is_memory_all_valid(unsigned long start, unsigned long end);
-extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
-
-extern void e820_register_active_regions(int nid, unsigned long start_pfn,
-					 unsigned long end_pfn);
 
 extern void finish_e820_parsing(void);
 
-- 
cgit v1.2.3


From 7b2a0a6c4866cac146dcb0433e6984eb19a81335 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 3 Jun 2008 19:35:04 -0700
Subject: x86: make 32-bit use e820_register_active_regions()

this way 32-bit is more similar to 64-bit, and smarter e820 and numa.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_32.c  | 68 ++--------------------------------------------
 arch/x86/kernel/numaq_32.c |  3 ++
 arch/x86/kernel/setup_32.c | 24 ++++++++++++----
 arch/x86/kernel/srat_32.c  |  4 ++-
 arch/x86/mm/discontig_32.c | 19 ++++---------
 include/asm-x86/e820_32.h  |  2 --
 6 files changed, 33 insertions(+), 87 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 0c025d04fc2e..e8a3b968c9fa 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -207,69 +207,6 @@ void __init init_iomem_resources(struct resource *code_resource,
 	}
 }
 
-/*
- * Find the highest page frame number we have available
- */
-void __init find_max_pfn(void)
-{
-	int i;
-
-	max_pfn = 0;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long start, end;
-		/* RAM? */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		start = PFN_UP(e820.map[i].addr);
-		end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-		if (start >= end)
-			continue;
-		if (end > max_pfn)
-			max_pfn = end;
-	}
-}
-
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-void __init register_bootmem_low_pages(unsigned long max_low_pfn)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long curr_pfn, last_pfn, size;
-		/*
-		 * Reserve usable low memory
-		 */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		/*
-		 * We are rounding up the start address of usable memory:
-		 */
-		curr_pfn = PFN_UP(e820.map[i].addr);
-		if (curr_pfn >= max_low_pfn)
-			continue;
-		/*
-		 * ... and at the end of the usable range downwards:
-		 */
-		last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-
-		if (last_pfn > max_low_pfn)
-			last_pfn = max_low_pfn;
-
-		/*
-		 * .. finally, did all the rounding and playing
-		 * around just make the area go away?
-		 */
-		if (last_pfn <= curr_pfn)
-			continue;
-
-		size = last_pfn - curr_pfn;
-		free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
-	}
-}
-
 void __init limit_regions(unsigned long long size)
 {
 	unsigned long long current_addr;
@@ -360,8 +297,9 @@ static int __init parse_memmap(char *arg)
 		 * size before original memory map is
 		 * reset.
 		 */
-		find_max_pfn();
-		saved_max_pfn = max_pfn;
+		e820_register_active_regions(0, 0, -1UL);
+		saved_max_pfn = e820_end_of_ram();
+		remove_all_active_ranges();
 #endif
 		e820.nr_map = 0;
 		user_defined_memmap = 1;
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 922be66668b8..27b908254fb0 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -32,6 +32,7 @@
 #include <asm/topology.h>
 #include <asm/processor.h>
 #include <asm/mpspec.h>
+#include <asm/e820.h>
 
 #define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
 
@@ -61,6 +62,8 @@ static void __init smp_dump_qct(void)
 			node_end_pfn[node] = MB_TO_PAGES(
 				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
 
+			e820_register_active_regions(node, node_start_pfn[node],
+							node_end_pfn[node]);
 			memory_present(node,
 				node_start_pfn[node], node_end_pfn[node]);
 			node_remap_size[node] = node_memmap_size_bytes(node,
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index ccf3595202fa..0ec6480aaa27 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -405,11 +405,12 @@ static void __init zone_sizes_init(void)
 	max_zone_pfns[ZONE_DMA] =
 		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+	remove_all_active_ranges();
 #ifdef CONFIG_HIGHMEM
 	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-	add_active_range(0, 0, highend_pfn);
+	e820_register_active_regions(0, 0, highend_pfn);
 #else
-	add_active_range(0, 0, max_low_pfn);
+	e820_register_active_regions(0, 0, max_low_pfn);
 #endif
 
 	free_area_init_nodes(max_zone_pfns);
@@ -582,6 +583,7 @@ static void __init relocate_initrd(void)
 
 void __init setup_bootmem_allocator(void)
 {
+	int i;
 	unsigned long bootmap_size, bootmap;
 	/*
 	 * Initialize the boot-time allocator (with low memory only):
@@ -603,7 +605,8 @@ void __init setup_bootmem_allocator(void)
 		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
 	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
 		 bootmap, bootmap + bootmap_size);
-	register_bootmem_low_pages(max_low_pfn);
+	for_each_online_node(i)
+		free_bootmem_with_active_regions(i, max_low_pfn);
 	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
 
 #ifdef CONFIG_ACPI_SLEEP
@@ -733,11 +736,20 @@ void __init setup_arch(char **cmdline_p)
 	if (efi_enabled)
 		efi_init();
 
+	e820_register_active_regions(0, 0, -1UL);
+	/*
+	 * partially used pages are not usable - thus
+	 * we are rounding upwards:
+	 */
+	max_pfn = e820_end_of_ram();
+
 	/* update e820 for memory not covered by WB MTRRs */
-	find_max_pfn();
 	mtrr_bp_init();
-	if (mtrr_trim_uncached_memory(max_pfn))
-		find_max_pfn();
+	if (mtrr_trim_uncached_memory(max_pfn)) {
+		remove_all_active_ranges();
+		e820_register_active_regions(0, 0, -1UL);
+		max_pfn = e820_end_of_ram();
+	}
 
 	max_low_pfn = setup_memory();
 
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 88971ee166d4..32d8b1142938 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -31,6 +31,7 @@
 #include <asm/srat.h>
 #include <asm/topology.h>
 #include <asm/smp.h>
+#include <asm/e820.h>
 
 /*
  * proximity macros and definitions
@@ -244,7 +245,8 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 		printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
 		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
 		node_read_chunk(chunk->nid, chunk);
-		add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
+		e820_register_active_regions(chunk->nid, chunk->start_pfn,
+					     min(chunk->end_pfn, max_pfn));
 	}
  
 	for_each_online_node(nid) {
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 7ced26ab9aec..a89ccf3d4c14 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -120,10 +120,9 @@ int __init get_memcfg_numa_flat(void)
 {
 	printk("NUMA - single node, flat memory mode\n");
 
-	/* Run the memory configuration and find the top of memory. */
-	find_max_pfn();
 	node_start_pfn[0] = 0;
 	node_end_pfn[0] = max_pfn;
+	e820_register_active_regions(0, 0, max_pfn);
 	memory_present(0, 0, max_pfn);
 	node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
 
@@ -337,6 +336,11 @@ unsigned long __init setup_memory(void)
 	 * this space and use it to adjust the boundary between ZONE_NORMAL
 	 * and ZONE_HIGHMEM.
 	 */
+
+	/* call find_max_low_pfn at first, it could update max_pfn */
+	system_max_low_pfn = max_low_pfn = find_max_low_pfn();
+
+	remove_all_active_ranges();
 	get_memcfg_numa();
 
 	kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
@@ -344,7 +348,6 @@ unsigned long __init setup_memory(void)
 	/* partially used pages are not usable - thus round upwards */
 	system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
 
-	system_max_low_pfn = max_low_pfn = find_max_low_pfn();
 	kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
 	do {
 		kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
@@ -402,7 +405,6 @@ unsigned long __init setup_memory(void)
 
 void __init zone_sizes_init(void)
 {
-	int nid;
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] =
@@ -412,15 +414,6 @@ void __init zone_sizes_init(void)
 	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
 #endif
 
-	/* If SRAT has not registered memory, register it now */
-	if (find_max_pfn_with_active_regions() == 0) {
-		for_each_online_node(nid) {
-			if (node_has_online_mem(nid))
-				add_active_range(nid, node_start_pfn[nid],
-							node_end_pfn[nid]);
-		}
-	}
-
 	free_area_init_nodes(max_zone_pfns);
 	return;
 }
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index 00fbc60b9d30..212b74c10efc 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -21,8 +21,6 @@
 extern void setup_memory_map(void);
 extern void finish_e820_parsing(void);
 
-extern void find_max_pfn(void);
-extern void register_bootmem_low_pages(unsigned long max_low_pfn);
 extern void limit_regions(unsigned long long size);
 extern void init_iomem_resources(struct resource *code_resource,
 				 struct resource *data_resource,
-- 
cgit v1.2.3


From bd70e522afce2f7837d081dc52f261ecf9d4d2d5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 4 Jun 2008 13:21:29 -0700
Subject: x86: e820 max_arch_pfn typo fix for 64 bit

should use right shift

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c140f731743b..5d33b9c08d1b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -771,7 +771,7 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 #  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
 # endif
 #else /* CONFIG_X86_32 */
-# define MAX_ARCH_PFN MAXMEM<<PAGE_SHIFT
+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
 #endif
 
 /*
-- 
cgit v1.2.3


From fa5b8a30cf03520737e9a0ee2ee03a61b2eccf05 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Mon, 26 May 2008 20:40:47 +0200
Subject: aperture_64.c: duplicated code, buggy?

Hi!

void __init early_gart_iommu_check(void)

contains

	for (num = 24; num < 32; num++) {
		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
			continue;

loop, with very similar loop duplicated in

void __init gart_iommu_hole_init(void)

. First copy of a loop seems to be buggy, too. It uses 0 as a "nothing
set" value, which may actually bite us in last_aper_enabled case
(because it may be often zero).

(Beware, it is hard to test this patch, because this code has about
2^8 different code paths, depending on hardware and cmdline settings).

Plus, the second loop does not check for consistency of
aper_enabled. Should it?

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6eea42eb287b..e5b17f910f8b 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -271,16 +271,16 @@ void __init early_gart_iommu_check(void)
 	 * or BIOS forget to put that in reserved.
 	 * try to update e820 to make that region as reserved.
 	 */
-	int fix, slot;
+	int i, fix, slot;
 	u32 ctl;
 	u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
 	u64 aper_base = 0, last_aper_base = 0;
-	int aper_enabled = 0, last_aper_enabled = 0;
-	int i;
+	int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
 
 	if (!early_pci_allowed())
 		return;
 
+	/* This is mostly duplicate of iommu_hole_init */
 	fix = 0;
 	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
 		int bus;
@@ -301,19 +301,22 @@ void __init early_gart_iommu_check(void)
 			aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
 			aper_base <<= 25;
 
-			if ((last_aper_order && aper_order != last_aper_order) ||
-			    (last_aper_base && aper_base != last_aper_base) ||
-			    (last_aper_enabled && aper_enabled != last_aper_enabled)) {
-				fix = 1;
-				goto out;
+			if (last_valid) {
+				if ((aper_order != last_aper_order) ||
+				    (aper_base != last_aper_base) ||
+				    (aper_enabled != last_aper_enabled)) {
+					fix = 1;
+					break;
+				}
 			}
+
 			last_aper_order = aper_order;
 			last_aper_base = aper_base;
 			last_aper_enabled = aper_enabled;
+			last_valid = 1;
 		}
 	}
 
-out:
 	if (!fix && !aper_enabled)
 		return;
 
-- 
cgit v1.2.3


From 4f384f8bcdb5d618a0a68fb84c809e602c798b8f Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Mon, 26 May 2008 21:17:30 +0200
Subject: x86: aperture_64.c: corner case wrong

If

fix == 0, aper_enabled == 1, gart_fix_e820 == 0

	if (!fix && !aper_enabled)
		return;

	if (gart_fix_e820 && !fix && aper_enabled) {
		if (e820_any_mapped(aper_base, aper_base + aper_size,
				    E820_RAM)) {
			/* reserve it, so we can reuse it in second kernel */
			printk(KERN_INFO "update e820 for GART\n");
			add_memory_region(aper_base, aper_size, E820_RESERVED);
			update_e820();
		}
		return;
	}

	/* different nodes have different setting, disable them all atfirst*/

we'll fall back here and disable all the settings, even when they were
all consistent.

What about this? (I hope it compiles...)

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index e5b17f910f8b..eb20f168c0fd 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -331,9 +331,11 @@ void __init early_gart_iommu_check(void)
 			add_memory_region(aper_base, aper_size, E820_RESERVED);
 			update_e820();
 		}
-		return;
 	}
 
+	if (!fix)
+		return;
+
 	/* different nodes have different setting, disable them all at first*/
 	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
 		int bus;
-- 
cgit v1.2.3


From d3fbe5ea9518b46a68e6b278974e92e2c3acef4a Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Mon, 2 Jun 2008 14:26:14 +0800
Subject: x86: split out common code into find_overlapped_early()

This patch clean up reserve_early() family functions by extracting the
common part of reserve_early(), free_early() and bad_addr() into
find_overlapped_early().

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: andi@firstfloor.org
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820.c | 50 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 5d33b9c08d1b..ac5e9ebf70ea 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -558,20 +558,34 @@ static struct early_res early_res[MAX_EARLY_RES] __initdata = {
 	{}
 };
 
-void __init reserve_early(u64 start, u64 end, char *name)
+static int __init find_overlapped_early(u64 start, u64 end)
 {
 	int i;
 	struct early_res *r;
+
 	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
 		r = &early_res[i];
 		if (end > r->start && start < r->end)
-			panic("Overlapping early reservations %llx-%llx %s to %llx-%llx %s\n",
-			      start, end - 1, name?name:"", r->start,
-			      r->end - 1, r->name);
+			break;
 	}
+
+	return i;
+}
+
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+	int i;
+	struct early_res *r;
+
+	i = find_overlapped_early(start, end);
 	if (i >= MAX_EARLY_RES)
 		panic("Too many early reservations");
 	r = &early_res[i];
+	if (r->end)
+		panic("Overlapping early reservations "
+		      "%llx-%llx %s to %llx-%llx %s\n",
+		      start, end - 1, name?name:"", r->start,
+		      r->end - 1, r->name);
 	r->start = start;
 	r->end = end;
 	if (name)
@@ -583,14 +597,11 @@ void __init free_early(u64 start, u64 end)
 	struct early_res *r;
 	int i, j;
 
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		r = &early_res[i];
-		if (start == r->start && end == r->end)
-			break;
-	}
-	if (i >= MAX_EARLY_RES || !early_res[i].end)
+	i = find_overlapped_early(start, end);
+	r = &early_res[i];
+	if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
 		panic("free_early on not reserved area: %llx-%llx!",
-			 start, end);
+			 start, end - 1);
 
 	for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
 		;
@@ -626,17 +637,16 @@ void __init early_res_to_bootmem(u64 start, u64 end)
 static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
 {
 	int i;
-	u64 addr = *addrp, last;
+	u64 addr = *addrp;
 	int changed = 0;
+	struct early_res *r;
 again:
-	last = addr + size;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		struct early_res *r = &early_res[i];
-		if (last >= r->start && addr < r->end) {
-			*addrp = addr = round_up(r->end, align);
-			changed = 1;
-			goto again;
-		}
+	i = find_overlapped_early(addr, addr + size);
+	r = &early_res[i];
+	if (i < MAX_EARLY_RES && r->end) {
+		*addrp = addr = round_up(r->end, align);
+		changed = 1;
+		goto again;
 	}
 	return changed;
 }
-- 
cgit v1.2.3


From d0ec2c6f2c2f0478b34ae78b3e65f60a561ac807 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Mon, 2 Jun 2008 14:26:18 +0800
Subject: x86: reserve highmem pages via reserve_early

This patch makes early reserved highmem pages become reserved
pages. This can be used for highmem pages allocated by bootloader such
as EFI memory map, linked list of setup_data, etc.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: andi@firstfloor.org
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820.c | 11 +++++++++++
 arch/x86/mm/init_32.c  |  3 ++-
 include/asm-x86/e820.h |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index ac5e9ebf70ea..a706e9057ba5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -612,6 +612,17 @@ void __init free_early(u64 start, u64 end)
 	early_res[j - 1].end = 0;
 }
 
+int __init page_is_reserved_early(unsigned long pagenr)
+{
+	u64 start = (u64)pagenr << PAGE_SHIFT;
+	int i;
+	struct early_res *r;
+
+	i = find_overlapped_early(start, start + PAGE_SIZE);
+	r = &early_res[i];
+	return (i < MAX_EARLY_RES && r->end);
+}
+
 void __init early_res_to_bootmem(u64 start, u64 end)
 {
 	int i;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..0e7bb5e81670 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -289,7 +289,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 
 void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 {
-	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
+	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) &&
+	    !page_is_reserved_early(pfn)) {
 		ClearPageReserved(page);
 		init_page_count(page);
 		__free_page(page);
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 44ed9c0a4dfd..8aa32323a182 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -86,6 +86,7 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
 extern void reserve_early(u64 start, u64 end, char *name);
 extern void free_early(u64 start, u64 end);
 extern void early_res_to_bootmem(u64 start, u64 end);
+extern int page_is_reserved_early(unsigned long pagenr);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
 extern unsigned long e820_end_of_ram(void);
-- 
cgit v1.2.3


From ecacf09f7d26b2317e8b1d59fa40f62081fad0bb Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Mon, 2 Jun 2008 14:26:21 +0800
Subject: x86: reserve EFI memory map with reserve_early

This patch reserves the EFI memory map with reserve_early(). Because EFI
memory map is allocated by bootloader, if it is not reserved by
reserved_early(), it may be overwritten through address returned by
find_e820_area().

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: andi@firstfloor.org
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/efi.c      | 33 ++++++++++++++++++++-------------
 arch/x86/kernel/efi_64.c   |  6 ------
 arch/x86/kernel/setup_32.c |  5 ++++-
 arch/x86/kernel/setup_64.c |  7 +++----
 include/asm-x86/efi.h      |  2 +-
 5 files changed, 28 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 4a1a26d5931f..d5c7fcdd1861 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -238,6 +238,23 @@ static void __init add_efi_memmap(void)
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 }
 
+void __init efi_reserve_early(void)
+{
+	unsigned long pmap;
+
+	pmap = boot_params.efi_info.efi_memmap;
+#ifdef CONFIG_X86_64
+	pmap += (__u64)boot_params.efi_info.efi_memmap_hi << 32;
+#endif
+	memmap.phys_map = (void *)pmap;
+	memmap.nr_map = boot_params.efi_info.efi_memmap_size /
+		boot_params.efi_info.efi_memdesc_size;
+	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
+	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
+	reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
+		      "EFI memmap");
+}
+
 #if EFI_DEBUG
 static void __init print_efi_memmap(void)
 {
@@ -267,21 +284,11 @@ void __init efi_init(void)
 	int i = 0;
 	void *tmp;
 
-#ifdef CONFIG_X86_32
 	efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
-	memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
-#else
-	efi_phys.systab = (efi_system_table_t *)
-		(boot_params.efi_info.efi_systab |
-		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
-	memmap.phys_map = (void *)
-		(boot_params.efi_info.efi_memmap |
-		 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
+#ifdef CONFIG_X86_64
+	efi_phys.systab = (void *)efi_phys.systab +
+		((__u64)boot_params.efi_info.efi_systab_hi<<32);
 #endif
-	memmap.nr_map = boot_params.efi_info.efi_memmap_size /
-		boot_params.efi_info.efi_memdesc_size;
-	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
-	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
 
 	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
 				   sizeof(efi_system_table_t));
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 21a7b759687a..652c5287215f 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -97,12 +97,6 @@ void __init efi_call_phys_epilog(void)
 	early_runtime_code_mapping_set_exec(0);
 }
 
-void __init efi_reserve_bootmem(void)
-{
-	reserve_bootmem_generic((unsigned long)memmap.phys_map,
-				memmap.nr_map * memmap.desc_size);
-}
-
 void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
 {
 	static unsigned pages_mapped __initdata;
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 0ec6480aaa27..2960cbecfa5a 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -67,6 +67,7 @@
 #include <asm/bios_ebda.h>
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
+#include <asm/efi.h>
 
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
@@ -683,8 +684,10 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL32", 4))
+		     "EL32", 4)) {
 		efi_enabled = 1;
+		efi_reserve_early();
+	}
 #endif
 
 	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 2599b2744207..078c02f6f5f9 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -330,8 +330,10 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL64", 4))
+		     "EL64", 4)) {
 		efi_enabled = 1;
+		efi_reserve_early();
+	}
 #endif
 
 	ARCH_SETUP
@@ -457,9 +459,6 @@ void __init setup_arch(char **cmdline_p)
        acpi_reserve_bootmem();
 #endif
 
-	if (efi_enabled)
-		efi_reserve_bootmem();
-
 #ifdef CONFIG_X86_MPPARSE
        /*
 	* Find and reserve possible boot-time SMP configuration:
diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h
index d53004b855cc..7ed2bd7a7f51 100644
--- a/include/asm-x86/efi.h
+++ b/include/asm-x86/efi.h
@@ -90,7 +90,7 @@ extern void *efi_ioremap(unsigned long addr, unsigned long size);
 
 #endif /* CONFIG_X86_32 */
 
-extern void efi_reserve_bootmem(void);
+extern void efi_reserve_early(void);
 extern void efi_call_phys_prelog(void);
 extern void efi_call_phys_epilog(void);
 
-- 
cgit v1.2.3


From 0c51a965ed3c44dd50497e74492a015680e49899 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Mon, 2 Jun 2008 14:26:23 +0800
Subject: x86: extract common part of head32.c and head64.c into head.c

This patch extracts the common part of head32.c and head64.c into head.c.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: andi@firstfloor.org
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Makefile           |  1 +
 arch/x86/kernel/Makefile    |  2 +-
 arch/x86/kernel/head.c      | 55 +++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/head32.c    | 50 -----------------------------------------
 arch/x86/kernel/head64.c    | 50 -----------------------------------------
 include/asm-x86/bios_ebda.h |  2 ++
 6 files changed, 59 insertions(+), 101 deletions(-)
 create mode 100644 arch/x86/kernel/head.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 3cff3c894cf3..30b046bcccb6 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -160,6 +160,7 @@ KBUILD_AFLAGS += $(mflags-y)
 
 head-y := arch/x86/kernel/head_$(BITS).o
 head-y += arch/x86/kernel/head$(BITS).o
+head-y += arch/x86/kernel/head.o
 head-y += arch/x86/kernel/init_task.o
 
 libs-y  += arch/x86/lib/
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5369a4e36f17..7e01ce39c836 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-extra-y                := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
+extra-y                := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
new file mode 100644
index 000000000000..e0d0ce58979e
--- /dev/null
+++ b/arch/x86/kernel/head.c
@@ -0,0 +1,55 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+void __init reserve_ebda_region(void)
+{
+	unsigned int lowmem, ebda_addr;
+
+	/* To determine the position of the EBDA and the */
+	/* end of conventional memory, we need to look at */
+	/* the BIOS data area. In a paravirtual environment */
+	/* that area is absent. We'll just have to assume */
+	/* that the paravirt case can handle memory setup */
+	/* correctly, without our help. */
+	if (paravirt_enabled())
+		return;
+
+	/* end of low (conventional) memory */
+	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+	lowmem <<= 10;
+
+	/* start of EBDA area */
+	ebda_addr = get_bios_ebda();
+
+	/* Fixup: bios puts an EBDA in the top 64K segment */
+	/* of conventional memory, but does not adjust lowmem. */
+	if ((lowmem - ebda_addr) <= 0x10000)
+		lowmem = ebda_addr;
+
+	/* Fixup: bios does not report an EBDA at all. */
+	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+		lowmem = 0x9f000;
+
+	/* Paranoia: should never happen, but... */
+	if ((lowmem == 0) || (lowmem >= 0x100000))
+		lowmem = 0x9f000;
+
+	/* reserve all memory between lowmem and the 1MB mark */
+	reserve_early(lowmem, 0x100000, "BIOS reserved");
+}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 0b6cb9fba71e..fa1d25dd83e3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -13,56 +13,6 @@
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
 
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
-	unsigned int lowmem, ebda_addr;
-
-	/* To determine the position of the EBDA and the */
-	/* end of conventional memory, we need to look at */
-	/* the BIOS data area. In a paravirtual environment */
-	/* that area is absent. We'll just have to assume */
-	/* that the paravirt case can handle memory setup */
-	/* correctly, without our help. */
-	if (paravirt_enabled())
-		return;
-
-	/* end of low (conventional) memory */
-	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
-	lowmem <<= 10;
-
-	/* start of EBDA area */
-	ebda_addr = get_bios_ebda();
-
-	/* Fixup: bios puts an EBDA in the top 64K segment */
-	/* of conventional memory, but does not adjust lowmem. */
-	if ((lowmem - ebda_addr) <= 0x10000)
-		lowmem = ebda_addr;
-
-	/* Fixup: bios does not report an EBDA at all. */
-	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-		lowmem = 0x9f000;
-
-	/* Paranoia: should never happen, but... */
-	if ((lowmem == 0) || (lowmem >= 0x100000))
-		lowmem = 0x9f000;
-
-	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_early(lowmem, 0x100000, "BIOS reserved");
-}
-
 void __init i386_start_kernel(void)
 {
 	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index e25c57b8aa84..f1773c8ee21e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -51,56 +51,6 @@ static void __init copy_bootdata(char *real_mode_data)
 	}
 }
 
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
-	unsigned int lowmem, ebda_addr;
-
-	/* To determine the position of the EBDA and the */
-	/* end of conventional memory, we need to look at */
-	/* the BIOS data area. In a paravirtual environment */
-	/* that area is absent. We'll just have to assume */
-	/* that the paravirt case can handle memory setup */
-	/* correctly, without our help. */
-	if (paravirt_enabled())
-		return;
-
-	/* end of low (conventional) memory */
-	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
-	lowmem <<= 10;
-
-	/* start of EBDA area */
-	ebda_addr = get_bios_ebda();
-
-	/* Fixup: bios puts an EBDA in the top 64K segment */
-	/* of conventional memory, but does not adjust lowmem. */
-	if ((lowmem - ebda_addr) <= 0x10000)
-		lowmem = ebda_addr;
-
-	/* Fixup: bios does not report an EBDA at all. */
-	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-		lowmem = 0x9f000;
-
-	/* Paranoia: should never happen, but... */
-	if ((lowmem == 0) || (lowmem >= 0x100000))
-		lowmem = 0x9f000;
-
-	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_early(lowmem, 0x100000, "BIOS reserved");
-}
-
 static void __init reserve_setup_data(void)
 {
 	struct setup_data *data;
diff --git a/include/asm-x86/bios_ebda.h b/include/asm-x86/bios_ebda.h
index b4a46b7be794..0033e50c13b2 100644
--- a/include/asm-x86/bios_ebda.h
+++ b/include/asm-x86/bios_ebda.h
@@ -14,4 +14,6 @@ static inline unsigned int get_bios_ebda(void)
 	return address;	/* 0 means none */
 }
 
+void reserve_ebda_region(void);
+
 #endif /* _MACH_BIOS_EBDA_H */
-- 
cgit v1.2.3


From c45a707dbe35cb9aa6490223e5b1129fa3583948 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Mon, 2 Jun 2008 14:26:25 +0800
Subject: x86: linked list of setup_data for i386

This patch adds linked list of struct setup_data supported for i386.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: andi@firstfloor.org
Cc: mingo@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head.c      | 18 ++++++++++++++++++
 arch/x86/kernel/head64.c    | 18 ------------------
 arch/x86/kernel/setup.c     | 22 ++++++++++++++++++++++
 arch/x86/kernel/setup_32.c  |  3 +++
 arch/x86/kernel/setup_64.c  | 22 ----------------------
 include/asm-x86/bootparam.h |  3 +++
 6 files changed, 46 insertions(+), 40 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index e0d0ce58979e..a727c0b9819c 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -53,3 +53,21 @@ void __init reserve_ebda_region(void)
 	/* reserve all memory between lowmem and the 1MB mark */
 	reserve_early(lowmem, 0x100000, "BIOS reserved");
 }
+
+void __init reserve_setup_data(void)
+{
+	struct setup_data *data;
+	u64 pa_data;
+	char buf[32];
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, sizeof(*data));
+		sprintf(buf, "setup data %x", data->type);
+		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+		pa_data = data->next;
+		early_iounmap(data, sizeof(*data));
+	}
+}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f1773c8ee21e..5fbed459ff3b 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -51,24 +51,6 @@ static void __init copy_bootdata(char *real_mode_data)
 	}
 }
 
-static void __init reserve_setup_data(void)
-{
-	struct setup_data *data;
-	unsigned long pa_data;
-	char buf[32];
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, sizeof(*data));
-		sprintf(buf, "setup data %x", data->type);
-		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
-		pa_data = data->next;
-		early_iounmap(data, sizeof(*data));
-	}
-}
-
 void __init x86_64_start_kernel(char * real_mode_data)
 {
 	int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6f80b852a196..0a281f2c7157 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -137,3 +137,25 @@ void __init setup_per_cpu_areas(void)
 }
 
 #endif
+
+void __init parse_setup_data(void)
+{
+	struct setup_data *data;
+	u64 pa_data;
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, PAGE_SIZE);
+		switch (data->type) {
+		default:
+			break;
+		}
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+		free_early(pa_data, pa_data+sizeof(*data)+data->len);
+#endif
+		pa_data = data->next;
+		early_iounmap(data, PAGE_SIZE);
+	}
+}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 2960cbecfa5a..ee1ccdbd7100 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -681,6 +681,7 @@ void __init setup_arch(char **cmdline_p)
 	pre_setup_arch_hook();
 	early_cpu_init();
 	early_ioremap_init();
+	reserve_setup_data();
 
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
@@ -729,6 +730,8 @@ void __init setup_arch(char **cmdline_p)
 	bss_resource.start = virt_to_phys(&__bss_start);
 	bss_resource.end = virt_to_phys(&__bss_stop)-1;
 
+	parse_setup_data();
+
 	parse_early_param();
 
 	finish_e820_parsing();
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 078c02f6f5f9..adf3b04dc582 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -272,28 +272,6 @@ void __attribute__((weak)) __init memory_setup(void)
        machine_specific_memory_setup();
 }
 
-static void __init parse_setup_data(void)
-{
-	struct setup_data *data;
-	unsigned long pa_data;
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, PAGE_SIZE);
-		switch (data->type) {
-		default:
-			break;
-		}
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-		free_early(pa_data, pa_data+sizeof(*data)+data->len);
-#endif
-		pa_data = data->next;
-		early_iounmap(data, PAGE_SIZE);
-	}
-}
-
 #ifdef CONFIG_PCI_MMCONFIG
 extern void __cpuinit fam10h_check_enable_mmcfg(void);
 extern void __init check_enable_amd_mmconf_dmi(void);
diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h
index f62f4733606b..0a073904168b 100644
--- a/include/asm-x86/bootparam.h
+++ b/include/asm-x86/bootparam.h
@@ -106,4 +106,7 @@ struct boot_params {
 	__u8  _pad9[276];				/* 0xeec */
 } __attribute__((packed));
 
+void reserve_setup_data(void);
+void parse_setup_data(void);
+
 #endif /* _ASM_BOOTPARAM_H */
-- 
cgit v1.2.3


From 74e411cb6443d8bcb55fbe89fcc7a9ee574df91b Mon Sep 17 00:00:00 2001
From: Krzysztof Oledzki <ole@ans.pl>
Date: Wed, 4 Jun 2008 03:40:17 +0200
Subject: x86: add another PCI ID for ICH6 force hpet.

Tested on Asus P5GDC-V

$ lspci -n -n |grep ISA
00:1f.0 ISA bridge [0601]: Intel Corporation 82801FB/FR (ICH6/ICH6R) LPC Interface Bridge [8086:2640] (rev 03)

Force enabled HPET at base address 0xfed00000
hpet clockevent registered
hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0
hpet0: 3 64-bit timers, 14318180 Hz

Signed-off-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/quirks.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index ddbf34d16a01..79bdcd11c66e 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -159,6 +159,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
 			 ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
+			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
 			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
-- 
cgit v1.2.3


From 3ed3f06295e69700fa808396f7b350bff2b69de0 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 4 Jun 2008 01:00:47 +0400
Subject: x86: nmi - consolidate nmi_watchdog_default for 32bit mode

64bit mode bootstrap code does set nmi_watchdog to NMI_NONE
by default and doing the same on 32bit mode is safe too.
Such an action saves us from several #ifdef.

Btw, my previous commit

commit 19ec673ced067316b9732bc6d1c4ff4052e5f795
Author: Cyrill Gorcunov <gorcunov@gmail.com>
Date:   Wed May 28 23:00:47 2008 +0400

    x86: nmi - fix incorrect NMI watchdog used by default

did not fix the problem completely, moreover it
introduced additional bug - nmi_watchdog would be
set to either NMI_LOCAL_APIC or NMI_IO_APIC
_regardless_ to boot option if being enabled thru
/proc/sys/kernel/nmi_watchdog. Sorry for that.
Fix it too.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: mingo@redhat.com
Cc: hpa@zytor.com
Cc: macro@linux-mips.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi.c | 14 ++++++++------
 include/asm-x86/nmi.h |  4 +---
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index cbd4fa3c475b..27ca8f69b466 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -487,14 +487,16 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 		return -EIO;
 	}
 
-#ifdef CONFIG_X86_64
 	/* if nmi_watchdog is not set yet, then set it */
 	nmi_watchdog_default();
-#else
-	if (lapic_watchdog_ok())
-		nmi_watchdog = NMI_LOCAL_APIC;
-	else
-		nmi_watchdog = NMI_IO_APIC;
+
+#ifdef CONFIG_X86_32
+	if (nmi_watchdog == NMI_NONE) {
+		if (lapic_watchdog_ok())
+			nmi_watchdog = NMI_LOCAL_APIC;
+		else
+			nmi_watchdog = NMI_IO_APIC;
+	}
 #endif
 
 	if (nmi_watchdog == NMI_LOCAL_APIC) {
diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h
index 972a4f6f799a..470bb4aacb75 100644
--- a/include/asm-x86/nmi.h
+++ b/include/asm-x86/nmi.h
@@ -38,12 +38,10 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev)
 
 #ifdef CONFIG_X86_64
 extern void default_do_nmi(struct pt_regs *);
-extern void nmi_watchdog_default(void);
-#else
-#define nmi_watchdog_default() do { } while (0)
 #endif
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
+extern void nmi_watchdog_default(void);
 extern int check_nmi_watchdog(void);
 extern int nmi_watchdog_enabled;
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
-- 
cgit v1.2.3


From 1a1b1d1322ebd1ece405f3057cdd408bc77e391d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 4 Jun 2008 01:00:58 +0400
Subject: x86: watchdog - check for CPU is being supported

This patch does check if CPU is being recongnized
before call the unreserve(). Since enable_lapic_nmi_watchdog()
does have such a check the same is make sense here too
in a sake of code consistency (but nothing more).

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: mingo@redhat.com
Cc: hpa@zytor.com
Cc: macro@linux-mips.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/perfctr-watchdog.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f9ae93adffe5..ddda4b64f545 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -181,7 +181,9 @@ void disable_lapic_nmi_watchdog(void)
 		return;
 
 	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
-	wd_ops->unreserve();
+
+	if (wd_ops)
+		wd_ops->unreserve();
 
 	BUG_ON(atomic_read(&nmi_active) != 0);
 }
-- 
cgit v1.2.3


From 75b9f5d2a0318da9d5e694a9a1be33f46b4c021e Mon Sep 17 00:00:00 2001
From: "mingo@elte.hu" <mingo@elte.hu>
Date: Thu, 5 Jun 2008 11:18:12 +0200
Subject: x86, nmi: fix build

fix:

arch/x86/kernel/built-in.o: In function `proc_nmi_enabled':
: undefined reference to `nmi_watchdog_default'
arch/x86/kernel/built-in.o: In function `native_smp_prepare_cpus':
: undefined reference to `nmi_watchdog_default'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 27ca8f69b466..19f1b95265cf 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -88,7 +88,6 @@ static inline unsigned int get_timer_irqs(int cpu)
 #endif
 }
 
-#ifdef CONFIG_X86_64
 /* Run after command line and cpu_init init, but before all other checks */
 void nmi_watchdog_default(void)
 {
@@ -96,7 +95,6 @@ void nmi_watchdog_default(void)
 		return;
 	nmi_watchdog = NMI_NONE;
 }
-#endif
 
 #ifdef CONFIG_SMP
 /*
-- 
cgit v1.2.3


From ea57a5a6db8961de35cd1a4a80d8e01ee4307973 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 6 Jun 2008 16:28:23 +0200
Subject: x86, 32-bit: SRAT fix

we were adding reserved BIOS ranges as general memory as well => not good.

solves this crash:

[   20.068075] hostname used greatest stack depth: 6464 bytes left
[   20.121404] BUG: unable to handle kernel <1>BUG: unable to handle kernel NULL pointer dereference at 00000b8c
[   20.121404] IP: [<c01160ae>] kmap_atomic_prot+0x2d/0x1c3
[   20.121404] *pdpt = 00000000367eb001 *pde = 0000000000000000
[   20.121404] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[   20.121404]
[   20.121404] Pid: 2061, comm: rc.sysinit Not tainted (2.6.26-rc3 #2440)
[   20.121404] EIP: 0060:[<c01160ae>] EFLAGS: 00010206 CPU: 0
[   20.121404] EIP is at kmap_atomic_prot+0x2d/0x1c3

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/srat_32.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 32d8b1142938..5eedd3fd4d49 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -240,6 +240,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 	for (i = 0; i < MAX_APICID; i++)
 		apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
 
+	remove_all_active_ranges();
 	for (j = 0; j < num_memory_chunks; j++){
 		struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
 		printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
@@ -247,14 +248,8 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 		node_read_chunk(chunk->nid, chunk);
 		e820_register_active_regions(chunk->nid, chunk->start_pfn,
 					     min(chunk->end_pfn, max_pfn));
-	}
- 
-	for_each_online_node(nid) {
-		unsigned long start = node_start_pfn[nid];
-		unsigned long end = node_end_pfn[nid];
-
-		memory_present(nid, start, end);
-		node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
+		memory_present(chunk->nid, chunk->start_pfn,
+			       min(chunk->end_pfn, max_pfn));
 	}
 	return 1;
 out_fail:
-- 
cgit v1.2.3


From 237749428552e2f81ec6c801482dfd8a777e470b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jun 2008 10:57:16 +0200
Subject: Revert "x86, 32-bit: SRAT fix"

This reverts commit ea57a5a6db8961de35cd1a4a80d8e01ee4307973, a better
fix will be merged.
---
 arch/x86/kernel/srat_32.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 5eedd3fd4d49..32d8b1142938 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -240,7 +240,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 	for (i = 0; i < MAX_APICID; i++)
 		apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
 
-	remove_all_active_ranges();
 	for (j = 0; j < num_memory_chunks; j++){
 		struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
 		printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
@@ -248,8 +247,14 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 		node_read_chunk(chunk->nid, chunk);
 		e820_register_active_regions(chunk->nid, chunk->start_pfn,
 					     min(chunk->end_pfn, max_pfn));
-		memory_present(chunk->nid, chunk->start_pfn,
-			       min(chunk->end_pfn, max_pfn));
+	}
+ 
+	for_each_online_node(nid) {
+		unsigned long start = node_start_pfn[nid];
+		unsigned long end = node_end_pfn[nid];
+
+		memory_present(nid, start, end);
+		node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
 	}
 	return 1;
 out_fail:
-- 
cgit v1.2.3


From db3660c1905293b91653e07f7857576df71ebf28 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 3 Jun 2008 01:43:24 -0700
Subject: x86: remove all active memory ranges before registering them again
 after trimming - 64bit

this way we keep the early_node_map all right.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_64.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index adf3b04dc582..26d60cc0e370 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -368,6 +368,7 @@ void __init setup_arch(char **cmdline_p)
 	/* update e820 for memory not covered by WB MTRRs */
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(end_pfn)) {
+		remove_all_active_ranges();
 		e820_register_active_regions(0, 0, -1UL);
 		end_pfn = e820_end_of_ram();
 	}
-- 
cgit v1.2.3


From c3ff01672a23fabb40d4b80ff25a845582fd07c2 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 6 Jun 2008 18:54:26 -0700
Subject: x86: fix boot failure with 64GB+ system with numa 32-bit

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/srat_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 32d8b1142938..e9d91720a40f 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -251,7 +251,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
  
 	for_each_online_node(nid) {
 		unsigned long start = node_start_pfn[nid];
-		unsigned long end = node_end_pfn[nid];
+		unsigned long end = min(node_end_pfn[nid], max_pfn);
 
 		memory_present(nid, start, end);
 		node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
-- 
cgit v1.2.3


From e0da33646826b66ef933d47ea2fb7a693fd849bf Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 8 Jun 2008 18:29:22 -0700
Subject: x86: introduce max_physical_apicid for bigsmp switching

a multi-socket test-system with 3 or 4 ioapics, when 4 dualcore cpus or
2 quadcore cpus installed, needs to switch to bigsmp or physflat.

CPU apic id is [4,11] instead of [0,7], and we need to check max apic
id instead of cpu numbers.

also add check for 32 bit when acpi is not compiled in or acpi=off.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c      |  5 ++++-
 arch/x86/kernel/apic_64.c      |  3 +++
 arch/x86/kernel/genapic_64.c   |  2 +-
 arch/x86/kernel/mpparse.c      |  5 +++++
 arch/x86/kernel/setup.c        |  1 +
 arch/x86/kernel/setup_32.c     | 11 +++++------
 arch/x86/mach-generic/bigsmp.c |  2 +-
 include/asm-x86/mpspec.h       |  1 +
 8 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index c304759f0834..954d67931a50 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1518,6 +1518,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		 */
 		cpu = 0;
 
+	if (apicid > max_physical_apicid)
+		max_physical_apicid = apicid;
+
 	/*
 	 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
 	 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1525,7 +1528,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
 	 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
 	 *       - Ashok Raj <ashok.raj@intel.com>
 	 */
-	if (num_processors > 8) {
+	if (max_physical_apicid >= 8) {
 		switch (boot_cpu_data.x86_vendor) {
 		case X86_VENDOR_INTEL:
 			if (!APIC_XAPIC(version)) {
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 54087f920f2f..1872555b98a3 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -1093,6 +1093,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		 */
 		cpu = 0;
 	}
+	if (apicid > max_physical_apicid)
+		max_physical_apicid = apicid;
+
 	/* are we being called early in kernel startup? */
 	if (x86_cpu_to_apicid_early_ptr) {
 		u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index cbaaf69bedb2..1fa8be5bd217 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -51,7 +51,7 @@ void __init setup_apic_routing(void)
 	else
 #endif
 
-	if (num_possible_cpus() <= 8)
+	if (max_physical_apicid < 8)
 		genapic = &apic_flat;
 	else
 		genapic = &apic_physflat;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b4a950da2f97..7591325e616d 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -473,6 +473,11 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 		++mpc_record;
 #endif
 	}
+
+#ifdef CONFIG_X86_GENERICARCH
+       generic_bigsmp_probe();
+#endif
+
 	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "MPTABLE: no processors registered!\n");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0a281f2c7157..45a5e247d450 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -17,6 +17,7 @@ unsigned int num_processors;
 unsigned disabled_cpus __cpuinitdata;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
+unsigned int max_physical_apicid;
 EXPORT_SYMBOL(boot_cpu_physical_apicid);
 
 DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index ee1ccdbd7100..be1a99003677 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -838,18 +838,17 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_ACPI
 	acpi_boot_init();
-
+#endif
+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+	if (smp_found_config)
+		get_smp_config();
+#endif
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
 	if (def_to_bigsmp)
 		printk(KERN_WARNING "More than 8 CPUs detected and "
 			"CONFIG_X86_PC cannot handle it.\nUse "
 			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
 #endif
-#endif
-#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
-	if (smp_found_config)
-		get_smp_config();
-#endif
 
 	e820_setup_gap();
 	e820_mark_nosave_regions(max_low_pfn);
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 95fc463056d0..2a243019acae 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -48,7 +48,7 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
 static int probe_bigsmp(void)
 {
 	if (def_to_bigsmp)
-	dmi_bigsmp = 1;
+		dmi_bigsmp = 1;
 	else
 		dmi_check_system(bigsmp_dmi_table);
 	return dmi_bigsmp;
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index f38b68e8de5a..4451d720ee07 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -33,6 +33,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
 extern unsigned int boot_cpu_physical_apicid;
+extern unsigned int max_physical_apicid;
 extern int smp_found_config;
 extern int mpc_default_type;
 extern unsigned long mp_lapic_addr;
-- 
cgit v1.2.3


From 4e78c91abe1a40b905611100a593be62784ba355 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jun 2008 11:59:30 +0200
Subject: Revert "x86, numaq: add pci_acpi_scan_root() stub"

This reverts commit f3294690979634ee10398bb0beadfe1d4edb881d.

That bug will be fixed in a better way via:

  x86: make generic arch support NUMAQ
---
 arch/x86/kernel/numaq_32.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 992f53cb79b6..e65281b1634b 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -87,14 +87,3 @@ static int __init numaq_tsc_disable(void)
 	return 0;
 }
 arch_initcall(numaq_tsc_disable);
-
-#ifdef CONFIG_ACPI
-/*
- * Dummy implementation:
- */
-struct pci_bus * __devinit
-pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum)
-{
-	return NULL;
-}
-#endif
-- 
cgit v1.2.3


From d49c4288407b2ffa8cab270cb5bc6882abe969f6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 8 Jun 2008 18:31:54 -0700
Subject: x86: make generic arch support NUMAQ

... so it could fall back to normal numa and we'd reduce the impact of the
NUMAQ subarch.

NUMAQ depends on GENERICARCH
also decouple genericarch numa from acpi.
also make it fall back to bigsmp if apicid > 8.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                            | 74 ++++++++++++++---------------
 arch/x86/Makefile                           | 18 -------
 arch/x86/boot/compressed/misc.c             |  4 --
 arch/x86/kernel/acpi/boot.c                 |  4 +-
 arch/x86/kernel/io_apic_32.c                |  9 ++--
 arch/x86/kernel/mpparse.c                   | 73 ++++++++++++++++++++++++++--
 arch/x86/kernel/numaq_32.c                  |  2 -
 arch/x86/kernel/summit_32.c                 |  2 +
 arch/x86/mach-es7000/Makefile               |  1 -
 arch/x86/mach-es7000/es7000plat.c           | 47 ------------------
 arch/x86/mach-generic/Makefile              | 10 ++--
 arch/x86/mach-generic/bigsmp.c              |  2 -
 arch/x86/mach-generic/numaq.c               | 41 ++++++++++++++++
 arch/x86/mach-generic/probe.c               | 15 +++++-
 arch/x86/pci/Makefile_32                    |  5 +-
 arch/x86/pci/numa.c                         | 29 ++---------
 drivers/acpi/Kconfig                        |  1 -
 include/asm-x86/mach-generic/mach_mpparse.h |  7 ++-
 include/asm-x86/mach-numaq/mach_apic.h      | 39 ++++-----------
 include/asm-x86/mach-numaq/mach_mpparse.h   | 11 +----
 include/asm-x86/mmzone_32.h                 | 18 ++-----
 include/asm-x86/mpspec.h                    |  6 +++
 include/asm-x86/numaq.h                     |  5 +-
 include/asm-x86/srat.h                      | 12 +++--
 24 files changed, 219 insertions(+), 216 deletions(-)
 create mode 100644 arch/x86/mach-generic/numaq.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9e9761504422..8b89810fe3f2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -268,36 +268,6 @@ config X86_VOYAGER
 	  If you do not specifically know you have a Voyager based machine,
 	  say N here, otherwise the kernel you build will not be bootable.
 
-config X86_NUMAQ
-	bool "NUMAQ (IBM/Sequent)"
-	depends on SMP && X86_32
-	select NUMA
-	help
-	  This option is used for getting Linux to run on a (IBM/Sequent) NUMA
-	  multiquad box. This changes the way that processors are bootstrapped,
-	  and uses Clustered Logical APIC addressing mode instead of Flat Logical.
-	  You will need a new lynxer.elf file to flash your firmware with - send
-	  email to <Martin.Bligh@us.ibm.com>.
-
-config X86_SUMMIT
-	bool "Summit/EXA (IBM x440)"
-	depends on X86_32 && SMP
-	help
-	  This option is needed for IBM systems that use the Summit/EXA chipset.
-	  In particular, it is needed for the x440.
-
-	  If you don't have one of these computers, you should say N here.
-	  If you want to build a NUMA kernel, you must select ACPI.
-
-config X86_BIGSMP
-	bool "Support for other sub-arch SMP systems with more than 8 CPUs"
-	depends on X86_32 && SMP
-	help
-	  This option is needed for the systems that have more than 8 CPUs
-	  and if the system is not of any sub-arch type above.
-
-	  If you don't have such a system, you should say N here.
-
 config X86_VISWS
 	bool "SGI 320/540 (Visual Workstation)"
 	depends on X86_32
@@ -311,12 +281,33 @@ config X86_VISWS
 	  and vice versa. See <file:Documentation/sgi-visws.txt> for details.
 
 config X86_GENERICARCH
-       bool "Generic architecture (Summit, bigsmp, ES7000, default)"
+       bool "Generic architecture"
 	depends on X86_32
        help
-          This option compiles in the Summit, bigsmp, ES7000, default subarchitectures.
-	  It is intended for a generic binary kernel.
-	  If you want a NUMA kernel, select ACPI.   We need SRAT for NUMA.
+          This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
+	  subarchitectures.  It is intended for a generic binary kernel.
+	  if you select them all, kernel will probe it one by one. and will
+	  fallback to default.
+
+if X86_GENERICARCH
+
+config X86_NUMAQ
+	bool "NUMAQ (IBM/Sequent)"
+	depends on SMP && X86_32
+	select NUMA
+	help
+	  This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
+	  NUMA multiquad box. This changes the way that processors are
+	  bootstrapped, and uses Clustered Logical APIC addressing mode instead
+	  of Flat Logical.  You will need a new lynxer.elf file to flash your
+	  firmware with - send email to <Martin.Bligh@us.ibm.com>.
+
+config X86_SUMMIT
+	bool "Summit/EXA (IBM x440)"
+	depends on X86_32 && SMP
+	help
+	  This option is needed for IBM systems that use the Summit/EXA chipset.
+	  In particular, it is needed for the x440.
 
 config X86_ES7000
 	bool "Support for Unisys ES7000 IA32 series"
@@ -324,8 +315,15 @@ config X86_ES7000
 	help
 	  Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
 	  supposed to run on an IA32-based Unisys ES7000 system.
-	  Only choose this option if you have such a system, otherwise you
-	  should say N here.
+
+config X86_BIGSMP
+	bool "Support for big SMP systems with more than 8 CPUs"
+	depends on X86_32 && SMP
+	help
+	  This option is needed for the systems that have more than 8 CPUs
+	  and if the system is not of any sub-arch type above.
+
+endif
 
 config X86_RDC321X
 	bool "RDC R-321x SoC"
@@ -913,9 +911,9 @@ config X86_PAE
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
 	depends on SMP
-	depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL)
+	depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || X86_SUMMIT && ACPI) && EXPERIMENTAL)
 	default n if X86_PC
-	default y if (X86_NUMAQ || X86_SUMMIT)
+	default y if (X86_NUMAQ || X86_SUMMIT || X86_GENERICARCH)
 	help
 	  Enable NUMA (Non Uniform Memory Access) support.
 	  The kernel will try to allocate memory used by a CPU on the
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 30b046bcccb6..d6650131659e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -117,29 +117,11 @@ mcore-$(CONFIG_X86_VOYAGER)	:= arch/x86/mach-voyager/
 mflags-$(CONFIG_X86_VISWS)	:= -Iinclude/asm-x86/mach-visws
 mcore-$(CONFIG_X86_VISWS)	:= arch/x86/mach-visws/
 
-# NUMAQ subarch support
-mflags-$(CONFIG_X86_NUMAQ)	:= -Iinclude/asm-x86/mach-numaq
-mcore-$(CONFIG_X86_NUMAQ)	:= arch/x86/mach-default/
-
-# BIGSMP subarch support
-mflags-$(CONFIG_X86_BIGSMP)	:= -Iinclude/asm-x86/mach-bigsmp
-mcore-$(CONFIG_X86_BIGSMP)	:= arch/x86/mach-default/
-
-#Summit subarch support
-mflags-$(CONFIG_X86_SUMMIT)	:= -Iinclude/asm-x86/mach-summit
-mcore-$(CONFIG_X86_SUMMIT)	:= arch/x86/mach-default/
-
 # generic subarchitecture
 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
 fcore-$(CONFIG_X86_GENERICARCH)	+= arch/x86/mach-generic/
 mcore-$(CONFIG_X86_GENERICARCH)	:= arch/x86/mach-default/
 
-
-# ES7000 subarch support
-mflags-$(CONFIG_X86_ES7000)	:= -Iinclude/asm-x86/mach-es7000
-fcore-$(CONFIG_X86_ES7000)	:= arch/x86/mach-es7000/
-mcore-$(CONFIG_X86_ES7000)	:= arch/x86/mach-default/
-
 # RDC R-321x subarch support
 mflags-$(CONFIG_X86_RDC321X)	:= -Iinclude/asm-x86/mach-rdc321x
 mcore-$(CONFIG_X86_RDC321X)	:= arch/x86/mach-default/
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 90456cee47c3..ba0be6a25ff7 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -221,10 +221,6 @@ static char *vidmem;
 static int vidport;
 static int lines, cols;
 
-#ifdef CONFIG_X86_NUMAQ
-void *xquad_portio;
-#endif
-
 #include "../../../../lib/inflate.c"
 
 static void *malloc(int size)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f226bdc19f69..7dc2130046d9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -848,7 +848,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
 #ifdef	CONFIG_X86_IO_APIC
 #define MP_ISA_BUS		0
 
-#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH)
+#ifdef CONFIG_X86_ES7000
 extern int es7000_plat;
 #endif
 
@@ -997,7 +997,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	set_bit(MP_ISA_BUS, mp_bus_not_pci);
 	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
 
-#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH)
+#ifdef CONFIG_X86_ES7000
 	/*
 	 * Older generations of ES7000 have no legacy identity mappings
 	 */
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 2285b81ad1d5..97e62a01f1e2 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1722,7 +1722,6 @@ void disable_IO_APIC(void)
  * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
  */
 
-#ifndef CONFIG_X86_NUMAQ
 static void __init setup_ioapic_ids_from_mpc(void)
 {
 	union IO_APIC_reg_00 reg_00;
@@ -1732,6 +1731,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
 	unsigned char old_id;
 	unsigned long flags;
 
+#ifdef CONFIG_X86_NUMAQ
+	if (found_numaq)
+		return;
+#endif
+
 	/*
 	 * Don't check I/O APIC IDs for xAPIC systems.  They have
 	 * no meaning without the serial APIC bus.
@@ -1828,9 +1832,6 @@ static void __init setup_ioapic_ids_from_mpc(void)
 			apic_printk(APIC_VERBOSE, " ok.\n");
 	}
 }
-#else
-static void __init setup_ioapic_ids_from_mpc(void) { }
-#endif
 
 int no_timer_check __initdata;
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 7591325e616d..1cc7a4b8643f 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -49,15 +49,73 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 }
 
 #ifdef CONFIG_X86_NUMAQ
+int found_numaq;
 /*
  * Have to match translation table entries to main table entries by counter
  * hence the mpc_record variable .... can't see a less disgusting way of
  * doing this ....
  */
+struct mpc_config_translation {
+	unsigned char mpc_type;
+	unsigned char trans_len;
+	unsigned char trans_type;
+	unsigned char trans_quad;
+	unsigned char trans_global;
+	unsigned char trans_local;
+	unsigned short trans_reserved;
+};
+
 
 static int mpc_record;
 static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
     __cpuinitdata;
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+
+static inline int mpc_apic_id(struct mpc_config_processor *m,
+			struct mpc_config_translation *translation_record)
+{
+	int quad = translation_record->trans_quad;
+	int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
+
+	printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver, quad, logical_apicid);
+	return logical_apicid;
+}
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+
+static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
+	struct mpc_config_translation *translation)
+{
+	int quad = translation->trans_quad;
+	int local = translation->trans_local;
+
+	mp_bus_id_to_node[m->mpc_busid] = quad;
+	mp_bus_id_to_local[m->mpc_busid] = local;
+	printk(KERN_INFO "Bus #%d is %s (node %d)\n",
+	       m->mpc_busid, name, quad);
+}
+
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+static void mpc_oem_pci_bus(struct mpc_config_bus *m,
+	struct mpc_config_translation *translation)
+{
+	int quad = translation->trans_quad;
+	int local = translation->trans_local;
+
+	quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+}
+
 #endif
 
 static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
@@ -321,11 +379,11 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
 	}
 }
 
-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
 				 char *productid)
 {
 	if (strncmp(oem, "IBM NUMA", 8))
-		printk("Warning!  May not be a NUMA-Q system!\n");
+		printk("Warning!  Not a NUMA-Q system!\n");
 	else
 		found_numaq = 1;
 
@@ -388,7 +446,16 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 		return 0;
 
 #ifdef CONFIG_X86_32
-	mps_oem_check(mpc, oem, str);
+	/*
+	 * need to make sure summit and es7000's mps_oem_check is safe to be
+	 * called early via genericarch 's mps_oem_check
+	 */
+	if (early) {
+#ifdef CONFIG_X86_NUMAQ
+		numaq_mps_oem_check(mpc, oem, str);
+#endif
+	} else
+		mps_oem_check(mpc, oem, str);
 #endif
 
 	/* save the local APIC address, it might be non-default */
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 27b908254fb0..f0f1de1c4a1d 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -36,8 +36,6 @@
 
 #define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
 
-int found_numaq;
-
 /*
  * Function: smp_dump_qct()
  *
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index ae751094eba9..d67ce5f044ba 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -36,7 +36,9 @@ static struct rio_table_hdr *rio_table_hdr __initdata;
 static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata;
 static struct rio_detail    *rio_devs[MAX_NUMNODES*4] __initdata;
 
+#ifndef CONFIG_X86_NUMAQ
 static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
+#endif
 
 static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
 {
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
index 69dd4da218dc..3ef8b43b62fc 100644
--- a/arch/x86/mach-es7000/Makefile
+++ b/arch/x86/mach-es7000/Makefile
@@ -3,4 +3,3 @@
 #
 
 obj-$(CONFIG_X86_ES7000)	:= es7000plat.o
-obj-$(CONFIG_X86_GENERICARCH)	:= es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c
index a41c77a47227..4354ce804889 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/mach-es7000/es7000plat.c
@@ -177,53 +177,6 @@ find_unisys_acpi_oem_table(unsigned long *oem_addr)
 }
 #endif
 
-/*
- * This file also gets compiled if CONFIG_X86_GENERICARCH is set. Generic
- * arch already has got following function definitions (asm-generic/es7000.c)
- * hence no need to define these for that case.
- */
-#ifndef CONFIG_X86_GENERICARCH
-void es7000_sw_apic(void);
-void __init enable_apic_mode(void)
-{
-	es7000_sw_apic();
-	return;
-}
-
-__init int mps_oem_check(struct mp_config_table *mpc, char *oem,
-		char *productid)
-{
-	if (mpc->mpc_oemptr) {
-		struct mp_config_oemtable *oem_table =
-			(struct mp_config_oemtable *)mpc->mpc_oemptr;
-		if (!strncmp(oem, "UNISYS", 6))
-			return parse_unisys_oem((char *)oem_table);
-	}
-	return 0;
-}
-#ifdef CONFIG_ACPI
-/* Hook from generic ACPI tables.c */
-int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
-	unsigned long oem_addr;
-	if (!find_unisys_acpi_oem_table(&oem_addr)) {
-		if (es7000_check_dsdt())
-			return parse_unisys_oem((char *)oem_addr);
-		else {
-			setup_unisys();
-			return 1;
-		}
-	}
-	return 0;
-}
-#else
-int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
-	return 0;
-}
-#endif
-#endif /* COFIG_X86_GENERICARCH */
-
 static void
 es7000_spin(int n)
 {
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
index 19d6d407737b..0dbd7803a1d5 100644
--- a/arch/x86/mach-generic/Makefile
+++ b/arch/x86/mach-generic/Makefile
@@ -2,7 +2,11 @@
 # Makefile for the generic architecture
 #
 
-EXTRA_CFLAGS	:= -Iarch/x86/kernel
+EXTRA_CFLAGS			:= -Iarch/x86/kernel
 
-obj-y		:= probe.o summit.o bigsmp.o es7000.o default.o 
-obj-y		+= ../../x86/mach-es7000/
+obj-y				:= probe.o default.o
+obj-$(CONFIG_X86_NUMAQ)		+= numaq.o
+obj-$(CONFIG_X86_SUMMIT)	+= summit.o
+obj-$(CONFIG_X86_BIGSMP)	+= bigsmp.o
+obj-$(CONFIG_X86_ES7000)	+= es7000.o
+obj-$(CONFIG_X86_ES7000)	+= ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 2a243019acae..59d771714559 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -23,10 +23,8 @@ static int dmi_bigsmp; /* can be set by dmi scanners */
 
 static int hp_ht_bigsmp(const struct dmi_system_id *d)
 {
-#ifdef CONFIG_X86_GENERICARCH
 	printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
 	dmi_bigsmp = 1;
-#endif
 	return 0;
 }
 
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
new file mode 100644
index 000000000000..8091e68764c4
--- /dev/null
+++ b/arch/x86/mach-generic/numaq.c
@@ -0,0 +1,41 @@
+/*
+ * APIC driver for the IBM NUMAQ chipset.
+ */
+#define APIC_DEFINITION 1
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/smp.h>
+#include <asm/mpspec.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <asm/mach-numaq/mach_apic.h>
+#include <asm/mach-numaq/mach_apicdef.h>
+#include <asm/mach-numaq/mach_ipi.h>
+#include <asm/mach-numaq/mach_mpparse.h>
+#include <asm/mach-numaq/mach_wakecpu.h>
+#include <asm/numaq.h>
+
+static int mps_oem_check(struct mp_config_table *mpc, char *oem,
+		char *productid)
+{
+	numaq_mps_oem_check(mpc, oem, productid);
+	return found_numaq;
+}
+
+static int probe_numaq(void)
+{
+	/* already know from get_memcfg_numaq() */
+	return found_numaq;
+}
+
+/* Hook from generic ACPI tables.c */
+static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	return 0;
+}
+
+struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
index c5ae751b994a..ba18dec48555 100644
--- a/arch/x86/mach-generic/probe.c
+++ b/arch/x86/mach-generic/probe.c
@@ -16,6 +16,7 @@
 #include <asm/apicdef.h>
 #include <asm/genapic.h>
 
+extern struct genapic apic_numaq;
 extern struct genapic apic_summit;
 extern struct genapic apic_bigsmp;
 extern struct genapic apic_es7000;
@@ -24,9 +25,18 @@ extern struct genapic apic_default;
 struct genapic *genapic = &apic_default;
 
 static struct genapic *apic_probe[] __initdata = {
+#ifdef CONFIG_X86_NUMAQ
+	&apic_numaq,
+#endif
+#ifdef CONFIG_X86_SUMMIT
 	&apic_summit,
+#endif
+#ifdef CONFIG_X86_BIGSMP
 	&apic_bigsmp,
+#endif
+#ifdef CONFIG_X86_ES7000
 	&apic_es7000,
+#endif
 	&apic_default,	/* must be last */
 	NULL,
 };
@@ -54,6 +64,7 @@ early_param("apic", parse_apic);
 
 void __init generic_bigsmp_probe(void)
 {
+#if CONFIG_X86_BIGSMP
 	/*
 	 * This routine is used to switch to bigsmp mode when
 	 * - There is no apic= option specified by the user
@@ -67,6 +78,7 @@ void __init generic_bigsmp_probe(void)
 			printk(KERN_INFO "Overriding APIC driver with %s\n",
 			       genapic->name);
 		}
+#endif
 }
 
 void __init generic_apic_probe(void)
@@ -88,7 +100,8 @@ void __init generic_apic_probe(void)
 
 /* These functions can switch the APIC even after the initial ->probe() */
 
-int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid)
+int __init mps_oem_check(struct mp_config_table *mpc, char *oem,
+				 char *productid)
 {
 	int i;
 	for (i = 0; apic_probe[i]; ++i) {
diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32
index 89ec35d00efd..962d96c0495a 100644
--- a/arch/x86/pci/Makefile_32
+++ b/arch/x86/pci/Makefile_32
@@ -13,10 +13,11 @@ pci-y				:= fixup.o
 pci-$(CONFIG_ACPI)		+= acpi.o
 pci-y				+= legacy.o irq.o
 
-# Careful: VISWS and NUMAQ overrule the pci-y above. The colons are
+# Careful: VISWS overrule the pci-y above. The colons are
 # therefor correct. This needs a proper fix by distangling the code.
 pci-$(CONFIG_X86_VISWS)		:= visws.o fixup.o
-pci-$(CONFIG_X86_NUMAQ)		:= numa.o irq.o
+
+pci-$(CONFIG_X86_NUMAQ)		+= numa.o
 
 # Necessary for NUMAQ as well
 pci-$(CONFIG_NUMA)		+= mp_bus_to_node.o
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c
index d9afbae5092b..99f1ecd485b5 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numa.c
@@ -6,45 +6,21 @@
 #include <linux/init.h>
 #include <linux/nodemask.h>
 #include <mach_apic.h>
+#include <asm/mpspec.h>
 #include "pci.h"
 
 #define XQUAD_PORTIO_BASE 0xfe400000
 #define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
 
-int mp_bus_id_to_node[MAX_MP_BUSSES];
 #define BUS2QUAD(global) (mp_bus_id_to_node[global])
 
-int mp_bus_id_to_local[MAX_MP_BUSSES];
 #define BUS2LOCAL(global) (mp_bus_id_to_local[global])
 
-void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
-	struct mpc_config_translation *translation)
-{
-	int quad = translation->trans_quad;
-	int local = translation->trans_local;
-
-	mp_bus_id_to_node[m->mpc_busid] = quad;
-	mp_bus_id_to_local[m->mpc_busid] = local;
-	printk(KERN_INFO "Bus #%d is %s (node %d)\n",
-	       m->mpc_busid, name, quad);
-}
-
-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
 #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
-void mpc_oem_pci_bus(struct mpc_config_bus *m,
-	struct mpc_config_translation *translation)
-{
-	int quad = translation->trans_quad;
-	int local = translation->trans_local;
-
-	quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
-}
 
 /* Where the IO area was mapped on multiquad, always 0 otherwise */
 void *xquad_portio;
-#ifdef CONFIG_X86_NUMAQ
 EXPORT_SYMBOL(xquad_portio);
-#endif
 
 #define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
 
@@ -179,6 +155,9 @@ static int __init pci_numa_init(void)
 {
 	int quad;
 
+	if (!found_numaq)
+		return 0;
+
 	raw_pci_ops = &pci_direct_conf1_mq;
 
 	if (pcibios_scanned++)
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index c52fca833268..860f15f36ce9 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -4,7 +4,6 @@
 
 menuconfig ACPI
 	bool "ACPI (Advanced Configuration and Power Interface) Support"
-	depends on !X86_NUMAQ
 	depends on !X86_VISWS
 	depends on !IA64_HP_SIM
 	depends on IA64 || X86
diff --git a/include/asm-x86/mach-generic/mach_mpparse.h b/include/asm-x86/mach-generic/mach_mpparse.h
index 0d0b5ba2e9d1..586cadbf3787 100644
--- a/include/asm-x86/mach-generic/mach_mpparse.h
+++ b/include/asm-x86/mach-generic/mach_mpparse.h
@@ -1,7 +1,10 @@
 #ifndef _MACH_MPPARSE_H
 #define _MACH_MPPARSE_H 1
 
-int mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid); 
-int acpi_madt_oem_check(char *oem_id, char *oem_table_id); 
+
+extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
+			 char *productid);
+
+extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
 
 #endif
diff --git a/include/asm-x86/mach-numaq/mach_apic.h b/include/asm-x86/mach-numaq/mach_apic.h
index 75a56e5afbe7..d802465e026a 100644
--- a/include/asm-x86/mach-numaq/mach_apic.h
+++ b/include/asm-x86/mach-numaq/mach_apic.h
@@ -20,8 +20,14 @@ static inline cpumask_t target_cpus(void)
 #define INT_DELIVERY_MODE dest_LowestPrio
 #define INT_DEST_MODE 0     /* physical delivery on LOCAL quad */
  
-#define check_apicid_used(bitmap, apicid) physid_isset(apicid, bitmap)
-#define check_apicid_present(bit) physid_isset(bit, phys_cpu_present_map)
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+	return physid_isset(apicid, bitmap);
+}
+static inline unsigned long check_apicid_present(int bit)
+{
+	return physid_isset(bit, phys_cpu_present_map);
+}
 #define apicid_cluster(apicid) (apicid & 0xF0)
 
 static inline int apic_id_registered(void)
@@ -77,11 +83,6 @@ static inline int cpu_present_to_apicid(int mps_cpu)
 		return BAD_APICID;
 }
 
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
-	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
 static inline int apicid_to_node(int logical_apicid) 
 {
 	return logical_apicid >> 4;
@@ -95,30 +96,6 @@ static inline physid_mask_t apicid_to_cpu_present(int logical_apicid)
 	return physid_mask_of_physid(cpu + 4*node);
 }
 
-struct mpc_config_translation {
-	unsigned char mpc_type;
-	unsigned char trans_len;
-	unsigned char trans_type;
-	unsigned char trans_quad;
-	unsigned char trans_global;
-	unsigned char trans_local;
-	unsigned short trans_reserved;
-};
-
-static inline int mpc_apic_id(struct mpc_config_processor *m, 
-			struct mpc_config_translation *translation_record)
-{
-	int quad = translation_record->trans_quad;
-	int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
-
-	printk("Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
-	       m->mpc_apicid,
-	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-	       m->mpc_apicver, quad, logical_apicid);
-	return logical_apicid;
-}
-
 extern void *xquad_portio;
 
 static inline void setup_portio_remap(void)
diff --git a/include/asm-x86/mach-numaq/mach_mpparse.h b/include/asm-x86/mach-numaq/mach_mpparse.h
index 459b12401187..626aef6b155f 100644
--- a/include/asm-x86/mach-numaq/mach_mpparse.h
+++ b/include/asm-x86/mach-numaq/mach_mpparse.h
@@ -1,14 +1,7 @@
 #ifndef __ASM_MACH_MPPARSE_H
 #define __ASM_MACH_MPPARSE_H
 
-extern void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
-			     struct mpc_config_translation *translation);
-extern void mpc_oem_pci_bus(struct mpc_config_bus *m,
-	struct mpc_config_translation *translation);
-
-/* Hook from generic ACPI tables.c */
-static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
-}
+extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+				char *productid);
 
 #endif /* __ASM_MACH_MPPARSE_H */
diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h
index ab0012888c44..b2298a227567 100644
--- a/include/asm-x86/mmzone_32.h
+++ b/include/asm-x86/mmzone_32.h
@@ -12,11 +12,9 @@
 extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)	(node_data[nid])
 
-#ifdef CONFIG_X86_NUMAQ
-	#include <asm/numaq.h>
-#elif defined(CONFIG_ACPI_SRAT)/* summit or generic arch */
-	#include <asm/srat.h>
-#endif
+#include <asm/numaq.h>
+/* summit or generic arch */
+#include <asm/srat.h>
 
 extern int get_memcfg_numa_flat(void);
 /*
@@ -26,14 +24,11 @@ extern int get_memcfg_numa_flat(void);
  */
 static inline void get_memcfg_numa(void)
 {
-#ifdef CONFIG_X86_NUMAQ
+
 	if (get_memcfg_numaq())
 		return;
-#elif defined(CONFIG_ACPI_SRAT)
 	if (get_memcfg_from_srat())
 		return;
-#endif
-
 	get_memcfg_numa_flat();
 }
 
@@ -42,7 +37,6 @@ extern int early_pfn_to_nid(unsigned long pfn);
 #else /* !CONFIG_NUMA */
 
 #define get_memcfg_numa get_memcfg_numa_flat
-#define get_zholes_size(n) (0)
 
 #endif /* CONFIG_NUMA */
 
@@ -83,9 +77,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
 })
 
-#ifdef CONFIG_X86_NUMAQ            /* we have contiguous memory on NUMA-Q */
-#define pfn_valid(pfn)          ((pfn) < num_physpages)
-#else
 static inline int pfn_valid(int pfn)
 {
 	int nid = pfn_to_nid(pfn);
@@ -94,7 +85,6 @@ static inline int pfn_valid(int pfn)
 		return (pfn < node_end_pfn(nid));
 	return 0;
 }
-#endif /* CONFIG_X86_NUMAQ */
 
 #endif /* CONFIG_DISCONTIGMEM */
 
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index 4451d720ee07..6e9c9588b1fc 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -13,6 +13,12 @@ extern int apic_version[MAX_APICS];
 extern u8 apicid_2_node[];
 extern int pic_mode;
 
+#ifdef CONFIG_X86_NUMAQ
+extern int mp_bus_id_to_node[MAX_MP_BUSSES];
+extern int mp_bus_id_to_local[MAX_MP_BUSSES];
+extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+#endif
+
 #define MAX_APICID 256
 
 #else
diff --git a/include/asm-x86/numaq.h b/include/asm-x86/numaq.h
index 739d16484b8c..ef068d2465d6 100644
--- a/include/asm-x86/numaq.h
+++ b/include/asm-x86/numaq.h
@@ -157,9 +157,10 @@ struct sys_cfg_data {
 	struct		eachquadmem eq[MAX_NUMNODES];	/* indexed by quad id */
 };
 
-static inline unsigned long *get_zholes_size(int nid)
+#else
+static inline int get_memcfg_numaq(void)
 {
-	return NULL;
+	return 0;
 }
 #endif /* CONFIG_X86_NUMAQ */
 #endif /* NUMAQ_H */
diff --git a/include/asm-x86/srat.h b/include/asm-x86/srat.h
index f4bba131d068..456fe0b5a921 100644
--- a/include/asm-x86/srat.h
+++ b/include/asm-x86/srat.h
@@ -27,11 +27,13 @@
 #ifndef _ASM_SRAT_H_
 #define _ASM_SRAT_H_
 
-#ifndef CONFIG_ACPI_SRAT
-#error CONFIG_ACPI_SRAT not defined, and srat.h header has been included
-#endif
-
+#ifdef CONFIG_ACPI_SRAT
 extern int get_memcfg_from_srat(void);
-extern unsigned long *get_zholes_size(int);
+#else
+static inline int get_memcfg_from_srat(void)
+{
+	return 0;
+}
+#endif
 
 #endif /* _ASM_SRAT_H_ */
-- 
cgit v1.2.3


From 6780711e9817585f89b02b0556aac3564cbe1f45 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 8 Jun 2008 19:53:26 -0700
Subject: x86: update mptable, fix

need to call early_reserve_e820() to preallocate mptable for 32bit

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index be1a99003677..4fa9f6c4542b 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -749,6 +749,8 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	max_pfn = e820_end_of_ram();
 
+	/* preallocate 4k for mptable mpc */
+	early_reserve_e820_mpc_new();
 	/* update e820 for memory not covered by WB MTRRs */
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(max_pfn)) {
-- 
cgit v1.2.3


From 1d74f2a0f64b4091e5e91b55ac1b17dff93f4b59 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sun, 1 Jun 2008 21:47:42 +0530
Subject: ftrace: remove ftrace_ip_converted()

Remove the unneeded function ftrace_ip_converted().

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/ftrace.c     | 10 ----------
 arch/powerpc/kernel/ftrace.c | 10 ----------
 arch/sparc64/kernel/ftrace.c |  7 -------
 arch/x86/kernel/ftrace.c     | 10 ----------
 kernel/trace/ftrace.c        |  7 -------
 5 files changed, 44 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index f4cb4cc3fa0c..22f3d6e309f9 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -22,16 +22,6 @@
 static unsigned long bl_insn;
 static const unsigned long NOP = 0xe1a00000; /* mov r0, r0 */
 
-/* return true if mcount call site is already patched/no-op'ed */
-int ftrace_ip_converted(unsigned long pc)
-{
-	unsigned long save;
-
-	pc -= INSN_SIZE;
-	save = *(unsigned long *)pc;
-	return save == NOP;
-}
-
 unsigned char *ftrace_nop_replace(void)
 {
 	return (char *)&NOP;
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index 69ed41223468..e12c593ab9ca 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -27,16 +27,6 @@ static unsigned int ftrace_nop = 0x60000000;
 # define GET_ADDR(addr) *(unsigned long *)addr
 #endif
 
-notrace int ftrace_ip_converted(unsigned long ip)
-{
-	unsigned int save;
-
-	ip -= CALL_BACK;
-	save = *(unsigned int *)ip;
-
-	return save == ftrace_nop;
-}
-
 static unsigned int notrace ftrace_calc_offset(long ip, long addr)
 {
 	return (int)((addr + CALL_BACK) - ip);
diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c
index f449e6df6c4a..c17373195b1e 100644
--- a/arch/sparc64/kernel/ftrace.c
+++ b/arch/sparc64/kernel/ftrace.c
@@ -7,13 +7,6 @@
 
 static const u32 ftrace_nop = 0x01000000;
 
-notrace int ftrace_ip_converted(unsigned long ip)
-{
-	u32 insn = *(u32 *) ip;
-
-	return (insn == ftrace_nop);
-}
-
 notrace unsigned char *ftrace_nop_replace(void)
 {
 	return (char *)&ftrace_nop;
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 498608c015fb..bc5cf8d46742 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -31,16 +31,6 @@ union ftrace_code_union {
 	} __attribute__((packed));
 };
 
-notrace int ftrace_ip_converted(unsigned long ip)
-{
-	unsigned long save;
-
-	ip -= CALL_BACK;
-	save = *(long *)ip;
-
-	return save == *ftrace_nop;
-}
-
 static int notrace ftrace_calc_offset(long ip, long addr)
 {
 	return (int)(addr - ip);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ec54cb7d69d6..a8929e4c77c1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -306,13 +306,6 @@ ftrace_record_ip(unsigned long ip)
 	if (ftrace_ip_in_hash(ip, key))
 		goto out_unlock;
 
-	/*
-	 * There's a slight race that the ftraced will update the
-	 * hash and reset here. If it is already converted, skip it.
-	 */
-	if (ftrace_ip_converted(ip))
-		goto out_unlock;
-
 	node = ftrace_alloc_dyn_node(ip);
 	if (!node)
 		goto out_unlock;
-- 
cgit v1.2.3


From 668231141f307ffd81db075b34bddaedae0ec863 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 5 Jun 2008 16:35:10 +0200
Subject: x86: fix compile warning in io_apic_{32,64}.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 1 +
 arch/x86/kernel/io_apic_64.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index a40d54fc1fdd..4fbf656b08b4 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1489,6 +1489,7 @@ void /*__init*/ print_local_APIC(void * dummy)
 
 	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
 		smp_processor_id(), hard_smp_processor_id());
+	v = apic_read(APIC_ID);
 	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v,
 			GET_APIC_ID(read_apic_id()));
 	v = apic_read(APIC_LVR);
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..c7c6b005f665 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1077,6 +1077,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
 
 	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
 		smp_processor_id(), hard_smp_processor_id());
+	v = apic_read(APIC_ID);
 	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
-- 
cgit v1.2.3


From 9e26d84273541a8c6c2efb705457ca8e6245fb73 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Fri, 6 Jun 2008 12:01:13 +0200
Subject: fix build bug in "x86: add PCI extended config space access for AMD
 Barcelona"

Also much less code now.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c    |  2 --
 arch/x86/kernel/cpu/amd_64.c |  1 +
 arch/x86/kernel/cpu/cpu.h    |  5 +++++
 arch/x86/kernel/setup.c      |  2 +-
 arch/x86/kernel/setup.h      | 26 --------------------------
 arch/x86/kernel/setup_64.c   |  2 --
 include/asm-x86/mmconfig.h   |  6 ------
 7 files changed, 7 insertions(+), 37 deletions(-)
 delete mode 100644 arch/x86/kernel/setup.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 656b40aed647..a38d54f4ff25 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -4,10 +4,8 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
-#include <asm/mmconfig.h>
 
 #include <mach_apic.h>
-#include "../setup.h"
 #include "cpu.h"
 
 /*
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 180097e99219..626fc21f027d 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -6,6 +6,7 @@
 #include <asm/cacheflush.h>
 
 #include <mach_apic.h>
+#include "cpu.h"
 
 extern int __cpuinit get_model_name(struct cpuinfo_x86 *c);
 extern void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 783691b2a738..f5d5bb1b5541 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,3 +1,4 @@
+#ifdef CONFIG_X86_32
 
 struct cpu_model_info {
 	int vendor;
@@ -36,3 +37,7 @@ extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
 
 extern int get_model_name(struct cpuinfo_x86 *c);
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
+
+#endif /* CONFIG_X86_32 */
+
+extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d8f171234013..20e14dbef107 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -136,6 +136,7 @@ void __init setup_per_cpu_areas(void)
 	setup_cpumask_of_cpu();
 }
 
+#endif
 #define ENABLE_CF8_EXT_CFG      (1ULL << 46)
 
 void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c)
@@ -149,4 +150,3 @@ void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c)
 	set_cpu_cap(c, X86_FEATURE_PCI_EXT_CFG);
 }
 
-#endif
diff --git a/arch/x86/kernel/setup.h b/arch/x86/kernel/setup.h
deleted file mode 100644
index 66cc2c7f961d..000000000000
--- a/arch/x86/kernel/setup.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Internal declarations for shared x86 setup code.
- *
- * Copyright (c) 2008 Advanced Micro Devices, Inc.
- * Contributed by Robert Richter <robert.richter@amd.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- * 02111-1307 USA
- */
-
-#ifndef _ARCH_X86_KERNEL_SETUP_H
-
-extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c);
-
-#endif	/* _ARCH_X86_KERNEL_SETUP_H */
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 215bd67e5b6c..25afdd80a3cf 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -73,8 +73,6 @@
 #include <asm/pat.h>
 #include <asm/mmconfig.h>
 
-#include "setup.h"
-
 #include <mach_apic.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h
index 691798fbee10..95beda07c6fa 100644
--- a/include/asm-x86/mmconfig.h
+++ b/include/asm-x86/mmconfig.h
@@ -9,10 +9,4 @@ static inline void fam10h_check_enable_mmcfg(void) { }
 static inline void check_enable_amd_mmconf_dmi(void) { }
 #endif
 
-#if defined(CONFIG_SMP) && defined(CONFIG_X86_64)
-extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c);
-#else
-static inline void amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c) { }
-#endif
-
 #endif
-- 
cgit v1.2.3


From 5b80fe8bd732b5dd442118a43e02db22e8fd93e2 Mon Sep 17 00:00:00 2001
From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Date: Sat, 7 Jun 2008 14:34:42 +0200
Subject: x86: coding style fixes to arch/x86/kernel/sys_i386_32.c

Before:
total: 16 errors, 25 warnings, 246 lines checked

After:
total: 0 errors, 7 warnings, 246 lines checked

Compile tested.

paolo@paolo-desktop:/tmp$ size sys*
   text    data     bss     dec     hex filename
   1209       0       0    1209     4b9 sys_i386_32.o.after
   1209       0       0    1209     4b9 sys_i386_32.o.before

paolo@paolo-desktop:/tmp$ md5sum sys*
6144f6d6ce7342c3e192681a1ccaa1c1  sys_i386_32.o.after
6144f6d6ce7342c3e192681a1ccaa1c1  sys_i386_32.o.before

Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/sys_i386_32.c | 64 +++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d2ab52cc1d6b..7066cb855a60 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -19,8 +19,8 @@
 #include <linux/utsname.h>
 #include <linux/ipc.h>
 
-#include <asm/uaccess.h>
-#include <asm/unistd.h>
+#include <linux/uaccess.h>
+#include <linux/unistd.h>
 
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
@@ -103,7 +103,7 @@ asmlinkage int old_select(struct sel_arg_struct __user *arg)
  *
  * This is really horribly ugly.
  */
-asmlinkage int sys_ipc (uint call, int first, int second,
+asmlinkage int sys_ipc(uint call, int first, int second,
 			int third, void __user *ptr, long fifth)
 {
 	int version, ret;
@@ -113,24 +113,24 @@ asmlinkage int sys_ipc (uint call, int first, int second,
 
 	switch (call) {
 	case SEMOP:
-		return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
+		return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
 	case SEMTIMEDOP:
 		return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
 					(const struct timespec __user *)fifth);
 
 	case SEMGET:
-		return sys_semget (first, second, third);
+		return sys_semget(first, second, third);
 	case SEMCTL: {
 		union semun fourth;
 		if (!ptr)
 			return -EINVAL;
 		if (get_user(fourth.__pad, (void __user * __user *) ptr))
 			return -EFAULT;
-		return sys_semctl (first, second, third, fourth);
+		return sys_semctl(first, second, third, fourth);
 	}
 
 	case MSGSND:
-		return sys_msgsnd (first, (struct msgbuf __user *) ptr, 
+		return sys_msgsnd(first, (struct msgbuf __user *) ptr,
 				   second, third);
 	case MSGRCV:
 		switch (version) {
@@ -138,45 +138,45 @@ asmlinkage int sys_ipc (uint call, int first, int second,
 			struct ipc_kludge tmp;
 			if (!ptr)
 				return -EINVAL;
-			
+
 			if (copy_from_user(&tmp,
-					   (struct ipc_kludge __user *) ptr, 
-					   sizeof (tmp)))
+					   (struct ipc_kludge __user *) ptr,
+					   sizeof(tmp)))
 				return -EFAULT;
-			return sys_msgrcv (first, tmp.msgp, second,
+			return sys_msgrcv(first, tmp.msgp, second,
 					   tmp.msgtyp, third);
 		}
 		default:
-			return sys_msgrcv (first,
+			return sys_msgrcv(first,
 					   (struct msgbuf __user *) ptr,
 					   second, fifth, third);
 		}
 	case MSGGET:
-		return sys_msgget ((key_t) first, second);
+		return sys_msgget((key_t) first, second);
 	case MSGCTL:
-		return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
+		return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
 
 	case SHMAT:
 		switch (version) {
 		default: {
 			ulong raddr;
-			ret = do_shmat (first, (char __user *) ptr, second, &raddr);
+			ret = do_shmat(first, (char __user *) ptr, second, &raddr);
 			if (ret)
 				return ret;
-			return put_user (raddr, (ulong __user *) third);
+			return put_user(raddr, (ulong __user *) third);
 		}
 		case 1:	/* iBCS2 emulator entry point */
 			if (!segment_eq(get_fs(), get_ds()))
 				return -EINVAL;
 			/* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
-			return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
+			return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
 		}
-	case SHMDT: 
-		return sys_shmdt ((char __user *)ptr);
+	case SHMDT:
+		return sys_shmdt((char __user *)ptr);
 	case SHMGET:
-		return sys_shmget (first, second, third);
+		return sys_shmget(first, second, third);
 	case SHMCTL:
-		return sys_shmctl (first, second,
+		return sys_shmctl(first, second,
 				   (struct shmid_ds __user *) ptr);
 	default:
 		return -ENOSYS;
@@ -186,28 +186,28 @@ asmlinkage int sys_ipc (uint call, int first, int second,
 /*
  * Old cruft
  */
-asmlinkage int sys_uname(struct old_utsname __user * name)
+asmlinkage int sys_uname(struct old_utsname __user *name)
 {
 	int err;
 	if (!name)
 		return -EFAULT;
 	down_read(&uts_sem);
-	err = copy_to_user(name, utsname(), sizeof (*name));
+	err = copy_to_user(name, utsname(), sizeof(*name));
 	up_read(&uts_sem);
-	return err?-EFAULT:0;
+	return err? -EFAULT:0;
 }
 
-asmlinkage int sys_olduname(struct oldold_utsname __user * name)
+asmlinkage int sys_olduname(struct oldold_utsname __user *name)
 {
 	int error;
 
 	if (!name)
 		return -EFAULT;
-	if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
+	if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
 		return -EFAULT;
-  
-  	down_read(&uts_sem);
-	
+
+	down_read(&uts_sem);
+
 	error = __copy_to_user(&name->sysname, &utsname()->sysname,
 			       __OLD_UTS_LEN);
 	error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
@@ -223,9 +223,9 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name)
 	error |= __copy_to_user(&name->machine, &utsname()->machine,
 				__OLD_UTS_LEN);
 	error |= __put_user(0, name->machine + __OLD_UTS_LEN);
-	
+
 	up_read(&uts_sem);
-	
+
 	error = error ? -EFAULT : 0;
 
 	return error;
@@ -241,6 +241,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[])
 	long __res;
 	asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
 	: "=a" (__res)
-	: "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
+	: "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
 	return __res;
 }
-- 
cgit v1.2.3


From 6ddd2a27948f0bd02a2ad001e8a6816898eba0dc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jun 2008 16:59:53 +0200
Subject: x86: simplify idle selection

default_idle is selected in cpu_idle(), when no other idle routine is
selected. Select it in select_idle_routine() when mwait is not
selected.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process.c    | 18 +++++++-----------
 arch/x86/kernel/process_32.c |  7 +------
 arch/x86/kernel/process_64.c |  7 ++-----
 3 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba370dc8685b..b3078f4ce25b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -139,27 +139,23 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
-	static int selected;
-
-	if (selected)
-		return;
 #ifdef CONFIG_X86_SMP
 	if (pm_idle == poll_idle && smp_num_siblings > 1) {
 		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
 			" performance may degrade.\n");
 	}
 #endif
+	if (pm_idle)
+		return;
+
 	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
 		/*
-		 * Skip, if setup has overridden idle.
 		 * One CPU supports mwait => All CPUs supports mwait
 		 */
-		if (!pm_idle) {
-			printk(KERN_INFO "using mwait in idle threads.\n");
-			pm_idle = mwait_idle;
-		}
-	}
-	selected = 1;
+		printk(KERN_INFO "using mwait in idle threads.\n");
+		pm_idle = mwait_idle;
+	} else
+		pm_idle = default_idle;
 }
 
 static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f8476dfbb60d..ee4ab461c50d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -168,24 +168,19 @@ void cpu_idle(void)
 	while (1) {
 		tick_nohz_stop_sched_tick();
 		while (!need_resched()) {
-			void (*idle)(void);
 
 			check_pgt_cache();
 			rmb();
-			idle = pm_idle;
 
 			if (rcu_pending(cpu))
 				rcu_check_callbacks(cpu, 0);
 
-			if (!idle)
-				idle = default_idle;
-
 			if (cpu_is_offline(cpu))
 				play_dead();
 
 			local_irq_disable();
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
-			idle();
+			pm_idle();
 		}
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e2319f39988b..db3d89a04399 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -150,12 +150,9 @@ void cpu_idle(void)
 	while (1) {
 		tick_nohz_stop_sched_tick();
 		while (!need_resched()) {
-			void (*idle)(void);
 
 			rmb();
-			idle = pm_idle;
-			if (!idle)
-				idle = default_idle;
+
 			if (cpu_is_offline(smp_processor_id()))
 				play_dead();
 			/*
@@ -165,7 +162,7 @@ void cpu_idle(void)
 			 */
 			local_irq_disable();
 			enter_idle();
-			idle();
+			pm_idle();
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
 			   loops can be woken up without interrupt. */
-- 
cgit v1.2.3


From aa83f3f2cfc74d66d01b1d2eb1485ea1103a0f4e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jun 2008 17:11:13 +0200
Subject: x86: cleanup C1E enabled detection

Rename the "MSR_K8_ENABLE_C1E" MSR to INT_PENDING_MSG, which is the
name in the data sheet as well. Move the C1E mask to the header file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c    | 5 ++---
 arch/x86/kernel/cpu/amd_64.c | 5 ++---
 include/asm-x86/msr-index.h  | 4 +++-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a38d54f4ff25..30b5055be355 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -25,7 +25,6 @@ extern void vide(void);
 __asm__(".align 4\nvide: ret");
 
 #ifdef CONFIG_X86_LOCAL_APIC
-#define ENABLE_C1E_MASK         0x18000000
 #define CPUID_PROCESSOR_SIGNATURE       1
 #define CPUID_XFAM              0x0ff00000
 #define CPUID_XFAM_K8           0x00000000
@@ -45,8 +44,8 @@ static __cpuinit int amd_apic_timer_broken(void)
 			break;
 	case CPUID_XFAM_10H:
 	case CPUID_XFAM_11H:
-		rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
-		if (lo & ENABLE_C1E_MASK) {
+		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
 			if (smp_processor_id() != boot_cpu_physical_apicid)
 				printk(KERN_INFO "AMD C1E detected late. "
 				       "	Force timer broadcast.\n");
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 626fc21f027d..6eef3c79d151 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -110,7 +110,6 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
 #endif
 }
 
-#define ENABLE_C1E_MASK		0x18000000
 #define CPUID_PROCESSOR_SIGNATURE	1
 #define CPUID_XFAM		0x0ff00000
 #define CPUID_XFAM_K8		0x00000000
@@ -130,8 +129,8 @@ static __cpuinit int amd_apic_timer_broken(void)
 			break;
 	case CPUID_XFAM_10H:
 	case CPUID_XFAM_11H:
-		rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
-		if (lo & ENABLE_C1E_MASK)
+		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+		if (lo & K8_INTP_C1E_ACTIVE_MASK)
 			return 1;
 		break;
 	default:
diff --git a/include/asm-x86/msr-index.h b/include/asm-x86/msr-index.h
index 09413ad39d3c..44bce773012e 100644
--- a/include/asm-x86/msr-index.h
+++ b/include/asm-x86/msr-index.h
@@ -111,7 +111,9 @@
 #define MSR_K8_TOP_MEM2			0xc001001d
 #define MSR_K8_SYSCFG			0xc0010010
 #define MSR_K8_HWCR			0xc0010015
-#define MSR_K8_ENABLE_C1E		0xc0010055
+#define MSR_K8_INT_PENDING_MSG		0xc0010055
+/* C1E active bits in int pending message */
+#define K8_INTP_C1E_ACTIVE_MASK		0x18000000
 #define MSR_K8_TSEG_ADDR		0xc0010112
 #define K8_MTRRFIXRANGE_DRAM_ENABLE	0x00040000 /* MtrrFixDramEn bit    */
 #define K8_MTRRFIXRANGE_DRAM_MODIFY	0x00080000 /* MtrrFixDramModEn bit */
-- 
cgit v1.2.3


From 732d7be17b98ebfd59e5864c3490f19856fa832c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jun 2008 17:27:20 +0200
Subject: x86: use cpuinfo to check for interrupt pending message msr

Simplify code: no need to do a cpuid(1) again. The cpuinfo structure
has all necessary information already.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c    | 41 +++++++++++++++--------------------------
 arch/x86/kernel/cpu/amd_64.c | 38 +++++++++++++++-----------------------
 2 files changed, 30 insertions(+), 49 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 30b5055be355..e76b49e7a916 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -25,35 +25,24 @@ extern void vide(void);
 __asm__(".align 4\nvide: ret");
 
 #ifdef CONFIG_X86_LOCAL_APIC
-#define CPUID_PROCESSOR_SIGNATURE       1
-#define CPUID_XFAM              0x0ff00000
-#define CPUID_XFAM_K8           0x00000000
-#define CPUID_XFAM_10H          0x00100000
-#define CPUID_XFAM_11H          0x00200000
-#define CPUID_XMOD              0x000f0000
-#define CPUID_XMOD_REV_F        0x00040000
 
 /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
+static __cpuinit int amd_apic_timer_broken(struct cpuinfo_x86 *c)
 {
 	u32 lo, hi;
-	u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-	switch (eax & CPUID_XFAM) {
-	case CPUID_XFAM_K8:
-		if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
-			break;
-	case CPUID_XFAM_10H:
-	case CPUID_XFAM_11H:
-		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
-		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
-			if (smp_processor_id() != boot_cpu_physical_apicid)
-				printk(KERN_INFO "AMD C1E detected late. "
-				       "	Force timer broadcast.\n");
-			return 1;
-		}
-		break;
-	default:
-		/* err on the side of caution */
+
+	if (c->x86 < 0x0F)
+		return 0;
+
+	/* Family 0x0f models < rev F do not have this MSR */
+	if (c->x86 == 0x0f && c->x86_model < 0x40)
+		return 0;
+
+	rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+	if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+		if (smp_processor_id() != boot_cpu_physical_apicid)
+			printk(KERN_INFO "AMD C1E detected late. "
+			       "Force timer broadcast.\n");
 		return 1;
 	}
 	return 0;
@@ -297,7 +286,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	}
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	if (amd_apic_timer_broken())
+	if (amd_apic_timer_broken(c))
 		local_apic_timer_disabled = 1;
 #endif
 
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 6eef3c79d151..f5fc161d8f2a 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -110,31 +110,23 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
 #endif
 }
 
-#define CPUID_PROCESSOR_SIGNATURE	1
-#define CPUID_XFAM		0x0ff00000
-#define CPUID_XFAM_K8		0x00000000
-#define CPUID_XFAM_10H		0x00100000
-#define CPUID_XFAM_11H		0x00200000
-#define CPUID_XMOD		0x000f0000
-#define CPUID_XMOD_REV_F	0x00040000
-
 /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
+static __cpuinit int amd_apic_timer_broken(struct cpuinfo_x86 *c)
 {
-	u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+	u32 lo, hi;
 
-	switch (eax & CPUID_XFAM) {
-	case CPUID_XFAM_K8:
-		if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
-			break;
-	case CPUID_XFAM_10H:
-	case CPUID_XFAM_11H:
-		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
-		if (lo & K8_INTP_C1E_ACTIVE_MASK)
-			return 1;
-		break;
-	default:
-		/* err on the side of caution */
+	if (c->x86 < 0x0F)
+		return 0;
+
+	/* Family 0x0f models < rev F do not have this MSR */
+	if (c->x86 == 0x0f && c->x86_model < 0x40)
+		return 0;
+
+	rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+	if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+		if (smp_processor_id() != boot_cpu_physical_apicid)
+			printk(KERN_INFO "AMD C1E detected late. "
+			       "Force timer broadcast.\n");
 		return 1;
 	}
 	return 0;
@@ -220,7 +212,7 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 == 0x10)
 		amd_enable_pci_ext_cfg(c);
 
-	if (amd_apic_timer_broken())
+	if (amd_apic_timer_broken(c))
 		disable_apic_timer = 1;
 
 	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
-- 
cgit v1.2.3


From 09fd4b4ef5bc7e5222c13acec1bee8cd252fb63f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jun 2008 18:04:27 +0200
Subject: x86: use cpuid to check MWAIT support for C1

cpuid(0x05) provides extended information about MWAIT in EDX when bit
0 of ECX is set. Bit 4-7 of EDX determine whether MWAIT is supported
for C1. C1E enabled CPUs have these bits set to 0.

Based on an earlier patch from Andi Kleen.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b3078f4ce25b..fe415ba606e0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -122,19 +122,31 @@ static void poll_idle(void)
  *
  * idle=mwait overrides this decision and forces the usage of mwait.
  */
+
+#define MWAIT_INFO			0x05
+#define MWAIT_ECX_EXTENDED_INFO		0x01
+#define MWAIT_EDX_C1			0xf0
+
 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 {
+	u32 eax, ebx, ecx, edx;
+
 	if (force_mwait)
 		return 1;
 
-	if (c->x86_vendor == X86_VENDOR_AMD) {
-		switch(c->x86) {
-		case 0x10:
-		case 0x11:
-			return 0;
-		}
-	}
-	return 1;
+	if (c->cpuid_level < MWAIT_INFO)
+		return 0;
+
+	cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
+	/* Check, whether EDX has extended info about MWAIT */
+	if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
+		return 1;
+
+	/*
+	 * edx enumeratios MONITOR/MWAIT extensions. Check, whether
+	 * C1  supports MWAIT
+	 */
+	return (edx & MWAIT_EDX_C1);
 }
 
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-- 
cgit v1.2.3


From 00dba56465228825ea806e3a7fc0aa6bba7bdc6c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jun 2008 18:35:28 +0200
Subject: x86: move more common idle functions/variables to process.c

more unification. Should cause no change in functionality.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process.c    | 70 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/process_32.c | 54 ----------------------------------
 arch/x86/kernel/process_64.c | 28 ------------------
 3 files changed, 70 insertions(+), 82 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index fe415ba606e0..9fea14607dfe 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -45,6 +45,76 @@ void arch_task_cache_init(void)
 				  SLAB_PANIC, NULL);
 }
 
+/*
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+
+#ifdef CONFIG_X86_32
+/*
+ * This halt magic was a workaround for ancient floppy DMA
+ * wreckage. It should be safe to remove.
+ */
+static int hlt_counter;
+void disable_hlt(void)
+{
+	hlt_counter++;
+}
+EXPORT_SYMBOL(disable_hlt);
+
+void enable_hlt(void)
+{
+	hlt_counter--;
+}
+EXPORT_SYMBOL(enable_hlt);
+
+static inline int hlt_use_halt(void)
+{
+	return (!hlt_counter && boot_cpu_data.hlt_works_ok);
+}
+#else
+static inline int hlt_use_halt(void)
+{
+	return 1;
+}
+#endif
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void default_idle(void)
+{
+	if (hlt_use_halt()) {
+		current_thread_info()->status &= ~TS_POLLING;
+		/*
+		 * TS_POLLING-cleared state must be visible before we
+		 * test NEED_RESCHED:
+		 */
+		smp_mb();
+
+		if (!need_resched())
+			safe_halt();	/* enables interrupts racelessly */
+		else
+			local_irq_enable();
+		current_thread_info()->status |= TS_POLLING;
+	} else {
+		local_irq_enable();
+		/* loop is done by the caller */
+		cpu_relax();
+	}
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
 static void do_nothing(void *unused)
 {
 }
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ee4ab461c50d..ae4020486a97 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,11 +58,6 @@
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
-static int hlt_counter;
-
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
 
@@ -77,55 +72,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 	return ((unsigned long *)tsk->thread.sp)[3];
 }
 
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-
-void disable_hlt(void)
-{
-	hlt_counter++;
-}
-
-EXPORT_SYMBOL(disable_hlt);
-
-void enable_hlt(void)
-{
-	hlt_counter--;
-}
-
-EXPORT_SYMBOL(enable_hlt);
-
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
-{
-	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
-		current_thread_info()->status &= ~TS_POLLING;
-		/*
-		 * TS_POLLING-cleared state must be visible before we
-		 * test NEED_RESCHED:
-		 */
-		smp_mb();
-
-		if (!need_resched())
-			safe_halt();	/* enables interrupts racelessly */
-		else
-			local_irq_enable();
-		current_thread_info()->status |= TS_POLLING;
-	} else {
-		local_irq_enable();
-		/* loop is done by the caller */
-		cpu_relax();
-	}
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(default_idle);
-#endif
-
 #ifdef CONFIG_HOTPLUG_CPU
 #include <asm/nmi.h>
 /* We don't actually take CPU down, just spin without interrupts. */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index db3d89a04399..9fb3a6fe863b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -56,15 +56,6 @@ asmlinkage extern void ret_from_fork(void);
 
 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
 
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
 
 void idle_notifier_register(struct notifier_block *n)
@@ -94,25 +85,6 @@ void exit_idle(void)
 	__exit_idle();
 }
 
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
-{
-	current_thread_info()->status &= ~TS_POLLING;
-	/*
-	 * TS_POLLING-cleared state must be visible before we
-	 * test NEED_RESCHED:
-	 */
-	smp_mb();
-	if (!need_resched())
-		safe_halt();	/* enables interrupts racelessly */
-	else
-		local_irq_enable();
-	current_thread_info()->status |= TS_POLLING;
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 DECLARE_PER_CPU(int, cpu_state);
 
-- 
cgit v1.2.3


From ee863ba7ab3d3ed8a9585d378aae69d1e3e9f1b4 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 10 Jun 2008 16:04:30 +0200
Subject: x86: unconditionally enable PAT for AMD CPUs

If PAT support is advertised it should just work. No errata known.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index d8b3e4a9d663..0fbd06241e07 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -54,14 +54,11 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
 {
 	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		if (c->x86 >= 0xf && c->x86 <= 0x11)
-			return;
-		break;
 	case X86_VENDOR_INTEL:
 		if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
 			return;
 		break;
+	case X86_VENDOR_AMD:
 	case X86_VENDOR_CENTAUR:
 	case X86_VENDOR_TRANSMETA:
 		return;
-- 
cgit v1.2.3


From 97cfab6ac4ddfda0d722393bbf46cc40bc332107 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 10 Jun 2008 16:05:18 +0200
Subject: x86: PAT: fix ambiguous paranoia check in pat_init()

Starting with commit 8d4a4300854f3971502e81dacd930704cb88f606 (x86:
cleanup PAT cpu validation) the PAT CPU feature flag is not cleared
anymore. Now the error message

  "PAT enabled, but CPU feature cleared"

in pat_init() is misleading.

Furthermore the current code does not check for existence of the PAT
CPU feature flag if a CPU is whitelisted in validate_pat_support.

This patch clears pat_wc_enabled if boot CPU has no PAT feature flag
and adapts the paranoia check.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c |  7 ++++---
 arch/x86/mm/pat.c                          | 11 ++++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 0fbd06241e07..2df461f06a52 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -53,6 +53,9 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_PAT
 void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
 {
+	if (!cpu_has_pat)
+		pat_disable("PAT not supported by CPU.");
+
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
 		if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
@@ -64,8 +67,6 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
 		return;
 	}
 
-	pat_disable(cpu_has_pat ?
-		    "PAT disabled. Not yet verified on this CPU type." :
-		    "PAT not supported by CPU.");
+	pat_disable("PAT disabled. Not yet verified on this CPU type.");
 }
 #endif
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 320d644a8107..65105b1195a3 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -76,14 +76,15 @@ void pat_init(void)
 		return;
 
 	/* Paranoia check. */
-	if (!cpu_has_pat) {
-		printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
+	if (!cpu_has_pat && boot_pat_state) {
 		/*
-		 * Panic if this happens on the secondary CPU, and we
+		 * If this happens we are on a secondary CPU, but
 		 * switched to PAT on the boot CPU. We have no way to
 		 * undo PAT.
-		*/
-		BUG_ON(boot_pat_state);
+		 */
+		printk(KERN_ERR "PAT enabled, "
+		       "but not supported by secondary CPU\n");
+		BUG();
 	}
 
 	/* Set PWT to Write-Combining. All other bits stay the same */
-- 
cgit v1.2.3


From cd7a4e936d345ab4cb49d68192d90bd4e4c58458 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 10 Jun 2008 16:05:39 +0200
Subject: x86: PAT: fixed checkpatch errors (and whitespaces)

x86: PAT: fixed checkpatch errors (and whitespaces)

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c |  2 --
 arch/x86/mm/pat.c                          | 29 ++++++++++++++---------------
 include/asm-x86/pat.h                      |  6 ++----
 3 files changed, 16 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 2df461f06a52..84a8220a6072 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -1,9 +1,7 @@
-
 /*
  *	Routines to indentify additional cpu features that are scattered in
  *	cpuid space.
  */
-
 #include <linux/cpu.h>
 
 #include <asm/pat.h>
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 65105b1195a3..a8b69bb26972 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -66,7 +66,7 @@ enum {
 	PAT_UC_MINUS = 7,	/* UC, but can be overriden by MTRR */
 };
 
-#define PAT(x,y)	((u64)PAT_ ## y << ((x)*8))
+#define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
 
 void pat_init(void)
 {
@@ -100,8 +100,8 @@ void pat_init(void)
 	 *      011 UC		_PAGE_CACHE_UC
 	 * PAT bit unused
 	 */
-	pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
-	      PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
+	pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+	      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
 
 	/* Boot CPU check */
 	if (!boot_pat_state)
@@ -117,11 +117,11 @@ void pat_init(void)
 static char *cattr_name(unsigned long flags)
 {
 	switch (flags & _PAGE_CACHE_MASK) {
-		case _PAGE_CACHE_UC:		return "uncached";
-		case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
-		case _PAGE_CACHE_WB:		return "write-back";
-		case _PAGE_CACHE_WC:		return "write-combining";
-		default:			return "broken";
+	case _PAGE_CACHE_UC:		return "uncached";
+	case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
+	case _PAGE_CACHE_WB:		return "write-back";
+	case _PAGE_CACHE_WC:		return "write-combining";
+	default:			return "broken";
 	}
 }
 
@@ -536,11 +536,11 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	 * we maintain the tradition of paranoia in this code.
 	 */
 	if (!pat_wc_enabled &&
-	    ! ( boot_cpu_has(X86_FEATURE_MTRR) ||
-		boot_cpu_has(X86_FEATURE_K6_MTRR) ||
-		boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
-		boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
-	   (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
+	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
+	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
+	      boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
+	    (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
 		flags = _PAGE_CACHE_UC;
 	}
 #endif
@@ -562,7 +562,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 		return 0;
 
 	if (pfn <= max_pfn_mapped &&
-            ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
+	    ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
 		free_memtype(offset, offset + size);
 		printk(KERN_INFO
 		"%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
@@ -600,4 +600,3 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
 
 	free_memtype(addr, addr + size);
 }
-
diff --git a/include/asm-x86/pat.h b/include/asm-x86/pat.h
index 88f60cc6a227..6fa9710ae09a 100644
--- a/include/asm-x86/pat.h
+++ b/include/asm-x86/pat.h
@@ -1,6 +1,5 @@
-
 #ifndef _ASM_PAT_H
-#define _ASM_PAT_H 1
+#define _ASM_PAT_H
 
 #include <linux/types.h>
 
@@ -8,7 +7,7 @@
 extern int pat_wc_enabled;
 extern void validate_pat_support(struct cpuinfo_x86 *c);
 #else
-static const int pat_wc_enabled = 0;
+static const int pat_wc_enabled;
 static inline void validate_pat_support(struct cpuinfo_x86 *c) { }
 #endif
 
@@ -21,4 +20,3 @@ extern int free_memtype(u64 start, u64 end);
 extern void pat_disable(char *reason);
 
 #endif
-
-- 
cgit v1.2.3


From 6703f6d10dcd3316e03641a5ecaa6c8a04374d98 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 10 Jun 2008 00:10:48 +0200
Subject: x86, gart: add resume handling

If GART IOMMU is used on an AMD64 system, the northbridge registers
related to it should be restored during resume so that memory is not
corrupted.  Make gart_resume() handle that as appropriate.

Ref. http://lkml.org/lkml/2008/5/25/96 and the following thread.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/aperture_64.c |  2 ++
 arch/x86/kernel/pci-gart_64.c | 57 +++++++++++++++++++++++++++++++++++++++----
 include/asm-x86/gart.h        |  1 +
 3 files changed, 55 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index eb20f168c0fd..3409abb231ac 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -496,4 +496,6 @@ out:
 			write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
 		}
 	}
+
+	set_up_gart_resume(aper_order, aper_alloc);
 }
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 3710097f02eb..f505c3890358 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -549,14 +549,63 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
 	return aper_base;
 }
 
+static void enable_gart_translations(void)
+{
+	int i;
+
+	for (i = 0; i < num_k8_northbridges; i++) {
+		struct pci_dev *dev = k8_northbridges[i];
+
+		enable_gart_translation(dev, __pa(agp_gatt_table));
+	}
+}
+
+/*
+ * If fix_up_north_bridges is set, the north bridges have to be fixed up on
+ * resume in the same way as they are handled in gart_iommu_hole_init().
+ */
+static bool fix_up_north_bridges;
+static u32 aperture_order;
+static u32 aperture_alloc;
+
+void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
+{
+	fix_up_north_bridges = true;
+	aperture_order = aper_order;
+	aperture_alloc = aper_alloc;
+}
+
 static int gart_resume(struct sys_device *dev)
 {
+	printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n");
+
+	if (fix_up_north_bridges) {
+		int i;
+
+		printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n");
+
+		for (i = 0; i < num_k8_northbridges; i++) {
+			struct pci_dev *dev = k8_northbridges[i];
+
+			/*
+			 * Don't enable translations just yet.  That is the next
+			 * step.  Restore the pre-suspend aperture settings.
+			 */
+			pci_write_config_dword(dev, AMD64_GARTAPERTURECTL,
+						aperture_order << 1);
+			pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
+						aperture_alloc >> 25);
+		}
+	}
+
+	enable_gart_translations();
+
 	return 0;
 }
 
 static int gart_suspend(struct sys_device *dev, pm_message_t state)
 {
-	return -EINVAL;
+	return 0;
 }
 
 static struct sysdev_class gart_sysdev_class = {
@@ -614,16 +663,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	memset(gatt, 0, gatt_size);
 	agp_gatt_table = gatt;
 
-	for (i = 0; i < num_k8_northbridges; i++) {
-		dev = k8_northbridges[i];
-		enable_gart_translation(dev, __pa(gatt));
-	}
+	enable_gart_translations();
 
 	error = sysdev_class_register(&gart_sysdev_class);
 	if (!error)
 		error = sysdev_register(&device_gart);
 	if (error)
 		panic("Could not register gart_sysdev -- would corrupt data on next suspend");
+
 	flush_gart();
 
 	printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h
index c818b96f936b..eeca2f51fd8f 100644
--- a/include/asm-x86/gart.h
+++ b/include/asm-x86/gart.h
@@ -14,6 +14,7 @@ extern void gart_iommu_shutdown(void);
 extern void __init gart_parse_options(char *);
 extern void early_gart_iommu_check(void);
 extern void gart_iommu_hole_init(void);
+extern void set_up_gart_resume(u32, u32);
 extern int fallback_aper_order;
 extern int fallback_aper_force;
 extern int gart_iommu_aperture;
-- 
cgit v1.2.3


From f781b03c4b1c713ac000877c8bbc31fc4164a29b Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 12 Jun 2008 17:08:16 +0400
Subject: x86: touch_nmi_watchdog(): reset alert counters for supported
 nmi_watchdog modes only

The checking 'if nmi_watchdog > 0' (ie NMI_NONE) is quite fast but it
has a side effect - it's taken even if nmi_watchdog = NMI_DISABLED.

Nowadays nmi_watchdog is set up to NMI_NONE by default so this condition
is properly taken most the time but we better show this explicitly.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 19f1b95265cf..326a8f4f50f8 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -354,7 +354,8 @@ static DEFINE_PER_CPU(int, nmi_touch);
 
 void touch_nmi_watchdog(void)
 {
-	if (nmi_watchdog > 0) {
+	if (nmi_watchdog == NMI_LOCAL_APIC ||
+		nmi_watchdog == NMI_IO_APIC) {
 		unsigned cpu;
 
 		/*
-- 
cgit v1.2.3


From ee4311adf105f4d740f52e3948acc1d81598afcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Jun 2008 17:43:02 +0200
Subject: ftrace: build fix with gcc 4.3

fix:

arch/x86/kernel/ftrace.c: Assembler messages:
arch/x86/kernel/ftrace.c:82: Error: bad register name `%sil'
make[1]: *** [arch/x86/kernel/ftrace.o] Error 1

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index bc5cf8d46742..55828149e01e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -88,7 +88,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		".previous\n"
 		_ASM_EXTABLE(1b, 3b)
 		: "=r"(faulted), "=a"(replaced)
-		: "r"(ip), "r"(new), "r"(newch),
+		: "r"(ip), "r"(new), "c"(newch),
 		  "0"(faulted), "a"(old)
 		: "memory");
 	sync_core();
-- 
cgit v1.2.3


From b4b3bd96f26586e53ab5482f1869221dd1b5ac36 Mon Sep 17 00:00:00 2001
From: Daniel Rahn <Daniel.Rahn@novell.com>
Date: Fri, 6 Jun 2008 09:42:36 +0200
Subject: x86: correctly report NR_BANKS in mce_64.c

attached is a no-brainer that makes kernel correctly report
NR_BANKS for MCE. We are right now limited to NR_BANKS==6, but the
error message will use the available number of banks instead of the
defined maximum.

For a Nehalem based system it will print:

"MCE: warning: using only 9 banks"

while the correct message would be

"MCE: warning: using only 6 banks"

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index f1f3f5e163b7..8c8299ce7ad4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -445,9 +445,9 @@ static void mce_init(void *dummy)
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	banks = cap & 0xff;
 	if (banks > MCE_EXTENDED_BANK) {
+		banks = MCE_EXTENDED_BANK;
 		printk(KERN_INFO "MCE: warning: using only %d banks\n",
 		       MCE_EXTENDED_BANK);
-		banks = MCE_EXTENDED_BANK;
 	}
 	/* Use accurate RIP reporting if available. */
 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
-- 
cgit v1.2.3


From e17ba73b0ee6c0f24393c48b455e0d8db761782c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Mon, 12 May 2008 15:44:40 +0200
Subject: x86, generic: mark early_printk as asmlinkage

It's not explicitly marked as asmlinkage, but invoked from x86_32
startup code with parameters on stack.

No other architectures define early_printk and none of them are affected
by this change, since defines asmlinkage as empty token.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/early_printk.c | 2 +-
 include/linux/kernel.h         | 2 +-
 kernel/printk.c                | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 643fd861b724..ff9e7350da54 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -196,7 +196,7 @@ static struct console simnow_console = {
 static struct console *early_console = &early_vga_console;
 static int early_console_initialized;
 
-void early_printk(const char *fmt, ...)
+asmlinkage void early_printk(const char *fmt, ...)
 {
 	char buf[512];
 	int n;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f2a668c195bf..4cb8d3df414e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -207,7 +207,7 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 		{ return false; }
 #endif
 
-extern void __attribute__((format(printf, 1, 2)))
+extern void asmlinkage __attribute__((format(printf, 1, 2)))
 	early_printk(const char *fmt, ...);
 
 unsigned long int_sqrt(unsigned long);
diff --git a/kernel/printk.c b/kernel/printk.c
index 70cfa5ac75ce..de1a4f4470c3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -38,7 +38,7 @@
 /*
  * Architectures can override it:
  */
-void __attribute__((weak)) early_printk(const char *fmt, ...)
+void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 {
 }
 
-- 
cgit v1.2.3


From fe94ae995d33a4df35b6b9cd0504e87d7e37c8de Mon Sep 17 00:00:00 2001
From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Date: Sat, 14 Jun 2008 14:06:19 +0200
Subject: x86: coding style fixes to arch/x86/kernel/cpu/mcheck/p4.c

Before:
total: 16 errors, 34 warnings, 257 lines checked

After:
total: 0 errors, 2 warnings, 257 lines checked

No changes in the compiled code:

paolo@paolo-desktop:~/linux.trees.git$ size /tmp/p4*
   text    data     bss     dec     hex filename
   2644       4       4    2652     a5c /tmp/p4.o.after
   2644       4       4    2652     a5c /tmp/p4.o.before

paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/p4*
13f1b21c4246b31a28aaff38184586ca  /tmp/p4.o.after
13f1b21c4246b31a28aaff38184586ca  /tmp/p4.o.before

Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/p4.c | 90 ++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index cb03345554a5..eef001ad3bde 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -8,7 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/smp.h>
 
-#include <asm/processor.h> 
+#include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/msr.h>
 #include <asm/apic.h>
@@ -32,12 +32,12 @@ struct intel_mce_extended_msrs {
 	/* u32 *reserved[]; */
 };
 
-static int mce_num_extended_msrs = 0;
+static int mce_num_extended_msrs;
 
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
 static void unexpected_thermal_interrupt(struct pt_regs *regs)
-{	
+{
 	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
 			smp_processor_id());
 	add_taint(TAINT_MACHINE_CHECK);
@@ -83,7 +83,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	 * be some SMM goo which handles it, so we can't even put a handler
 	 * since it might be delivered via SMI already -zwanem.
 	 */
-	rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 	h = apic_read(APIC_LVTTHMR);
 	if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
 		printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
@@ -91,7 +91,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 		return; /* -EBUSY */
 	}
 
-	/* check whether a vector already exists, temporarily masked? */	
+	/* check whether a vector already exists, temporarily masked? */
 	if (h & APIC_VECTOR_MASK) {
 		printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
 				"installed\n",
@@ -104,18 +104,18 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
 	apic_write_around(APIC_LVTTHMR, h);
 
-	rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
-	wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
+	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
 
 	/* ok we're good to go... */
 	vendor_thermal_interrupt = intel_thermal_interrupt;
-	
-	rdmsr (MSR_IA32_MISC_ENABLE, l, h);
-	wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
 
-	l = apic_read (APIC_LVTTHMR);
-	apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-	printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
+	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+	wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
+
+	l = apic_read(APIC_LVTTHMR);
+	apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+	printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
 
 	/* enable thermal throttle processing */
 	atomic_set(&therm_throt_en, 1);
@@ -129,28 +129,28 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
 {
 	u32 h;
 
-	rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
-	rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
-	rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
-	rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
-	rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
-	rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
-	rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
-	rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
-	rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
-	rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
+	rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
+	rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
+	rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
+	rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
+	rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
+	rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
+	rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
+	rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
+	rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
+	rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
 }
 
-static void intel_machine_check(struct pt_regs * regs, long error_code)
+static void intel_machine_check(struct pt_regs *regs, long error_code)
 {
-	int recover=1;
+	int recover = 1;
 	u32 alow, ahigh, high, low;
 	u32 mcgstl, mcgsth;
 	int i;
 
-	rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 	if (mcgstl & (1<<0))	/* Recoverable ? */
-		recover=0;
+		recover = 0;
 
 	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
 		smp_processor_id(), mcgsth, mcgstl);
@@ -191,20 +191,20 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 	}
 
 	if (recover & 2)
-		panic ("CPU context corrupt");
+		panic("CPU context corrupt");
 	if (recover & 1)
-		panic ("Unable to continue");
+		panic("Unable to continue");
 
 	printk(KERN_EMERG "Attempting to continue.\n");
-	/* 
-	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
+	/*
+	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
 	 * recoverable/continuable.This will allow BIOS to look at the MSRs
 	 * for errors if the OS could not log the error.
 	 */
-	for (i=0; i<nr_mce_banks; i++) {
+	for (i = 0; i < nr_mce_banks; i++) {
 		u32 msr;
 		msr = MSR_IA32_MC0_STATUS+i*4;
-		rdmsr (msr, low, high);
+		rdmsr(msr, low, high);
 		if (high&(1<<31)) {
 			/* Clear it */
 			wrmsr(msr, 0UL, 0UL);
@@ -214,7 +214,7 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 		}
 	}
 	mcgstl &= ~(1<<2);
-	wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 }
 
 
@@ -222,30 +222,30 @@ void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int i;
-	
+
 	machine_check_vector = intel_machine_check;
 	wmb();
 
-	printk (KERN_INFO "Intel machine check architecture supported.\n");
-	rdmsr (MSR_IA32_MCG_CAP, l, h);
+	printk(KERN_INFO "Intel machine check architecture supported.\n");
+	rdmsr(MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<8))	/* Control register present ? */
-		wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 	nr_mce_banks = l & 0xff;
 
-	for (i=0; i<nr_mce_banks; i++) {
-		wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-		wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+	for (i = 0; i < nr_mce_banks; i++) {
+		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
 	}
 
-	set_in_cr4 (X86_CR4_MCE);
-	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+	set_in_cr4(X86_CR4_MCE);
+	printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
 		smp_processor_id());
 
 	/* Check for P4/Xeon extended MCE MSRs */
-	rdmsr (MSR_IA32_MCG_CAP, l, h);
+	rdmsr(MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<9))	{/* MCG_EXT_P */
 		mce_num_extended_msrs = (l >> 16) & 0xff;
-		printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
+		printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
 				" available\n",
 			smp_processor_id(), mce_num_extended_msrs);
 
-- 
cgit v1.2.3


From 5175676a2d012ca5e5ad5eaedbfc1da5d5660d2a Mon Sep 17 00:00:00 2001
From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Date: Sat, 14 Jun 2008 14:37:14 +0200
Subject: x86: coding style fixes to arch/x86/kernel/cpu/mcheck/k7.c

Before:
total: 6 errors, 13 warnings, 105 lines checked

After:
total: 0 errors, 0 warnings, 105 lines checked

paolo@paolo-desktop:~/linux.trees.git$ size /tmp/k7*
   text    data     bss     dec     hex filename
   1135       0       0    1135     46f /tmp/k7.o.after
   1135       0       0    1135     46f /tmp/k7.o.before

paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/k7*
87b14954045aa37dbaee6fb7e022ed9a  /tmp/k7.o.after
87b14954045aa37dbaee6fb7e022ed9a  /tmp/k7.o.before

Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/k7.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index e633c9c2b764..f390c9f66351 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -9,23 +9,23 @@
 #include <linux/interrupt.h>
 #include <linux/smp.h>
 
-#include <asm/processor.h> 
+#include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
 
 /* Machine Check Handler For AMD Athlon/Duron */
-static void k7_machine_check(struct pt_regs * regs, long error_code)
+static void k7_machine_check(struct pt_regs *regs, long error_code)
 {
-	int recover=1;
+	int recover = 1;
 	u32 alow, ahigh, high, low;
 	u32 mcgstl, mcgsth;
 	int i;
 
-	rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 	if (mcgstl & (1<<0))	/* Recoverable ? */
-		recover=0;
+		recover = 0;
 
 	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
 		smp_processor_id(), mcgsth, mcgstl);
@@ -60,12 +60,12 @@ static void k7_machine_check(struct pt_regs * regs, long error_code)
 	}
 
 	if (recover&2)
-		panic ("CPU context corrupt");
+		panic("CPU context corrupt");
 	if (recover&1)
-		panic ("Unable to continue");
-	printk (KERN_EMERG "Attempting to continue.\n");
+		panic("Unable to continue");
+	printk(KERN_EMERG "Attempting to continue.\n");
 	mcgstl &= ~(1<<2);
-	wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 }
 
 
@@ -81,25 +81,25 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
 	machine_check_vector = k7_machine_check;
 	wmb();
 
-	printk (KERN_INFO "Intel machine check architecture supported.\n");
-	rdmsr (MSR_IA32_MCG_CAP, l, h);
+	printk(KERN_INFO "Intel machine check architecture supported.\n");
+	rdmsr(MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<8))	/* Control register present ? */
-		wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 	nr_mce_banks = l & 0xff;
 
 	/* Clear status for MC index 0 separately, we don't touch CTL,
 	 * as some K7 Athlons cause spurious MCEs when its enabled. */
 	if (boot_cpu_data.x86 == 6) {
-		wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+		wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
 		i = 1;
 	} else
 		i = 0;
-	for (; i<nr_mce_banks; i++) {
-		wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-		wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+	for (; i < nr_mce_banks; i++) {
+		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
 	}
 
-	set_in_cr4 (X86_CR4_MCE);
-	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+	set_in_cr4(X86_CR4_MCE);
+	printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
 		smp_processor_id());
 }
-- 
cgit v1.2.3


From 219835f10eaef19d2b976076f2bc4ad806856c55 Mon Sep 17 00:00:00 2001
From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Date: Sat, 14 Jun 2008 21:11:39 +0200
Subject: x86: coding style fixes to x86/kernel/cpu/cpufreq/cpufreq-nforce2.c

Before:
total: 22 errors, 8 warnings, 440 lines checked

After:
total: 0 errors, 8 warnings, 442 lines checked

paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/cpufreq-nforce2.o.*
3d4330a5d188fe904446e5948a618b48  /tmp/cpufreq-nforce2.o.after
1477e6b0dcd6f59b1fb6b4490042eca6  /tmp/cpufreq-nforce2.o.before
^^^ I guess this is because I fixed a few "do not initialise statics to 0 or NULL"

paolo@paolo-desktop:~/linux.trees.git$ size /tmp/cpufreq-nforce2.o.*
   text    data     bss     dec     hex filename
   1923      72      16    2011     7db /tmp/cpufreq-nforce2.o.after
   1923      72      16    2011     7db /tmp/cpufreq-nforce2.o.before

Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c | 44 ++++++++++++++-------------
 1 file changed, 23 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index f03e9153618e..965ea52767ac 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -26,9 +26,10 @@
 #define NFORCE2_SAFE_DISTANCE 50
 
 /* Delay in ms between FSB changes */
-//#define NFORCE2_DELAY 10
+/* #define NFORCE2_DELAY 10 */
 
-/* nforce2_chipset:
+/*
+ * nforce2_chipset:
  * FSB is changed using the chipset
  */
 static struct pci_dev *nforce2_chipset_dev;
@@ -36,13 +37,13 @@ static struct pci_dev *nforce2_chipset_dev;
 /* fid:
  * multiplier * 10
  */
-static int fid = 0;
+static int fid;
 
 /* min_fsb, max_fsb:
  * minimum and maximum FSB (= FSB at boot time)
  */
-static int min_fsb = 0;
-static int max_fsb = 0;
+static int min_fsb;
+static int max_fsb;
 
 MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
 MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
@@ -53,7 +54,7 @@ module_param(min_fsb, int, 0444);
 
 MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
 MODULE_PARM_DESC(min_fsb,
-                 "Minimum FSB to use, if not defined: current FSB - 50");
+		"Minimum FSB to use, if not defined: current FSB - 50");
 
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg)
 
@@ -139,7 +140,7 @@ static unsigned int nforce2_fsb_read(int bootfsb)
 
 	/* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
 	nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
-						0x01EF,PCI_ANY_ID,PCI_ANY_ID,NULL);
+						0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL);
 	if (!nforce2_sub5)
 		return 0;
 
@@ -147,13 +148,13 @@ static unsigned int nforce2_fsb_read(int bootfsb)
 	fsb /= 1000000;
 
 	/* Check if PLL register is already set */
-	pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
+	pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
 
-	if(bootfsb || !temp)
+	if (bootfsb || !temp)
 		return fsb;
-		
+
 	/* Use PLL register FSB value */
-	pci_read_config_dword(nforce2_chipset_dev,NFORCE2_PLLREG, &temp);
+	pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp);
 	fsb = nforce2_calc_fsb(temp);
 
 	return fsb;
@@ -184,7 +185,7 @@ static int nforce2_set_fsb(unsigned int fsb)
 	}
 
 	/* First write? Then set actual value */
-	pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
+	pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
 	if (!temp) {
 		pll = nforce2_calc_pll(tfsb);
 
@@ -210,7 +211,8 @@ static int nforce2_set_fsb(unsigned int fsb)
 			tfsb--;
 
 		/* Calculate the PLL reg. value */
-		if ((pll = nforce2_calc_pll(tfsb)) == -1)
+		pll = nforce2_calc_pll(tfsb);
+		if (pll == -1)
 			return -EINVAL;
 
 		nforce2_write_pll(pll);
@@ -249,7 +251,7 @@ static unsigned int nforce2_get(unsigned int cpu)
 static int nforce2_target(struct cpufreq_policy *policy,
 			  unsigned int target_freq, unsigned int relation)
 {
-//        unsigned long         flags;
+/*        unsigned long         flags; */
 	struct cpufreq_freqs freqs;
 	unsigned int target_fsb;
 
@@ -271,17 +273,17 @@ static int nforce2_target(struct cpufreq_policy *policy,
 	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 
 	/* Disable IRQs */
-	//local_irq_save(flags);
+	/* local_irq_save(flags); */
 
 	if (nforce2_set_fsb(target_fsb) < 0)
 		printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
-                       target_fsb);
+			target_fsb);
 	else
 		dprintk("Changed FSB successfully to %d\n",
-                       target_fsb);
+			target_fsb);
 
 	/* Enable IRQs */
-	//local_irq_restore(flags);
+	/* local_irq_restore(flags); */
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 
@@ -302,8 +304,8 @@ static int nforce2_verify(struct cpufreq_policy *policy)
 		policy->max = (fsb_pol_max + 1) * fid * 100;
 
 	cpufreq_verify_within_limits(policy,
-                                     policy->cpuinfo.min_freq,
-                                     policy->cpuinfo.max_freq);
+				     policy->cpuinfo.min_freq,
+				     policy->cpuinfo.max_freq);
 	return 0;
 }
 
@@ -347,7 +349,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
 	/* Set maximum FSB to FSB at boot time */
 	max_fsb = nforce2_fsb_read(1);
 
-	if(!max_fsb)
+	if (!max_fsb)
 		return -EIO;
 
 	if (!min_fsb)
-- 
cgit v1.2.3


From b8e0418b2a25f975c3b8764030c24b7253d33a68 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Mon, 16 Jun 2008 19:59:08 -0300
Subject: x86: fix typo CONFIX -> CONFIG

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 326a8f4f50f8..fd680c73ba77 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -68,7 +68,7 @@ static inline unsigned int get_nmi_count(int cpu)
 
 static inline int mce_in_progress(void)
 {
-#if defined(CONFIX_X86_64) && defined(CONFIG_X86_MCE)
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
 	return atomic_read(&mce_entry) > 0;
 #endif
 	return 0;
-- 
cgit v1.2.3


From 5f0120b5786f5dbe097a946a2eb5d745ebc2b7ed Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 18 Jun 2008 12:42:11 +0100
Subject: x86-64: remove unnecessary ptregs call stubs

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: "Andi Kleen" <andi@firstfloor.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/ia32/ia32entry.S   | 6 ++----
 arch/x86/kernel/entry_64.S  | 1 -
 include/asm-x86/unistd_64.h | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index b5e329da166c..3aefbce2de48 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -370,13 +370,11 @@ quiet_ni_syscall:
 	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
 	PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
 	PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
-	PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
 	PTREGSCALL stub32_execve, sys32_execve, %rcx
 	PTREGSCALL stub32_fork, sys_fork, %rdi
 	PTREGSCALL stub32_clone, sys32_clone, %rdx
 	PTREGSCALL stub32_vfork, sys_vfork, %rdi
 	PTREGSCALL stub32_iopl, sys_iopl, %rsi
-	PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
 
 ENTRY(ia32_ptregs_common)
 	popq %r11
@@ -476,7 +474,7 @@ ia32_sys_call_table:
 	.quad sys_ssetmask
 	.quad sys_setreuid16	/* 70 */
 	.quad sys_setregid16
-	.quad stub32_sigsuspend
+	.quad sys32_sigsuspend
 	.quad compat_sys_sigpending
 	.quad sys_sethostname
 	.quad compat_sys_setrlimit	/* 75 */
@@ -583,7 +581,7 @@ ia32_sys_call_table:
 	.quad sys32_rt_sigpending
 	.quad compat_sys_rt_sigtimedwait
 	.quad sys32_rt_sigqueueinfo
-	.quad stub32_rt_sigsuspend
+	.quad sys_rt_sigsuspend
 	.quad sys32_pread		/* 180 */
 	.quad sys32_pwrite
 	.quad sys_chown16
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1f1751dfb2f3..5cf0aa993f4f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -420,7 +420,6 @@ END(\label)
 	PTREGSCALL stub_clone, sys_clone, %r8
 	PTREGSCALL stub_fork, sys_fork, %rdi
 	PTREGSCALL stub_vfork, sys_vfork, %rdi
-	PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
 	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
 	PTREGSCALL stub_iopl, sys_iopl, %rsi
 
diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
index fe26e36d0f51..9c1a4a3470d9 100644
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -290,7 +290,7 @@ __SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
 #define __NR_rt_sigqueueinfo			129
 __SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
 #define __NR_rt_sigsuspend			130
-__SYSCALL(__NR_rt_sigsuspend, stub_rt_sigsuspend)
+__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
 #define __NR_sigaltstack			131
 __SYSCALL(__NR_sigaltstack, stub_sigaltstack)
 #define __NR_utime				132
-- 
cgit v1.2.3


From aeaaa59c7e15dcfaaf57ce069ef81683067d575d Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 17 Jun 2008 11:42:01 -0700
Subject: x86/paravirt/xen: add set_fixmap pv_mmu_ops

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/paravirt.c |  2 ++
 arch/x86/mm/pgtable.c      |  9 +++++++--
 arch/x86/xen/enlighten.c   | 29 +++++++++++++++++++++++++++++
 include/asm-x86/fixmap.h   | 14 ++++++++++++--
 include/asm-x86/paravirt.h | 13 +++++++++++++
 5 files changed, 63 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 74f0c5ea2a03..cf06670349dc 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -416,6 +416,8 @@ struct pv_mmu_ops pv_mmu_ops = {
 		.enter = paravirt_nop,
 		.leave = paravirt_nop,
 	},
+
+	.set_fixmap = native_set_fixmap,
 };
 
 EXPORT_SYMBOL_GPL(pv_time_ops);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7498124e30fc..e9fb66361fc8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -277,7 +277,7 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
 
 int fixmaps_set;
 
-void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
+void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
 {
 	unsigned long address = __fix_to_virt(idx);
 
@@ -285,6 +285,11 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 		BUG();
 		return;
 	}
-	set_pte_vaddr(address, pfn_pte(phys >> PAGE_SHIFT, flags));
+	set_pte_vaddr(address, pte);
 	fixmaps_set++;
 }
+
+void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
+{
+	__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
+}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c8a56e457d61..0ad8a64a2e05 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -960,6 +960,33 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 	return ret;
 }
 
+static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
+{
+	pte_t pte;
+
+	phys >>= PAGE_SHIFT;
+
+	switch (idx) {
+	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
+#ifdef CONFIG_X86_F00F_BUG
+	case FIX_F00F_IDT:
+#endif
+	case FIX_WP_TEST:
+	case FIX_VDSO:
+#ifdef CONFIG_X86_LOCAL_APIC
+	case FIX_APIC_BASE:	/* maps dummy local APIC */
+#endif
+		pte = pfn_pte(phys, prot);
+		break;
+
+	default:
+		pte = mfn_pte(phys, prot);
+		break;
+	}
+
+	__native_set_fixmap(idx, pte);
+}
+
 static const struct pv_info xen_info __initdata = {
 	.paravirt_enabled = 1,
 	.shared_kernel_pmd = 0,
@@ -1112,6 +1139,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 		.enter = paravirt_enter_lazy_mmu,
 		.leave = xen_leave_lazy,
 	},
+
+	.set_fixmap = xen_set_fixmap,
 };
 
 #ifdef CONFIG_SMP
diff --git a/include/asm-x86/fixmap.h b/include/asm-x86/fixmap.h
index 934d6b49b530..44d4f8217349 100644
--- a/include/asm-x86/fixmap.h
+++ b/include/asm-x86/fixmap.h
@@ -9,8 +9,18 @@
 
 extern int fixmaps_set;
 
-extern void __set_fixmap(enum fixed_addresses idx,
-			 unsigned long phys, pgprot_t flags);
+void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
+void native_set_fixmap(enum fixed_addresses idx,
+		       unsigned long phys, pgprot_t flags);
+
+#ifndef CONFIG_PARAVIRT
+static inline void __set_fixmap(enum fixed_addresses idx,
+				unsigned long phys, pgprot_t flags)
+{
+	native_set_fixmap(idx, phys, flags);
+}
+#endif
+
 #define set_fixmap(idx, phys)				\
 	__set_fixmap(idx, phys, PAGE_KERNEL)
 
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 0f13b945e240..af85caf9a799 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -273,6 +273,13 @@ struct pv_mmu_ops {
 #endif
 
 	struct pv_lazy_ops lazy_mode;
+
+	/* dom0 ops */
+
+	/* Sometimes the physical address is a pfn, and sometimes its
+	   an mfn.  We can tell which is which from the index. */
+	void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
+			   unsigned long phys, pgprot_t flags);
 };
 
 /* This contains all the paravirt structures: we get a convenient
@@ -1252,6 +1259,12 @@ static inline void arch_flush_lazy_mmu_mode(void)
 	}
 }
 
+static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
+				unsigned long phys, pgprot_t flags)
+{
+	pv_mmu_ops.set_fixmap(idx, phys, flags);
+}
+
 void _paravirt_nop(void);
 #define paravirt_nop	((void *)_paravirt_nop)
 
-- 
cgit v1.2.3


From 864fe51671c9e44fb9d02765623daac9acc26a8b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 20 May 2008 19:15:34 +0200
Subject: apm_32: BKL pushdown

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/x86/kernel/apm_32.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9290e29013..82222366477f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,6 +204,7 @@
 #include <linux/module.h>
 
 #include <linux/poll.h>
+#include <linux/smp_lock.h>
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/timer.h>
@@ -1544,10 +1545,12 @@ static int do_open(struct inode *inode, struct file *filp)
 {
 	struct apm_user *as;
 
+	lock_kernel();
 	as = kmalloc(sizeof(*as), GFP_KERNEL);
 	if (as == NULL) {
 		printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
 		       sizeof(*as));
+		       unlock_kernel();
 		return -ENOMEM;
 	}
 	as->magic = APM_BIOS_MAGIC;
@@ -1569,6 +1572,7 @@ static int do_open(struct inode *inode, struct file *filp)
 	user_list = as;
 	spin_unlock(&user_list_lock);
 	filp->private_data = as;
+	unlock_kernel();
 	return 0;
 }
 
-- 
cgit v1.2.3


From 77149367dade50af8370420265bd4f818cde8afd Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 20 May 2008 19:16:16 +0200
Subject: microcode: BKL pushdown

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/x86/kernel/microcode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 69729e38b78a..9f0a994eef9b 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -75,6 +75,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -422,6 +423,7 @@ out:
 
 static int microcode_open (struct inode *unused1, struct file *unused2)
 {
+	cycle_kernel_lock();
 	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
 }
 
-- 
cgit v1.2.3


From 0754557d72c1fbfc5fcfd5235e7c23ae6f77248c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 03:50:47 -0700
Subject: x86: change early_gart_iommu_check() back to any_mapped

Kevin Winchester reported a GART related direct rendering failure against
linux-next-20080611, which shows up via these log entries:

 PCI: Using ACPI for IRQ routing
 PCI: Cannot allocate resource region 0 of device 0000:00:00.0
 agpgart: Detected AGP bridge 0
 agpgart: Aperture conflicts with PCI mapping.
 agpgart: Aperture from AGP @ e0000000 size 128 MB
 agpgart: Aperture conflicts with PCI mapping.
 agpgart: No usable aperture found.
 agpgart: Consider rebooting with iommu=memaper=2 to get a good aperture.

instead of the expected:

 PCI: Using ACPI for IRQ routing
 agpgart: Detected AGP bridge 0
 agpgart: Aperture from AGP @ e0000000 size 128 MB

Kevin bisected it down to this change in tip/x86/gart:
"x86: checking aperture size order".

agp check is using request_mem_region(), and could fail if e820 is reserved...

change it back to e820_any_mapped().

Reported-and-bisected-by: "Kevin Winchester" <kjwinchester@gmail.com>
Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Tested-by: Kevin Winchester <kjwinchester@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/aperture_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3409abb231ac..e819362c7068 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -324,8 +324,8 @@ void __init early_gart_iommu_check(void)
 		fix = 1;
 
 	if (gart_fix_e820 && !fix && aper_enabled) {
-		if (!e820_all_mapped(aper_base, aper_base + aper_size,
-				    E820_RESERVED)) {
+		if (e820_any_mapped(aper_base, aper_base + aper_size,
+				    E820_RAM)) {
 			/* reserve it, so we can reuse it in second kernel */
 			printk(KERN_INFO "update e820 for GART\n");
 			add_memory_region(aper_base, aper_size, E820_RESERVED);
-- 
cgit v1.2.3


From 395a59d0f8e86bb39cd700c3d185d30c670bb958 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sat, 21 Jun 2008 23:47:27 +0530
Subject: ftrace: store mcount address in rec->ip

Record the address of the mcount call-site. Currently all archs except sparc64
record the address of the instruction following the mcount call-site. Some
general cleanups are entailed. Storing mcount addresses in rec->ip enables
looking them up in the kprobe hash table later on to check if they're kprobe'd.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: davem@davemloft.net
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/armksyms.c          | 10 +++++-----
 arch/arm/kernel/entry-common.S      |  4 ++++
 arch/arm/kernel/ftrace.c            | 16 +++++++---------
 arch/powerpc/kernel/entry_32.S      |  4 ++++
 arch/powerpc/kernel/entry_64.S      |  5 ++++-
 arch/powerpc/kernel/ftrace.c        | 21 +++++++--------------
 arch/sparc64/kernel/ftrace.c        | 10 ++++++----
 arch/sparc64/kernel/sparc64_ksyms.c |  2 +-
 arch/x86/kernel/entry_32.S          |  4 ++++
 arch/x86/kernel/entry_64.S          |  4 ++++
 arch/x86/kernel/ftrace.c            | 26 +++++++++-----------------
 arch/x86/kernel/i386_ksyms_32.c     |  2 +-
 arch/x86/kernel/x8664_ksyms_64.c    |  2 +-
 include/asm-arm/ftrace.h            | 14 ++++++++++++++
 include/asm-powerpc/ftrace.h        |  8 ++++++++
 include/asm-sparc64/ftrace.h        | 14 ++++++++++++++
 include/asm-x86/ftrace.h            | 14 ++++++++++++++
 include/linux/ftrace.h              |  3 +--
 kernel/trace/ftrace.c               |  3 ++-
 19 files changed, 110 insertions(+), 56 deletions(-)
 create mode 100644 include/asm-arm/ftrace.h
 create mode 100644 include/asm-sparc64/ftrace.h
 create mode 100644 include/asm-x86/ftrace.h

(limited to 'arch/x86/kernel')

diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 3b132215cbf8..cc7b246e9652 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -18,6 +18,7 @@
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
+#include <asm/ftrace.h>
 
 /*
  * libgcc functions - functions that are used internally by the
@@ -48,11 +49,6 @@ extern void __aeabi_ulcmp(void);
 extern void fpundefinstr(void);
 extern void fp_enter(void);
 
-#ifdef CONFIG_FTRACE
-extern void mcount(void);
-EXPORT_SYMBOL(mcount);
-#endif
-
 /*
  * This has a special calling convention; it doesn't
  * modify any of the usual registers, except for LR.
@@ -186,3 +182,7 @@ EXPORT_SYMBOL(_find_next_bit_be);
 #endif
 
 EXPORT_SYMBOL(copy_page);
+
+#ifdef CONFIG_FTRACE
+EXPORT_SYMBOL(mcount);
+#endif
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 8f79a4789ed4..84694e88b428 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -9,6 +9,7 @@
  */
 
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 #include <asm/arch/entry-macro.S>
 
 #include "entry-header.S"
@@ -104,6 +105,7 @@ ENTRY(ret_from_fork)
 ENTRY(mcount)
 	stmdb sp!, {r0-r3, lr}
 	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
 
 	.globl mcount_call
 mcount_call:
@@ -114,6 +116,7 @@ ENTRY(ftrace_caller)
 	stmdb sp!, {r0-r3, lr}
 	ldr r1, [fp, #-4]
 	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
 
 	.globl ftrace_call
 ftrace_call:
@@ -134,6 +137,7 @@ ENTRY(mcount)
 trace:
 	ldr r1, [fp, #-4]
 	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
 	mov lr, pc
 	mov pc, r2
 	ldmia sp!, {r0-r3, pc}
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index 22f3d6e309f9..76d50e6091bc 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -12,9 +12,10 @@
  */
 
 #include <linux/ftrace.h>
+
 #include <asm/cacheflush.h>
+#include <asm/ftrace.h>
 
-#define INSN_SIZE      4
 #define PC_OFFSET      8
 #define BL_OPCODE      0xeb000000
 #define BL_OFFSET_MASK 0x00ffffff
@@ -32,10 +33,10 @@ unsigned char *ftrace_call_replace(unsigned long pc, unsigned long addr)
 {
 	long offset;
 
-	offset = (long)addr - (long)(pc - INSN_SIZE + PC_OFFSET);
+	offset = (long)addr - (long)(pc + PC_OFFSET);
 	if (unlikely(offset < -33554432 || offset > 33554428)) {
 		/* Can't generate branches that far (from ARM ARM). Ftrace
-		 * doesn't generate branches outside of core kernel text.
+		 * doesn't generate branches outside of kernel text.
 		 */
 		WARN_ON_ONCE(1);
 		return NULL;
@@ -52,7 +53,6 @@ int ftrace_modify_code(unsigned long pc, unsigned char *old_code,
 
 	old = *(unsigned long *)old_code;
 	new = *(unsigned long *)new_code;
-	pc -= INSN_SIZE;
 
 	__asm__ __volatile__ (
 		"1:  ldr    %1, [%2]  \n"
@@ -77,7 +77,7 @@ int ftrace_modify_code(unsigned long pc, unsigned char *old_code,
 		: "memory");
 
 	if (!err && (replaced == old))
-		flush_icache_range(pc, pc + INSN_SIZE);
+		flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
 
 	return err;
 }
@@ -89,8 +89,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	unsigned char *new;
 
 	pc = (unsigned long)&ftrace_call;
-	pc += INSN_SIZE;
-	memcpy(&old, &ftrace_call, INSN_SIZE);
+	memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(pc, (unsigned long)func);
 	ret = ftrace_modify_code(pc, (unsigned char *)&old, new);
 	return ret;
@@ -103,8 +102,7 @@ int ftrace_mcount_set(unsigned long *data)
 	unsigned char *new;
 
 	pc = (unsigned long)&mcount_call;
-	pc += INSN_SIZE;
-	memcpy(&old, &mcount_call, INSN_SIZE);
+	memcpy(&old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(pc, *addr);
 	*addr = ftrace_modify_code(pc, (unsigned char *)&old, new);
 	return 0;
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 3b1dd29d9f91..7231a708af0d 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -30,6 +30,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 
 #undef SHOW_SYSCALLS
 #undef SHOW_SYSCALLS_TASK
@@ -1053,6 +1054,7 @@ _GLOBAL(_mcount)
 	stw	r10,40(r1)
 	stw	r3, 44(r1)
 	stw	r5, 8(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	.globl mcount_call
 mcount_call:
 	bl	ftrace_stub
@@ -1090,6 +1092,7 @@ _GLOBAL(ftrace_caller)
 	stw	r10,40(r1)
 	stw	r3, 44(r1)
 	stw	r5, 8(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 .globl ftrace_call
 ftrace_call:
 	bl	ftrace_stub
@@ -1128,6 +1131,7 @@ _GLOBAL(_mcount)
 	stw	r3, 44(r1)
 	stw	r5, 8(r1)
 
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	LOAD_REG_ADDR(r5, ftrace_trace_function)
 	lwz	r5,0(r5)
 
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 2c4d9e056ead..2f511a969d2c 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -31,6 +31,7 @@
 #include <asm/bug.h>
 #include <asm/ptrace.h>
 #include <asm/irqflags.h>
+#include <asm/ftrace.h>
 
 /*
  * System calls.
@@ -879,6 +880,7 @@ _GLOBAL(_mcount)
 	mflr	r3
 	stdu	r1, -112(r1)
 	std	r3, 128(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	.globl mcount_call
 mcount_call:
 	bl	ftrace_stub
@@ -895,6 +897,7 @@ _GLOBAL(ftrace_caller)
 	stdu	r1, -112(r1)
 	std	r3, 128(r1)
 	ld	r4, 16(r11)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 .globl ftrace_call
 ftrace_call:
 	bl	ftrace_stub
@@ -916,7 +919,7 @@ _GLOBAL(_mcount)
 	std	r3, 128(r1)
 	ld	r4, 16(r11)
 
-
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	LOAD_REG_ADDR(r5,ftrace_trace_function)
 	ld	r5,0(r5)
 	ld	r5,0(r5)
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index e12c593ab9ca..3855ceb937b0 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -15,8 +15,8 @@
 #include <linux/list.h>
 
 #include <asm/cacheflush.h>
+#include <asm/ftrace.h>
 
-#define CALL_BACK		4
 
 static unsigned int ftrace_nop = 0x60000000;
 
@@ -27,9 +27,10 @@ static unsigned int ftrace_nop = 0x60000000;
 # define GET_ADDR(addr) *(unsigned long *)addr
 #endif
 
+
 static unsigned int notrace ftrace_calc_offset(long ip, long addr)
 {
-	return (int)((addr + CALL_BACK) - ip);
+	return (int)(addr - ip);
 }
 
 notrace unsigned char *ftrace_nop_replace(void)
@@ -76,9 +77,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	unsigned new = *(unsigned *)new_code;
 	int faulted = 0;
 
-	/* move the IP back to the start of the call */
-	ip -= CALL_BACK;
-
 	/*
 	 * Note: Due to modules and __init, code can
 	 *  disappear and change, we need to protect against faulting
@@ -118,12 +116,10 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char old[4], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 	int ret;
 
-	ip += CALL_BACK;
-
-	memcpy(old, &ftrace_call, 4);
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	ret = ftrace_modify_code(ip, old, new);
 
@@ -134,16 +130,13 @@ notrace int ftrace_mcount_set(unsigned long *data)
 {
 	unsigned long ip = (long)(&mcount_call);
 	unsigned long *addr = data;
-	unsigned char old[4], *new;
-
-	/* ip is at the location, but modify code will subtact this */
-	ip += CALL_BACK;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
 	/*
 	 * Replace the mcount stub with a pointer to the
 	 * ip recorder function.
 	 */
-	memcpy(old, &mcount_call, 4);
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, *addr);
 	*addr = ftrace_modify_code(ip, old, new);
 
diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c
index c17373195b1e..4298d0aee713 100644
--- a/arch/sparc64/kernel/ftrace.c
+++ b/arch/sparc64/kernel/ftrace.c
@@ -5,6 +5,8 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
+#include <asm/ftrace.h>
+
 static const u32 ftrace_nop = 0x01000000;
 
 notrace unsigned char *ftrace_nop_replace(void)
@@ -60,9 +62,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char old[4], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
-	memcpy(old, &ftrace_call, 4);
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	return ftrace_modify_code(ip, old, new);
 }
@@ -71,13 +73,13 @@ notrace int ftrace_mcount_set(unsigned long *data)
 {
 	unsigned long ip = (long)(&mcount_call);
 	unsigned long *addr = data;
-	unsigned char old[4], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
 	/*
 	 * Replace the mcount stub with a pointer to the
 	 * ip recorder function.
 	 */
-	memcpy(old, &mcount_call, 4);
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, *addr);
 	*addr = ftrace_modify_code(ip, old, new);
 
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 8ac0b99f2c55..b80d982a29c6 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -53,6 +53,7 @@
 #include <asm/ns87303.h>
 #include <asm/timer.h>
 #include <asm/cpudata.h>
+#include <asm/ftrace.h>
 
 struct poll {
 	int fd;
@@ -112,7 +113,6 @@ EXPORT_SYMBOL(smp_call_function);
 #endif /* CONFIG_SMP */
 
 #if defined(CONFIG_MCOUNT)
-extern void _mcount(void);
 EXPORT_SYMBOL(_mcount);
 #endif
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 04ea83ccb979..95e6bbe3665e 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -51,6 +51,7 @@
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
 #include <asm/processor-flags.h>
+#include <asm/ftrace.h>
 #include "irq_vectors.h"
 
 /*
@@ -1118,6 +1119,7 @@ ENTRY(mcount)
 	pushl %ecx
 	pushl %edx
 	movl 0xc(%esp), %eax
+	subl $MCOUNT_INSN_SIZE, %eax
 
 .globl mcount_call
 mcount_call:
@@ -1136,6 +1138,7 @@ ENTRY(ftrace_caller)
 	pushl %edx
 	movl 0xc(%esp), %eax
 	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
 
 .globl ftrace_call
 ftrace_call:
@@ -1166,6 +1169,7 @@ trace:
 	pushl %edx
 	movl 0xc(%esp), %eax
 	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
 
 	call *ftrace_trace_function
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index fe25e5febca3..b0f7308f78a6 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -51,6 +51,7 @@
 #include <asm/page.h>
 #include <asm/irqflags.h>
 #include <asm/paravirt.h>
+#include <asm/ftrace.h>
 
 	.code64
 
@@ -68,6 +69,7 @@ ENTRY(mcount)
 	movq %r9, 48(%rsp)
 
 	movq 0x38(%rsp), %rdi
+	subq $MCOUNT_INSN_SIZE, %rdi
 
 .globl mcount_call
 mcount_call:
@@ -99,6 +101,7 @@ ENTRY(ftrace_caller)
 
 	movq 0x38(%rsp), %rdi
 	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
 
 .globl ftrace_call
 ftrace_call:
@@ -139,6 +142,7 @@ trace:
 
 	movq 0x38(%rsp), %rdi
 	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
 
 	call   *ftrace_trace_function
 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 55828149e01e..ab115cd15fdf 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -17,20 +17,21 @@
 #include <linux/list.h>
 
 #include <asm/alternative.h>
+#include <asm/ftrace.h>
 
-#define CALL_BACK		5
 
 /* Long is fine, even if it is only 4 bytes ;-) */
 static long *ftrace_nop;
 
 union ftrace_code_union {
-	char code[5];
+	char code[MCOUNT_INSN_SIZE];
 	struct {
 		char e8;
 		int offset;
 	} __attribute__((packed));
 };
 
+
 static int notrace ftrace_calc_offset(long ip, long addr)
 {
 	return (int)(addr - ip);
@@ -46,7 +47,7 @@ notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 	static union ftrace_code_union calc;
 
 	calc.e8		= 0xe8;
-	calc.offset	= ftrace_calc_offset(ip, addr);
+	calc.offset	= ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
 
 	/*
 	 * No locking needed, this must be called via kstop_machine
@@ -65,9 +66,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	unsigned char newch = new_code[4];
 	int faulted = 0;
 
-	/* move the IP back to the start of the call */
-	ip -= CALL_BACK;
-
 	/*
 	 * Note: Due to modules and __init, code can
 	 *  disappear and change, we need to protect against faulting
@@ -102,12 +100,10 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char old[5], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 	int ret;
 
-	ip += CALL_BACK;
-
-	memcpy(old, &ftrace_call, 5);
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	ret = ftrace_modify_code(ip, old, new);
 
@@ -118,16 +114,13 @@ notrace int ftrace_mcount_set(unsigned long *data)
 {
 	unsigned long ip = (long)(&mcount_call);
 	unsigned long *addr = data;
-	unsigned char old[5], *new;
-
-	/* ip is at the location, but modify code will subtact this */
-	ip += CALL_BACK;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
 	/*
 	 * Replace the mcount stub with a pointer to the
 	 * ip recorder function.
 	 */
-	memcpy(old, &mcount_call, 5);
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, *addr);
 	*addr = ftrace_modify_code(ip, old, new);
 
@@ -142,8 +135,7 @@ int __init ftrace_dyn_arch_init(void *data)
 
 	ftrace_mcount_set(data);
 
-	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
+	ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
 
 	return 0;
 }
-
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 29999dbb754c..dd7ebee446af 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,9 +1,9 @@
-#include <linux/ftrace.h>
 #include <linux/module.h>
 
 #include <asm/checksum.h>
 #include <asm/pgtable.h>
 #include <asm/desc.h>
+#include <asm/ftrace.h>
 
 #ifdef CONFIG_FTRACE
 /* mcount is defined in assembly */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 122885bc5f3b..16ff4bf418d9 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -1,7 +1,6 @@
 /* Exports for assembly files.
    All C exports should go in the respective C files. */
 
-#include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/smp.h>
 
@@ -11,6 +10,7 @@
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
+#include <asm/ftrace.h>
 
 #ifdef CONFIG_FTRACE
 /* mcount is defined in assembly */
diff --git a/include/asm-arm/ftrace.h b/include/asm-arm/ftrace.h
new file mode 100644
index 000000000000..584ef9a8e5a5
--- /dev/null
+++ b/include/asm-arm/ftrace.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_ARM_FTRACE
+#define _ASM_ARM_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_ARM_FTRACE */
diff --git a/include/asm-powerpc/ftrace.h b/include/asm-powerpc/ftrace.h
index b1bfa704b6e5..de921326cca8 100644
--- a/include/asm-powerpc/ftrace.h
+++ b/include/asm-powerpc/ftrace.h
@@ -1,6 +1,14 @@
 #ifndef _ASM_POWERPC_FTRACE
 #define _ASM_POWERPC_FTRACE
 
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(_mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
 extern void _mcount(void);
+#endif
 
 #endif
+
+#endif /* _ASM_POWERPC_FTRACE */
diff --git a/include/asm-sparc64/ftrace.h b/include/asm-sparc64/ftrace.h
new file mode 100644
index 000000000000..f76a40a338bb
--- /dev/null
+++ b/include/asm-sparc64/ftrace.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_SPARC64_FTRACE
+#define _ASM_SPARC64_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(_mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void _mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_SPARC64_FTRACE */
diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h
new file mode 100644
index 000000000000..c184441133f2
--- /dev/null
+++ b/include/asm-x86/ftrace.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_FTRACE
+#define _ASM_SPARC64_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(mcount))
+#define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+#endif
+
+#endif /* CONFIG_FTRACE */
+
+#endif /* _ASM_X86_FTRACE */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 20e14d0093c7..366098d591de 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -31,7 +31,6 @@ int unregister_ftrace_function(struct ftrace_ops *ops);
 void clear_ftrace_function(void);
 
 extern void ftrace_stub(unsigned long a0, unsigned long a1);
-extern void mcount(void);
 
 #else /* !CONFIG_FTRACE */
 # define register_ftrace_function(ops) do { } while (0)
@@ -54,7 +53,7 @@ enum {
 
 struct dyn_ftrace {
 	struct hlist_node node;
-	unsigned long	  ip;
+	unsigned long	  ip; /* address of mcount call-site */
 	unsigned long	  flags;
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0d5bcf69952d..f1e9e5c74e64 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,8 @@
 #include <linux/hash.h>
 #include <linux/list.h>
 
+#include <asm/ftrace.h>
+
 #include "trace.h"
 
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -329,7 +331,6 @@ ftrace_record_ip(unsigned long ip)
 }
 
 #define FTRACE_ADDR ((long)(ftrace_caller))
-#define MCOUNT_ADDR ((long)(mcount))
 
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec,
-- 
cgit v1.2.3


From 3da757daf86e498872855f0b5e101f763ba79499 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Fri, 20 Jun 2008 15:06:33 -0700
Subject: x86: use cpu_khz for loops_per_jiffy calculation

On the x86 platform we can use the value of tsc_khz computed during tsc
calibration to calculate the loops_per_jiffy value. Its very important
to keep the error in lpj values to minimum as any error in that may
result in kernel panic in check_timer. In virtualization environment, On
a highly overloaded host the guest delay calibration may sometimes
result in errors beyond the ~50% that timer_irq_works can handle,
resulting in the guest panicking.

Does some formating changes to lpj_setup code to now have a single
printk to print the bogomips value.

We do this only for the boot processor because the AP's can have
different base frequencies or the BIOS might boot a AP at a different
frequency.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Daniel Hecht <dhecht@vmware.com>
Cc: Tim Mann <mann@vmware.com>
Cc: Zach Amsden <zach@vmware.com>
Cc: Sahil Rihan <srihan@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/time_64.c |  2 ++
 arch/x86/kernel/tsc_32.c  |  5 +++++
 include/linux/delay.h     |  1 +
 init/calibrate.c          | 36 +++++++++++++++++++-----------------
 4 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index c737849e2ef7..12b4a71bd074 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -123,6 +123,8 @@ void __init time_init(void)
 		(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
 		cpu_khz = calculate_cpu_khz();
 
+	lpj_tsc = ((unsigned long)tsc_khz * 1000)/HZ;
+
 	if (unsynchronized_tsc())
 		mark_tsc_unstable("TSCs unsynchronized");
 
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 068759db63dd..be729035b30b 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -401,6 +401,7 @@ static inline void check_geode_tsc_reliable(void) { }
 void __init tsc_init(void)
 {
 	int cpu;
+	u64 lpj;
 
 	if (!cpu_has_tsc || tsc_disabled) {
 		/* Disable the TSC in case of !cpu_has_tsc */
@@ -421,6 +422,10 @@ void __init tsc_init(void)
 		return;
 	}
 
+	lpj = ((u64)tsc_khz * 1000);
+	do_div(lpj, HZ);
+	lpj_tsc = lpj;
+
 	printk("Detected %lu.%03lu MHz processor.\n",
 				(unsigned long)cpu_khz / 1000,
 				(unsigned long)cpu_khz % 1000);
diff --git a/include/linux/delay.h b/include/linux/delay.h
index 54552d21296e..01aec60590ab 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -41,6 +41,7 @@ static inline void ndelay(unsigned long x)
 #define ndelay(x) ndelay(x)
 #endif
 
+extern unsigned long lpj_tsc;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
diff --git a/init/calibrate.c b/init/calibrate.c
index ecb3822d4f70..86286974dada 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -8,7 +8,9 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/timex.h>
+#include <linux/smp.h>
 
+unsigned long lpj_tsc;
 unsigned long preset_lpj;
 static int __init lpj_setup(char *str)
 {
@@ -108,6 +110,10 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
  * This is the number of bits of precision for the loops_per_jiffy.  Each
  * bit takes on average 1.5/HZ seconds.  This (like the original) is a little
  * better than 1%
+ * For the boot cpu we can skip the delay calibration and assign it a value
+ * calculated based on the tsc frequency.
+ * For the rest of the CPUs we cannot assume that the tsc frequency is same as
+ * the cpu frequency, hence do the calibration for those.
  */
 #define LPS_PREC 8
 
@@ -118,20 +124,20 @@ void __cpuinit calibrate_delay(void)
 
 	if (preset_lpj) {
 		loops_per_jiffy = preset_lpj;
-		printk("Calibrating delay loop (skipped)... "
-			"%lu.%02lu BogoMIPS preset\n",
-			loops_per_jiffy/(500000/HZ),
-			(loops_per_jiffy/(5000/HZ)) % 100);
+		printk(KERN_INFO
+			"Calibrating delay loop (skipped) preset value.. ");
+	} else if ((smp_processor_id() == 0) && lpj_tsc) {
+		loops_per_jiffy = lpj_tsc;
+		printk(KERN_INFO
+			"Calibrating delay loop (skipped), "
+			"using tsc calculated value.. ");
 	} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
-		printk("Calibrating delay using timer specific routine.. ");
-		printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
-			loops_per_jiffy/(500000/HZ),
-			(loops_per_jiffy/(5000/HZ)) % 100,
-			loops_per_jiffy);
+		printk(KERN_INFO
+			"Calibrating delay using timer specific routine.. ");
 	} else {
 		loops_per_jiffy = (1<<12);
 
-		printk(KERN_DEBUG "Calibrating delay loop... ");
+		printk(KERN_INFO "Calibrating delay loop... ");
 		while ((loops_per_jiffy <<= 1) != 0) {
 			/* wait for "start of" clock tick */
 			ticks = jiffies;
@@ -161,12 +167,8 @@ void __cpuinit calibrate_delay(void)
 			if (jiffies != ticks)	/* longer than 1 tick */
 				loops_per_jiffy &= ~loopbit;
 		}
-
-		/* Round the value and print it */
-		printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
-			loops_per_jiffy/(500000/HZ),
-			(loops_per_jiffy/(5000/HZ)) % 100,
-			loops_per_jiffy);
 	}
-
+	printk(KERN_INFO "%lu.%02lu BogoMIPS (lpj=%lu)\n",
+			loops_per_jiffy/(500000/HZ),
+			(loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy);
 }
-- 
cgit v1.2.3


From 6ff10de374cc68ff2024247793176dc8a1b317ea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 24 Jun 2008 01:19:49 +0200
Subject: x86: fix "x86: use cpu_khz for loops_per_jiffy calculation"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

arch/x86/kernel/tsc_32.c: In function ‘tsc_init':
arch/x86/kernel/tsc_32.c:421: error: ‘lpj_tsc' undeclared (first use in this function)
arch/x86/kernel/tsc_32.c:421: error: (Each undeclared identifier is reported only once
arch/x86/kernel/tsc_32.c:421: error: for each function it appears in.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc_32.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index be729035b30b..0af49fb533e1 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -1,6 +1,7 @@
 #include <linux/sched.h>
 #include <linux/clocksource.h>
 #include <linux/workqueue.h>
+#include <linux/delay.h>
 #include <linux/cpufreq.h>
 #include <linux/jiffies.h>
 #include <linux/init.h>
-- 
cgit v1.2.3


From f3f3149f35b9195ef4b761b1353fc0766b5f53be Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Mon, 23 Jun 2008 18:21:56 -0700
Subject: x86: use cpu_khz for loops_per_jiffy calculation, cleanup

As suggested by Ingo, remove all references to tsc from init/calibrate.c

TSC is x86 specific, and using tsc in variable names in a generic file should
be avoided. lpj_tsc is now called lpj_fine, since it is related to fine tuning
of lpj value. Also tsc_rate_*  is called timer_rate_*

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Daniel Hecht <dhecht@vmware.com>
Cc: Tim Mann <mann@vmware.com>
Cc: Zach Amsden <zach@vmware.com>
Cc: Sahil Rihan <srihan@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/time_64.c |  2 +-
 arch/x86/kernel/tsc_32.c  |  2 +-
 include/linux/delay.h     |  2 +-
 init/calibrate.c          | 36 +++++++++++++++++++-----------------
 4 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 12b4a71bd074..39ae8511a137 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -123,7 +123,7 @@ void __init time_init(void)
 		(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
 		cpu_khz = calculate_cpu_khz();
 
-	lpj_tsc = ((unsigned long)tsc_khz * 1000)/HZ;
+	lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ;
 
 	if (unsynchronized_tsc())
 		mark_tsc_unstable("TSCs unsynchronized");
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 0af49fb533e1..048baab77268 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -425,7 +425,7 @@ void __init tsc_init(void)
 
 	lpj = ((u64)tsc_khz * 1000);
 	do_div(lpj, HZ);
-	lpj_tsc = lpj;
+	lpj_fine = lpj;
 
 	printk("Detected %lu.%03lu MHz processor.\n",
 				(unsigned long)cpu_khz / 1000,
diff --git a/include/linux/delay.h b/include/linux/delay.h
index 01aec60590ab..fd832c6d419e 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -41,7 +41,7 @@ static inline void ndelay(unsigned long x)
 #define ndelay(x) ndelay(x)
 #endif
 
-extern unsigned long lpj_tsc;
+extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
diff --git a/init/calibrate.c b/init/calibrate.c
index 86286974dada..7963e3fc51d9 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -10,7 +10,7 @@
 #include <linux/timex.h>
 #include <linux/smp.h>
 
-unsigned long lpj_tsc;
+unsigned long lpj_fine;
 unsigned long preset_lpj;
 static int __init lpj_setup(char *str)
 {
@@ -35,9 +35,9 @@ static unsigned long __cpuinit calibrate_delay_direct(void)
 	unsigned long pre_start, start, post_start;
 	unsigned long pre_end, end, post_end;
 	unsigned long start_jiffies;
-	unsigned long tsc_rate_min, tsc_rate_max;
-	unsigned long good_tsc_sum = 0;
-	unsigned long good_tsc_count = 0;
+	unsigned long timer_rate_min, timer_rate_max;
+	unsigned long good_timer_sum = 0;
+	unsigned long good_timer_count = 0;
 	int i;
 
 	if (read_current_timer(&pre_start) < 0 )
@@ -81,22 +81,24 @@ static unsigned long __cpuinit calibrate_delay_direct(void)
 		}
 		read_current_timer(&post_end);
 
-		tsc_rate_max = (post_end - pre_start) / DELAY_CALIBRATION_TICKS;
-		tsc_rate_min = (pre_end - post_start) / DELAY_CALIBRATION_TICKS;
+		timer_rate_max = (post_end - pre_start) /
+					DELAY_CALIBRATION_TICKS;
+		timer_rate_min = (pre_end - post_start) /
+					DELAY_CALIBRATION_TICKS;
 
 		/*
-	 	 * If the upper limit and lower limit of the tsc_rate is
+		 * If the upper limit and lower limit of the timer_rate is
 		 * >= 12.5% apart, redo calibration.
 		 */
 		if (pre_start != 0 && pre_end != 0 &&
-		    (tsc_rate_max - tsc_rate_min) < (tsc_rate_max >> 3)) {
-			good_tsc_count++;
-			good_tsc_sum += tsc_rate_max;
+		    (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) {
+			good_timer_count++;
+			good_timer_sum += timer_rate_max;
 		}
 	}
 
-	if (good_tsc_count)
-		return (good_tsc_sum/good_tsc_count);
+	if (good_timer_count)
+		return (good_timer_sum/good_timer_count);
 
 	printk(KERN_WARNING "calibrate_delay_direct() failed to get a good "
 	       "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n");
@@ -111,8 +113,8 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
  * bit takes on average 1.5/HZ seconds.  This (like the original) is a little
  * better than 1%
  * For the boot cpu we can skip the delay calibration and assign it a value
- * calculated based on the tsc frequency.
- * For the rest of the CPUs we cannot assume that the tsc frequency is same as
+ * calculated based on the timer frequency.
+ * For the rest of the CPUs we cannot assume that the timer frequency is same as
  * the cpu frequency, hence do the calibration for those.
  */
 #define LPS_PREC 8
@@ -126,11 +128,11 @@ void __cpuinit calibrate_delay(void)
 		loops_per_jiffy = preset_lpj;
 		printk(KERN_INFO
 			"Calibrating delay loop (skipped) preset value.. ");
-	} else if ((smp_processor_id() == 0) && lpj_tsc) {
-		loops_per_jiffy = lpj_tsc;
+	} else if ((smp_processor_id() == 0) && lpj_fine) {
+		loops_per_jiffy = lpj_fine;
 		printk(KERN_INFO
 			"Calibrating delay loop (skipped), "
-			"using tsc calculated value.. ");
+			"value calculated using timer frequency.. ");
 	} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
 		printk(KERN_INFO
 			"Calibrating delay using timer specific routine.. ");
-- 
cgit v1.2.3


From 378fc6eedc0091cc5ba65b298b8967bd2d43df5d Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 24 Jun 2008 16:21:18 +0100
Subject: x86: arch/x86/kernel/machine_kexec_32.c: remove extra semicolons

I can't see any reason to keep these semicolons.

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/machine_kexec_32.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d0b234c9fc31..f4960171bc66 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -39,7 +39,7 @@ static void set_idt(void *newidt, __u16 limit)
 	curidt.address = (unsigned long)newidt;
 
 	load_idt(&curidt);
-};
+}
 
 
 static void set_gdt(void *newgdt, __u16 limit)
@@ -51,7 +51,7 @@ static void set_gdt(void *newgdt, __u16 limit)
 	curgdt.address = (unsigned long)newgdt;
 
 	load_gdt(&curgdt);
-};
+}
 
 static void load_segments(void)
 {
-- 
cgit v1.2.3


From 08b882c627aeeeb3cfd3c4354f0d360d7949549d Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 16 Jun 2008 04:30:01 -0700
Subject: paravirt: add hooks for ptep_modify_prot_start/commit

This patch adds paravirt-ops hooks in pv_mmu_ops for ptep_modify_prot_start and
ptep_modify_prot_commit.  This allows the hypervisor-specific backends to
implement these in some more efficient way.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/paravirt.c |  3 +++
 arch/x86/xen/enlighten.c   |  3 +++
 include/asm-x86/paravirt.h | 28 ++++++++++++++++++++++++++++
 3 files changed, 34 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c98d54688180..f1ab0f727007 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -380,6 +380,9 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.pte_update = paravirt_nop,
 	.pte_update_defer = paravirt_nop,
 
+	.ptep_modify_prot_start = __ptep_modify_prot_start,
+	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
 #ifdef CONFIG_HIGHPTE
 	.kmap_atomic_pte = kmap_atomic,
 #endif
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 73fb0c4c150a..0b7553cbc529 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1138,6 +1138,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.set_pte_at = xen_set_pte_at,
 	.set_pmd = xen_set_pmd_hyper,
 
+	.ptep_modify_prot_start = __ptep_modify_prot_start,
+	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
 	.pte_val = xen_pte_val,
 	.pte_flags = native_pte_val,
 	.pgd_val = xen_pgd_val,
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 5ea37a48eecb..e9ada314dfc1 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -238,6 +238,11 @@ struct pv_mmu_ops {
 	void (*pte_update_defer)(struct mm_struct *mm,
 				 unsigned long addr, pte_t *ptep);
 
+	pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep);
+	void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, pte_t pte);
+
 	pteval_t (*pte_val)(pte_t);
 	pteval_t (*pte_flags)(pte_t);
 	pte_t (*make_pte)(pteval_t pte);
@@ -1039,6 +1044,29 @@ static inline pgdval_t pgd_val(pgd_t pgd)
 	return ret;
 }
 
+#define  __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+					   pte_t *ptep)
+{
+	pteval_t ret;
+
+	ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start,
+			 mm, addr, ptep);
+
+	return (pte_t) { .pte = ret };
+}
+
+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+					   pte_t *ptep, pte_t pte)
+{
+	if (sizeof(pteval_t) > sizeof(long))
+		/* 5 arg words */
+		pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte);
+	else
+		PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit,
+			    mm, addr, ptep, pte.pte);
+}
+
 static inline void set_pte(pte_t *ptep, pte_t pte)
 {
 	if (sizeof(pteval_t) > sizeof(long))
-- 
cgit v1.2.3


From fa3d319ac6f648dc51cb672a31f482c80a8c2457 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Thu, 26 Jun 2008 00:25:43 +0200
Subject: pci-gart_64.c: could we get better explanation?

Add better explanation to pci-gart.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Trivial patch monkey <trivial@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-gart_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index f505c3890358..f20c20a7853d 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -824,10 +824,10 @@ void __init gart_iommu_init(void)
 	wbinvd();
 
 	/*
-	 * Try to workaround a bug (thanks to BenH)
+	 * Try to workaround a bug (thanks to BenH):
 	 * Set unmapped entries to a scratch page instead of 0.
 	 * Any prefetches that hit unmapped entries won't get an bus abort
-	 * then.
+	 * then. (P2P bridge may be prefetching on DMA reads).
 	 */
 	scratch = get_zeroed_page(GFP_KERNEL);
 	if (!scratch)
-- 
cgit v1.2.3


From f6e2e6b6fc465bc3cc4eae8d53fcf573ca1cfa14 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:39 +0200
Subject: x86, AMD IOMMU: add defines and structures for ACPI scanning code

This patch adds the required data structures and constants required to parse
the ACPI table for the AMD IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 101 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 arch/x86/kernel/amd_iommu_init.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
new file mode 100644
index 000000000000..6fce5ab683d8
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <asm/pci-direct.h>
+#include <asm/amd_iommu_types.h>
+#include <asm/gart.h>
+
+/*
+ * definitions for the ACPI scanning code
+ */
+#define UPDATE_LAST_BDF(x) do {\
+	if ((x) > amd_iommu_last_bdf) \
+		amd_iommu_last_bdf = (x); \
+	} while (0);
+
+#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
+#define PCI_BUS(x) (((x) >> 8) & 0xff)
+#define IVRS_HEADER_LENGTH 48
+#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
+
+#define ACPI_IVHD_TYPE                  0x10
+#define ACPI_IVMD_TYPE_ALL              0x20
+#define ACPI_IVMD_TYPE                  0x21
+#define ACPI_IVMD_TYPE_RANGE            0x22
+
+#define IVHD_DEV_ALL                    0x01
+#define IVHD_DEV_SELECT                 0x02
+#define IVHD_DEV_SELECT_RANGE_START     0x03
+#define IVHD_DEV_RANGE_END              0x04
+#define IVHD_DEV_ALIAS                  0x42
+#define IVHD_DEV_ALIAS_RANGE            0x43
+#define IVHD_DEV_EXT_SELECT             0x46
+#define IVHD_DEV_EXT_SELECT_RANGE       0x47
+
+#define IVHD_FLAG_HT_TUN_EN             0x00
+#define IVHD_FLAG_PASSPW_EN             0x01
+#define IVHD_FLAG_RESPASSPW_EN          0x02
+#define IVHD_FLAG_ISOC_EN               0x03
+
+#define IVMD_FLAG_EXCL_RANGE            0x08
+#define IVMD_FLAG_UNITY_MAP             0x01
+
+#define ACPI_DEVFLAG_INITPASS           0x01
+#define ACPI_DEVFLAG_EXTINT             0x02
+#define ACPI_DEVFLAG_NMI                0x04
+#define ACPI_DEVFLAG_SYSMGT1            0x10
+#define ACPI_DEVFLAG_SYSMGT2            0x20
+#define ACPI_DEVFLAG_LINT0              0x40
+#define ACPI_DEVFLAG_LINT1              0x80
+#define ACPI_DEVFLAG_ATSDIS             0x10000000
+
+struct ivhd_header {
+	u8 type;
+	u8 flags;
+	u16 length;
+	u16 devid;
+	u16 cap_ptr;
+	u64 mmio_phys;
+	u16 pci_seg;
+	u16 info;
+	u32 reserved;
+} __attribute__((packed));
+
+struct ivhd_entry {
+	u8 type;
+	u16 devid;
+	u8 flags;
+	u32 ext;
+} __attribute__((packed));
+
+struct ivmd_header {
+	u8 type;
+	u8 flags;
+	u16 length;
+	u16 devid;
+	u16 aux;
+	u64 resv;
+	u64 range_start;
+	u64 range_length;
+} __attribute__((packed));
+
-- 
cgit v1.2.3


From 928abd2545fe367ea3ff3cb8a5076e1d6d2a9574 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:40 +0200
Subject: x86, AMD IOMMU: add data structures to manage the IOMMUs in the
 system

This patch adds the data structures which will contain the information read
from the ACPI table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6fce5ab683d8..0ad8cf9e7ba2 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -99,3 +99,20 @@ struct ivmd_header {
 	u64 range_length;
 } __attribute__((packed));
 
+static int __initdata amd_iommu_disable;
+
+u16 amd_iommu_last_bdf;
+struct list_head amd_iommu_unity_map;
+unsigned amd_iommu_aperture_order = 26;
+int amd_iommu_isolate;
+
+struct list_head amd_iommu_list;
+struct dev_table_entry *amd_iommu_dev_table;
+u16 *amd_iommu_alias_table;
+struct amd_iommu **amd_iommu_rlookup_table;
+struct protection_domain **amd_iommu_pd_table;
+unsigned long *amd_iommu_pd_alloc_bitmap;
+
+static u32 dev_table_size;
+static u32 alias_table_size;
+static u32 rlookup_table_size;
-- 
cgit v1.2.3


From 3e8064ba59128bcb1405079a0789b27b356832b9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:41 +0200
Subject: x86, AMD IOMMU: add functions to find last possible PCI device for
 IOMMU

This patch adds functions to find the last PCI bus/device/function the IOMMU
driver has to handle. This information is used later to allocate the memory for
the data structures.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 79 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 0ad8cf9e7ba2..ee0b2da027bc 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -116,3 +116,82 @@ unsigned long *amd_iommu_pd_alloc_bitmap;
 static u32 dev_table_size;
 static u32 alias_table_size;
 static u32 rlookup_table_size;
+
+static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
+{
+	u32 cap;
+
+	cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
+	UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+
+	return 0;
+}
+
+static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
+{
+	u8 *p = (void *)h, *end = (void *)h;
+	struct ivhd_entry *dev;
+
+	p += sizeof(*h);
+	end += h->length;
+
+	find_last_devid_on_pci(PCI_BUS(h->devid),
+			PCI_SLOT(h->devid),
+			PCI_FUNC(h->devid),
+			h->cap_ptr);
+
+	while (p < end) {
+		dev = (struct ivhd_entry *)p;
+		switch (dev->type) {
+		case IVHD_DEV_SELECT:
+		case IVHD_DEV_RANGE_END:
+		case IVHD_DEV_ALIAS:
+		case IVHD_DEV_EXT_SELECT:
+			UPDATE_LAST_BDF(dev->devid);
+			break;
+		default:
+			break;
+		}
+		p += 0x04 << (*p >> 6);
+	}
+
+	WARN_ON(p != end);
+
+	return 0;
+}
+
+static int __init find_last_devid_acpi(struct acpi_table_header *table)
+{
+	int i;
+	u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
+	struct ivhd_header *h;
+
+	/*
+	 * Validate checksum here so we don't need to do it when
+	 * we actually parse the table
+	 */
+	for (i = 0; i < table->length; ++i)
+		checksum += p[i];
+	if (checksum != 0)
+		/* ACPI table corrupt */
+		return -ENODEV;
+
+	p += IVRS_HEADER_LENGTH;
+
+	end += table->length;
+	while (p < end) {
+		h = (struct ivhd_header *)p;
+		switch (h->type) {
+		case ACPI_IVHD_TYPE:
+			find_last_devid_from_ivhd(h);
+			break;
+		default:
+			break;
+		}
+		p += h->length;
+	}
+	WARN_ON(p != end);
+
+	return 0;
+}
+
-- 
cgit v1.2.3


From ca7ed057ae25e5e60814f950995f22f051d2e449 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:42 +0200
Subject: x86, AMD IOMMU: add amd_iommu_init.c to Makefile

This patch adds the source file amd_iommu_init.c to the kernel Makefile for the
x86 architecture.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 77807d4769c9..1e4e00aca18c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,6 +100,7 @@ ifeq ($(CONFIG_X86_64),y)
 
         obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
         obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
+        obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o
         obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb_64.o
 
         obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
-- 
cgit v1.2.3


From 6c56747b46717b4c6a890b35e8518f4be961dc7e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:43 +0200
Subject: x86, AMD IOMMU: add functions for mapping/unmapping the MMIO space

This patch contains two functions to map and unmap the MMIO region of an IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index ee0b2da027bc..3147e6991005 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -117,6 +117,29 @@ static u32 dev_table_size;
 static u32 alias_table_size;
 static u32 rlookup_table_size;
 
+static u8 * __init iommu_map_mmio_space(u64 address)
+{
+	u8 *ret;
+
+	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
+		return NULL;
+
+	ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
+	if (ret != NULL)
+		return ret;
+
+	release_mem_region(address, MMIO_REGION_LENGTH);
+
+	return NULL;
+}
+
+static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
+{
+	if (iommu->mmio_base)
+		iounmap(iommu->mmio_base);
+	release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
+}
+
 static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
 {
 	u32 cap;
-- 
cgit v1.2.3


From b2026aa2dce4454950ccd9c410790f310d65696a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:44 +0200
Subject: x86, AMD IOMMU: add functions for programming IOMMU MMIO space

This patch adds the functions required to programm the IOMMU with the MMIO
space.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 60 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 3147e6991005..ffb8ac82e32f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -117,6 +117,66 @@ static u32 dev_table_size;
 static u32 alias_table_size;
 static u32 rlookup_table_size;
 
+static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+{
+	u64 start = iommu->exclusion_start & PAGE_MASK;
+	u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
+	u64 entry;
+
+	if (!iommu->exclusion_start)
+		return;
+
+	entry = start | MMIO_EXCL_ENABLE_MASK;
+	memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
+			&entry, sizeof(entry));
+
+	entry = limit;
+	memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
+			&entry, sizeof(entry));
+}
+
+static void __init iommu_set_device_table(struct amd_iommu *iommu)
+{
+	u32 entry;
+
+	BUG_ON(iommu->mmio_base == NULL);
+
+	entry = virt_to_phys(amd_iommu_dev_table);
+	entry |= (dev_table_size >> 12) - 1;
+	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
+			&entry, sizeof(entry));
+}
+
+static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+{
+	u32 ctrl;
+
+	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	ctrl |= (1 << bit);
+	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+{
+	u32 ctrl;
+
+	ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	ctrl &= ~(1 << bit);
+	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+void __init iommu_enable(struct amd_iommu *iommu)
+{
+	u32 ctrl;
+
+	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
+	print_devid(iommu->devid, 0);
+	printk(" cap 0x%hx\n", iommu->cap_ptr);
+
+	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
+	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
 static u8 * __init iommu_map_mmio_space(u64 address)
 {
 	u8 *ret;
-- 
cgit v1.2.3


From b36ca91e1d2d7e846844820784d57d20ad73dbd8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:45 +0200
Subject: x86, AMD IOMMU: add command buffer (de-)allocation

This patch adds the functions to allocate and deallocate the command buffer for
one IOMMU in the system.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index ffb8ac82e32f..c2be3adee877 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -278,3 +278,33 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
 	return 0;
 }
 
+static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
+{
+	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL,
+			get_order(CMD_BUFFER_SIZE));
+	u64 entry = 0;
+
+	if (cmd_buf == NULL)
+		return NULL;
+
+	iommu->cmd_buf_size = CMD_BUFFER_SIZE;
+
+	memset(cmd_buf, 0, CMD_BUFFER_SIZE);
+
+	entry = (u64)virt_to_phys(cmd_buf);
+	entry |= MMIO_CMD_SIZE_512;
+	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
+			&entry, sizeof(entry));
+
+	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+
+	return cmd_buf;
+}
+
+static void __init free_command_buffer(struct amd_iommu *iommu)
+{
+	if (iommu->cmd_buf)
+		free_pages((unsigned long)iommu->cmd_buf,
+				get_order(CMD_BUFFER_SIZE));
+}
+
-- 
cgit v1.2.3


From 3566b7786afd7c14c62726f359df3c827054670b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:46 +0200
Subject: x86, AMD IOMMU: add device table initialization functions

This patch adds functions necessary to initialize the device table from the
ACPI definitions.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 45 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c2be3adee877..4c37abb3435b 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -308,3 +308,48 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
 				get_order(CMD_BUFFER_SIZE));
 }
 
+static void set_dev_entry_bit(u16 devid, u8 bit)
+{
+	int i = (bit >> 5) & 0x07;
+	int _bit = bit & 0x1f;
+
+	amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
+}
+
+static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
+{
+	if (flags & ACPI_DEVFLAG_INITPASS)
+		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
+	if (flags & ACPI_DEVFLAG_EXTINT)
+		set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
+	if (flags & ACPI_DEVFLAG_NMI)
+		set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
+	if (flags & ACPI_DEVFLAG_SYSMGT1)
+		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
+	if (flags & ACPI_DEVFLAG_SYSMGT2)
+		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
+	if (flags & ACPI_DEVFLAG_LINT0)
+		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
+	if (flags & ACPI_DEVFLAG_LINT1)
+		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
+}
+
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+	amd_iommu_rlookup_table[devid] = iommu;
+}
+
+static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
+{
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+	if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
+		return;
+
+	if (iommu) {
+		set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
+		iommu->exclusion_start = m->range_start;
+		iommu->exclusion_length = m->range_length;
+	}
+}
+
-- 
cgit v1.2.3


From 5d0c8e49f88b908a8fcc913c4b9843108ae8897b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:47 +0200
Subject: x86, AMD IOMMU: add functions for IOMMU hardware initialization from
 ACPI

This patch adds functions to initialize the IOMMU hardware with information
from ACPI and PCI.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 125 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 4c37abb3435b..8ec48f1f39d3 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -353,3 +353,128 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
 	}
 }
 
+static void __init init_iommu_from_pci(struct amd_iommu *iommu)
+{
+	int bus = PCI_BUS(iommu->devid);
+	int dev = PCI_SLOT(iommu->devid);
+	int fn  = PCI_FUNC(iommu->devid);
+	int cap_ptr = iommu->cap_ptr;
+	u32 range;
+
+	iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
+
+	range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
+	iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range));
+	iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range));
+}
+
+static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
+					struct ivhd_header *h)
+{
+	u8 *p = (u8 *)h;
+	u8 *end = p, flags = 0;
+	u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
+	u32 ext_flags = 0;
+	bool alias = 0;
+	struct ivhd_entry *e;
+
+	/*
+	 * First set the recommended feature enable bits from ACPI
+	 * into the IOMMU control registers
+	 */
+	h->flags & IVHD_FLAG_HT_TUN_EN ?
+		iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+		iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+
+	h->flags & IVHD_FLAG_PASSPW_EN ?
+		iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+		iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+
+	h->flags & IVHD_FLAG_RESPASSPW_EN ?
+		iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+		iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+
+	h->flags & IVHD_FLAG_ISOC_EN ?
+		iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+		iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+
+	/*
+	 * make IOMMU memory accesses cache coherent
+	 */
+	iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+
+	/*
+	 * Done. Now parse the device entries
+	 */
+	p += sizeof(struct ivhd_header);
+	end += h->length;
+
+	while (p < end) {
+		e = (struct ivhd_entry *)p;
+		switch (e->type) {
+		case IVHD_DEV_ALL:
+			for (dev_i = iommu->first_device;
+					dev_i <= iommu->last_device; ++dev_i)
+				set_dev_entry_from_acpi(dev_i, e->flags, 0);
+			break;
+		case IVHD_DEV_SELECT:
+			devid = e->devid;
+			set_dev_entry_from_acpi(devid, e->flags, 0);
+			break;
+		case IVHD_DEV_SELECT_RANGE_START:
+			devid_start = e->devid;
+			flags = e->flags;
+			ext_flags = 0;
+			alias = 0;
+			break;
+		case IVHD_DEV_ALIAS:
+			devid = e->devid;
+			devid_to = e->ext >> 8;
+			set_dev_entry_from_acpi(devid, e->flags, 0);
+			amd_iommu_alias_table[devid] = devid_to;
+			break;
+		case IVHD_DEV_ALIAS_RANGE:
+			devid_start = e->devid;
+			flags = e->flags;
+			devid_to = e->ext >> 8;
+			ext_flags = 0;
+			alias = 1;
+			break;
+		case IVHD_DEV_EXT_SELECT:
+			devid = e->devid;
+			set_dev_entry_from_acpi(devid, e->flags, e->ext);
+			break;
+		case IVHD_DEV_EXT_SELECT_RANGE:
+			devid_start = e->devid;
+			flags = e->flags;
+			ext_flags = e->ext;
+			alias = 0;
+			break;
+		case IVHD_DEV_RANGE_END:
+			devid = e->devid;
+			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
+				if (alias)
+					amd_iommu_alias_table[dev_i] = devid_to;
+				set_dev_entry_from_acpi(
+						amd_iommu_alias_table[dev_i],
+						flags, ext_flags);
+			}
+			break;
+		default:
+			break;
+		}
+
+		p += 0x04 << (e->type >> 6);
+	}
+}
+
+static int __init init_iommu_devices(struct amd_iommu *iommu)
+{
+	u16 i;
+
+	for (i = iommu->first_device; i <= iommu->last_device; ++i)
+		set_iommu_for_device(iommu, i);
+
+	return 0;
+}
+
-- 
cgit v1.2.3


From e47d402d2df89bb1aa77b7573597a9dd2241aafe Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:48 +0200
Subject: x86, AMD IOMMU: add detect code for AMD IOMMU hardware

This patch adds the detection of AMD IOMMU hardware provided on information
from ACPI provided by the BIOS.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 78 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8ec48f1f39d3..3f4f7b890441 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -478,3 +478,81 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
 	return 0;
 }
 
+static void __init free_iommu_one(struct amd_iommu *iommu)
+{
+	free_command_buffer(iommu);
+	iommu_unmap_mmio_space(iommu);
+}
+
+static void __init free_iommu_all(void)
+{
+	struct amd_iommu *iommu, *next;
+
+	list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+		list_del(&iommu->list);
+		free_iommu_one(iommu);
+		kfree(iommu);
+	}
+}
+
+static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
+{
+	spin_lock_init(&iommu->lock);
+	list_add_tail(&iommu->list, &amd_iommu_list);
+
+	/*
+	 * Copy data from ACPI table entry to the iommu struct
+	 */
+	iommu->devid = h->devid;
+	iommu->cap_ptr = h->cap_ptr;
+	iommu->mmio_phys = h->mmio_phys;
+	iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
+	if (!iommu->mmio_base)
+		return -ENOMEM;
+
+	iommu_set_device_table(iommu);
+	iommu->cmd_buf = alloc_command_buffer(iommu);
+	if (!iommu->cmd_buf)
+		return -ENOMEM;
+
+	init_iommu_from_pci(iommu);
+	init_iommu_from_acpi(iommu, h);
+	init_iommu_devices(iommu);
+
+	return 0;
+}
+
+static int __init init_iommu_all(struct acpi_table_header *table)
+{
+	u8 *p = (u8 *)table, *end = (u8 *)table;
+	struct ivhd_header *h;
+	struct amd_iommu *iommu;
+	int ret;
+
+	INIT_LIST_HEAD(&amd_iommu_list);
+
+	end += table->length;
+	p += IVRS_HEADER_LENGTH;
+
+	while (p < end) {
+		h = (struct ivhd_header *)p;
+		switch (*p) {
+		case ACPI_IVHD_TYPE:
+			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
+			if (iommu == NULL)
+				return -ENOMEM;
+			ret = init_iommu_one(iommu, h);
+			if (ret)
+				return ret;
+			break;
+		default:
+			break;
+		}
+		p += h->length;
+
+	}
+	WARN_ON(p != end);
+
+	return 0;
+}
+
-- 
cgit v1.2.3


From be2a022c0dd0f630b06f83de284df53cb60a308f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:49 +0200
Subject: x86, AMD IOMMU: add functions to parse IOMMU memory mapping
 requirements for devices

This patch adds the functions to parse the information about IOMMU exclusion
ranges and required unity mappings for the devices handled by the IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 87 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 3f4f7b890441..555fcc9830c8 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -556,3 +556,90 @@ static int __init init_iommu_all(struct acpi_table_header *table)
 	return 0;
 }
 
+static void __init free_unity_maps(void)
+{
+	struct unity_map_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+}
+
+static int __init init_exclusion_range(struct ivmd_header *m)
+{
+	int i;
+
+	switch (m->type) {
+	case ACPI_IVMD_TYPE:
+		set_device_exclusion_range(m->devid, m);
+		break;
+	case ACPI_IVMD_TYPE_ALL:
+		for (i = 0; i < amd_iommu_last_bdf; ++i)
+			set_device_exclusion_range(i, m);
+		break;
+	case ACPI_IVMD_TYPE_RANGE:
+		for (i = m->devid; i <= m->aux; ++i)
+			set_device_exclusion_range(i, m);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int __init init_unity_map_range(struct ivmd_header *m)
+{
+	struct unity_map_entry *e = 0;
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (e == NULL)
+		return -ENOMEM;
+
+	switch (m->type) {
+	default:
+	case ACPI_IVMD_TYPE:
+		e->devid_start = e->devid_end = m->devid;
+		break;
+	case ACPI_IVMD_TYPE_ALL:
+		e->devid_start = 0;
+		e->devid_end = amd_iommu_last_bdf;
+		break;
+	case ACPI_IVMD_TYPE_RANGE:
+		e->devid_start = m->devid;
+		e->devid_end = m->aux;
+		break;
+	}
+	e->address_start = PAGE_ALIGN(m->range_start);
+	e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
+	e->prot = m->flags >> 1;
+
+	list_add_tail(&e->list, &amd_iommu_unity_map);
+
+	return 0;
+}
+
+static int __init init_memory_definitions(struct acpi_table_header *table)
+{
+	u8 *p = (u8 *)table, *end = (u8 *)table;
+	struct ivmd_header *m;
+
+	INIT_LIST_HEAD(&amd_iommu_unity_map);
+
+	end += table->length;
+	p += IVRS_HEADER_LENGTH;
+
+	while (p < end) {
+		m = (struct ivmd_header *)p;
+		if (m->flags & IVMD_FLAG_EXCL_RANGE)
+			init_exclusion_range(m);
+		else if (m->flags & IVMD_FLAG_UNITY_MAP)
+			init_unity_map_range(m);
+
+		p += m->length;
+	}
+
+	return 0;
+}
+
-- 
cgit v1.2.3


From fe74c9cf3985e307e9734296d08a270d510e3fb7 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:50 +0200
Subject: x86, AMD IOMMU: clue initialization code together

This patch puts the AMD IOMMU ACPI table parsing and hardware initialization
functions together to the main intialization routine.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 126 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 555fcc9830c8..c792ddc4fec6 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -643,3 +643,129 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
 	return 0;
 }
 
+int __init amd_iommu_init(void)
+{
+	int i, ret = 0;
+
+
+	if (amd_iommu_disable) {
+		printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
+		return 0;
+	}
+
+	/*
+	 * First parse ACPI tables to find the largest Bus/Dev/Func
+	 * we need to handle. Upon this information the shared data
+	 * structures for the IOMMUs in the system will be allocated
+	 */
+	if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
+		return -ENODEV;
+
+	dev_table_size     = TBL_SIZE(DEV_TABLE_ENTRY_SIZE);
+	alias_table_size   = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE);
+	rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE);
+
+	ret = -ENOMEM;
+
+	/* Device table - directly used by all IOMMUs */
+	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL,
+				      get_order(dev_table_size));
+	if (amd_iommu_dev_table == NULL)
+		goto out;
+
+	/*
+	 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
+	 * IOMMU see for that device
+	 */
+	amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
+			get_order(alias_table_size));
+	if (amd_iommu_alias_table == NULL)
+		goto free;
+
+	/* IOMMU rlookup table - find the IOMMU for a specific device */
+	amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL,
+			get_order(rlookup_table_size));
+	if (amd_iommu_rlookup_table == NULL)
+		goto free;
+
+	/*
+	 * Protection Domain table - maps devices to protection domains
+	 * This table has the same size as the rlookup_table
+	 */
+	amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL,
+				     get_order(rlookup_table_size));
+	if (amd_iommu_pd_table == NULL)
+		goto free;
+
+	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL,
+					    get_order(MAX_DOMAIN_ID/8));
+	if (amd_iommu_pd_alloc_bitmap == NULL)
+		goto free;
+
+	/*
+	 * memory is allocated now; initialize the device table with all zeroes
+	 * and let all alias entries point to itself
+	 */
+	memset(amd_iommu_dev_table, 0, dev_table_size);
+	for (i = 0; i < amd_iommu_last_bdf; ++i)
+		amd_iommu_alias_table[i] = i;
+
+	memset(amd_iommu_pd_table, 0, rlookup_table_size);
+	memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
+
+	/*
+	 * never allocate domain 0 because its used as the non-allocated and
+	 * error value placeholder
+	 */
+	amd_iommu_pd_alloc_bitmap[0] = 1;
+
+	/*
+	 * now the data structures are allocated and basically initialized
+	 * start the real acpi table scan
+	 */
+	ret = -ENODEV;
+	if (acpi_table_parse("IVRS", init_iommu_all) != 0)
+		goto free;
+
+	if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
+		goto free;
+
+	printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
+			(1 << (amd_iommu_aperture_order-20)));
+
+	printk(KERN_INFO "AMD IOMMU: device isolation ");
+	if (amd_iommu_isolate)
+		printk("enabled\n");
+	else
+		printk("disabled\n");
+
+out:
+	return ret;
+
+free:
+	if (amd_iommu_pd_alloc_bitmap)
+		free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+
+	if (amd_iommu_pd_table)
+		free_pages((unsigned long)amd_iommu_pd_table,
+				get_order(rlookup_table_size));
+
+	if (amd_iommu_rlookup_table)
+		free_pages((unsigned long)amd_iommu_rlookup_table,
+				get_order(rlookup_table_size));
+
+	if (amd_iommu_alias_table)
+		free_pages((unsigned long)amd_iommu_alias_table,
+				get_order(alias_table_size));
+
+	if (amd_iommu_dev_table)
+		free_pages((unsigned long)amd_iommu_dev_table,
+				get_order(dev_table_size));
+
+	free_iommu_all();
+
+	free_unity_maps();
+
+	goto out;
+}
+
-- 
cgit v1.2.3


From ae7877de9cd8da075a46d5f615cb9fc9858e4b2f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:51 +0200
Subject: x86, AMD IOMMU: add early detection code

This patch adds code to detect an AMD IOMMU early during boot before the early
detection code of other IOMMUs run.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c792ddc4fec6..d7a75bf0f7e6 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -769,3 +769,23 @@ free:
 	goto out;
 }
 
+static int __init early_amd_iommu_detect(struct acpi_table_header *table)
+{
+	return 0;
+}
+
+void __init amd_iommu_detect(void)
+{
+	if (swiotlb || no_iommu || iommu_detected)
+		return;
+
+	if (amd_iommu_disable)
+		return;
+
+	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
+		iommu_detected = 1;
+		gart_iommu_aperture_disabled = 1;
+		gart_iommu_aperture = 0;
+	}
+}
+
-- 
cgit v1.2.3


From 918ad6c54d107ccfc4ccaa3a54b97437467fb58a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:52 +0200
Subject: x86, AMD IOMMU: add kernel command line parameters for AMD IOMMU

This patch adds two parameters to the kernel command line to control behavior
of the AMD IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index d7a75bf0f7e6..ec6f13b69d53 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -789,3 +789,37 @@ void __init amd_iommu_detect(void)
 	}
 }
 
+static int __init parse_amd_iommu_options(char *str)
+{
+	for (; *str; ++str) {
+		if (strcmp(str, "off") == 0)
+			amd_iommu_disable = 1;
+		if (strcmp(str, "isolate") == 0)
+			amd_iommu_isolate = 1;
+	}
+
+	return 1;
+}
+
+static int __init parse_amd_iommu_size_options(char *str)
+{
+	for (; *str; ++str) {
+		if (strcmp(str, "32M") == 0)
+			amd_iommu_aperture_order = 25;
+		if (strcmp(str, "64M") == 0)
+			amd_iommu_aperture_order = 26;
+		if (strcmp(str, "128M") == 0)
+			amd_iommu_aperture_order = 27;
+		if (strcmp(str, "256M") == 0)
+			amd_iommu_aperture_order = 28;
+		if (strcmp(str, "512M") == 0)
+			amd_iommu_aperture_order = 29;
+		if (strcmp(str, "1G") == 0)
+			amd_iommu_aperture_order = 30;
+	}
+
+	return 1;
+}
+
+__setup("amd_iommu=", parse_amd_iommu_options);
+__setup("amd_iommu_size=", parse_amd_iommu_size_options);
-- 
cgit v1.2.3


From b6c02715cff8c96174022166bfa3e1a25436fb34 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:53 +0200
Subject: x86, AMD IOMMU: add generic defines and structures for mapping code

This patch adds generic stuff used by the mapping and unmapping code for the
AMD IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 arch/x86/kernel/amd_iommu.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
new file mode 100644
index 000000000000..90392c7b253b
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/pci.h>
+#include <linux/gfp.h>
+#include <linux/bitops.h>
+#include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
+#include <asm/proto.h>
+#include <asm/gart.h>
+#include <asm/amd_iommu_types.h>
+
+#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
+
+#define to_pages(addr, size) \
+	 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+
+static DEFINE_RWLOCK(amd_iommu_devtable_lock);
+
+struct command {
+	u32 data[4];
+};
+
+
-- 
cgit v1.2.3


From 000fca2dfcfbd2859fed1208c38c899ab4873049 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:54 +0200
Subject: x86, AMD IOMMU: add amd_iommu.c to Makefile

This patch adds the new amd_iommu.c file to the architecture kernel Makefile
for x86.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 1e4e00aca18c..b34e0bb18b60 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,7 +100,7 @@ ifeq ($(CONFIG_X86_64),y)
 
         obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
         obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
-        obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o
+        obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o amd_iommu.o
         obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb_64.o
 
         obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
-- 
cgit v1.2.3


From a19ae1eccfb2d97f4704b1a2b3d1d9905845dcac Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:55 +0200
Subject: x86, AMD IOMMU: add functions to send IOMMU commands

This patch adds generic handling function as well as all functions to send
specific commands to the IOMMU hardware as required by this driver.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 106 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 90392c7b253b..a24ee4a5203a 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -37,4 +37,110 @@ struct command {
 	u32 data[4];
 };
 
+static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+{
+	u32 tail, head;
+	u8 *target;
+
+	tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+	target = (iommu->cmd_buf + tail);
+	memcpy_toio(target, cmd, sizeof(*cmd));
+	tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+	head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+	if (tail == head)
+		return -ENOMEM;
+	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+	return 0;
+}
+
+static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	ret = __iommu_queue_command(iommu, cmd);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return ret;
+}
+
+static int iommu_completion_wait(struct amd_iommu *iommu)
+{
+	int ret;
+	struct command cmd;
+	volatile u64 ready = 0;
+	unsigned long ready_phys = virt_to_phys(&ready);
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
+	cmd.data[1] = HIGH_U32(ready_phys);
+	cmd.data[2] = 1; /* value written to 'ready' */
+	CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+
+	iommu->need_sync = 0;
+
+	ret = iommu_queue_command(iommu, &cmd);
+
+	if (ret)
+		return ret;
+
+	while (!ready)
+		cpu_relax();
+
+	return 0;
+}
+
+static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
+{
+	struct command cmd;
+
+	BUG_ON(iommu == NULL);
+
+	memset(&cmd, 0, sizeof(cmd));
+	CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
+	cmd.data[0] = devid;
+
+	iommu->need_sync = 1;
+
+	return iommu_queue_command(iommu, &cmd);
+}
+
+static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
+		u64 address, u16 domid, int pde, int s)
+{
+	struct command cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	address &= PAGE_MASK;
+	CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
+	cmd.data[1] |= domid;
+	cmd.data[2] = LOW_U32(address);
+	cmd.data[3] = HIGH_U32(address);
+	if (s)
+		cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+	if (pde)
+		cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+
+	iommu->need_sync = 1;
+
+	return iommu_queue_command(iommu, &cmd);
+}
+
+static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
+		u64 address, size_t size)
+{
+	int i;
+	unsigned pages = to_pages(address, size);
+
+	address &= PAGE_MASK;
+
+	for (i = 0; i < pages; ++i) {
+		iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 0);
+		address += PAGE_SIZE;
+	}
+
+	return 0;
+}
 
-- 
cgit v1.2.3


From bd0e521158af407ec816aea070831d4ca7ae65e9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:56 +0200
Subject: x86, AMD IOMMU: add functions to initialize unity mappings

This patch adds the functions which will initialize the unity mappings in the
device page tables.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 122 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a24ee4a5203a..1d70f5e6f438 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -37,6 +37,9 @@ struct command {
 	u32 data[4];
 };
 
+static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
+			     struct unity_map_entry *e);
+
 static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
 {
 	u32 tail, head;
@@ -144,3 +147,122 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
 	return 0;
 }
 
+static int iommu_map(struct protection_domain *dom,
+		     unsigned long bus_addr,
+		     unsigned long phys_addr,
+		     int prot)
+{
+	u64 __pte, *pte, *page;
+
+	bus_addr  = PAGE_ALIGN(bus_addr);
+	phys_addr = PAGE_ALIGN(bus_addr);
+
+	/* only support 512GB address spaces for now */
+	if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
+		return -EINVAL;
+
+	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
+
+	if (!IOMMU_PTE_PRESENT(*pte)) {
+		page = (u64 *)get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		*pte = IOMMU_L2_PDE(virt_to_phys(page));
+	}
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+	if (!IOMMU_PTE_PRESENT(*pte)) {
+		page = (u64 *)get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		*pte = IOMMU_L1_PDE(virt_to_phys(page));
+	}
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+
+	if (IOMMU_PTE_PRESENT(*pte))
+		return -EBUSY;
+
+	__pte = phys_addr | IOMMU_PTE_P;
+	if (prot & IOMMU_PROT_IR)
+		__pte |= IOMMU_PTE_IR;
+	if (prot & IOMMU_PROT_IW)
+		__pte |= IOMMU_PTE_IW;
+
+	*pte = __pte;
+
+	return 0;
+}
+
+static int iommu_for_unity_map(struct amd_iommu *iommu,
+			       struct unity_map_entry *entry)
+{
+	u16 bdf, i;
+
+	for (i = entry->devid_start; i <= entry->devid_end; ++i) {
+		bdf = amd_iommu_alias_table[i];
+		if (amd_iommu_rlookup_table[bdf] == iommu)
+			return 1;
+	}
+
+	return 0;
+}
+
+static int iommu_init_unity_mappings(struct amd_iommu *iommu)
+{
+	struct unity_map_entry *entry;
+	int ret;
+
+	list_for_each_entry(entry, &amd_iommu_unity_map, list) {
+		if (!iommu_for_unity_map(iommu, entry))
+			continue;
+		ret = dma_ops_unity_map(iommu->default_dom, entry);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
+			     struct unity_map_entry *e)
+{
+	u64 addr;
+	int ret;
+
+	for (addr = e->address_start; addr < e->address_end;
+	     addr += PAGE_SIZE) {
+		ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
+		if (ret)
+			return ret;
+		/*
+		 * if unity mapping is in aperture range mark the page
+		 * as allocated in the aperture
+		 */
+		if (addr < dma_dom->aperture_size)
+			__set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+	}
+
+	return 0;
+}
+
+static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
+					  u16 devid)
+{
+	struct unity_map_entry *e;
+	int ret;
+
+	list_for_each_entry(e, &amd_iommu_unity_map, list) {
+		if (!(devid >= e->devid_start && devid <= e->devid_end))
+			continue;
+		ret = dma_ops_unity_map(dma_dom, e);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
-- 
cgit v1.2.3


From d3086444b2988659a50af09462261c78d3012cb4 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:57 +0200
Subject: x86, AMD IOMMU: add address allocation and deallocation functions

This patch adds the address allocator to the AMD IOMMU code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 48 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 1d70f5e6f438..69d8d024387e 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -266,3 +266,51 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
 	return 0;
 }
 
+static unsigned long dma_mask_to_pages(unsigned long mask)
+{
+	return (mask >> PAGE_SHIFT) +
+		(PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
+}
+
+static unsigned long dma_ops_alloc_addresses(struct device *dev,
+					     struct dma_ops_domain *dom,
+					     unsigned int pages)
+{
+	unsigned long limit = dma_mask_to_pages(*dev->dma_mask);
+	unsigned long address;
+	unsigned long size = dom->aperture_size >> PAGE_SHIFT;
+	unsigned long boundary_size;
+
+	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+			PAGE_SIZE) >> PAGE_SHIFT;
+	limit = limit < size ? limit : size;
+
+	if (dom->next_bit >= limit)
+		dom->next_bit = 0;
+
+	address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
+			0 , boundary_size, 0);
+	if (address == -1)
+		address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
+				0, boundary_size, 0);
+
+	if (likely(address != -1)) {
+		set_bit_string(dom->bitmap, address, pages);
+		dom->next_bit = address + pages;
+		address <<= PAGE_SHIFT;
+	} else
+		address = bad_dma_address;
+
+	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
+
+	return address;
+}
+
+static void dma_ops_free_addresses(struct dma_ops_domain *dom,
+				   unsigned long address,
+				   unsigned int pages)
+{
+	address >>= PAGE_SHIFT;
+	iommu_area_free(dom->bitmap, address, pages);
+}
+
-- 
cgit v1.2.3


From ec487d1a110abfffebdf116560db5c1d71b94cab Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:58 +0200
Subject: x86, AMD IOMMU: add domain allocation and deallocation functions

This patch adds the functions to allocate and free protection domains for the
IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 147 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 69d8d024387e..c43d15dbc8fa 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -314,3 +314,150 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 	iommu_area_free(dom->bitmap, address, pages);
 }
 
+static u16 domain_id_alloc(void)
+{
+	unsigned long flags;
+	int id;
+
+	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
+	BUG_ON(id == 0);
+	if (id > 0 && id < MAX_DOMAIN_ID)
+		__set_bit(id, amd_iommu_pd_alloc_bitmap);
+	else
+		id = 0;
+	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+	return id;
+}
+
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+				      unsigned long start_page,
+				      unsigned int pages)
+{
+	unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+
+	if (start_page + pages > last_page)
+		pages = last_page - start_page;
+
+	set_bit_string(dom->bitmap, start_page, pages);
+}
+
+static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
+{
+	int i, j;
+	u64 *p1, *p2, *p3;
+
+	p1 = dma_dom->domain.pt_root;
+
+	if (!p1)
+		return;
+
+	for (i = 0; i < 512; ++i) {
+		if (!IOMMU_PTE_PRESENT(p1[i]))
+			continue;
+
+		p2 = IOMMU_PTE_PAGE(p1[i]);
+		for (j = 0; j < 512; ++i) {
+			if (!IOMMU_PTE_PRESENT(p2[j]))
+				continue;
+			p3 = IOMMU_PTE_PAGE(p2[j]);
+			free_page((unsigned long)p3);
+		}
+
+		free_page((unsigned long)p2);
+	}
+
+	free_page((unsigned long)p1);
+}
+
+static void dma_ops_domain_free(struct dma_ops_domain *dom)
+{
+	if (!dom)
+		return;
+
+	dma_ops_free_pagetable(dom);
+
+	kfree(dom->pte_pages);
+
+	kfree(dom->bitmap);
+
+	kfree(dom);
+}
+
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
+						   unsigned order)
+{
+	struct dma_ops_domain *dma_dom;
+	unsigned i, num_pte_pages;
+	u64 *l2_pde;
+	u64 address;
+
+	/*
+	 * Currently the DMA aperture must be between 32 MB and 1GB in size
+	 */
+	if ((order < 25) || (order > 30))
+		return NULL;
+
+	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
+	if (!dma_dom)
+		return NULL;
+
+	spin_lock_init(&dma_dom->domain.lock);
+
+	dma_dom->domain.id = domain_id_alloc();
+	if (dma_dom->domain.id == 0)
+		goto free_dma_dom;
+	dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
+	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+	dma_dom->domain.priv = dma_dom;
+	if (!dma_dom->domain.pt_root)
+		goto free_dma_dom;
+	dma_dom->aperture_size = (1ULL << order);
+	dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
+				  GFP_KERNEL);
+	if (!dma_dom->bitmap)
+		goto free_dma_dom;
+	/*
+	 * mark the first page as allocated so we never return 0 as
+	 * a valid dma-address. So we can use 0 as error value
+	 */
+	dma_dom->bitmap[0] = 1;
+	dma_dom->next_bit = 0;
+
+	if (iommu->exclusion_start &&
+	    iommu->exclusion_start < dma_dom->aperture_size) {
+		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+		int pages = to_pages(iommu->exclusion_start,
+				iommu->exclusion_length);
+		dma_ops_reserve_addresses(dma_dom, startpage, pages);
+	}
+
+	num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
+	dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
+			GFP_KERNEL);
+	if (!dma_dom->pte_pages)
+		goto free_dma_dom;
+
+	l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
+	if (l2_pde == NULL)
+		goto free_dma_dom;
+
+	dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
+
+	for (i = 0; i < num_pte_pages; ++i) {
+		dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
+		if (!dma_dom->pte_pages[i])
+			goto free_dma_dom;
+		address = virt_to_phys(dma_dom->pte_pages[i]);
+		l2_pde[i] = IOMMU_L1_PDE(address);
+	}
+
+	return dma_dom;
+
+free_dma_dom:
+	dma_ops_domain_free(dma_dom);
+
+	return NULL;
+}
+
-- 
cgit v1.2.3


From b20ac0d4d6cff969d7074bbf02a1b86058df0804 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:27:59 +0200
Subject: x86, AMD IOMMU: add functions to find IOMMU device resources

This patch adds functions necessary to find the IOMMU resources for a specific
device.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 75 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index c43d15dbc8fa..47e80b5814bf 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -461,3 +461,78 @@ free_dma_dom:
 	return NULL;
 }
 
+static struct protection_domain *domain_for_device(u16 devid)
+{
+	struct protection_domain *dom;
+	unsigned long flags;
+
+	read_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	dom = amd_iommu_pd_table[devid];
+	read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+	return dom;
+}
+
+static void set_device_domain(struct amd_iommu *iommu,
+			      struct protection_domain *domain,
+			      u16 devid)
+{
+	unsigned long flags;
+
+	u64 pte_root = virt_to_phys(domain->pt_root);
+
+	pte_root |= (domain->mode & 0x07) << 9;
+	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2;
+
+	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	amd_iommu_dev_table[devid].data[0] = pte_root;
+	amd_iommu_dev_table[devid].data[1] = pte_root >> 32;
+	amd_iommu_dev_table[devid].data[2] = domain->id;
+
+	amd_iommu_pd_table[devid] = domain;
+	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+	iommu_queue_inv_dev_entry(iommu, devid);
+
+	iommu->need_sync = 1;
+}
+
+static int get_device_resources(struct device *dev,
+				struct amd_iommu **iommu,
+				struct protection_domain **domain,
+				u16 *bdf)
+{
+	struct dma_ops_domain *dma_dom;
+	struct pci_dev *pcidev;
+	u16 _bdf;
+
+	BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
+
+	pcidev = to_pci_dev(dev);
+	_bdf = (pcidev->bus->number << 8) | pcidev->devfn;
+
+	if (_bdf >= amd_iommu_last_bdf) {
+		*iommu = NULL;
+		*domain = NULL;
+		*bdf = 0xffff;
+		return 0;
+	}
+
+	*bdf = amd_iommu_alias_table[_bdf];
+
+	*iommu = amd_iommu_rlookup_table[*bdf];
+	if (*iommu == NULL)
+		return 0;
+	dma_dom = (*iommu)->default_dom;
+	*domain = domain_for_device(*bdf);
+	if (*domain == NULL) {
+		*domain = &dma_dom->domain;
+		set_device_domain(*iommu, *domain, *bdf);
+		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
+				"device ", (*domain)->id);
+		print_devid(_bdf, 1);
+	}
+
+	return 1;
+}
+
-- 
cgit v1.2.3


From cb76c3229725c6dcae31da65e9ca57f434628c05 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:28:00 +0200
Subject: x86, AMD IOMMU: add generic dma_ops mapping functions

This patch adds the generic functions to map and unmap pages to a protection
domain for dma_ops usage.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 105 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 47e80b5814bf..e00a3e7ba356 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -536,3 +536,108 @@ static int get_device_resources(struct device *dev,
 	return 1;
 }
 
+static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
+				     struct dma_ops_domain *dom,
+				     unsigned long address,
+				     phys_addr_t paddr,
+				     int direction)
+{
+	u64 *pte, __pte;
+
+	WARN_ON(address > dom->aperture_size);
+
+	paddr &= PAGE_MASK;
+
+	pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	pte += IOMMU_PTE_L0_INDEX(address);
+
+	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
+
+	if (direction == DMA_TO_DEVICE)
+		__pte |= IOMMU_PTE_IR;
+	else if (direction == DMA_FROM_DEVICE)
+		__pte |= IOMMU_PTE_IW;
+	else if (direction == DMA_BIDIRECTIONAL)
+		__pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
+
+	WARN_ON(*pte);
+
+	*pte = __pte;
+
+	return (dma_addr_t)address;
+}
+
+static void dma_ops_domain_unmap(struct amd_iommu *iommu,
+				 struct dma_ops_domain *dom,
+				 unsigned long address)
+{
+	u64 *pte;
+
+	if (address >= dom->aperture_size)
+		return;
+
+	WARN_ON(address & 0xfffULL || address > dom->aperture_size);
+
+	pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	pte += IOMMU_PTE_L0_INDEX(address);
+
+	WARN_ON(!*pte);
+
+	*pte = 0ULL;
+}
+
+static dma_addr_t __map_single(struct device *dev,
+			       struct amd_iommu *iommu,
+			       struct dma_ops_domain *dma_dom,
+			       phys_addr_t paddr,
+			       size_t size,
+			       int dir)
+{
+	dma_addr_t offset = paddr & ~PAGE_MASK;
+	dma_addr_t address, start;
+	unsigned int pages;
+	int i;
+
+	pages = to_pages(paddr, size);
+	paddr &= PAGE_MASK;
+
+	address = dma_ops_alloc_addresses(dev, dma_dom, pages);
+	if (unlikely(address == bad_dma_address))
+		goto out;
+
+	start = address;
+	for (i = 0; i < pages; ++i) {
+		dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+		paddr += PAGE_SIZE;
+		start += PAGE_SIZE;
+	}
+	address += offset;
+
+out:
+	return address;
+}
+
+static void __unmap_single(struct amd_iommu *iommu,
+			   struct dma_ops_domain *dma_dom,
+			   dma_addr_t dma_addr,
+			   size_t size,
+			   int dir)
+{
+	dma_addr_t i, start;
+	unsigned int pages;
+
+	if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
+		return;
+
+	pages = to_pages(dma_addr, size);
+	dma_addr &= PAGE_MASK;
+	start = dma_addr;
+
+	for (i = 0; i < pages; ++i) {
+		dma_ops_domain_unmap(iommu, dma_dom, start);
+		start += PAGE_SIZE;
+	}
+
+	dma_ops_free_addresses(dma_dom, dma_addr, pages);
+}
+
-- 
cgit v1.2.3


From 4da70b9e4f8576ec906dba9240c5b6bc6584f91d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:28:01 +0200
Subject: x86, AMD IOMMU: add dma_ops mapping functions for single mappings

This patch adds the dma_ops specific mapping functions for single mappings.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 59 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index e00a3e7ba356..b4079f6bbd74 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -40,6 +40,11 @@ struct command {
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 			     struct unity_map_entry *e);
 
+static int iommu_has_npcache(struct amd_iommu *iommu)
+{
+	return iommu->cap & IOMMU_CAP_NPCACHE;
+}
+
 static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
 {
 	u32 tail, head;
@@ -641,3 +646,57 @@ static void __unmap_single(struct amd_iommu *iommu,
 	dma_ops_free_addresses(dma_dom, dma_addr, pages);
 }
 
+static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
+			     size_t size, int dir)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	u16 devid;
+	dma_addr_t addr;
+
+	get_device_resources(dev, &iommu, &domain, &devid);
+
+	if (iommu == NULL || domain == NULL)
+		return (dma_addr_t)paddr;
+
+	spin_lock_irqsave(&domain->lock, flags);
+	addr = __map_single(dev, iommu, domain->priv, paddr, size, dir);
+	if (addr == bad_dma_address)
+		goto out;
+
+	if (iommu_has_npcache(iommu))
+		iommu_flush_pages(iommu, domain->id, addr, size);
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+out:
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return addr;
+}
+
+static void unmap_single(struct device *dev, dma_addr_t dma_addr,
+			 size_t size, int dir)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	u16 devid;
+
+	if (!get_device_resources(dev, &iommu, &domain, &devid))
+		return;
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	__unmap_single(iommu, domain->priv, dma_addr, size, dir);
+
+	iommu_flush_pages(iommu, domain->id, dma_addr, size);
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+	spin_unlock_irqrestore(&domain->lock, flags);
+}
+
-- 
cgit v1.2.3


From 65b050adbfd9481ec20514cfc06fa596a92cb3b5 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:28:02 +0200
Subject: x86, AMD IOMMU: add mapping functions for scatter gather lists

This patch adds the dma_ops functions for mapping and unmapping scatter gather
lists.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index b4079f6bbd74..f4747fe70aaa 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -700,3 +700,101 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
+static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
+			   int nelems, int dir)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sglist, s, nelems, i) {
+		s->dma_address = (dma_addr_t)sg_phys(s);
+		s->dma_length  = s->length;
+	}
+
+	return nelems;
+}
+
+static int map_sg(struct device *dev, struct scatterlist *sglist,
+		  int nelems, int dir)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	u16 devid;
+	int i;
+	struct scatterlist *s;
+	phys_addr_t paddr;
+	int mapped_elems = 0;
+
+	get_device_resources(dev, &iommu, &domain, &devid);
+
+	if (!iommu || !domain)
+		return map_sg_no_iommu(dev, sglist, nelems, dir);
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	for_each_sg(sglist, s, nelems, i) {
+		paddr = sg_phys(s);
+
+		s->dma_address = __map_single(dev, iommu, domain->priv,
+					      paddr, s->length, dir);
+
+		if (s->dma_address) {
+			s->dma_length = s->length;
+			mapped_elems++;
+		} else
+			goto unmap;
+		if (iommu_has_npcache(iommu))
+			iommu_flush_pages(iommu, domain->id, s->dma_address,
+					  s->dma_length);
+	}
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+out:
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return mapped_elems;
+unmap:
+	for_each_sg(sglist, s, mapped_elems, i) {
+		if (s->dma_address)
+			__unmap_single(iommu, domain->priv, s->dma_address,
+				       s->dma_length, dir);
+		s->dma_address = s->dma_length = 0;
+	}
+
+	mapped_elems = 0;
+
+	goto out;
+}
+
+static void unmap_sg(struct device *dev, struct scatterlist *sglist,
+		     int nelems, int dir)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	struct scatterlist *s;
+	u16 devid;
+	int i;
+
+	if (!get_device_resources(dev, &iommu, &domain, &devid))
+		return;
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	for_each_sg(sglist, s, nelems, i) {
+		__unmap_single(iommu, domain->priv, s->dma_address,
+			       s->dma_length, dir);
+		iommu_flush_pages(iommu, domain->id, s->dma_address,
+				  s->dma_length);
+		s->dma_address = s->dma_length = 0;
+	}
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+	spin_unlock_irqrestore(&domain->lock, flags);
+}
+
-- 
cgit v1.2.3


From 5d8b53cf3f8762b2230fb3d5b4e2ff78c5e701d8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:28:03 +0200
Subject: x86, AMD IOMMU: add mapping functions for coherent mappings

This patch adds the dma_ops functions for coherent mappings.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 74 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f4747fe70aaa..aab9125ac0b2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -798,3 +798,77 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
+static void *alloc_coherent(struct device *dev, size_t size,
+			    dma_addr_t *dma_addr, gfp_t flag)
+{
+	unsigned long flags;
+	void *virt_addr;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	u16 devid;
+	phys_addr_t paddr;
+
+	virt_addr = (void *)__get_free_pages(flag, get_order(size));
+	if (!virt_addr)
+		return 0;
+
+	memset(virt_addr, 0, size);
+	paddr = virt_to_phys(virt_addr);
+
+	get_device_resources(dev, &iommu, &domain, &devid);
+
+	if (!iommu || !domain) {
+		*dma_addr = (dma_addr_t)paddr;
+		return virt_addr;
+	}
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
+				 size, DMA_BIDIRECTIONAL);
+
+	if (*dma_addr == bad_dma_address) {
+		free_pages((unsigned long)virt_addr, get_order(size));
+		virt_addr = NULL;
+		goto out;
+	}
+
+	if (iommu_has_npcache(iommu))
+		iommu_flush_pages(iommu, domain->id, *dma_addr, size);
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+out:
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return virt_addr;
+}
+
+static void free_coherent(struct device *dev, size_t size,
+			  void *virt_addr, dma_addr_t dma_addr)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	u16 devid;
+
+	get_device_resources(dev, &iommu, &domain, &devid);
+
+	if (!iommu || !domain)
+		goto free_mem;
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
+	iommu_flush_pages(iommu, domain->id, dma_addr, size);
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+free_mem:
+	free_pages((unsigned long)virt_addr, get_order(size));
+}
+
-- 
cgit v1.2.3


From c432f3df8ea740e2ecb27da88bd6b5a254714959 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:28:04 +0200
Subject: x86, AMD IOMMU: add pre-allocation of protection domains

This patch adds a function to pre-allocate protection domains. So we don't have
to allocate it on the first request for a device (which can happen in atomic
mode).

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index aab9125ac0b2..bed5f820898d 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -872,3 +872,37 @@ free_mem:
 	free_pages((unsigned long)virt_addr, get_order(size));
 }
 
+/*
+ * If the driver core informs the DMA layer if a driver grabs a device
+ * we don't need to preallocate the protection domains anymore.
+ * For now we have to.
+ */
+void prealloc_protection_domains(void)
+{
+	struct pci_dev *dev = NULL;
+	struct dma_ops_domain *dma_dom;
+	struct amd_iommu *iommu;
+	int order = amd_iommu_aperture_order;
+	u16 devid;
+
+	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+		devid = (dev->bus->number << 8) | dev->devfn;
+		if (devid >= amd_iommu_last_bdf)
+			continue;
+		devid = amd_iommu_alias_table[devid];
+		if (domain_for_device(devid))
+			continue;
+		iommu = amd_iommu_rlookup_table[devid];
+		if (!iommu)
+			continue;
+		dma_dom = dma_ops_domain_alloc(iommu, order);
+		if (!dma_dom)
+			continue;
+		init_unity_mappings_for_device(dma_dom, devid);
+		set_device_domain(iommu, &dma_dom->domain, devid);
+		printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ",
+		       dma_dom->domain.id);
+		print_devid(devid, 1);
+	}
+}
+
-- 
cgit v1.2.3


From 6631ee9d00996e7663a086b5abceba65c89ff8f6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:28:05 +0200
Subject: x86, AMD IOMMU: add dma_ops initialization function

This patch adds the function to initialize the dma_ops for the AMD IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 46 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index bed5f820898d..c282bdec1650 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -906,3 +906,49 @@ void prealloc_protection_domains(void)
 	}
 }
 
+static struct dma_mapping_ops amd_iommu_dma_ops = {
+	.alloc_coherent = alloc_coherent,
+	.free_coherent = free_coherent,
+	.map_single = map_single,
+	.unmap_single = unmap_single,
+	.map_sg = map_sg,
+	.unmap_sg = unmap_sg,
+};
+
+int __init amd_iommu_init_dma_ops(void)
+{
+	struct amd_iommu *iommu;
+	int order = amd_iommu_aperture_order;
+	int ret;
+
+	list_for_each_entry(iommu, &amd_iommu_list, list) {
+		iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+		if (iommu->default_dom == NULL)
+			return -ENOMEM;
+		ret = iommu_init_unity_mappings(iommu);
+		if (ret)
+			goto free_domains;
+	}
+
+	if (amd_iommu_isolate)
+		prealloc_protection_domains();
+
+	iommu_detected = 1;
+	force_iommu = 1;
+	bad_dma_address = 0;
+	gart_iommu_aperture_disabled = 1;
+	gart_iommu_aperture = 0;
+
+	dma_ops = &amd_iommu_dma_ops;
+
+	return 0;
+
+free_domains:
+
+	list_for_each_entry(iommu, &amd_iommu_list, list) {
+		if (iommu->default_dom)
+			dma_ops_domain_free(iommu->default_dom);
+	}
+
+	return ret;
+}
-- 
cgit v1.2.3


From c6da992e16a9d261eb9dfeff14e9777c3e0468c5 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:28:06 +0200
Subject: x86, AMD IOMMU: add amd_iommu.h to export functions to the generic
 x86 dma code

This patch adds the amd_iommu.h file which will be included in the generic
code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c      |  1 +
 arch/x86/kernel/amd_iommu_init.c |  1 +
 include/asm-x86/amd_iommu.h      | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 34 insertions(+)
 create mode 100644 include/asm-x86/amd_iommu.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index c282bdec1650..134dea103247 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -25,6 +25,7 @@
 #include <asm/proto.h>
 #include <asm/gart.h>
 #include <asm/amd_iommu_types.h>
+#include <asm/amd_iommu.h>
 
 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
 
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index ec6f13b69d53..bae4a76a3b54 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -23,6 +23,7 @@
 #include <linux/list.h>
 #include <asm/pci-direct.h>
 #include <asm/amd_iommu_types.h>
+#include <asm/amd_iommu.h>
 #include <asm/gart.h>
 
 /*
diff --git a/include/asm-x86/amd_iommu.h b/include/asm-x86/amd_iommu.h
new file mode 100644
index 000000000000..30a12049353b
--- /dev/null
+++ b/include/asm-x86/amd_iommu.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_H
+#define _ASM_X86_AMD_IOMMU_H
+
+#ifdef CONFIG_AMD_IOMMU
+extern int amd_iommu_init(void);
+extern int amd_iommu_init_dma_ops(void);
+extern void amd_iommu_detect(void);
+#else
+static inline int amd_iommu_init(void) { return -ENODEV; }
+static inline void amd_iommu_detect(void) { }
+#endif
+
+#endif
-- 
cgit v1.2.3


From 8736197ba8e40d030eec0ab7a9e9f2be41810c4e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:28:07 +0200
Subject: x86, AMD IOMMU: initialize dma_ops from IOMMU initialization and
 enable IOMMUs

This patch adds a call to the driver specific dma_ops initialization routine
from the AMD IOMMU hardware initialization. Further it adds the necessary code
to finally enable AMD IOMMU hardware.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index bae4a76a3b54..d1aa2344cd90 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -644,6 +644,16 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
 	return 0;
 }
 
+static void __init enable_iommus(void)
+{
+	struct amd_iommu *iommu;
+
+	list_for_each_entry(iommu, &amd_iommu_list, list) {
+		iommu_set_exclusion_range(iommu);
+		iommu_enable(iommu);
+	}
+}
+
 int __init amd_iommu_init(void)
 {
 	int i, ret = 0;
@@ -731,6 +741,12 @@ int __init amd_iommu_init(void)
 	if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
 		goto free;
 
+	ret = amd_iommu_init_dma_ops();
+	if (ret)
+		goto free;
+
+	enable_iommus();
+
 	printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
 			(1 << (amd_iommu_aperture_order-20)));
 
-- 
cgit v1.2.3


From a69ca3401821b7312cb7ec939a8814240fd7b9b3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 26 Jun 2008 21:28:08 +0200
Subject: AMD_IOMMU: call detect and initialization functions from dma code

This patch adds the function calls to initialize the AMD IOMMU hardware and
dma_ops to the generic DMA code for the x86 architecture.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: Sebastian.Biemueller@amd.com
Cc: robert.richter@amd.com
Cc: joro@8bytes.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-dma.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index dc00a1331ace..dea01672f1f4 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -7,6 +7,7 @@
 #include <asm/dma.h>
 #include <asm/gart.h>
 #include <asm/calgary.h>
+#include <asm/amd_iommu.h>
 
 int forbid_dac __read_mostly;
 EXPORT_SYMBOL(forbid_dac);
@@ -122,6 +123,8 @@ void __init pci_iommu_alloc(void)
 
 	detect_intel_iommu();
 
+	amd_iommu_detect();
+
 #ifdef CONFIG_SWIOTLB
 	pci_swiotlb_init();
 #endif
@@ -502,6 +505,8 @@ static int __init pci_iommu_init(void)
 
 	intel_iommu_init();
 
+	amd_iommu_init();
+
 #ifdef CONFIG_GART_IOMMU
 	gart_iommu_init();
 #endif
-- 
cgit v1.2.3


From 92af4e29020ff096178a00605b3662b3b39d4aa9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 27 Jun 2008 10:48:16 +0200
Subject: x86, AMD IOMMU, build fix #2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

 arch/x86/kernel/amd_iommu.c: In function ‘amd_iommu_init_dma_ops':
 arch/x86/kernel/amd_iommu.c:940: error: lvalue required as left operand of assignment
 arch/x86/kernel/amd_iommu.c:941: error: lvalue required as left operand of assignment

due to !CONFIG_GART_IOMMU.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c      | 2 ++
 arch/x86/kernel/amd_iommu_init.c | 2 ++
 include/asm-x86/gart.h           | 5 +++--
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 134dea103247..a1b38561d347 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -937,8 +937,10 @@ int __init amd_iommu_init_dma_ops(void)
 	iommu_detected = 1;
 	force_iommu = 1;
 	bad_dma_address = 0;
+#ifdef CONFIG_GART_IOMMU
 	gart_iommu_aperture_disabled = 1;
 	gart_iommu_aperture = 0;
+#endif
 
 	dma_ops = &amd_iommu_dma_ops;
 
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index d1aa2344cd90..6ab8128db1cc 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -801,8 +801,10 @@ void __init amd_iommu_detect(void)
 
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
 		iommu_detected = 1;
+#ifdef CONFIG_GART_IOMMU
 		gart_iommu_aperture_disabled = 1;
 		gart_iommu_aperture = 0;
+#endif
 	}
 }
 
diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h
index 90958ed993fa..ec5785c6e2fd 100644
--- a/include/asm-x86/gart.h
+++ b/include/asm-x86/gart.h
@@ -18,8 +18,9 @@ extern int gart_iommu_aperture_allowed;
 extern int gart_iommu_aperture_disabled;
 extern int fix_aperture;
 #else
-#define gart_iommu_aperture 0
-#define gart_iommu_aperture_allowed 0
+#define gart_iommu_aperture		0
+#define gart_iommu_aperture_allowed	0
+#define gart_iommu_aperture_disabled	1
 
 static inline void early_gart_iommu_check(void)
 {
-- 
cgit v1.2.3


From 9d8ad5d6c7fce31fd2c0fd4fe9977bda3e92e340 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 27 Jun 2008 17:22:17 +0200
Subject: x86: don't destroy %rbp on kernel-mode faults

From the code:

    "B stepping K8s sometimes report an truncated RIP for IRET exceptions
    returning to compat mode. Check for these here too."

The code then proceeds to truncate the upper 32 bits of %rbp. This means
that when do_page_fault() is finally called, its prologue,

    do_page_fault:
        push %rbp
        movl %rsp, %rbp

will put the truncated base pointer on the stack. This means that the
stack tracer will not be able to follow the base-pointer changes and
will see all subsequent stack frames as unreliable.

This patch changes the code to use a different register (%rcx) for the
checking and leaves %rbp untouched.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..fa1c9eb7f604 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -926,11 +926,11 @@ error_kernelspace:
 	   iret run with kernel gs again, so don't set the user space flag.
 	   B stepping K8s sometimes report an truncated RIP for IRET 
 	   exceptions returning to compat mode. Check for these here too. */
-	leaq irq_return(%rip),%rbp
-	cmpq %rbp,RIP(%rsp) 
+	leaq irq_return(%rip),%rcx
+	cmpq %rcx,RIP(%rsp)
 	je   error_swapgs
-	movl %ebp,%ebp	/* zero extend */
-	cmpq %rbp,RIP(%rsp) 
+	movl %ecx,%ecx	/* zero extend */
+	cmpq %rcx,RIP(%rsp)
 	je   error_swapgs
 	cmpq $gs_change,RIP(%rsp)
         je   error_swapgs
-- 
cgit v1.2.3


From 44974c8fc1d7047abe414562e0782320f4c1f511 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 30 Jun 2008 08:11:22 +0900
Subject: x86 gart: remove unnecessary set_bit_string

iommu_area_alloc internally calls set_bit_string and set bits
properly. This set_bit_string is unnecessary.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: joerg.roedel@amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-gart_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index f20c20a7853d..021f3c684a62 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -104,7 +104,6 @@ static unsigned long alloc_iommu(struct device *dev, int size)
 					  size, base_index, boundary_size, 0);
 	}
 	if (offset != -1) {
-		set_bit_string(iommu_gart_bitmap, offset, size);
 		next_bit = offset+size;
 		if (next_bit >= iommu_pages) {
 			next_bit = 0;
-- 
cgit v1.2.3


From 7441e9cb18a1a1e5b87f516fa97b6d4abb0838e3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 30 Jun 2008 20:18:02 +0200
Subject: x86, AMD IOMMU: disable suspend/resume with IOMMU enabled (for now)

This patch disables suspend/resume on machines with AMD IOMMU enabled. Real
suspend/resume support for AMD IOMMU is currently being worked on. Until this
is ready it will be disabled to avoid data corruption when the IOMMU is not
properly re-enabled at resume. The patch is based on a similar patch for the
GART driver written by Pavel Machek.

The overall driver merged into tip/master is tested with parallel disk and
network loads and showed no problems in a test running for 3 days.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6ab8128db1cc..5d9e45c7cea2 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -21,6 +21,7 @@
 #include <linux/acpi.h>
 #include <linux/gfp.h>
 #include <linux/list.h>
+#include <linux/sysdev.h>
 #include <asm/pci-direct.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu.h>
@@ -654,6 +655,32 @@ static void __init enable_iommus(void)
 	}
 }
 
+/*
+ * Suspend/Resume support
+ * disable suspend until real resume implemented
+ */
+
+static int amd_iommu_resume(struct sys_device *dev)
+{
+	return 0;
+}
+
+static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return -EINVAL;
+}
+
+static struct sysdev_class amd_iommu_sysdev_class = {
+	.name = "amd_iommu",
+	.suspend = amd_iommu_suspend,
+	.resume = amd_iommu_resume,
+};
+
+static struct sys_device device_amd_iommu = {
+	.id = 0,
+	.cls = &amd_iommu_sysdev_class,
+};
+
 int __init amd_iommu_init(void)
 {
 	int i, ret = 0;
@@ -745,6 +772,14 @@ int __init amd_iommu_init(void)
 	if (ret)
 		goto free;
 
+	ret = sysdev_class_register(&amd_iommu_sysdev_class);
+	if (ret)
+		goto free;
+
+	ret = sysdev_register(&device_amd_iommu);
+	if (ret)
+		goto free;
+
 	enable_iommus();
 
 	printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
-- 
cgit v1.2.3


From 45fdc3a7624a4a48185a04ae0abab5f9793d8952 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 30 Jun 2008 14:02:41 -0700
Subject: x86 ptrace: fix PTRACE_GETFPXREGS error

ptrace has always returned only -EIO for all failures to access
registers.  The user_regset calls are allowed to return a more
meaningful variety of errors.  The REGSET_XFP calls use -ENODEV
for !cpu_has_fxsr hardware.  Make ptrace return the traditional
-EIO instead of the error code from the user_regset call.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i387.c   | 4 ++--
 arch/x86/kernel/ptrace.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 95e80e5033c3..eb9ddd8efb82 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -162,7 +162,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	int ret;
 
 	if (!cpu_has_fxsr)
-		return -EIO;
+		return -ENODEV;
 
 	ret = init_fpu(target);
 	if (ret)
@@ -179,7 +179,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	int ret;
 
 	if (!cpu_has_fxsr)
-		return -EIO;
+		return -ENODEV;
 
 	ret = init_fpu(target);
 	if (ret)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a7835f282936..77040b6070e1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -943,13 +943,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		return copy_regset_to_user(child, &user_x86_32_view,
 					   REGSET_XFP,
 					   0, sizeof(struct user_fxsr_struct),
-					   datap);
+					   datap) ? -EIO : 0;
 
 	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
 		return copy_regset_from_user(child, &user_x86_32_view,
 					     REGSET_XFP,
 					     0, sizeof(struct user_fxsr_struct),
-					     datap);
+					     datap) ? -EIO : 0;
 #endif
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-- 
cgit v1.2.3


From 38c4c97c62a30aef276663c1128a2051a25ead7d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 20 May 2008 19:17:02 +0200
Subject: x86-mce: BKL pushdown

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae0..4ef151633e8b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
@@ -527,10 +528,12 @@ static int open_exclu;	/* already open exclusive? */
 
 static int mce_open(struct inode *inode, struct file *file)
 {
+	lock_kernel();
 	spin_lock(&mce_state_lock);
 
 	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 		spin_unlock(&mce_state_lock);
+		unlock_kernel();
 		return -EBUSY;
 	}
 
@@ -539,6 +542,7 @@ static int mce_open(struct inode *inode, struct file *file)
 	open_count++;
 
 	spin_unlock(&mce_state_lock);
+	unlock_kernel();
 
 	return nonseekable_open(inode, file);
 }
-- 
cgit v1.2.3


From bc4e0f9ae2b1b133d36c1392d0145d2997cff272 Mon Sep 17 00:00:00 2001
From: Ben Castricum <lk0806@bencastricum.nl>
Date: Tue, 10 Jun 2008 13:15:12 +0200
Subject: x86: microcode: cosmetic changes

First announce ourself, then start working. Currently this module reports
itself when all is completed which is not most modules do. Plus some
cosmetic/whitespace cleanups.

Signed-off-by: Ben Castricum <lk0806@bencastricum.nl>
Cc: trivial@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 69729e38b78a..9758fea87c5b 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -5,13 +5,14 @@
  *		      2006	Shaohua Li <shaohua.li@intel.com>
  *
  *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II, 
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
  *	Pentium III, Xeon, Pentium 4, etc.
  *
- *	Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
- *	Order Number 245472 or free download from:
- *		
- *	http://developer.intel.com/design/pentium4/manuals/245472.htm
+ *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *	Software Developer's Manual
+ *	Order Number 253668 or free download from:
+ *
+ *	http://developer.intel.com/design/pentium4/manuals/253668.htm
  *
  *	For more information, go to http://www.urbanmyth.org/microcode
  *
@@ -58,12 +59,12 @@
  *		nature of implementation.
  *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
  *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, 
+ *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
  *		Jun Nakajima <jun.nakajima@intel.com>
  *		Support for the microcode updates in the new format.
  *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
  *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode 
+ *		because we no longer hold a copy of applied microcode
  *		in kernel memory.
  *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
  *		Fix sigmatch() macro to handle old CPUs with pf == 0.
@@ -320,11 +321,11 @@ static void apply_microcode(int cpu)
 		return;
 
 	/* serialize access to the physical write to MSR 0x79 */
-	spin_lock_irqsave(&microcode_update_lock, flags);          
+	spin_lock_irqsave(&microcode_update_lock, flags);
 
 	/* write microcode via MSR 0x79 */
 	wrmsr(MSR_IA32_UCODE_WRITE,
-		(unsigned long) uci->mc->bits, 
+		(unsigned long) uci->mc->bits,
 		(unsigned long) uci->mc->bits >> 16 >> 16);
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
 
@@ -341,7 +342,7 @@ static void apply_microcode(int cpu)
 		return;
 	}
 	printk(KERN_INFO "microcode: CPU%d updated from revision "
-	       "0x%x to 0x%x, date = %08x \n", 
+	       "0x%x to 0x%x, date = %08x \n",
 	       cpu_num, uci->rev, val[1], uci->mc->hdr.date);
 	uci->rev = val[1];
 }
@@ -534,7 +535,7 @@ static int cpu_request_microcode(int cpu)
 		c->x86, c->x86_model, c->x86_mask);
 	error = request_firmware(&firmware, name, &microcode_pdev->dev);
 	if (error) {
-		pr_debug("microcode: ucode data file %s load failed\n", name);
+		pr_debug("microcode: data file %s load failed\n", name);
 		return error;
 	}
 	buf = firmware->data;
@@ -805,6 +806,9 @@ static int __init microcode_init (void)
 {
 	int error;
 
+	printk(KERN_INFO
+		"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
+
 	error = microcode_dev_init();
 	if (error)
 		return error;
@@ -825,9 +829,6 @@ static int __init microcode_init (void)
 	}
 
 	register_hotcpu_notifier(&mc_cpu_notifier);
-
-	printk(KERN_INFO 
-		"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2d144e63098be47c21ad59d68a4fd17bd73a3aaf Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Tue, 24 Jun 2008 17:12:56 -0700
Subject: x86, mce_64.c: mce_cpu_quirks being ignored

Quirks getting ignored was a bug. Below patch fixes the bug, until
we have the dynamic banks support.

Sysfs choice configuration should not have any issues with the earlier patch
as we look for NR_SYSFS_BANKS in do_machine_check().

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Max Asbock <masbock@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 8c8299ce7ad4..501ca1cea27d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -463,7 +463,11 @@ static void mce_init(void *dummy)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
 	for (i = 0; i < banks; i++) {
-		wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
+		if (i < NR_SYSFS_BANKS)
+			wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+		else
+			wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
+
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 }
-- 
cgit v1.2.3


From 5f6a59d8ad55781d4d2ff0d327f84aaeed2c4127 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Jul 2008 19:35:07 +0200
Subject: x86, AMD IOMMU: remove unnecessary set_bit_string

The set_bit_string call in the address allocator is not necessary because its
already called in iommu_area_alloc().

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: robert.richter@amd.com
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a1b38561d347..329b2c3f2fed 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -301,7 +301,6 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
 				0, boundary_size, 0);
 
 	if (likely(address != -1)) {
-		set_bit_string(dom->bitmap, address, pages);
 		dom->next_bit = address + pages;
 		address <<= PAGE_SHIFT;
 	} else
-- 
cgit v1.2.3


From 999ba417cc1a43881126d08876d5d7e653113ae3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Jul 2008 19:35:08 +0200
Subject: x86, AMD IOMMU: flush domain TLB when there is more than one page to
 flush

This patch changes the domain TLB flushing behavior of the driver. When there
is more than one page to flush it flushes the whole domain TLB instead of every
single page. So we send only a single command to the IOMMU in every case which
is faster to execute.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: robert.richter@amd.com
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu.c       | 14 ++++++++++----
 include/asm-x86/amd_iommu_types.h |  2 ++
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 329b2c3f2fed..f2766d84c7a0 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -140,16 +140,22 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
 static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
 		u64 address, size_t size)
 {
-	int i;
+	int s = 0;
 	unsigned pages = to_pages(address, size);
 
 	address &= PAGE_MASK;
 
-	for (i = 0; i < pages; ++i) {
-		iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 0);
-		address += PAGE_SIZE;
+	if (pages > 1) {
+		/*
+		 * If we have to flush more than one page, flush all
+		 * TLB entries for this domain
+		 */
+		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+		s = 1;
 	}
 
+	iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
+
 	return 0;
 }
 
diff --git a/include/asm-x86/amd_iommu_types.h b/include/asm-x86/amd_iommu_types.h
index 0f395501ab8e..7bfcb47cc452 100644
--- a/include/asm-x86/amd_iommu_types.h
+++ b/include/asm-x86/amd_iommu_types.h
@@ -93,6 +93,8 @@
 #define CMD_INV_IOMMU_PAGES_SIZE_MASK	0x01
 #define CMD_INV_IOMMU_PAGES_PDE_MASK	0x02
 
+#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS	0x7fffffffffffffffULL
+
 /* macros and definitions for device table entries */
 #define DEV_ENTRY_VALID         0x00
 #define DEV_ENTRY_TRANSLATION   0x01
-- 
cgit v1.2.3


From 8b14518fadd9d5915827d86d5c10e602fedf042e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Jul 2008 19:35:09 +0200
Subject: x86, AMD IOMMU: honor iommu=off instead of amd_iommu=off

This patch removes the amd_iommu=off kernel parameter and honors the generic
iommu=off parameter for the same purpose.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: robert.richter@amd.com
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5d9e45c7cea2..c54c82305af4 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -101,8 +101,6 @@ struct ivmd_header {
 	u64 range_length;
 } __attribute__((packed));
 
-static int __initdata amd_iommu_disable;
-
 u16 amd_iommu_last_bdf;
 struct list_head amd_iommu_unity_map;
 unsigned amd_iommu_aperture_order = 26;
@@ -686,7 +684,7 @@ int __init amd_iommu_init(void)
 	int i, ret = 0;
 
 
-	if (amd_iommu_disable) {
+	if (no_iommu) {
 		printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
 		return 0;
 	}
@@ -831,9 +829,6 @@ void __init amd_iommu_detect(void)
 	if (swiotlb || no_iommu || iommu_detected)
 		return;
 
-	if (amd_iommu_disable)
-		return;
-
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
 		iommu_detected = 1;
 #ifdef CONFIG_GART_IOMMU
@@ -846,8 +841,6 @@ void __init amd_iommu_detect(void)
 static int __init parse_amd_iommu_options(char *str)
 {
 	for (; *str; ++str) {
-		if (strcmp(str, "off") == 0)
-			amd_iommu_disable = 1;
 		if (strcmp(str, "isolate") == 0)
 			amd_iommu_isolate = 1;
 	}
-- 
cgit v1.2.3


From c1cbebeec4a6be1c1fb54fbf2395ade2534b03c4 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Jul 2008 19:35:10 +0200
Subject: x86, AMD IOMMU: don't try to init IOMMU if early detect code did not
 detect one

This patch adds a check if the early detect code has found AMD IOMMU hardware
descriptions and does not try to initialize hardware if the check failed.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: robert.richter@amd.com
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c54c82305af4..c970fac6fdc6 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -101,6 +101,8 @@ struct ivmd_header {
 	u64 range_length;
 } __attribute__((packed));
 
+static int __initdata amd_iommu_detected;
+
 u16 amd_iommu_last_bdf;
 struct list_head amd_iommu_unity_map;
 unsigned amd_iommu_aperture_order = 26;
@@ -689,6 +691,9 @@ int __init amd_iommu_init(void)
 		return 0;
 	}
 
+	if (!amd_iommu_detected)
+		return -ENODEV;
+
 	/*
 	 * First parse ACPI tables to find the largest Bus/Dev/Func
 	 * we need to handle. Upon this information the shared data
@@ -831,6 +836,7 @@ void __init amd_iommu_detect(void)
 
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
 		iommu_detected = 1;
+		amd_iommu_detected = 1;
 #ifdef CONFIG_GART_IOMMU
 		gart_iommu_aperture_disabled = 1;
 		gart_iommu_aperture = 0;
-- 
cgit v1.2.3


From fb339690a0506d2aca447b5e888fd6ac39ea4fa2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Jul 2008 19:35:11 +0200
Subject: x86, AMD IOMMU: remove unnecessary code from the iommu_enable
 function

This code removes a leftover from the iommu_enable function. The ctrl variable
is assigned but never used.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: iommu@lists.linux-foundation.org
Cc: bhavna.sarathy@amd.com
Cc: robert.richter@amd.com
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c970fac6fdc6..2a13e430437d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -169,14 +169,11 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 
 void __init iommu_enable(struct amd_iommu *iommu)
 {
-	u32 ctrl;
-
 	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
 	print_devid(iommu->devid, 0);
 	printk(" cap 0x%hx\n", iommu->cap_ptr);
 
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
-	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
 }
 
 static u8 * __init iommu_map_mmio_space(u64 address)
-- 
cgit v1.2.3


From aa276e1cafb3ce9d01d1e837bcd67e92616013ac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jun 2008 19:15:00 +0200
Subject: x86, clockevents: add C1E aware idle function

C1E on AMD machines is like C3 but without control from the OS. Up to
now we disabled the local apic timer for those machines as it stops
when the CPU goes into C1E. This excludes those machines from high
resolution timers / dynamic ticks, which hurts especially X2 based
laptops.

The current boot time C1E detection has another, more serious flaw
as well: some BIOSes do not enable C1E until the ACPI processor module
is loaded. This causes systems to stop working after that point.

To work nicely with C1E enabled machines we use a separate idle
function, which checks on idle entry whether C1E was enabled in the
Interrupt Pending Message MSR. This allows us to do timer broadcasting
for C1E and covers the late enablement of C1E as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c    |  5 ++--
 arch/x86/kernel/apic_64.c    | 26 +----------------
 arch/x86/kernel/cpu/amd.c    | 30 --------------------
 arch/x86/kernel/cpu/amd_64.c | 25 -----------------
 arch/x86/kernel/process.c    | 66 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/apic.h       |  3 --
 kernel/time/tick-broadcast.c |  6 ++--
 7 files changed, 73 insertions(+), 88 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6c..c44206e731d4 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -64,9 +64,8 @@ static int enable_local_apic __initdata;
 
 /* Local APIC timer verification ok */
 static int local_apic_timer_verify_ok;
-/* Disable local APIC timer from the kernel commandline or via dmi quirk
-   or using CPU MSR check */
-int local_apic_timer_disabled;
+/* Disable local APIC timer from the kernel commandline or via dmi quirk */
+static int local_apic_timer_disabled;
 /* Local APIC timer works in C2 */
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 0633cfd0dc29..a5cc8447cf4d 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -43,7 +43,7 @@
 #include <mach_ipi.h>
 #include <mach_apic.h>
 
-int disable_apic_timer __cpuinitdata;
+static int disable_apic_timer __cpuinitdata;
 static int apic_calibrate_pmtmr __initdata;
 int disable_apic;
 
@@ -422,32 +422,8 @@ void __init setup_boot_APIC_clock(void)
 	setup_APIC_timer();
 }
 
-/*
- * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
- * C1E flag only in the secondary CPU, so when we detect the wreckage
- * we already have enabled the boot CPU local apic timer. Check, if
- * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
- * set the DUMMY flag again and force the broadcast mode in the
- * clockevents layer.
- */
-static void __cpuinit check_boot_apic_timer_broadcast(void)
-{
-	if (!disable_apic_timer ||
-	    (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
-		return;
-
-	printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
-	lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
-
-	local_irq_enable();
-	clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
-			   &boot_cpu_physical_apicid);
-	local_irq_disable();
-}
-
 void __cpuinit setup_secondary_APIC_clock(void)
 {
-	check_boot_apic_timer_broadcast();
 	setup_APIC_timer();
 }
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e76b49e7a916..acc891ae5901 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,31 +24,6 @@
 extern void vide(void);
 __asm__(".align 4\nvide: ret");
 
-#ifdef CONFIG_X86_LOCAL_APIC
-
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(struct cpuinfo_x86 *c)
-{
-	u32 lo, hi;
-
-	if (c->x86 < 0x0F)
-		return 0;
-
-	/* Family 0x0f models < rev F do not have this MSR */
-	if (c->x86 == 0x0f && c->x86_model < 0x40)
-		return 0;
-
-	rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
-	if (lo & K8_INTP_C1E_ACTIVE_MASK) {
-		if (smp_processor_id() != boot_cpu_physical_apicid)
-			printk(KERN_INFO "AMD C1E detected late. "
-			       "Force timer broadcast.\n");
-		return 1;
-	}
-	return 0;
-}
-#endif
-
 int force_mwait __cpuinitdata;
 
 static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
@@ -285,11 +260,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 			num_cache_leaves = 3;
 	}
 
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (amd_apic_timer_broken(c))
-		local_apic_timer_disabled = 1;
-#endif
-
 	/* K6s reports MCEs but don't actually have all the MSRs */
 	if (c->x86 < 6)
 		clear_cpu_cap(c, X86_FEATURE_MCE);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index f5fc161d8f2a..f8d20588bde9 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -110,28 +110,6 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
 #endif
 }
 
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(struct cpuinfo_x86 *c)
-{
-	u32 lo, hi;
-
-	if (c->x86 < 0x0F)
-		return 0;
-
-	/* Family 0x0f models < rev F do not have this MSR */
-	if (c->x86 == 0x0f && c->x86_model < 0x40)
-		return 0;
-
-	rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
-	if (lo & K8_INTP_C1E_ACTIVE_MASK) {
-		if (smp_processor_id() != boot_cpu_physical_apicid)
-			printk(KERN_INFO "AMD C1E detected late. "
-			       "Force timer broadcast.\n");
-		return 1;
-	}
-	return 0;
-}
-
 void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 {
 	early_init_amd_mc(c);
@@ -212,9 +190,6 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 == 0x10)
 		amd_enable_pci_ext_cfg(c);
 
-	if (amd_apic_timer_broken(c))
-		disable_apic_timer = 1;
-
 	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
 		unsigned long long tseg;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9fea14607dfe..68ad3539b143 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/pm.h>
+#include <linux/clockchips.h>
 
 struct kmem_cache *task_xstate_cachep;
 
@@ -219,6 +220,68 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 	return (edx & MWAIT_EDX_C1);
 }
 
+/*
+ * Check for AMD CPUs, which have potentially C1E support
+ */
+static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
+{
+	if (c->x86_vendor != X86_VENDOR_AMD)
+		return 0;
+
+	if (c->x86 < 0x0F)
+		return 0;
+
+	/* Family 0x0f models < rev F do not have C1E */
+	if (c->x86 == 0x0f && c->x86_model < 0x40)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * C1E aware idle routine. We check for C1E active in the interrupt
+ * pending message MSR. If we detect C1E, then we handle it the same
+ * way as C3 power states (local apic timer and TSC stop)
+ */
+static void c1e_idle(void)
+{
+	static cpumask_t c1e_mask = CPU_MASK_NONE;
+	static int c1e_detected;
+
+	if (need_resched())
+		return;
+
+	if (!c1e_detected) {
+		u32 lo, hi;
+
+		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+			c1e_detected = 1;
+			mark_tsc_unstable("TSC halt in C1E");
+			printk(KERN_INFO "System has C1E enabled\n");
+		}
+	}
+
+	if (c1e_detected) {
+		int cpu = smp_processor_id();
+
+		if (!cpu_isset(cpu, c1e_mask)) {
+			cpu_set(cpu, c1e_mask);
+			/* Force broadcast so ACPI can not interfere */
+			clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+					   &cpu);
+			printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
+			       cpu);
+		}
+		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+		default_idle();
+		local_irq_disable();
+		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+		local_irq_enable();
+	} else
+		default_idle();
+}
+
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_X86_SMP
@@ -236,6 +299,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 		 */
 		printk(KERN_INFO "using mwait in idle threads.\n");
 		pm_idle = mwait_idle;
+	} else if (check_c1e_idle(c)) {
+		printk(KERN_INFO "using C1E aware idle routine\n");
+		pm_idle = c1e_idle;
 	} else
 		pm_idle = default_idle;
 }
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index be9639a9a186..3c387cda95fa 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -38,12 +38,9 @@ extern void generic_apic_probe(void);
 extern int apic_verbosity;
 extern int timer_over_8254;
 extern int local_apic_timer_c2_ok;
-extern int local_apic_timer_disabled;
 
-extern int apic_runs_main_timer;
 extern int ioapic_force;
 extern int disable_apic;
-extern int disable_apic_timer;
 
 /*
  * Basic functions accessing APICs.
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 57a1f02e5ec0..67f80c261709 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -30,6 +30,7 @@
 struct tick_device tick_broadcast_device;
 static cpumask_t tick_broadcast_mask;
 static DEFINE_SPINLOCK(tick_broadcast_lock);
+static int tick_broadcast_force;
 
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
@@ -232,10 +233,11 @@ static void tick_do_broadcast_on_off(void *why)
 						     CLOCK_EVT_MODE_SHUTDOWN);
 		}
 		if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
-			dev->features |= CLOCK_EVT_FEAT_DUMMY;
+			tick_broadcast_force = 1;
 		break;
 	case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-		if (cpu_isset(cpu, tick_broadcast_mask)) {
+		if (!tick_broadcast_force &&
+		    cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_clear(cpu, tick_broadcast_mask);
 			if (td->mode == TICKDEV_MODE_PERIODIC)
 				tick_setup_periodic(dev, 0);
-- 
cgit v1.2.3


From 0beefa208bb3a9e581a60125703409ebe6f7fa53 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 17 Jun 2008 09:12:03 +0200
Subject: x86: add C1E aware idle function, fix

On Tue, 17 Jun 2008, Rafael J. Wysocki wrote:
>
> BTW, with the C1E patches reverted I don't get the
> WARNING: at /home/rafael/src/linux-next/kernel/smp.c:215 smp_call_function_single+0x3d/0xa2
> in the log.  Thomas?

The BROADCAST_FORCE notification uses smp_function_call and therefor
must be run with interrupts enabled.

While at it, add a comment for the BROADCAST_EXIT notifier as well.

Reported-and-bisected-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 68ad3539b143..4061d63aabe7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -267,17 +267,29 @@ static void c1e_idle(void)
 
 		if (!cpu_isset(cpu, c1e_mask)) {
 			cpu_set(cpu, c1e_mask);
-			/* Force broadcast so ACPI can not interfere */
+			/*
+			 * Force broadcast so ACPI can not interfere. Needs
+			 * to run with interrupts enabled as it uses
+			 * smp_function_call.
+			 */
+			local_irq_enable();
 			clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
 					   &cpu);
 			printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
 			       cpu);
+			local_irq_disable();
 		}
 		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+
 		default_idle();
-		local_irq_disable();
-		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
-		local_irq_enable();
+
+		/*
+		 * The switch back from broadcast mode needs to be
+		 * called with interrupts disabled.
+		 */
+		 local_irq_disable();
+		 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+		 local_irq_enable();
 	} else
 		default_idle();
 }
-- 
cgit v1.2.3


From 3a27dd1ce5de08e21e0266ddf00e6f1f843bfe8b Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 12 Jun 2008 20:19:23 +0200
Subject: x86: Move PCI IO ECS code to x86/pci

"Form follows function". Code is now where it belongs to.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c    |  3 ---
 arch/x86/kernel/cpu/amd_64.c |  4 ----
 arch/x86/kernel/cpu/cpu.h    |  2 --
 arch/x86/kernel/setup.c      | 13 -------------
 arch/x86/pci/Makefile_32     |  1 +
 arch/x86/pci/amd_bus.c       | 32 ++++++++++++++++++++++++++++++++
 arch/x86/pci/direct.c        | 16 +++++++++-------
 arch/x86/pci/pci.h           |  1 +
 include/asm-x86/cpufeature.h |  2 --
 9 files changed, 43 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index acc891ae5901..81a07ca65d44 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -266,9 +266,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	if (cpu_has_xmm2)
 		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
-	if (c->x86 == 0x10)
-		amd_enable_pci_ext_cfg(c);
 }
 
 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index f8d20588bde9..250bfe6064af 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -6,7 +6,6 @@
 #include <asm/cacheflush.h>
 
 #include <mach_apic.h>
-#include "cpu.h"
 
 extern int __cpuinit get_model_name(struct cpuinfo_x86 *c);
 extern void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c);
@@ -187,9 +186,6 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 == 0x10)
 		fam10h_check_enable_mmcfg();
 
-	if (c->x86 == 0x10)
-		amd_enable_pci_ext_cfg(c);
-
 	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
 		unsigned long long tseg;
 
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f5d5bb1b5541..40ad1893fe87 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -39,5 +39,3 @@ extern int get_model_name(struct cpuinfo_x86 *c);
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
 
 #endif /* CONFIG_X86_32 */
-
-extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 20e14dbef107..6f80b852a196 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -137,16 +137,3 @@ void __init setup_per_cpu_areas(void)
 }
 
 #endif
-#define ENABLE_CF8_EXT_CFG      (1ULL << 46)
-
-void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c)
-{
-	u64 reg;
-	rdmsrl(MSR_AMD64_NB_CFG, reg);
-	if (!(reg & ENABLE_CF8_EXT_CFG)) {
-		reg |= ENABLE_CF8_EXT_CFG;
-		wrmsrl(MSR_AMD64_NB_CFG, reg);
-	}
-	set_cpu_cap(c, X86_FEATURE_PCI_EXT_CFG);
-}
-
diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32
index 89ec35d00efd..f647e7e56da4 100644
--- a/arch/x86/pci/Makefile_32
+++ b/arch/x86/pci/Makefile_32
@@ -22,3 +22,4 @@ pci-$(CONFIG_X86_NUMAQ)		:= numa.o irq.o
 pci-$(CONFIG_NUMA)		+= mp_bus_to_node.o
 
 obj-y				+= $(pci-y) common.o early.o
+obj-y				+= amd_bus.o
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 5c2799c20e47..15f505d3a78e 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -1,5 +1,9 @@
 #include <linux/init.h>
 #include <linux/pci.h>
+#include "pci.h"
+
+#ifdef CONFIG_X86_64
+
 #include <asm/pci-direct.h>
 #include <asm/mpspec.h>
 #include <linux/cpumask.h>
@@ -526,3 +530,31 @@ static int __init early_fill_mp_bus_info(void)
 }
 
 postcore_initcall(early_fill_mp_bus_info);
+
+#endif
+
+/* common 32/64 bit code */
+
+#define ENABLE_CF8_EXT_CFG      (1ULL << 46)
+
+static void enable_pci_io_ecs_per_cpu(void *unused)
+{
+	u64 reg;
+	rdmsrl(MSR_AMD64_NB_CFG, reg);
+	if (!(reg & ENABLE_CF8_EXT_CFG)) {
+		reg |= ENABLE_CF8_EXT_CFG;
+		wrmsrl(MSR_AMD64_NB_CFG, reg);
+	}
+}
+
+static int __init enable_pci_io_ecs(void)
+{
+	/* assume all cpus from fam10h have IO ECS */
+        if (boot_cpu_data.x86 < 0x10)
+		return 0;
+	on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1, 1);
+	pci_probe |= PCI_HAS_IO_ECS;
+	return 0;
+}
+
+postcore_initcall(enable_pci_io_ecs);
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 27d61b63def6..9915293500fb 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -265,14 +265,16 @@ void __init pci_direct_init(int type)
 		 type);
 	if (type == 1) {
 		raw_pci_ops = &pci_direct_conf1;
-		if (!raw_pci_ext_ops && cpu_has_pci_ext_cfg) {
-			printk(KERN_INFO "PCI: Using configuration type 1 "
-			       "for extended access\n");
-			raw_pci_ext_ops = &pci_direct_conf1;
-		}
-	} else {
-		raw_pci_ops = &pci_direct_conf2;
+		if (raw_pci_ext_ops)
+			return;
+		if (!(pci_probe & PCI_HAS_IO_ECS))
+			return;
+		printk(KERN_INFO "PCI: Using configuration type 1 "
+		       "for extended access\n");
+		raw_pci_ext_ops = &pci_direct_conf1;
+		return;
 	}
+	raw_pci_ops = &pci_direct_conf2;
 }
 
 int __init pci_direct_probe(void)
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index f3972b12c60a..fd53db50eeeb 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -27,6 +27,7 @@
 #define PCI_CAN_SKIP_ISA_ALIGN	0x8000
 #define PCI_USE__CRS		0x10000
 #define PCI_CHECK_ENABLE_AMD_MMCONF	0x20000
+#define PCI_HAS_IO_ECS		0x40000
 
 extern unsigned int pci_probe;
 extern unsigned long pirq_table_addr;
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 40fcbba00f15..0d609c837a41 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -79,7 +79,6 @@
 #define X86_FEATURE_REP_GOOD	(3*32+16) /* rep microcode works well on this CPU */
 #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
 #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */
-#define X86_FEATURE_PCI_EXT_CFG	(3*32+19) /* PCI extended cfg access */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
@@ -188,7 +187,6 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_gbpages		boot_cpu_has(X86_FEATURE_GBPAGES)
 #define cpu_has_arch_perfmon	boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
 #define cpu_has_pat		boot_cpu_has(X86_FEATURE_PAT)
-#define cpu_has_pci_ext_cfg	boot_cpu_has(X86_FEATURE_PCI_EXT_CFG)
 
 #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
 # define cpu_has_invlpg		1
-- 
cgit v1.2.3


From dcd32b6a1ffe6c040f8346f7fbaf4318bb8ae41c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 20 Jun 2008 08:18:09 +0200
Subject: x86: make 64-bit identify_cpu use cpu_dev

we may need to move some functions to common.c later

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd_64.c     | 17 +++++--
 arch/x86/kernel/cpu/centaur_64.c | 16 ++++++-
 arch/x86/kernel/cpu/cpu.h        |  6 ++-
 arch/x86/kernel/cpu/intel_64.c   | 15 ++++++-
 arch/x86/kernel/setup_64.c       | 96 +++++++++++++++++++++-------------------
 5 files changed, 95 insertions(+), 55 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 250bfe6064af..30b7557c9641 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -7,8 +7,7 @@
 
 #include <mach_apic.h>
 
-extern int __cpuinit get_model_name(struct cpuinfo_x86 *c);
-extern void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c);
+#include "cpu.h"
 
 int force_mwait __cpuinitdata;
 
@@ -109,7 +108,7 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
 #endif
 }
 
-void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 {
 	early_init_amd_mc(c);
 
@@ -118,7 +117,7 @@ void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 }
 
-void __cpuinit init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 {
 	unsigned level;
 
@@ -200,3 +199,13 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c)
 			set_memory_4k((unsigned long)__va(tseg), 1);
 	}
 }
+
+static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+	.c_vendor	= "AMD",
+	.c_ident	= { "AuthenticAMD" },
+	.c_early_init   = early_init_amd,
+	.c_init		= init_amd,
+};
+
+cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index bac96d187d05..13526fd5cce1 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -4,13 +4,15 @@
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
 
-void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+#include "cpu.h"
+
+static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
 {
 	if (c->x86 == 0x6 && c->x86_model >= 0xf)
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 }
 
-void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 {
 	/* Cache sizes */
 	unsigned n;
@@ -29,3 +31,13 @@ void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 	}
 	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 }
+
+static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+	.c_vendor	= "Centaur",
+	.c_ident	= { "CentaurHauls" },
+	.c_early_init	= early_init_centaur,
+	.c_init		= init_centaur,
+};
+
+cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 40ad1893fe87..4d894e8565fe 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,4 +1,6 @@
-#ifdef CONFIG_X86_32
+#ifndef ARCH_X86_CPU_H
+
+#define ARCH_X86_CPU_H
 
 struct cpu_model_info {
 	int vendor;
@@ -38,4 +40,4 @@ extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
 extern int get_model_name(struct cpuinfo_x86 *c);
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
 
-#endif /* CONFIG_X86_32 */
+#endif
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
index b33912199484..fcb1cc9d75ca 100644
--- a/arch/x86/kernel/cpu/intel_64.c
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -5,7 +5,9 @@
 #include <asm/topology.h>
 #include <asm/numa_64.h>
 
-void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+#include "cpu.h"
+
+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 {
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
@@ -48,7 +50,7 @@ static void __cpuinit srat_detect_node(void)
 #endif
 }
 
-void __cpuinit init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
 	/* Cache sizes */
 	unsigned n;
@@ -90,3 +92,12 @@ void __cpuinit init_intel(struct cpuinfo_x86 *c)
 
 	srat_detect_node();
 }
+
+static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+	.c_vendor	= "Intel",
+	.c_ident	= { "GenuineIntel" },
+	.c_early_init   = early_init_intel,
+	.c_init		= init_intel,
+};
+cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
+
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 25afdd80a3cf..0cebf953c25a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -80,6 +80,8 @@
 #define ARCH_SETUP
 #endif
 
+#include "cpu/cpu.h"
+
 /*
  * Machine setup..
  */
@@ -163,6 +165,7 @@ static struct resource bss_resource = {
 	.flags = IORESOURCE_RAM,
 };
 
+static void __init early_cpu_init(void);
 static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
 
 #ifdef CONFIG_PROC_VMCORE
@@ -339,6 +342,7 @@ void __init setup_arch(char **cmdline_p)
 	bss_resource.start = virt_to_phys(&__bss_start);
 	bss_resource.end = virt_to_phys(&__bss_stop)-1;
 
+	early_cpu_init();
 	early_identify_cpu(&boot_cpu_data);
 
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
@@ -524,6 +528,19 @@ void __init setup_arch(char **cmdline_p)
 	check_enable_amd_mmconf_dmi();
 }
 
+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+	display_cacheinfo(c);
+}
+
+static struct cpu_dev __cpuinitdata default_cpu = {
+	.c_init	= default_init,
+	.c_vendor = "Unknown",
+};
+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
 int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
@@ -625,24 +642,37 @@ out:
 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
-
-	if (!strcmp(v, "AuthenticAMD"))
-		c->x86_vendor = X86_VENDOR_AMD;
-	else if (!strcmp(v, "GenuineIntel"))
-		c->x86_vendor = X86_VENDOR_INTEL;
-	else if (!strcmp(v, "CentaurHauls"))
-		c->x86_vendor = X86_VENDOR_CENTAUR;
-	else
-		c->x86_vendor = X86_VENDOR_UNKNOWN;
+	int i;
+	static int printed;
+
+	for (i = 0; i < X86_VENDOR_NUM; i++) {
+		if (cpu_devs[i]) {
+			if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+			    (cpu_devs[i]->c_ident[1] &&
+			    !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+				c->x86_vendor = i;
+				this_cpu = cpu_devs[i];
+				return;
+			}
+		}
+	}
+	if (!printed) {
+		printed++;
+		printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+		printk(KERN_ERR "CPU: Your system may be unstable.\n");
+	}
+	c->x86_vendor = X86_VENDOR_UNKNOWN;
 }
 
-// FIXME: Needs to use cpu_vendor_dev_register
-extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c);
-extern void __cpuinit init_amd(struct cpuinfo_x86 *c);
-extern void __cpuinit early_init_intel(struct cpuinfo_x86 *c);
-extern void __cpuinit init_intel(struct cpuinfo_x86 *c);
-extern void __cpuinit early_init_centaur(struct cpuinfo_x86 *c);
-extern void __cpuinit init_centaur(struct cpuinfo_x86 *c);
+static void __init early_cpu_init(void)
+{
+        struct cpu_vendor_dev *cvdev;
+
+        for (cvdev = __x86cpuvendor_start ;
+             cvdev < __x86cpuvendor_end   ;
+             cvdev++)
+                cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+}
 
 /* Do some early cpuid on the boot CPU to get some parameter that are
    needed before check_bugs. Everything advanced is in identify_cpu
@@ -722,17 +752,9 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000007)
 		c->x86_power = cpuid_edx(0x80000007);
 
-	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		early_init_amd(c);
-		break;
-	case X86_VENDOR_INTEL:
-		early_init_intel(c);
-		break;
-	case X86_VENDOR_CENTAUR:
-		early_init_centaur(c);
-		break;
-	}
+	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+	    cpu_devs[c->x86_vendor]->c_early_init)
+		cpu_devs[c->x86_vendor]->c_early_init(c);
 
 	validate_pat_support(c);
 }
@@ -760,24 +782,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	 * At the end of this section, c->x86_capability better
 	 * indicate the features this CPU genuinely supports!
 	 */
-	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		init_amd(c);
-		break;
-
-	case X86_VENDOR_INTEL:
-		init_intel(c);
-		break;
-
-	case X86_VENDOR_CENTAUR:
-		init_centaur(c);
-		break;
-
-	case X86_VENDOR_UNKNOWN:
-	default:
-		display_cacheinfo(c);
-		break;
-	}
+	if (this_cpu->c_init)
+		this_cpu->c_init(c);
 
 	detect_ht(c);
 
-- 
cgit v1.2.3


From c49c412a47b5102516d3313d4eba38cb1e968721 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 19 Jun 2008 15:30:31 -0700
Subject: x86: make 64bit identify_cpu use cpu_dev v2

v2: fix early_panic on this config:

  http://redhat.com/~mingo/misc/config-Thu_Jun_19_14_22_37_CEST_2008.bad

reason : struct cpu_vendor_dev size is 16, need to make table to be 16
         byte alignment

also print out the cpu supported...

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Dave Jones <davej@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_64.c       | 20 ++++++++++++++++++++
 arch/x86/kernel/vmlinux_64.lds.S |  1 +
 2 files changed, 21 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 0cebf953c25a..b789ec599923 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -664,6 +664,25 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
 }
 
+static void __init early_cpu_support_print(void)
+{
+	int i,j;
+	struct cpu_dev *cpu_devx;
+
+	printk("KERNEL supported cpus:\n");
+	for (i = 0; i < X86_VENDOR_NUM; i++) {
+		cpu_devx = cpu_devs[i];
+		if (!cpu_devx)
+			continue;
+		for (j = 0; j < 2; j++) {
+			if (!cpu_devx->c_ident[j])
+				continue;
+			printk("  %s %s\n", cpu_devx->c_vendor,
+				cpu_devx->c_ident[j]);
+		}
+	}
+}
+
 static void __init early_cpu_init(void)
 {
         struct cpu_vendor_dev *cvdev;
@@ -672,6 +691,7 @@ static void __init early_cpu_init(void)
              cvdev < __x86cpuvendor_end   ;
              cvdev++)
                 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+	early_cpu_support_print();
 }
 
 /* Do some early cpuid on the boot CPU to get some parameter that are
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fad3674b06a5..b29f63bdff5e 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -177,6 +177,7 @@ SECTIONS
 	*(.con_initcall.init)
   }
   __con_initcall_end = .;
+  . = ALIGN(16);
   __x86cpuvendor_start = .;
   .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
 	*(.x86cpuvendor.init)
-- 
cgit v1.2.3


From a0176e2485ce6468f9b74264a2fd6c19811f027a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 16 Jun 2008 12:44:17 +0200
Subject: Revert "Revert "x86: fix ioapic bug again""

This reverts commit 0b6a39f7ebcb1c82587ce35b401c513eed41ac5c.

The changes in tip/x86/apic solve this better.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 12 ++++++++++--
 arch/x86/kernel/nmi_32.c     |  9 +++++++--
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4dc8600d9d20..a40d54fc1fdd 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2130,10 +2130,14 @@ static inline void __init check_timer(void)
 {
 	int apic1, pin1, apic2, pin2;
 	int vector;
+	unsigned int ver;
 	unsigned long flags;
 
 	local_irq_save(flags);
 
+	ver = apic_read(APIC_LVR);
+	ver = GET_APIC_VERSION(ver);
+
 	/*
 	 * get/set the timer IRQ vector:
 	 */
@@ -2146,11 +2150,15 @@ static inline void __init check_timer(void)
 	 * mode for the 8259A whenever interrupts are routed
 	 * through I/O APICs.  Also IRQ0 has to be enabled in
 	 * the 8259A which implies the virtual wire has to be
-	 * disabled in the local APIC.
+	 * disabled in the local APIC.  Finally timer interrupts
+	 * need to be acknowledged manually in the 8259A for
+	 * timer_interrupt() and for the i82489DX when using
+	 * the NMI watchdog.
 	 */
 	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	timer_ack = 1;
+	timer_ack = !cpu_has_tsc;
+	timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
 	if (timer_over_8254 > 0)
 		enable_8259A_irq(0);
 
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 84160f74eeb0..11b14bbaa61e 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -26,6 +26,7 @@
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
+#include <asm/timer.h>
 
 #include "mach_traps.h"
 
@@ -81,7 +82,7 @@ int __init check_nmi_watchdog(void)
 
 	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
 	if (!prev_nmi_count)
-		return -1;
+		goto error;
 
 	printk(KERN_INFO "Testing NMI watchdog ... ");
 
@@ -118,7 +119,7 @@ int __init check_nmi_watchdog(void)
 	if (!atomic_read(&nmi_active)) {
 		kfree(prev_nmi_count);
 		atomic_set(&nmi_active, -1);
-		return -1;
+		goto error;
 	}
 	printk("OK.\n");
 
@@ -129,6 +130,10 @@ int __init check_nmi_watchdog(void)
 
 	kfree(prev_nmi_count);
 	return 0;
+error:
+	timer_ack = !cpu_has_tsc;
+
+	return -1;
 }
 
 static int __init setup_nmi_watchdog(char *str)
-- 
cgit v1.2.3


From d11d5794e0c21a1054e6cd57381050a999ad7232 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Wed, 21 May 2008 22:09:11 +0100
Subject: x86: I/O APIC: AEOI timer acknowledgement clean-ups

The code that used to be in do_slow_gettimeoffset() that relied on the
IRR bit of the master 8259A PIC for IRQ0 to check the state of the output
timer 0 of the PIT is no longer there.  As a result, there is no need to
use the POLL command to acknowledge the timer interrupt in the "8259A
Virtual Wire", except for the NMI watchdog when the i82489DX APIC is used
(this is because this particular APIC treats NMIs as level-triggered and
keeping the input asserted would keep motherboard NMI sources held off for
too long).  Remove the unneeded bits and adjust comments accordingly.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 20 +++++++++-----------
 arch/x86/kernel/io_apic_64.c |  7 ++-----
 arch/x86/kernel/nmi_32.c     |  2 +-
 arch/x86/kernel/time_32.c    |  3 +--
 4 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index a40d54fc1fdd..c64b3f5cc121 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2146,19 +2146,17 @@ static inline void __init check_timer(void)
 	set_intr_gate(vector, interrupt[0]);
 
 	/*
-	 * Subtle, code in do_timer_interrupt() expects an AEOI
-	 * mode for the 8259A whenever interrupts are routed
-	 * through I/O APICs.  Also IRQ0 has to be enabled in
-	 * the 8259A which implies the virtual wire has to be
-	 * disabled in the local APIC.  Finally timer interrupts
-	 * need to be acknowledged manually in the 8259A for
-	 * timer_interrupt() and for the i82489DX when using
-	 * the NMI watchdog.
+	 * As IRQ0 is to be enabled in the 8259A, the virtual
+	 * wire has to be disabled in the local APIC.  Also
+	 * timer interrupts need to be acknowledged manually in
+	 * the 8259A for the i82489DX when using the NMI
+	 * watchdog as that APIC treats NMIs as level-triggered.
+	 * The AEOI mode will finish them in the 8259A
+	 * automatically.
 	 */
 	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	timer_ack = !cpu_has_tsc;
-	timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+	timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
 	if (timer_over_8254 > 0)
 		enable_8259A_irq(0);
 
@@ -2219,6 +2217,7 @@ static inline void __init check_timer(void)
 		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
 		nmi_watchdog = 0;
 	}
+	timer_ack = 0;
 
 	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
@@ -2237,7 +2236,6 @@ static inline void __init check_timer(void)
 
 	printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
 
-	timer_ack = 0;
 	init_8259A(0);
 	make_8259A_irq(0);
 	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..7c34e38e4219 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1669,11 +1669,8 @@ static inline void __init check_timer(void)
 	assign_irq_vector(0, TARGET_CPUS);
 
 	/*
-	 * Subtle, code in do_timer_interrupt() expects an AEOI
-	 * mode for the 8259A whenever interrupts are routed
-	 * through I/O APICs.  Also IRQ0 has to be enabled in
-	 * the 8259A which implies the virtual wire has to be
-	 * disabled in the local APIC.
+	 * As IRQ0 is to be enabled in the 8259A, the virtual
+	 * wire has to be disabled in the local APIC.
 	 */
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 11b14bbaa61e..d322901c38e6 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -131,7 +131,7 @@ int __init check_nmi_watchdog(void)
 	kfree(prev_nmi_count);
 	return 0;
 error:
-	timer_ack = !cpu_has_tsc;
+	timer_ack = 0;
 
 	return -1;
 }
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 2ff21f398934..5f29f12da50c 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -84,8 +84,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 	if (timer_ack) {
 		/*
 		 * Subtle, when I/O APICs are used we have to ack timer IRQ
-		 * manually to reset the IRR bit for do_slow_gettimeoffset().
-		 * This will also deassert NMI lines for the watchdog if run
+		 * manually to deassert NMI lines for the watchdog if run
 		 * on an 82489DX-based system.
 		 */
 		spin_lock(&i8259A_lock);
-- 
cgit v1.2.3


From ecd29476ae0143b1c3641edfa76c0fc3e9ad3021 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Wed, 21 May 2008 22:09:19 +0100
Subject: x86: I/O APIC: remove parameters to fiddle with the 8259A

Remove the "disable_8254_timer" and "enable_8254_timer" kernel
parameters.  Now that AEOI acknowledgements are no longer needed for
correct timer operation, the 8259A can be kept disabled unconditionally
unless interrupts, either timer or watchdog ones, are actually passed
through it.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/early-quirks.c | 13 -------------
 arch/x86/kernel/io_apic_32.c   | 21 ++-------------------
 arch/x86/kernel/io_apic_64.c   | 21 ++-------------------
 include/asm-x86/apic.h         |  1 -
 4 files changed, 4 insertions(+), 52 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 9f51e1ea9e82..84fd9f2a28ff 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -98,17 +98,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
 
 }
 
-static void __init ati_bugs(int num, int slot, int func)
-{
-#ifdef CONFIG_X86_IO_APIC
-	if (timer_over_8254 == 1) {
-		timer_over_8254 = 0;
-		printk(KERN_INFO
-		"ATI board detected. Disabling timer routing over 8254.\n");
-	}
-#endif
-}
-
 #define QFLAG_APPLY_ONCE 	0x1
 #define QFLAG_APPLIED		0x2
 #define QFLAG_DONE		(QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -126,8 +115,6 @@ static struct chipset early_qrk[] __initdata = {
 	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
 	{ PCI_VENDOR_ID_VIA, PCI_ANY_ID,
 	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
-	{ PCI_VENDOR_ID_ATI, PCI_ANY_ID,
-	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
 	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
 	  PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
 	{}
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index c64b3f5cc121..61cf366d040f 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -58,7 +58,6 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 static DEFINE_SPINLOCK(ioapic_lock);
 static DEFINE_SPINLOCK(vector_lock);
 
-int timer_over_8254 __initdata = 1;
 
 /*
  *	Is the SiS APIC rmw bug present ?
@@ -2157,8 +2156,6 @@ static inline void __init check_timer(void)
 	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
 	timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
-	if (timer_over_8254 > 0)
-		enable_8259A_irq(0);
 
 	pin1  = find_isa_irq_pin(0, mp_INT);
 	apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2175,7 +2172,6 @@ static inline void __init check_timer(void)
 		unmask_IO_APIC_irq(0);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
-				disable_8259A_irq(0);
 				setup_nmi();
 				enable_8259A_irq(0);
 			}
@@ -2195,6 +2191,7 @@ static inline void __init check_timer(void)
 		 * legacy devices should be connected to IO APIC #0
 		 */
 		setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			printk("works.\n");
 			if (pin1 != -1)
@@ -2209,6 +2206,7 @@ static inline void __init check_timer(void)
 		/*
 		 * Cleanup, just in case ...
 		 */
+		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
 	}
 	printk(" failed.\n");
@@ -2221,7 +2219,6 @@ static inline void __init check_timer(void)
 
 	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
-	disable_8259A_irq(0);
 	set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
 				      "fasteoi");
 	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
@@ -2292,20 +2289,6 @@ void __init setup_IO_APIC(void)
 		print_IO_APIC();
 }
 
-static int __init setup_disable_8254_timer(char *s)
-{
-	timer_over_8254 = -1;
-	return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
-	timer_over_8254 = 2;
-	return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
 /*
  *	Called after all the initialization is done. If we didnt find any
  *	APIC bugs then we can allow the modify fast path
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 7c34e38e4219..9f16ca4b5a24 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -90,7 +90,6 @@ static int no_timer_check;
 
 static int disable_timer_pin_1 __initdata;
 
-int timer_over_8254 __initdata = 1;
 
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
@@ -430,20 +429,6 @@ static int __init disable_timer_pin_setup(char *arg)
 }
 __setup("disable_timer_pin_1", disable_timer_pin_setup);
 
-static int __init setup_disable_8254_timer(char *s)
-{
-	timer_over_8254 = -1;
-	return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
-	timer_over_8254 = 2;
-	return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
 
 /*
  * Find the IRQ entry number of a certain pin.
@@ -1674,8 +1659,6 @@ static inline void __init check_timer(void)
 	 */
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	if (timer_over_8254 > 0)
-		enable_8259A_irq(0);
 
 	pin1  = find_isa_irq_pin(0, mp_INT);
 	apic1 = find_isa_irq_apic(0, mp_INT);
@@ -1693,7 +1676,6 @@ static inline void __init check_timer(void)
 		if (!no_timer_check && timer_irq_works()) {
 			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
-				disable_8259A_irq(0);
 				setup_nmi();
 				enable_8259A_irq(0);
 			}
@@ -1715,6 +1697,7 @@ static inline void __init check_timer(void)
 		 * legacy devices should be connected to IO APIC #0
 		 */
 		setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
 			nmi_watchdog_default();
@@ -1726,6 +1709,7 @@ static inline void __init check_timer(void)
 		/*
 		 * Cleanup, just in case ...
 		 */
+		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
 	}
 	apic_printk(APIC_VERBOSE," failed.\n");
@@ -1737,7 +1721,6 @@ static inline void __init check_timer(void)
 
 	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
-	disable_8259A_irq(0);
 	irq_desc[0].chip = &lapic_irq_type;
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index be9639a9a186..9c2dfd926e59 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -36,7 +36,6 @@ extern void generic_apic_probe(void);
 #ifdef CONFIG_X86_LOCAL_APIC
 
 extern int apic_verbosity;
-extern int timer_over_8254;
 extern int local_apic_timer_c2_ok;
 extern int local_apic_timer_disabled;
 
-- 
cgit v1.2.3


From e67465f1298671266a8e824c1751afdb7c08c860 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Wed, 21 May 2008 22:09:26 +0100
Subject: x86: I/O APIC: clean up after a fasteoi failure

Disable the 8259A when routing of the timer interrupt through the chip to
the local APIC of the primary processor has failed.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 1 +
 arch/x86/kernel/io_apic_64.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 61cf366d040f..e7b7655c4e94 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2228,6 +2228,7 @@ static inline void __init check_timer(void)
 		printk(" works.\n");
 		goto out;
 	}
+	disable_8259A_irq(0);
 	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
 	printk(" failed.\n");
 
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 9f16ca4b5a24..6433fc99f1f9 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1729,6 +1729,7 @@ static inline void __init check_timer(void)
 		apic_printk(APIC_VERBOSE," works.\n");
 		goto out;
 	}
+	disable_8259A_irq(0);
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
 	apic_printk(APIC_VERBOSE," failed.\n");
 
-- 
cgit v1.2.3


From 60134ebe795b728dbb960485a8e873c3250ada36 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Wed, 21 May 2008 22:09:34 +0100
Subject: x86: I/O APIC: keep IRQ off when changing LVT registers

Disable the 8259A acting in the "virtual wire" mode to keep the interrupt
line inactive while fiddling with local APIC interrupt vector registers
associated with its destination inputs.  To be on the safe side,
especially concerning flipping the trigger mode.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 2 ++
 arch/x86/kernel/io_apic_64.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index e7b7655c4e94..41218ac75d10 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2199,7 +2199,9 @@ static inline void __init check_timer(void)
 			else
 				add_pin_to_irq(0, apic2, pin2);
 			if (nmi_watchdog == NMI_IO_APIC) {
+				disable_8259A_irq(0);
 				setup_nmi();
+				enable_8259A_irq(0);
 			}
 			goto out;
 		}
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 6433fc99f1f9..aa45a85c4d11 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1702,7 +1702,9 @@ static inline void __init check_timer(void)
 			apic_printk(APIC_VERBOSE," works.\n");
 			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
+				disable_8259A_irq(0);
 				setup_nmi();
+				enable_8259A_irq(0);
 			}
 			goto out;
 		}
-- 
cgit v1.2.3


From 73d08e636026bbcb413d4864ca5e917502f8a0f9 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Wed, 21 May 2008 22:09:43 +0100
Subject: x86: APIC/SMP: correct the message for "nosmp"

The local APIC is no longer forced off when "nosmp" has been specified.
Correct the message printed.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3e1cecedde42..8c53d86e3e86 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1157,8 +1157,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	 * If SMP should be disabled, then really disable it!
 	 */
 	if (!max_cpus) {
-		printk(KERN_INFO "SMP mode deactivated,"
-				 "forcing use of dummy APIC emulation.\n");
+		printk(KERN_INFO "SMP mode deactivated.\n");
 		smpboot_clear_io_apic();
 #ifdef CONFIG_X86_32
 		connect_bsp_APIC();
-- 
cgit v1.2.3


From a1133d8e4ffc2db751eb987a2f3cf8ead67927c3 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Wed, 21 May 2008 22:10:16 +0100
Subject: x86: APIC/SMP: downgrade the NMI watchdog for "nosmp"

If configured to use the I/O APIC, the NMI watchdog is deemed to fail if
the chip has been deactivated as a result of "nosmp".  Downgrade to the
local APIC watchdog similarly to what is done for the UP case.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8c53d86e3e86..df60bc7c9cb7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1159,6 +1159,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	if (!max_cpus) {
 		printk(KERN_INFO "SMP mode deactivated.\n");
 		smpboot_clear_io_apic();
+
+		if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
+			nmi_watchdog = NMI_LOCAL_APIC;
+
 #ifdef CONFIG_X86_32
 		connect_bsp_APIC();
 #endif
-- 
cgit v1.2.3


From 35542c5ebced864776d90d83d1e255016fd4c084 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Wed, 21 May 2008 22:10:22 +0100
Subject: x86: I/O APIC: clean up the 8259A on a NMI watchdog failure

There is no point in keeping the 8259A enabled if the I/O APIC NMI
watchdog has failed and the 8259A is not used to pass through regular
timer interrupts.  This fixes problems with some systems where some logic
gets confused.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c |  2 ++
 arch/x86/kernel/io_apic_64.c |  2 ++
 arch/x86/kernel/nmi_32.c     |  4 ++++
 arch/x86/kernel/nmi_64.c     | 11 +++++++++--
 include/asm-x86/io_apic.h    |  4 ++++
 5 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 41218ac75d10..7c7d88ebb966 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -58,6 +58,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 static DEFINE_SPINLOCK(ioapic_lock);
 static DEFINE_SPINLOCK(vector_lock);
 
+int timer_through_8259 __initdata;
 
 /*
  *	Is the SiS APIC rmw bug present ?
@@ -2194,6 +2195,7 @@ static inline void __init check_timer(void)
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			printk("works.\n");
+			timer_through_8259 = 1;
 			if (pin1 != -1)
 				replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
 			else
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index aa45a85c4d11..2a60bfb7446f 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -90,6 +90,7 @@ static int no_timer_check;
 
 static int disable_timer_pin_1 __initdata;
 
+int timer_through_8259 __initdata;
 
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
@@ -1700,6 +1701,7 @@ static inline void __init check_timer(void)
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
+			timer_through_8259 = 1;
 			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
 				disable_8259A_irq(0);
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index d322901c38e6..6580dae46277 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -24,6 +24,8 @@
 #include <linux/kdebug.h>
 #include <linux/slab.h>
 
+#include <asm/i8259.h>
+#include <asm/io_apic.h>
 #include <asm/smp.h>
 #include <asm/nmi.h>
 #include <asm/timer.h>
@@ -131,6 +133,8 @@ int __init check_nmi_watchdog(void)
 	kfree(prev_nmi_count);
 	return 0;
 error:
+	if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
+		disable_8259A_irq(0);
 	timer_ack = 0;
 
 	return -1;
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 5a29ded994fa..0060e44e8989 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -21,6 +21,8 @@
 #include <linux/cpumask.h>
 #include <linux/kdebug.h>
 
+#include <asm/i8259.h>
+#include <asm/io_apic.h>
 #include <asm/smp.h>
 #include <asm/nmi.h>
 #include <asm/proto.h>
@@ -90,7 +92,7 @@ int __init check_nmi_watchdog(void)
 
 	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
 	if (!prev_nmi_count)
-		return -1;
+		goto error;
 
 	printk(KERN_INFO "Testing NMI watchdog ... ");
 
@@ -121,7 +123,7 @@ int __init check_nmi_watchdog(void)
 	if (!atomic_read(&nmi_active)) {
 		kfree(prev_nmi_count);
 		atomic_set(&nmi_active, -1);
-		return -1;
+		goto error;
 	}
 	printk("OK.\n");
 
@@ -132,6 +134,11 @@ int __init check_nmi_watchdog(void)
 
 	kfree(prev_nmi_count);
 	return 0;
+error:
+	if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
+		disable_8259A_irq(0);
+
+	return -1;
 }
 
 static int __init setup_nmi_watchdog(char *str)
diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h
index d593e14f0341..a6732566ad05 100644
--- a/include/asm-x86/io_apic.h
+++ b/include/asm-x86/io_apic.h
@@ -137,6 +137,9 @@ extern int sis_apic_bug;
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
 
+/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
+extern int timer_through_8259;
+
 static inline void disable_ioapic_setup(void)
 {
 	skip_ioapic_setup = 1;
@@ -162,6 +165,7 @@ extern void ioapic_init_mappings(void);
 
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
+static const int timer_through_8259 = 0;
 #endif
 
 #endif
-- 
cgit v1.2.3


From 9a1c61929121cbd597a5575f82711c0db8ee1778 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 27 May 2008 21:19:09 +0100
Subject: x86: I/O APIC: fix the name of the L-APIC IRQ handler

The local APIC interrupt handler gets registered with
set_irq_chip_and_handler_name(), which results in
"local-APIC-edge-fasteoi" reported as the name of the handler.  Fix by
removing the type of the handler left over from before the generic
handlers were introduced.

The 64-bit variation should get fixed with the upcoming merge.

NB It should really use the "edge" handler and not the "fasteoi" one,
but that's a separate issue.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 7c7d88ebb966..7064500f00f2 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2037,7 +2037,7 @@ static void unmask_lapic_irq (unsigned int irq)
 }
 
 static struct irq_chip lapic_chip __read_mostly = {
-	.name		= "local-APIC-edge",
+	.name		= "local-APIC",
 	.mask		= mask_lapic_irq,
 	.unmask		= unmask_lapic_irq,
 	.eoi		= ack_apic,
-- 
cgit v1.2.3


From f08252623c7981f5cea70e4fab4983a94fc52212 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 27 May 2008 21:19:16 +0100
Subject: x86: I/O APIC: fix the name of the through-8259A handler

When the through-8259A mode is used for the timer, the call to
set_irq_handler() will register a NULL handler name, resulting in
"IO-APIC-<NULL>" reported.  Fix by calling ioapic_register_intr() as done
for all the other I/O APIC interrupts.

The 64-bit variation calls set_irq_chip_and_handler_name() here
needlessly and should get fixed with the upcoming merge.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 7064500f00f2..c93c0d3c1a6b 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1331,8 +1331,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 	 * The timer IRQ doesn't have to know that behind the
 	 * scene we have a 8259A-master in AEOI mode ...
 	 */
-	irq_desc[0].chip = &ioapic_chip;
-	set_irq_handler(0, handle_edge_irq);
+	ioapic_register_intr(0, vector, IOAPIC_EDGE);
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
-- 
cgit v1.2.3


From 80d16bace63e057d30337e48d70aef0881656457 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 27 May 2008 21:19:22 +0100
Subject: x86: I/O APIC: remove redundant 8259A {,un}masking

For a better control the masking and unmasking of the timer interrupt
line in the 8259A operating in the 'Virtual Wire' mode has been moved out
of setup_ExtINT_IRQ0_pin() now, so remove the redundant calls from the
function.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 4 ----
 arch/x86/kernel/io_apic_64.c | 4 ----
 2 files changed, 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index c93c0d3c1a6b..6e743ecf863b 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1310,8 +1310,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 
 	memset(&entry,0,sizeof(entry));
 
-	disable_8259A_irq(0);
-
 	/* mask LVT0 */
 	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 
@@ -1337,8 +1335,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 	 * Add it to the IO-APIC irq-routing table:
 	 */
 	ioapic_write_entry(apic, pin, entry);
-
-	enable_8259A_irq(0);
 }
 
 void __init print_IO_APIC(void)
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 2a60bfb7446f..83d4e117746b 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -906,8 +906,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 
 	memset(&entry, 0, sizeof(entry));
 
-	disable_8259A_irq(0);
-
 	/* mask LVT0 */
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 
@@ -933,8 +931,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 	 * Add it to the IO-APIC irq-routing table:
 	 */
 	ioapic_write_entry(apic, pin, entry);
-
-	enable_8259A_irq(0);
 }
 
 void __apicdebuginit print_IO_APIC(void)
-- 
cgit v1.2.3


From 6b4722a7779ebadcf016fd96ce3156b6acda8a31 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 27 May 2008 21:19:28 +0100
Subject: x86: I/O APIC: remove redundant LVT0 masking

The LINT0 line of the local APIC is masked in the LVT0 entry in
check_timer() before this function is ever called.  Removed the
redundant unmasking for better control.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 3 ---
 arch/x86/kernel/io_apic_64.c | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 6e743ecf863b..51e5519ee1a7 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1310,9 +1310,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 
 	memset(&entry,0,sizeof(entry));
 
-	/* mask LVT0 */
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-
 	/*
 	 * We use logical delivery to get the timer IRQ
 	 * to the first CPU.
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 83d4e117746b..565f19b01bf4 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -906,9 +906,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 
 	memset(&entry, 0, sizeof(entry));
 
-	/* mask LVT0 */
-	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-
 	/*
 	 * We use logical delivery to get the timer IRQ
 	 * to the first CPU.
-- 
cgit v1.2.3


From f7633ce55b2ea2926a39d7ca9d0bb06c43edd2c2 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 27 May 2008 21:19:34 +0100
Subject: x86: I/O APIC: rename setup_ExtINT_IRQ0_pin()

Rename setup_ExtINT_IRQ0_pin() to setup_timer_IRQ0_pin() to better
reflect the upcoming role of a function setting up a (semi-)arbitrary I/O
APIC pin appropriately for the 8254 timer.  By "appropriate" the following
settings are meant: edge-triggered, active-high, all the other settings
per-architecture.  Adjust comments to reflect code appropriately.  No
functional changes.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c |  9 +++++----
 arch/x86/kernel/io_apic_64.c | 10 +++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 51e5519ee1a7..ce682e873aa7 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1302,9 +1302,10 @@ static void __init setup_IO_APIC_irqs(void)
 }
 
 /*
- * Set up the 8259A-master output pin:
+ * Set up the timer pin, possibly with the 8259A-master behind.
  */
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+					int vector)
 {
 	struct IO_APIC_route_entry entry;
 
@@ -1324,7 +1325,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 
 	/*
 	 * The timer IRQ doesn't have to know that behind the
-	 * scene we have a 8259A-master in AEOI mode ...
+	 * scene we may have a 8259A-master in AEOI mode ...
 	 */
 	ioapic_register_intr(0, vector, IOAPIC_EDGE);
 
@@ -2183,7 +2184,7 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+		setup_timer_IRQ0_pin(apic2, pin2, vector);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			printk("works.\n");
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 565f19b01bf4..6c4635fa97a6 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -897,10 +897,10 @@ static void __init setup_IO_APIC_irqs(void)
 }
 
 /*
- * Set up the 8259A-master output pin as broadcast to all
- * CPUs.
+ * Set up the timer pin, possibly with the 8259A-master behind.
  */
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+					int vector)
 {
 	struct IO_APIC_route_entry entry;
 
@@ -920,7 +920,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 
 	/*
 	 * The timer IRQ doesn't have to know that behind the
-	 * scene we have a 8259A-master in AEOI mode ...
+	 * scene we may have a 8259A-master in AEOI mode ...
 	 */
 	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
 
@@ -1690,7 +1690,7 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
-- 
cgit v1.2.3


From 24742ece8eb01b5855059020ba1c09173fd9b732 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 27 May 2008 21:19:40 +0100
Subject: x86: I/O APIC: unmask the second-chance timer interrupt

Unmask the timer interrupt line set up in the through-8259A mode
explicitly after setup_timer_IRQ0_pin() has set up the I/O APIC interrupt
redirection entry to let the two operations be unbound from each other.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 1 +
 arch/x86/kernel/io_apic_64.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index ce682e873aa7..b381f93b79ae 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2185,6 +2185,7 @@ static inline void __init check_timer(void)
 		 * legacy devices should be connected to IO APIC #0
 		 */
 		setup_timer_IRQ0_pin(apic2, pin2, vector);
+		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			printk("works.\n");
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 6c4635fa97a6..0e20b7d7c1c7 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1691,6 +1691,7 @@ static inline void __init check_timer(void)
 		 * legacy devices should be connected to IO APIC #0
 		 */
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
-- 
cgit v1.2.3


From 03be750559b2fe20d85dd968e08d5fe1c3accf83 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 27 May 2008 21:19:45 +0100
Subject: x86: I/O APIC: keep the timer IRQ masked during set-up

Keep the timer interrupt line masked when reconfiguring its interrupt
redirection entry in the I/O APIC.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 2 +-
 arch/x86/kernel/io_apic_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index b381f93b79ae..63a2f64c765b 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1316,7 +1316,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
 	 * to the first CPU.
 	 */
 	entry.dest_mode = INT_DEST_MODE;
-	entry.mask = 0;					/* unmask IRQ now */
+	entry.mask = 1;					/* mask IRQ now */
 	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.polarity = 0;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 0e20b7d7c1c7..ae0ac990574f 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -911,7 +911,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
 	 * to the first CPU.
 	 */
 	entry.dest_mode = INT_DEST_MODE;
-	entry.mask = 0;					/* unmask IRQ now */
+	entry.mask = 1;					/* mask IRQ now */
 	entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.polarity = 0;
-- 
cgit v1.2.3


From 691874fa96d6349a8b60f8ea9c2bae52ece79941 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 27 May 2008 21:19:51 +0100
Subject: x86: I/O APIC: timer through 8259A second-chance

Some systems incorrectly report the ExtINTA pin of the I/O APIC as the
genuine target of the timer interrupt.  Here is a change that copies timer
pin information found to the other pin if one has been found only.  This
way both a direct and a through-8259A route is tested with the pin letting
these problematic systems work well enough.  If no timer pin information
has been found for the I/O APIC, then local APIC variations are tried
only, similarly to what is done without the change (except without the
misleading messages).

Obviously if we try the first-chance path without being told by the BIOS
to do so, we should not complain either, so do not print the message in
this case.

The 64-bit variation should be updated with a call to
replace_pin_at_irq() which can be done with the upcoming merge.  Since
add_pin_to_irq() is now always called in the first-chance path, the
condition to require it in the second-chance path no longer happens.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 38 ++++++++++++++++++++++++++++----------
 arch/x86/kernel/io_apic_64.c | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 57 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 63a2f64c765b..a69a59d19e18 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2122,6 +2122,7 @@ static inline void __init unlock_ExtINT_logic(void)
 static inline void __init check_timer(void)
 {
 	int apic1, pin1, apic2, pin2;
+	int no_pin1 = 0;
 	int vector;
 	unsigned int ver;
 	unsigned long flags;
@@ -2159,10 +2160,30 @@ static inline void __init check_timer(void)
 	printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		vector, apic1, pin1, apic2, pin2);
 
+	/*
+	 * Some BIOS writers are clueless and report the ExtINTA
+	 * I/O APIC input from the cascaded 8259A as the timer
+	 * interrupt input.  So just in case, if only one pin
+	 * was found above, try it both directly and through the
+	 * 8259A.
+	 */
+	if (pin1 == -1) {
+		pin1 = pin2;
+		apic1 = apic2;
+		no_pin1 = 1;
+	} else if (pin2 == -1) {
+		pin2 = pin1;
+		apic2 = apic1;
+	}
+
 	if (pin1 != -1) {
 		/*
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
+		if (no_pin1) {
+			add_pin_to_irq(0, apic1, pin1);
+			setup_timer_IRQ0_pin(apic1, pin1, vector);
+		}
 		unmask_IO_APIC_irq(0);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
@@ -2174,26 +2195,23 @@ static inline void __init check_timer(void)
 			goto out;
 		}
 		clear_IO_APIC_pin(apic1, pin1);
-		printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
-				"IO-APIC\n");
-	}
+		if (!no_pin1)
+			printk(KERN_ERR "..MP-BIOS bug: "
+			       "8254 timer not connected to IO-APIC\n");
 
-	printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
-	if (pin2 != -1) {
+		printk(KERN_INFO "...trying to set up timer (IRQ0) "
+		       "through the 8259A ... ");
 		printk("\n..... (found pin %d) ...", pin2);
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
+		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, vector);
 		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			printk("works.\n");
 			timer_through_8259 = 1;
-			if (pin1 != -1)
-				replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
-			else
-				add_pin_to_irq(0, apic2, pin2);
 			if (nmi_watchdog == NMI_IO_APIC) {
 				disable_8259A_irq(0);
 				setup_nmi();
@@ -2206,8 +2224,8 @@ static inline void __init check_timer(void)
 		 */
 		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
+		printk(" failed.\n");
 	}
-	printk(" failed.\n");
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ae0ac990574f..9a54b779abd4 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1638,6 +1638,7 @@ static inline void __init check_timer(void)
 	struct irq_cfg *cfg = irq_cfg + 0;
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
+	int no_pin1 = 0;
 
 	local_irq_save(flags);
 
@@ -1662,10 +1663,30 @@ static inline void __init check_timer(void)
 	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		cfg->vector, apic1, pin1, apic2, pin2);
 
+	/*
+	 * Some BIOS writers are clueless and report the ExtINTA
+	 * I/O APIC input from the cascaded 8259A as the timer
+	 * interrupt input.  So just in case, if only one pin
+	 * was found above, try it both directly and through the
+	 * 8259A.
+	 */
+	if (pin1 == -1) {
+		pin1 = pin2;
+		apic1 = apic2;
+		no_pin1 = 1;
+	} else if (pin2 == -1) {
+		pin2 = pin1;
+		apic2 = apic1;
+	}
+
 	if (pin1 != -1) {
 		/*
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
+		if (no_pin1) {
+			add_pin_to_irq(0, apic1, pin1);
+			setup_timer_IRQ0_pin(apic1, pin1, vector);
+		}
 		unmask_IO_APIC_irq(0);
 		if (!no_timer_check && timer_irq_works()) {
 			nmi_watchdog_default();
@@ -1678,18 +1699,19 @@ static inline void __init check_timer(void)
 			goto out;
 		}
 		clear_IO_APIC_pin(apic1, pin1);
-		apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
-				"connected to IO-APIC\n");
-	}
+		if (!no_pin1)
+			apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
+				    "8254 timer not connected to IO-APIC\n");
 
-	apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
-				"through the 8259A ... ");
-	if (pin2 != -1) {
+		apic_printk(APIC_VERBOSE,KERN_INFO
+			"...trying to set up timer (IRQ0) "
+			"through the 8259A ... ");
 		apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
 			apic2, pin2);
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
+		/* replace_pin_at_irq(0, apic1, pin1, apic2, pin2); */
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
@@ -1709,8 +1731,8 @@ static inline void __init check_timer(void)
 		 */
 		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
+		apic_printk(APIC_VERBOSE," failed.\n");
 	}
-	apic_printk(APIC_VERBOSE," failed.\n");
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-- 
cgit v1.2.3


From 7223daf5e1a6298e459db84442adc04d68a6a42d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 15:43:36 +0200
Subject: x86: make irq_cfg static

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 9a54b779abd4..bb3033f037c3 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -61,7 +61,7 @@ struct irq_cfg {
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
 	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
 	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
 	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-- 
cgit v1.2.3


From 431ee79db0a9552314d787446044973a8176b57d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 15:43:35 +0200
Subject: x86: apic_64.c fix sparse warnings about shadowed variables

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_64.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 0633cfd0dc29..07fda23a9f76 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -875,7 +875,7 @@ static int __init detect_init_APIC(void)
 
 void __init early_init_lapic_mapping(void)
 {
-	unsigned long apic_phys;
+	unsigned long phys_addr;
 
 	/*
 	 * If no local APIC can be found then go out
@@ -884,11 +884,11 @@ void __init early_init_lapic_mapping(void)
 	if (!smp_found_config)
 		return;
 
-	apic_phys = mp_lapic_addr;
+	phys_addr = mp_lapic_addr;
 
-	set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+	set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
 	apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
-				 APIC_BASE, apic_phys);
+		    APIC_BASE, phys_addr);
 
 	/*
 	 * Fetch the APIC ID of the BSP in case we have a
-- 
cgit v1.2.3


From b1b57ee135ac7614184faa7d7345b5e7098cb81d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 31 May 2008 12:20:10 +0200
Subject: x86 build fix:

  arch/x86/kernel/io_apic_64.c: In function 'check_timer':
  arch/x86/kernel/io_apic_64.c:1688: error: 'vector' undeclared (first use in this function)
  arch/x86/kernel/io_apic_64.c:1688: error: (Each undeclared identifier is reported only once
  arch/x86/kernel/io_apic_64.c:1688: error: for each function it appears in.)
---
 arch/x86/kernel/io_apic_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index bb3033f037c3..8991567c76b7 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1685,7 +1685,7 @@ static inline void __init check_timer(void)
 		 */
 		if (no_pin1) {
 			add_pin_to_irq(0, apic1, pin1);
-			setup_timer_IRQ0_pin(apic1, pin1, vector);
+			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
 		unmask_IO_APIC_irq(0);
 		if (!no_timer_check && timer_irq_works()) {
-- 
cgit v1.2.3


From 067fa0ff0c89d25c2136ed095c72213089d4bb4e Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 29 May 2008 22:32:30 +0400
Subject: x86: IO-APIC - use NMI_NONE instead of numeric constant

Not sure but maybe it is better to use NMI_DISABLED,
will take a look. But for now this patch is not change
anything in logic so it will not hurt/broke the kernel.
For most cases nmi_watchdog assignment is by one of NMI_*
macro so I think there it make sense too.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 2 +-
 arch/x86/kernel/io_apic_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index a69a59d19e18..5c0f8d6496a6 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2229,7 +2229,7 @@ static inline void __init check_timer(void)
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-		nmi_watchdog = 0;
+		nmi_watchdog = NMI_NONE;
 	}
 	timer_ack = 0;
 
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 8991567c76b7..40a184d4dff8 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1736,7 +1736,7 @@ static inline void __init check_timer(void)
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-		nmi_watchdog = 0;
+		nmi_watchdog = NMI_NONE;
 	}
 
 	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
-- 
cgit v1.2.3


From ff11571b25152edfb1eb0e6feb7e0009670fe4a5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Jun 2008 11:17:16 +0200
Subject: x86, io-apic: fix nmi_watchdog=1 bootup hang

nmi_watchdog=1 hangs on 64-bit:

[    0.250000] Detected 12.564 MHz APIC timer.
[    0.254178] APIC timer registered as dummy, due to nmi_watchdog=1!
[    0.260366] Testing NMI watchdog ... <4>WARNING: CPU#0: NMI appears to be stuck (0->0)!
[    ...     ]
[    0.470003] calling  genl_init+0x0/0xd0
[  hard hang ]

bisected it down to:

 git-bisect start
 git-bisect good 1beee8dc8cf58e3f605bd7b34d7a39939be7d8d2
 git-bisect bad 11582ece0aaa2d0f94f345c08a4ab9997078a083
 git-bisect bad 5479c623bb44089844022c03d4c0eb16d5b7a15f
 git-bisect bad cfb4c7fabeb499e1c29f9d1878968e37a938e28a
 git-bisect good 246dd412d31e4f5de1d43aa6422a325b785f36e4
 git-bisect bad 3f8237eaff7dc1e35fa791dae095574fd974e671
 git-bisect good 90e23b13ab849e2a11f00c655eb3a2011b4623be
 git-bisect bad 833526a34eeefc117df3191a594c3c3a4f15a9ac
 git-bisect good 791b93d3dfaf16c23e978bec0cc0a3dd9d855d63
 git-bisect bad 65767c64068f2c93e56a1accfed5c78230ac12d7
 git-bisect bad 2abc5c05dd82c188e3bdf6641a274f013348d14b
 git-bisect bad 317e1f2597ffb4d4db940577bbe56dc6e881ef07

| 317e1f2597ffb4d4db940577bbe56dc6e881ef07 is first bad commit
| commit 317e1f2597ffb4d4db940577bbe56dc6e881ef07
| Author: Maciej W. Rozycki <macro@linux-mips.org>
| Date:   Wed May 21 22:10:22 2008 +0100
|     x86: I/O APIC: clean up the 8259A on a NMI watchdog failure

the problem is that in the dummy-lapic branch we rely on the i8259A
but if the NMI watchdog fails we turn off IRQ 0 - which doesnt work
too well ;-)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c | 6 ++++--
 arch/x86/kernel/apic_64.c | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6c..3cebf91d500f 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -541,11 +541,13 @@ void __init setup_boot_APIC_clock(void)
 		 * PIT/HPET going.  Otherwise register lapic as a dummy
 		 * device.
 		 */
-		if (nmi_watchdog != NMI_IO_APIC)
+		if (nmi_watchdog != NMI_IO_APIC) {
 			lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-		else
+		} else {
 			printk(KERN_WARNING "APIC timer registered as dummy,"
 			       " due to nmi_watchdog=1!\n");
+			timer_through_8259 = 1;
+		}
 	}
 
 	/* Setup the lapic or request the broadcast */
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 07fda23a9f76..6cbc668a4c63 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -413,11 +413,13 @@ void __init setup_boot_APIC_clock(void)
 	 * PIT/HPET going.  Otherwise register lapic as a dummy
 	 * device.
 	 */
-	if (nmi_watchdog != NMI_IO_APIC)
+	if (nmi_watchdog != NMI_IO_APIC) {
 		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-	else
+	} else {
 		printk(KERN_WARNING "APIC timer registered as dummy,"
 		       " due to nmi_watchdog=1!\n");
+		timer_through_8259 = 1;
+	}
 
 	setup_APIC_timer();
 }
-- 
cgit v1.2.3


From ab5a5be099cb20a1c006bf0e211c48502d7ebc44 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 8 Jun 2008 10:13:33 +0200
Subject: Revert "x86, io-apic: fix nmi_watchdog=1 bootup hang"

This reverts commit 2229ff84f01746d02fb6b79e156fb5cce48c908f.

A better fix from Maciej will be merged.
---
 arch/x86/kernel/apic_32.c | 6 ++----
 arch/x86/kernel/apic_64.c | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 3cebf91d500f..4b99b1bdeb6c 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -541,13 +541,11 @@ void __init setup_boot_APIC_clock(void)
 		 * PIT/HPET going.  Otherwise register lapic as a dummy
 		 * device.
 		 */
-		if (nmi_watchdog != NMI_IO_APIC) {
+		if (nmi_watchdog != NMI_IO_APIC)
 			lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-		} else {
+		else
 			printk(KERN_WARNING "APIC timer registered as dummy,"
 			       " due to nmi_watchdog=1!\n");
-			timer_through_8259 = 1;
-		}
 	}
 
 	/* Setup the lapic or request the broadcast */
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 6cbc668a4c63..07fda23a9f76 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -413,13 +413,11 @@ void __init setup_boot_APIC_clock(void)
 	 * PIT/HPET going.  Otherwise register lapic as a dummy
 	 * device.
 	 */
-	if (nmi_watchdog != NMI_IO_APIC) {
+	if (nmi_watchdog != NMI_IO_APIC)
 		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-	} else {
+	else
 		printk(KERN_WARNING "APIC timer registered as dummy,"
 		       " due to nmi_watchdog=1!\n");
-		timer_through_8259 = 1;
-	}
 
 	setup_APIC_timer();
 }
-- 
cgit v1.2.3


From 6fe9fe875691e15eda61b992e03257e68aa5ba4f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 8 Jun 2008 10:14:40 +0200
Subject: Revert "x86: APIC/SMP: downgrade the NMI watchdog for "nosmp""

This reverts commit 791b93d3dfaf16c23e978bec0cc0a3dd9d855d63.

A better fix from Maciej will be merged.
---
 arch/x86/kernel/smpboot.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index df60bc7c9cb7..8c53d86e3e86 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1159,10 +1159,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	if (!max_cpus) {
 		printk(KERN_INFO "SMP mode deactivated.\n");
 		smpboot_clear_io_apic();
-
-		if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
-			nmi_watchdog = NMI_LOCAL_APIC;
-
 #ifdef CONFIG_X86_32
 		connect_bsp_APIC();
 #endif
-- 
cgit v1.2.3


From acae7d906f2f81d814e9c3730eeb19dfd3bf3bb4 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Fri, 6 Jun 2008 03:27:49 +0100
Subject: x86: APIC/UP: Downgrade the NMI watchdog for no I/O APIC

 If configured to use the I/O APIC, the NMI watchdog is deemed to fail if
the chip will not be used in the UP configuration, because "noapic" has
been specified or the chip is simply not there.  Downgrade to the local
APIC watchdog to rectify.

The new #ifdef is ugly, I know.  A proper solution is to provide suitable
definitions of smp_found_config, etc. for !CONFIG_X86_IO_APIC in a header.
Likewise the whole if () condition should be moved to a static inline
function.  Such clean-ups are beyond the scope of this change and can be
done once the whole issue of the timer has been sorted out.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c | 4 ++++
 arch/x86/kernel/apic_64.c | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6c..26e7a62e9ab6 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1269,6 +1269,10 @@ int __init APIC_init_uniprocessor(void)
 
 	setup_local_APIC();
 
+#ifdef CONFIG_X86_IO_APIC
+	if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
+#endif
+		localise_nmi_watchdog();
 	end_local_APIC_setup();
 #ifdef CONFIG_X86_IO_APIC
 	if (smp_found_config)
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 07fda23a9f76..13eac4f984c3 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -954,6 +954,8 @@ int __init APIC_init_uniprocessor(void)
 	if (!skip_ioapic_setup && nr_ioapics)
 		enable_IO_APIC();
 
+	if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
+		localise_nmi_watchdog();
 	end_local_APIC_setup();
 
 	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
-- 
cgit v1.2.3


From 19662027488946e34dd54d3bb408fa3307681d7d Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Fri, 6 Jun 2008 03:27:56 +0100
Subject: x86: APIC/UP: Remove redundant NMI watchdog downgrade

For the UP case the NMI watchdog downgrade is done consistently in
APIC_init_uniprocessor() now.  Remove redundant code used only when
BIOS-disabled local APIC is activated.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 26e7a62e9ab6..126e87153b39 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1154,9 +1154,6 @@ static int __init detect_init_APIC(void)
 	if (l & MSR_IA32_APICBASE_ENABLE)
 		mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
 
-	if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
-		nmi_watchdog = NMI_LOCAL_APIC;
-
 	printk(KERN_INFO "Found and enabled local APIC!\n");
 
 	apic_pm_activate();
-- 
cgit v1.2.3


From d54db1ac9ecde9bcb8a561595b02c1d970d3a4d6 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Fri, 6 Jun 2008 03:28:02 +0100
Subject: x86: APIC/SMP: Downgrade the NMI watchdog for "nosmp"

 If configured to use the I/O APIC, the NMI watchdog is deemed to fail if
the chip has been deactivated as a result of "nosmp".  Downgrade to the
local APIC watchdog similarly to what is done for the UP case.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8c53d86e3e86..f3760349c84f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1159,6 +1159,9 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	if (!max_cpus) {
 		printk(KERN_INFO "SMP mode deactivated.\n");
 		smpboot_clear_io_apic();
+
+		localise_nmi_watchdog();
+
 #ifdef CONFIG_X86_32
 		connect_bsp_APIC();
 #endif
-- 
cgit v1.2.3


From 46b3b4ef1ea2a0892b9b38b6a0c65a3f33b504aa Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 7 Jun 2008 19:53:57 +0400
Subject: x86, io-apic: use predefined names instead of numeric constants

This patch replaces some hard-coded numbers with predefined names.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 10 ++++++----
 arch/x86/kernel/io_apic_64.c | 13 +++++++------
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 5c0f8d6496a6..40fbb229fe7d 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -261,25 +261,27 @@ static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsign
 /* mask = 1 */
 static void __mask_IO_APIC_irq (unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0x00010000, 0);
+	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
 }
 
 /* mask = 0 */
 static void __unmask_IO_APIC_irq (unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0, 0x00010000);
+	__modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
 }
 
 /* mask = 1, trigger = 0 */
 static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
+				IO_APIC_REDIR_LEVEL_TRIGGER);
 }
 
 /* mask = 0, trigger = 1 */
 static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
+				IO_APIC_REDIR_MASKED);
 }
 
 static void mask_IO_APIC_irq (unsigned int irq)
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 40a184d4dff8..60a022d6de5e 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -183,7 +183,7 @@ static bool io_apic_level_ack_pending(unsigned int irq)
 			break;
 		reg = io_apic_read(entry->apic, 0x10 + pin*2);
 		/* Is the remote IRR bit set? */
-		if ((reg >> 14) & 1) {
+		if (reg & IO_APIC_REDIR_REMOTE_IRR) {
 			spin_unlock_irqrestore(&ioapic_lock, flags);
 			return true;
 		}
@@ -298,7 +298,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
 			break;
 		io_apic_write(apic, 0x11 + pin*2, dest);
 		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~0x000000ff;
+		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
 		reg |= vector;
 		io_apic_modify(apic, reg);
 		if (!entry->next)
@@ -366,10 +366,11 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	static void name##_IO_APIC_irq (unsigned int irq)		\
 	__DO_ACTION(R, ACTION, FINAL)
 
-DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-						/* mask = 1 */
-DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-						/* mask = 0 */
+/* mask = 1 */
+DO_ACTION(__mask,	0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+
+/* mask = 0 */
+DO_ACTION(__unmask,	0, &= ~IO_APIC_REDIR_MASKED, )
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
-- 
cgit v1.2.3


From 360624484c81d55f88b1e5f48ce24c9243ce38e5 Mon Sep 17 00:00:00 2001
From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Date: Sun, 8 Jun 2008 13:07:18 +0200
Subject: x86: coding style fixes to arch/x86/kernel/io_apic_32.c

Before:
total: 91 errors, 73 warnings, 2850 lines checked

After:
total: 1 errors, 47 warnings, 2848 lines checked

Compile tested:

paolo@paolo-desktop:/tmp$ size io*
   text    data     bss     dec     hex filename
  13836    1756   11104   26696    6848 io_apic_32.o.after
  13836    1756   11104   26696    6848 io_apic_32.o.before

Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 348 +++++++++++++++++++++----------------------
 1 file changed, 173 insertions(+), 175 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 40fbb229fe7d..9a924ebcd74c 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -239,7 +239,7 @@ static void __init replace_pin_at_irq(unsigned int irq,
 	}
 }
 
-static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
+static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
 {
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int pin, reg;
@@ -259,32 +259,32 @@ static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsign
 }
 
 /* mask = 1 */
-static void __mask_IO_APIC_irq (unsigned int irq)
+static void __mask_IO_APIC_irq(unsigned int irq)
 {
 	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
 }
 
 /* mask = 0 */
-static void __unmask_IO_APIC_irq (unsigned int irq)
+static void __unmask_IO_APIC_irq(unsigned int irq)
 {
 	__modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
 }
 
 /* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
 {
 	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
 				IO_APIC_REDIR_LEVEL_TRIGGER);
 }
 
 /* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
 {
 	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
 				IO_APIC_REDIR_MASKED);
 }
 
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq(unsigned int irq)
 {
 	unsigned long flags;
 
@@ -293,7 +293,7 @@ static void mask_IO_APIC_irq (unsigned int irq)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq(unsigned int irq)
 {
 	unsigned long flags;
 
@@ -305,7 +305,7 @@ static void unmask_IO_APIC_irq (unsigned int irq)
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
-	
+
 	/* Check delivery_mode to be sure we're not clearing an SMI pin */
 	entry = ioapic_read_entry(apic, pin);
 	if (entry.delivery_mode == dest_SMI)
@@ -317,7 +317,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 	ioapic_mask_entry(apic, pin);
 }
 
-static void clear_IO_APIC (void)
+static void clear_IO_APIC(void)
 {
 	int apic, pin;
 
@@ -334,7 +334,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int apicid_value;
 	cpumask_t tmp;
-	
+
 	cpus_and(tmp, cpumask, cpu_online_map);
 	if (cpus_empty(tmp))
 		tmp = TARGET_CPUS;
@@ -363,7 +363,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 # include <linux/kernel_stat.h>	/* kstat */
 # include <linux/slab.h>		/* kmalloc() */
 # include <linux/timer.h>
- 
+
 #define IRQBALANCE_CHECK_ARCH -999
 #define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
 #define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
@@ -375,14 +375,14 @@ static int physical_balance __read_mostly;
 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
 
 static struct irq_cpu_info {
-	unsigned long * last_irq;
-	unsigned long * irq_delta;
+	unsigned long *last_irq;
+	unsigned long *irq_delta;
 	unsigned long irq;
 } irq_cpu_data[NR_CPUS];
 
 #define CPU_IRQ(cpu)		(irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu,irq) 	(irq_cpu_data[cpu].irq_delta[irq])
+#define LAST_CPU_IRQ(cpu, irq)   (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu, irq) 	(irq_cpu_data[cpu].irq_delta[irq])
 
 #define IDLE_ENOUGH(cpu,now) \
 	(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
@@ -421,8 +421,8 @@ inside:
 			if (cpu == -1)
 				cpu = NR_CPUS-1;
 		}
-	} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
-			(search_idle && !IDLE_ENOUGH(cpu,now)));
+	} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
+			(search_idle && !IDLE_ENOUGH(cpu, now)));
 
 	return cpu;
 }
@@ -432,15 +432,14 @@ static inline void balance_irq(int cpu, int irq)
 	unsigned long now = jiffies;
 	cpumask_t allowed_mask;
 	unsigned int new_cpu;
-		
+
 	if (irqbalance_disabled)
-		return; 
+		return;
 
 	cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
-	if (cpu != new_cpu) {
+	if (cpu != new_cpu)
 		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
-	}
 }
 
 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
@@ -452,14 +451,14 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
 			if (!irq_desc[j].action)
 				continue;
 			/* Is it a significant load ?  */
-			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
 						useful_load_threshold)
 				continue;
 			balance_irq(i, j);
 		}
 	}
 	balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-		balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);	
+		balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
 	return;
 }
 
@@ -488,22 +487,22 @@ static void do_irq_balance(void)
 			/* Is this an active IRQ or balancing disabled ? */
 			if (!irq_desc[j].action || irq_balancing_disabled(j))
 				continue;
-			if ( package_index == i )
-				IRQ_DELTA(package_index,j) = 0;
+			if (package_index == i)
+				IRQ_DELTA(package_index, j) = 0;
 			/* Determine the total count per processor per IRQ */
 			value_now = (unsigned long) kstat_cpu(i).irqs[j];
 
 			/* Determine the activity per processor per IRQ */
-			delta = value_now - LAST_CPU_IRQ(i,j);
+			delta = value_now - LAST_CPU_IRQ(i, j);
 
 			/* Update last_cpu_irq[][] for the next time */
-			LAST_CPU_IRQ(i,j) = value_now;
+			LAST_CPU_IRQ(i, j) = value_now;
 
 			/* Ignore IRQs whose rate is less than the clock */
 			if (delta < useful_load_threshold)
 				continue;
 			/* update the load for the processor or package total */
-			IRQ_DELTA(package_index,j) += delta;
+			IRQ_DELTA(package_index, j) += delta;
 
 			/* Keep track of the higher numbered sibling as well */
 			if (i != package_index)
@@ -529,7 +528,8 @@ static void do_irq_balance(void)
 	max_cpu_irq = ULONG_MAX;
 
 tryanothercpu:
-	/* Look for heaviest loaded processor.
+	/*
+	 * Look for heaviest loaded processor.
 	 * We may come back to get the next heaviest loaded processor.
 	 * Skip processors with trivial loads.
 	 */
@@ -538,7 +538,7 @@ tryanothercpu:
 	for_each_online_cpu(i) {
 		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
-		if (max_cpu_irq <= CPU_IRQ(i)) 
+		if (max_cpu_irq <= CPU_IRQ(i))
 			continue;
 		if (tmp_cpu_irq < CPU_IRQ(i)) {
 			tmp_cpu_irq = CPU_IRQ(i);
@@ -547,8 +547,9 @@ tryanothercpu:
 	}
 
 	if (tmp_loaded == -1) {
- 	 /* In the case of small number of heavy interrupt sources, 
-	  * loading some of the cpus too much. We use Ingo's original 
+	 /*
+	  * In the case of small number of heavy interrupt sources,
+	  * loading some of the cpus too much. We use Ingo's original
 	  * approach to rotate them around.
 	  */
 		if (!first_attempt && imbalance >= useful_load_threshold) {
@@ -557,13 +558,14 @@ tryanothercpu:
 		}
 		goto not_worth_the_effort;
 	}
-	
+
 	first_attempt = 0;		/* heaviest search */
 	max_cpu_irq = tmp_cpu_irq;	/* load */
 	max_loaded = tmp_loaded;	/* processor */
 	imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-	
-	/* if imbalance is less than approx 10% of max load, then
+
+	/*
+	 * if imbalance is less than approx 10% of max load, then
 	 * observe diminishing returns action. - quit
 	 */
 	if (imbalance < (max_cpu_irq >> 3))
@@ -579,26 +581,25 @@ tryanotherirq:
 		/* Is this an active IRQ? */
 		if (!irq_desc[j].action)
 			continue;
-		if (imbalance <= IRQ_DELTA(max_loaded,j))
+		if (imbalance <= IRQ_DELTA(max_loaded, j))
 			continue;
 		/* Try to find the IRQ that is closest to the imbalance
 		 * without going over.
 		 */
-		if (move_this_load < IRQ_DELTA(max_loaded,j)) {
-			move_this_load = IRQ_DELTA(max_loaded,j);
+		if (move_this_load < IRQ_DELTA(max_loaded, j)) {
+			move_this_load = IRQ_DELTA(max_loaded, j);
 			selected_irq = j;
 		}
 	}
-	if (selected_irq == -1) {
+	if (selected_irq == -1)
 		goto tryanothercpu;
-	}
 
 	imbalance = move_this_load;
-	
+
 	/* For physical_balance case, we accumulated both load
 	 * values in the one of the siblings cpu_irq[],
 	 * to use the same code for physical and logical processors
-	 * as much as possible. 
+	 * as much as possible.
 	 *
 	 * NOTE: the cpu_irq[] array holds the sum of the load for
 	 * sibling A and sibling B in the slot for the lowest numbered
@@ -627,11 +628,11 @@ tryanotherirq:
 		/* mark for change destination */
 		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
 
-		/* Since we made a change, come back sooner to 
+		/* Since we made a change, come back sooner to
 		 * check for more variation.
 		 */
 		balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-			balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);	
+			balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
 		return;
 	}
 	goto tryanotherirq;
@@ -642,7 +643,7 @@ not_worth_the_effort:
 	 * upward
 	 */
 	balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
-		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);	
+		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
 	return;
 }
 
@@ -681,13 +682,13 @@ static int __init balanced_irq_init(void)
 	cpumask_t tmp;
 
 	cpus_shift_right(tmp, cpu_online_map, 2);
-        c = &boot_cpu_data;
+	c = &boot_cpu_data;
 	/* When not overwritten by the command line ask subarchitecture. */
 	if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
 		irqbalance_disabled = NO_BALANCE_IRQ;
 	if (irqbalance_disabled)
 		return 0;
-	
+
 	 /* disable irqbalance completely if there is only one processor online */
 	if (num_online_cpus() < 2) {
 		irqbalance_disabled = 1;
@@ -707,10 +708,10 @@ static int __init balanced_irq_init(void)
 			printk(KERN_ERR "balanced_irq_init: out of memory");
 			goto failed;
 		}
-		memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
-		memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
+		memset(irq_cpu_data[i].irq_delta, 0, sizeof(unsigned long) * NR_IRQS);
+		memset(irq_cpu_data[i].last_irq, 0, sizeof(unsigned long) * NR_IRQS);
 	}
-	
+
 	printk(KERN_INFO "Starting balanced_irq\n");
 	if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
 		return 0;
@@ -845,7 +846,7 @@ static int __init find_isa_irq_apic(int irq, int type)
 	}
 	if (i < mp_irq_entries) {
 		int apic;
-		for(apic = 0; apic < nr_ioapics; apic++) {
+		for (apic = 0; apic < nr_ioapics; apic++) {
 			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
 				return apic;
 		}
@@ -882,7 +883,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 		    !mp_irqs[i].mpc_irqtype &&
 		    (bus == lbus) &&
 		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+			int irq = pin_2_irq(i, apic, mp_irqs[i].mpc_dstirq);
 
 			if (!(apic || IO_APIC_IRQ(irq)))
 				continue;
@@ -902,7 +903,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 
 /*
- * This function currently is only a helper for the i386 smp boot process where 
+ * This function currently is only a helper for the i386 smp boot process where
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
@@ -977,37 +978,36 @@ static int MPBIOS_polarity(int idx)
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].mpc_irqflag & 3)
+	switch (mp_irqs[idx].mpc_irqflag & 3) {
+	case 0: /* conforms, ie. bus-type dependent polarity */
 	{
-		case 0: /* conforms, ie. bus-type dependent polarity */
-		{
-			polarity = test_bit(bus, mp_bus_not_pci)?
-				default_ISA_polarity(idx):
-				default_PCI_polarity(idx);
-			break;
-		}
-		case 1: /* high active */
-		{
-			polarity = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			printk(KERN_WARNING "broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
-		case 3: /* low active */
-		{
-			polarity = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			printk(KERN_WARNING "broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
+		polarity = test_bit(bus, mp_bus_not_pci)?
+			default_ISA_polarity(idx):
+			default_PCI_polarity(idx);
+		break;
+	}
+	case 1: /* high active */
+	{
+		polarity = 0;
+		break;
+	}
+	case 2: /* reserved */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		polarity = 1;
+		break;
+	}
+	case 3: /* low active */
+	{
+		polarity = 1;
+		break;
+	}
+	default: /* invalid */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		polarity = 1;
+		break;
+	}
 	}
 	return polarity;
 }
@@ -1020,69 +1020,67 @@ static int MPBIOS_trigger(int idx)
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) {
+	case 0: /* conforms, ie. bus-type dependent */
 	{
-		case 0: /* conforms, ie. bus-type dependent */
-		{
-			trigger = test_bit(bus, mp_bus_not_pci)?
-					default_ISA_trigger(idx):
-					default_PCI_trigger(idx);
+		trigger = test_bit(bus, mp_bus_not_pci)?
+				default_ISA_trigger(idx):
+				default_PCI_trigger(idx);
 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-			switch (mp_bus_id_to_type[bus])
-			{
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					trigger = default_EISA_trigger(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_MCA: /* MCA pin */
-				{
-					trigger = default_MCA_trigger(idx);
-					break;
-				}
-				default:
-				{
-					printk(KERN_WARNING "broken BIOS!!\n");
-					trigger = 1;
-					break;
-				}
-			}
-#endif
+		switch (mp_bus_id_to_type[bus]) {
+		case MP_BUS_ISA: /* ISA pin */
+		{
+			/* set before the switch */
 			break;
 		}
-		case 1: /* edge */
+		case MP_BUS_EISA: /* EISA pin */
 		{
-			trigger = 0;
+			trigger = default_EISA_trigger(idx);
 			break;
 		}
-		case 2: /* reserved */
+		case MP_BUS_PCI: /* PCI pin */
 		{
-			printk(KERN_WARNING "broken BIOS!!\n");
-			trigger = 1;
+			/* set before the switch */
 			break;
 		}
-		case 3: /* level */
+		case MP_BUS_MCA: /* MCA pin */
 		{
-			trigger = 1;
+			trigger = default_MCA_trigger(idx);
 			break;
 		}
-		default: /* invalid */
+		default:
 		{
 			printk(KERN_WARNING "broken BIOS!!\n");
-			trigger = 0;
+			trigger = 1;
 			break;
 		}
 	}
+#endif
+		break;
+	}
+	case 1: /* edge */
+	{
+		trigger = 0;
+		break;
+	}
+	case 2: /* reserved */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		trigger = 1;
+		break;
+	}
+	case 3: /* level */
+	{
+		trigger = 1;
+		break;
+	}
+	default: /* invalid */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		trigger = 0;
+		break;
+	}
+	}
 	return trigger;
 }
 
@@ -1150,8 +1148,8 @@ static inline int IO_APIC_irq_trigger(int irq)
 
 	for (apic = 0; apic < nr_ioapics; apic++) {
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-			idx = find_irq_entry(apic,pin,mp_INT);
-			if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+			idx = find_irq_entry(apic, pin, mp_INT);
+			if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
 				return irq_trigger(idx);
 		}
 	}
@@ -1166,7 +1164,7 @@ static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }
 
 static int __assign_irq_vector(int irq)
 {
-	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+	static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
 	int vector, offset;
 
 	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
@@ -1239,15 +1237,15 @@ static void __init setup_IO_APIC_irqs(void)
 		/*
 		 * add it to the IO-APIC irq-routing table:
 		 */
-		memset(&entry,0,sizeof(entry));
+		memset(&entry, 0, sizeof(entry));
 
 		entry.delivery_mode = INT_DELIVERY_MODE;
 		entry.dest_mode = INT_DEST_MODE;
 		entry.mask = 0;				/* enable IRQ */
-		entry.dest.logical.logical_dest = 
+		entry.dest.logical.logical_dest =
 					cpu_mask_to_apicid(TARGET_CPUS);
 
-		idx = find_irq_entry(apic,pin,mp_INT);
+		idx = find_irq_entry(apic, pin, mp_INT);
 		if (idx == -1) {
 			if (first_notcon) {
 				apic_printk(APIC_VERBOSE, KERN_DEBUG
@@ -1291,7 +1289,7 @@ static void __init setup_IO_APIC_irqs(void)
 			vector = assign_irq_vector(irq);
 			entry.vector = vector;
 			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-		
+
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
 		}
@@ -1311,7 +1309,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
 {
 	struct IO_APIC_route_entry entry;
 
-	memset(&entry,0,sizeof(entry));
+	memset(&entry, 0, sizeof(entry));
 
 	/*
 	 * We use logical delivery to get the timer IRQ
@@ -1349,7 +1347,7 @@ void __init print_IO_APIC(void)
 	if (apic_verbosity == APIC_QUIET)
 		return;
 
- 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
 		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
@@ -1454,7 +1452,7 @@ void __init print_IO_APIC(void)
 
 #if 0
 
-static void print_APIC_bitfield (int base)
+static void print_APIC_bitfield(int base)
 {
 	unsigned int v;
 	int i, j;
@@ -1475,7 +1473,7 @@ static void print_APIC_bitfield (int base)
 	}
 }
 
-void /*__init*/ print_local_APIC(void * dummy)
+void /*__init*/ print_local_APIC(void *dummy)
 {
 	unsigned int v, ver, maxlvt;
 
@@ -1558,7 +1556,7 @@ void /*__init*/ print_local_APIC(void * dummy)
 	printk("\n");
 }
 
-void print_all_local_APICs (void)
+void print_all_local_APICs(void)
 {
 	on_each_cpu(print_local_APIC, NULL, 1, 1);
 }
@@ -1581,11 +1579,11 @@ void /*__init*/ print_PIC(void)
 	v = inb(0xa0) << 8 | inb(0x20);
 	printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
 
-	outb(0x0b,0xa0);
-	outb(0x0b,0x20);
+	outb(0x0b, 0xa0);
+	outb(0x0b, 0x20);
 	v = inb(0xa0) << 8 | inb(0x20);
-	outb(0x0a,0xa0);
-	outb(0x0a,0x20);
+	outb(0x0a, 0xa0);
+	outb(0x0a, 0x20);
 
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 
@@ -1621,7 +1619,7 @@ static void __init enable_IO_APIC(void)
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
 	}
-	for(apic = 0; apic < nr_ioapics; apic++) {
+	for (apic = 0; apic < nr_ioapics; apic++) {
 		int pin;
 		/* See if any of the pins is in ExtINT mode */
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
@@ -1743,7 +1741,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		reg_00.raw = io_apic_read(apic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
-		
+
 		old_id = mp_ioapics[apic].mpc_apicid;
 
 		if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
@@ -1795,7 +1793,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		/*
 		 * Read the right value from the MPC table and
 		 * write it into the ID register.
-	 	 */
+		 */
 		apic_printk(APIC_VERBOSE, KERN_INFO
 			"...changing IO-APIC physical APIC ID to %d ...",
 			mp_ioapics[apic].mpc_apicid);
@@ -2015,7 +2013,7 @@ static void ack_apic(unsigned int irq)
 	ack_APIC_irq();
 }
 
-static void mask_lapic_irq (unsigned int irq)
+static void mask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;
 
@@ -2023,7 +2021,7 @@ static void mask_lapic_irq (unsigned int irq)
 	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void unmask_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;
 
@@ -2041,14 +2039,14 @@ static struct irq_chip lapic_chip __read_mostly = {
 static void __init setup_nmi(void)
 {
 	/*
- 	 * Dirty trick to enable the NMI watchdog ...
+	 * Dirty trick to enable the NMI watchdog ...
 	 * We put the 8259A master into AEOI mode and
 	 * unmask on all local APICs LVT0 as NMI.
 	 *
 	 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
 	 * is from Maciej W. Rozycki - so we do not have to EOI from
 	 * the NMI handler or the timer interrupt.
-	 */ 
+	 */
 	apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
 
 	enable_NMI_through_LVT0();
@@ -2312,10 +2310,10 @@ void __init setup_IO_APIC(void)
  *	Called after all the initialization is done. If we didnt find any
  *	APIC bugs then we can allow the modify fast path
  */
- 
+
 static int __init io_apic_bug_finalize(void)
 {
-	if(sis_apic_bug == -1)
+	if (sis_apic_bug == -1)
 		sis_apic_bug = 0;
 	return 0;
 }
@@ -2326,17 +2324,17 @@ struct sysfs_ioapic_data {
 	struct sys_device dev;
 	struct IO_APIC_route_entry entry[0];
 };
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
 
 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
 {
 	struct IO_APIC_route_entry *entry;
 	struct sysfs_ioapic_data *data;
 	int i;
-	
+
 	data = container_of(dev, struct sysfs_ioapic_data, dev);
 	entry = data->entry;
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
 		entry[i] = ioapic_read_entry(dev->id, i);
 
 	return 0;
@@ -2349,7 +2347,7 @@ static int ioapic_resume(struct sys_device *dev)
 	unsigned long flags;
 	union IO_APIC_reg_00 reg_00;
 	int i;
-	
+
 	data = container_of(dev, struct sysfs_ioapic_data, dev);
 	entry = data->entry;
 
@@ -2360,7 +2358,7 @@ static int ioapic_resume(struct sys_device *dev)
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
 		ioapic_write_entry(dev->id, i, entry[i]);
 
 	return 0;
@@ -2374,15 +2372,15 @@ static struct sysdev_class ioapic_sysdev_class = {
 
 static int __init ioapic_init_sysfs(void)
 {
-	struct sys_device * dev;
+	struct sys_device *dev;
 	int i, size, error = 0;
 
 	error = sysdev_class_register(&ioapic_sysdev_class);
 	if (error)
 		return error;
 
-	for (i = 0; i < nr_ioapics; i++ ) {
-		size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
+	for (i = 0; i < nr_ioapics; i++) {
+		size = sizeof(struct sys_device) + nr_ioapic_registers[i]
 			* sizeof(struct IO_APIC_route_entry);
 		mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
 		if (!mp_ioapic_data[i]) {
@@ -2391,7 +2389,7 @@ static int __init ioapic_init_sysfs(void)
 		}
 		memset(mp_ioapic_data[i], 0, size);
 		dev = &mp_ioapic_data[i]->dev;
-		dev->id = i; 
+		dev->id = i;
 		dev->cls = &ioapic_sysdev_class;
 		error = sysdev_register(dev);
 		if (error) {
@@ -2466,7 +2464,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 		msg->address_lo =
 			MSI_ADDR_BASE_LO |
 			((INT_DEST_MODE == 0) ?
-				MSI_ADDR_DEST_MODE_PHYSICAL:
+MSI_ADDR_DEST_MODE_PHYSICAL:
 				MSI_ADDR_DEST_MODE_LOGICAL) |
 			((INT_DELIVERY_MODE != dest_LowestPrio) ?
 				MSI_ADDR_REDIRECTION_CPU:
@@ -2477,7 +2475,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 			MSI_DATA_TRIGGER_EDGE |
 			MSI_DATA_LEVEL_ASSERT |
 			((INT_DELIVERY_MODE != dest_LowestPrio) ?
-				MSI_DATA_DELIVERY_FIXED:
+MSI_DATA_DELIVERY_FIXED:
 				MSI_DATA_DELIVERY_LOWPRI) |
 			MSI_DATA_VECTOR(vector);
 	}
@@ -2648,12 +2646,12 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 #endif /* CONFIG_HT_IRQ */
 
 /* --------------------------------------------------------------------------
-                          ACPI-based IOAPIC Configuration
+			ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
 
 #ifdef CONFIG_ACPI
 
-int __init io_apic_get_unique_id (int ioapic, int apic_id)
+int __init io_apic_get_unique_id(int ioapic, int apic_id)
 {
 	union IO_APIC_reg_00 reg_00;
 	static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -2662,10 +2660,10 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
 	int i = 0;
 
 	/*
-	 * The P4 platform supports up to 256 APIC IDs on two separate APIC 
-	 * buses (one for LAPICs, one for IOAPICs), where predecessors only 
+	 * The P4 platform supports up to 256 APIC IDs on two separate APIC
+	 * buses (one for LAPICs, one for IOAPICs), where predecessors only
 	 * supports up to 16 on one shared APIC bus.
-	 * 
+	 *
 	 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
 	 *      advantage of new APIC bus architecture.
 	 */
@@ -2684,7 +2682,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
 	}
 
 	/*
-	 * Every APIC in a system must have a unique ID or we get lots of nice 
+	 * Every APIC in a system must have a unique ID or we get lots of nice
 	 * 'stuck on smp_invalidate_needed IPI wait' messages.
 	 */
 	if (check_apicid_used(apic_id_map, apic_id)) {
@@ -2701,7 +2699,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
 			"trying %d\n", ioapic, apic_id, i);
 
 		apic_id = i;
-	} 
+	}
 
 	tmp = apicid_to_cpu_present(apic_id);
 	physids_or(apic_id_map, apic_id_map, tmp);
@@ -2728,7 +2726,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
 }
 
 
-int __init io_apic_get_version (int ioapic)
+int __init io_apic_get_version(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
@@ -2741,7 +2739,7 @@ int __init io_apic_get_version (int ioapic)
 }
 
 
-int __init io_apic_get_redir_entries (int ioapic)
+int __init io_apic_get_redir_entries(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
@@ -2754,7 +2752,7 @@ int __init io_apic_get_redir_entries (int ioapic)
 }
 
 
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
 {
 	struct IO_APIC_route_entry entry;
 
@@ -2770,7 +2768,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	 * corresponding device driver registers for this IRQ.
 	 */
 
-	memset(&entry,0,sizeof(entry));
+	memset(&entry, 0, sizeof(entry));
 
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.dest_mode = INT_DEST_MODE;
-- 
cgit v1.2.3


From cd08d0754ecb0cb9293c8476cb33ded1d23d0d8f Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Thu, 19 Jun 2008 00:39:33 +0100
Subject: x86: fix IO APIC breakage on HP nx6325

On Thu, 19 Jun 2008, Rafael J. Wysocki wrote:

> >  With such a configuration the "x86: I/O APIC: timer through 8259A
> > second-chance" patch should not matter, because the only change it
> > introduces is an attempt to try the same I/O APIC pin again, but with the
> > IRQ0 line of the master 8259A enabled.  That's not a terribly unusual
> > configuration and nothing should get confused in the system.
>
> But it _does_ get confused, really.

 Something certainly gets confused, but so far I am not sure which bit
exactly it is, are you?

> >  Barring the unlikely possibility of the 8259A actually being wired to
> > INTIN2 of the I/O APIC I can see two possible explanations:
> >
> > 1. The 8259A interrupt actually escapes to the CPU somehow and is handled
> >    as an ExtINTA interrupt.  This would make the code in check_timer()
> >    decide it has found a working configuration, while actually it has been
> >    fooled.
[...]
> Here you go:
>
> [    0.108006] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
> [    0.108006] ..MP-BIOS bug: 8254 timer not connected to IO-APIC
> [    0.108006] ...trying to set up timer (IRQ0) through the 8259A ... <3>
> [    0.108006] ..... (found apic 0 pin 2) ...<3> works.
>
> The full dmesg is at: http://www.sisk.pl/kernel/debug/20080618/dmesg-1.log

Thanks.  In this case I suspect the case #1 quoted above happens, that is
the 8259A manages to deliver its interrupt somehow.  Note at this stage it
is meant to be in the AEOI mode, so it can happily resubmit the interrupt
indefinitely with no additional handling as long as it receives INTA
cycles.

Can you please try the patch below on top of "x86: I/O APIC: timer
through 8259A second-chance" to see whether my hypothesis is true?  It
modifies the through-8259A setup path so that the APIC input gets masked,
but the 8259A has the timer interrupt still enabled.  Let me know how the
timer interrupt is routed in this case.

Bisected-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Tested-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_64.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 60a022d6de5e..f06f5b4fb35f 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1715,6 +1715,7 @@ static inline void __init check_timer(void)
 		/* replace_pin_at_irq(0, apic1, pin1, apic2, pin2); */
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		unmask_IO_APIC_irq(0);
+		clear_IO_APIC_pin(apic2, pin2);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
-- 
cgit v1.2.3


From 7f0dbbc08d80df7ea15d8da4f0d78255891c8812 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Fri, 20 Jun 2008 01:35:07 +0100
Subject: x86: fix IO APIC breakage on HP nx6325, v2

> That helped a lot, the system seems to work normally now.
>
> Here's the relevant snippet from dmesg:
>
> [    0.108006] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
> [    0.108006] ..MP-BIOS bug: 8254 timer not connected to IO-APIC
> [    0.108006] ...trying to set up timer (IRQ0) through the 8259A ... <3>
> [    0.108006] ..... (found apic 0 pin 2) ...<3> failed.
> [    0.108006] ...trying to set up timer as Virtual Wire IRQ...<3> works.
>
> and the whole thing is at: http://www.sisk.pl/kernel/debug/20080618/dmesg-2.log

 Hmm, that only proved the 8259A is indeed wired to the pin #2 of the I/O
APIC.

> I, personally, don't have any and AMD only has SB600 documentation on its
> web page (it's still marked as "AMD confidential" ;-)).

 Well, the IC block is most likely the same as that's not rocket science
and once done there is no need to fiddle with that.  That written, I am
afraid there is nothing useful about the IC in the document, except that
it's there and consists of an I/O APIC providing 24 inputs and the usual
pair of 8259A cores.  Thanks for the reference anyway.

> There is an interrupt controller in there, but I'm not sure if there's any
> 8259A.  The northbridge is on the CPU, actually.

 I will praise the day someone ships an x86 machine without an 8259A core!

 As expressed in another mail I suspect there may actually be a direct
route from the 8254 to INTIN0 in the southbridge -- this is what other
bootstrap logs seen in the Internet suggest.  This would mean this
particular BIOS is buggy (is it the latest version?) and provides an
incorrect IRQ override in its ACPI tables, for example because the
responsible block has been blindly copied from a machine using a commoner
wiring.  This could be moderately easily fixed up with a quirk based on
the PCI ID (after checking it again, we actually used to have a quirk for
ATI in this area, but the way it was done suggests the issue was not
understood well enough).

 Could you please remove the hack sent yesterday and test the patch
provided below?  I do hope it builds, but I have no immediate means to
check it.  Please report the output.  The intent is to test INTIN0
directly before testing INTIN2 through the 8259A.  Thanks.

 Aside of that, what I have gathered from your reports (please correct me
if I have got it wrong) is that when the through-8259A mode is used, then
after a while 8254 timer interrupts stop arriving.  What's interesting,
the "Virtual Wire IRQ" seems to work for you correctly (that's quite an
odd setup where a local APIC input is used in the native mode -- please
post /proc/interrupts for confirmation), which in turn implies the master
8259A drives its INT output as we expect.  Why would the I/O APIC input
have problems then?  Hmm...

[ mingo@elte.hu: revert the "x86: fix IO APIC breakage on HP nx6325"
  version. ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_64.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index f06f5b4fb35f..40aa041b2987 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -360,6 +360,26 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	entry->pin = pin;
 }
 
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+				      int oldapic, int oldpin,
+				      int newapic, int newpin)
+{
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	while (1) {
+		if (entry->apic == oldapic && entry->pin == oldpin) {
+			entry->apic = newapic;
+			entry->pin = newpin;
+		}
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+}
+
 
 #define DO_ACTION(name,R,ACTION, FINAL)					\
 									\
@@ -1680,6 +1700,11 @@ static inline void __init check_timer(void)
 		apic2 = apic1;
 	}
 
+	replace_pin_at_irq(0, 0, 0, apic1, pin1);
+	apic1 = 0;
+	pin1 = 0;
+	setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+
 	if (pin1 != -1) {
 		/*
 		 * Ok, does IRQ0 through the IOAPIC work?
@@ -1712,10 +1737,9 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		/* replace_pin_at_irq(0, apic1, pin1, apic2, pin2); */
+		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		unmask_IO_APIC_irq(0);
-		clear_IO_APIC_pin(apic2, pin2);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
-- 
cgit v1.2.3


From 25556c1699ad84dd6077adf67c92eba362aa7dc2 Mon Sep 17 00:00:00 2001
From: Christophe Jaillet <jaillet.christophe@wanadoo.fr>
Date: Sun, 22 Jun 2008 22:13:48 +0200
Subject: x86, arch/x86/kernel/io_apic_32.c: use kzalloc instead of
 kmalloc/memset

1) replace kmalloc/memset with equivalent kzalloc.

Signed-off-by: Christophe Jaillet <jaillet.christophe@wanadoo.fr>
Cc: cj <jaillet.christophe@wanadoo.fr>
Cc: petero2@telia.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 9a924ebcd74c..e22cbfbce049 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -702,14 +702,12 @@ static int __init balanced_irq_init(void)
 		physical_balance = 1;
 
 	for_each_online_cpu(i) {
-		irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
-		irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+		irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+		irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
 		if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
 			printk(KERN_ERR "balanced_irq_init: out of memory");
 			goto failed;
 		}
-		memset(irq_cpu_data[i].irq_delta, 0, sizeof(unsigned long) * NR_IRQS);
-		memset(irq_cpu_data[i].last_irq, 0, sizeof(unsigned long) * NR_IRQS);
 	}
 
 	printk(KERN_INFO "Starting balanced_irq\n");
@@ -2382,12 +2380,11 @@ static int __init ioapic_init_sysfs(void)
 	for (i = 0; i < nr_ioapics; i++) {
 		size = sizeof(struct sys_device) + nr_ioapic_registers[i]
 			* sizeof(struct IO_APIC_route_entry);
-		mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+		mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
 		if (!mp_ioapic_data[i]) {
 			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
 			continue;
 		}
-		memset(mp_ioapic_data[i], 0, size);
 		dev = &mp_ioapic_data[i]->dev;
 		dev->id = i;
 		dev->cls = &ioapic_sysdev_class;
-- 
cgit v1.2.3


From 8b2ef1d7285740953a2c4ef7faf15fdfc5e2f358 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Sun, 8 Jun 2008 15:46:30 +0200
Subject: x86: add flags parameter to reserve_bootmem_generic()

This patch adds a 'flags' parameter to reserve_bootmem_generic() like it
already has been added in reserve_bootmem() with commit
72a7fe3967dbf86cb34e24fbf1d957fe24d2f246.

It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively
change the behaviour. Since the change is x86-specific, I don't think it's
necessary to add a new API for migration. There are only 4 users of that
function.

The change is necessary for the next patch, using reserve_bootmem_generic()
for crashkernel reservation.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c |  5 +++--
 arch/x86/mm/init_64.c     | 17 ++++++++++++-----
 include/asm-x86/proto.h   |  2 +-
 3 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 1cc7a4b8643f..6ae60909b601 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -880,10 +880,11 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 			if (!reserve)
 				return 1;
 
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+				BOOTMEM_DEFAULT);
 			if (mpf->mpf_physptr)
 				reserve_bootmem_generic(mpf->mpf_physptr,
-							PAGE_SIZE);
+					PAGE_SIZE, BOOTMEM_DEFAULT);
 #endif
 		return 1;
 		}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 819dad973b13..bf7bf1de6c25 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -799,12 +799,13 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
 {
 #ifdef CONFIG_NUMA
 	int nid, next_nid;
 #endif
 	unsigned long pfn = phys >> PAGE_SHIFT;
+	int ret;
 
 	if (pfn >= end_pfn) {
 		/*
@@ -812,11 +813,11 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 		 * firmware tables:
 		 */
 		if (pfn < max_pfn_mapped)
-			return;
+			return -EFAULT;
 
 		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
 				phys, len);
-		return;
+		return -EFAULT;
 	}
 
 	/* Should check here against the e820 map to avoid double free */
@@ -824,9 +825,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 	nid = phys_to_nid(phys);
 	next_nid = phys_to_nid(phys + len - 1);
 	if (nid == next_nid)
-		reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
 	else
-		reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+		ret = reserve_bootmem(phys, len, flags);
+
+	if (ret != 0)
+		return ret;
+
 #else
 	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
 #endif
@@ -835,6 +840,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 		dma_reserve += len / PAGE_SIZE;
 		set_dma_reserve(dma_reserve);
 	}
+
+	return 0;
 }
 
 int kern_addr_valid(unsigned long addr)
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index 6c8b41b03f6d..a9f51472521e 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -14,7 +14,7 @@ extern void ia32_syscall(void);
 extern void ia32_cstar_target(void);
 extern void ia32_sysenter_target(void);
 
-extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
+extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags);
 
 extern void syscall32_cpu_init(void);
 
-- 
cgit v1.2.3


From 12448e3ef3ae7fca7a7ad205084f45a67dcc4356 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Sun, 8 Jun 2008 15:46:31 +0200
Subject: x86: use reserve_bootmem_generic() to reserve crashkernel memory on
 x86_64

This patch uses reserve_bootmem_generic() instead of reserve_bootmem()
to reserve the crashkernel memory on x86_64. That's necessary for NUMA
machines, see 00212fef814612245ed0261cbac8426d0c9a31a5:

  [PATCH] Fix kdump Crash Kernel boot memory reservation for NUMA machines

  This patch will fix a boot memory reservation bug that trashes memory on
  the ES7000 when loading the kdump crash kernel.

  The code in arch/x86_64/kernel/setup.c to reserve boot memory for the crash
  kernel uses the non-numa aware "reserve_bootmem" function instead of the
  NUMA aware "reserve_bootmem_generic".  I checked to make sure that no other
  function was using "reserve_bootmem" and found none, except the ones that
  had NUMA ifdef'ed out.

  I have tested this patch only on an ES7000 with NUMA on and off (numa=off)
  in a single (non-NUMA) and multi-cell (NUMA) configurations.

  Signed-off-by: Amul Shah <amul.shah@unisys.com>
  Looks-good-to: Vivek Goyal <vgoyal@in.ibm.com>
  Cc: Andi Kleen <ak@muc.de>
  Signed-off-by: Andrew Morton <akpm@osdl.org>
  Signed-off-by: Linus Torvalds <torvalds@osdl.org>

The switch-back to reserve_bootmem() was accidentally introduced in
5c3391f9f749023a49c64d607da4fb49263690eb when adding the BOOTMEM_EXCLUSIVE
parameter.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 26d60cc0e370..ec65696ca2cd 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -244,7 +244,7 @@ static void __init reserve_crashkernel(void)
 			return;
 		}
 
-		if (reserve_bootmem(crash_base, crash_size,
+		if (reserve_bootmem_generic(crash_base, crash_size,
 					BOOTMEM_EXCLUSIVE) < 0) {
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"memory is in use\n");
-- 
cgit v1.2.3


From df5f6c212cc049d1989b5ce71bb863a367c261e9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 10 Jun 2008 16:38:41 +0200
Subject: x86: unify the reserve_bootmem() behavior of early_res_to_bootmem()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a706e9057ba5..68aba413d403 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -635,12 +635,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
 			continue;
 		printk(KERN_INFO "  early res: %d [%llx-%llx] %s\n", i,
 			final_start, final_end - 1, r->name);
-#ifdef CONFIG_X86_64
-		reserve_bootmem_generic(final_start, final_end - final_start);
-#else
 		reserve_bootmem(final_start, final_end - final_start,
 				BOOTMEM_DEFAULT);
-#endif
 	}
 }
 
-- 
cgit v1.2.3


From ab4a465e96adf2f3a8aaa95384bacfa9ab661e35 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 10 Jun 2008 12:55:54 -0700
Subject: x86: e820 merge parsing of the mem=/memmap= boot parameters

since we now have 32-bit support for e820_register_active_regions(),
we can merge the parsing of the mem=/memmap= boot parameters.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c    |  86 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/e820_32.c | 120 ----------------------------------------------
 arch/x86/kernel/e820_64.c |  69 --------------------------
 include/asm-x86/e820.h    |   2 +
 include/asm-x86/e820_32.h |   2 -
 include/asm-x86/e820_64.h |   2 -
 6 files changed, 88 insertions(+), 193 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 68aba413d403..4f2cd5d179e2 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -890,3 +890,89 @@ u64 __init e820_hole_size(u64 start, u64 end)
 	}
 	return end - start - ((u64)ram << PAGE_SHIFT);
 }
+
+static void early_panic(char *msg)
+{
+	early_printk(msg);
+	panic(msg);
+}
+
+/* "mem=nopentium" disables the 4MB page tables. */
+static int __init parse_memopt(char *p)
+{
+	u64 mem_size;
+
+	if (!p)
+		return -EINVAL;
+
+#ifdef CONFIG_X86_32
+	if (!strcmp(p, "nopentium")) {
+		setup_clear_cpu_cap(X86_FEATURE_PSE);
+		return 0;
+	}
+#endif
+
+	mem_size = memparse(p, &p);
+	end_user_pfn = mem_size>>PAGE_SHIFT;
+	return 0;
+}
+early_param("mem", parse_memopt);
+
+static int userdef __initdata;
+
+static int __init parse_memmap_opt(char *p)
+{
+	char *oldp;
+	u64 start_at, mem_size;
+
+	if (!strcmp(p, "exactmap")) {
+#ifdef CONFIG_CRASH_DUMP
+		/*
+		 * If we are doing a crash dump, we still need to know
+		 * the real mem size before original memory map is
+		 * reset.
+		 */
+		e820_register_active_regions(0, 0, -1UL);
+		saved_max_pfn = e820_end_of_ram();
+		remove_all_active_ranges();
+#endif
+		e820.nr_map = 0;
+		userdef = 1;
+		return 0;
+	}
+
+	oldp = p;
+	mem_size = memparse(p, &p);
+	if (p == oldp)
+		return -EINVAL;
+
+	userdef = 1;
+	if (*p == '@') {
+		start_at = memparse(p+1, &p);
+		add_memory_region(start_at, mem_size, E820_RAM);
+	} else if (*p == '#') {
+		start_at = memparse(p+1, &p);
+		add_memory_region(start_at, mem_size, E820_ACPI);
+	} else if (*p == '$') {
+		start_at = memparse(p+1, &p);
+		add_memory_region(start_at, mem_size, E820_RESERVED);
+	} else {
+		end_user_pfn = (mem_size >> PAGE_SHIFT);
+	}
+	return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+
+void __init finish_e820_parsing(void)
+{
+	if (userdef) {
+		int nr = e820.nr_map;
+
+		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+			early_panic("Invalid user supplied memory map");
+		e820.nr_map = nr;
+
+		printk(KERN_INFO "user-defined physical RAM map:\n");
+		e820_print_map("user");
+	}
+}
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index e8a3b968c9fa..8de3df9548db 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -207,36 +207,6 @@ void __init init_iomem_resources(struct resource *code_resource,
 	}
 }
 
-void __init limit_regions(unsigned long long size)
-{
-	unsigned long long current_addr;
-	int i;
-
-	e820_print_map("limit_regions start");
-	for (i = 0; i < e820.nr_map; i++) {
-		current_addr = e820.map[i].addr + e820.map[i].size;
-		if (current_addr < size)
-			continue;
-
-		if (e820.map[i].type != E820_RAM)
-			continue;
-
-		if (e820.map[i].addr >= size) {
-			/*
-			 * This region starts past the end of the
-			 * requested size, skip it completely.
-			 */
-			e820.nr_map = i;
-		} else {
-			e820.nr_map = i + 1;
-			e820.map[i].size -= current_addr - size;
-		}
-		e820_print_map("limit_regions endfor");
-		return;
-	}
-	e820_print_map("limit_regions endfunc");
-}
-
 /* Overridden in paravirt.c if CONFIG_PARAVIRT */
 char * __init __attribute__((weak)) memory_setup(void)
 {
@@ -249,93 +219,3 @@ void __init setup_memory_map(void)
 	e820_print_map(memory_setup());
 }
 
-static int __initdata user_defined_memmap;
-
-/*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem=  [also see Documentation/i386/boot.txt]
- */
-static int __init parse_mem(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp(arg, "nopentium") == 0) {
-		setup_clear_cpu_cap(X86_FEATURE_PSE);
-	} else {
-		/* If the user specifies memory size, we
-		 * limit the BIOS-provided memory map to
-		 * that size. exactmap can be used to specify
-		 * the exact map. mem=number can be used to
-		 * trim the existing memory map.
-		 */
-		unsigned long long mem_size;
-
-		mem_size = memparse(arg, &arg);
-		limit_regions(mem_size);
-		user_defined_memmap = 1;
-	}
-	return 0;
-}
-early_param("mem", parse_mem);
-
-static int __init parse_memmap(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp(arg, "exactmap") == 0) {
-#ifdef CONFIG_CRASH_DUMP
-		/* If we are doing a crash dump, we
-		 * still need to know the real mem
-		 * size before original memory map is
-		 * reset.
-		 */
-		e820_register_active_regions(0, 0, -1UL);
-		saved_max_pfn = e820_end_of_ram();
-		remove_all_active_ranges();
-#endif
-		e820.nr_map = 0;
-		user_defined_memmap = 1;
-	} else {
-		/* If the user specifies memory size, we
-		 * limit the BIOS-provided memory map to
-		 * that size. exactmap can be used to specify
-		 * the exact map. mem=number can be used to
-		 * trim the existing memory map.
-		 */
-		unsigned long long start_at, mem_size;
-
-		mem_size = memparse(arg, &arg);
-		if (*arg == '@') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_RAM);
-		} else if (*arg == '#') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_ACPI);
-		} else if (*arg == '$') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_RESERVED);
-		} else {
-			limit_regions(mem_size);
-			user_defined_memmap = 1;
-		}
-	}
-	return 0;
-}
-early_param("memmap", parse_memmap);
-
-void __init finish_e820_parsing(void)
-{
-	if (user_defined_memmap) {
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		e820_print_map("user");
-	}
-}
-
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 0afee2ca0bf8..47952b1690b6 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -96,75 +96,6 @@ char *__init machine_specific_memory_setup(void)
 	return who;
 }
 
-static int __init parse_memopt(char *p)
-{
-	if (!p)
-		return -EINVAL;
-	end_user_pfn = memparse(p, &p);
-	end_user_pfn >>= PAGE_SHIFT;
-	return 0;
-}
-early_param("mem", parse_memopt);
-
-static int userdef __initdata;
-
-static int __init parse_memmap_opt(char *p)
-{
-	char *oldp;
-	unsigned long long start_at, mem_size;
-
-	if (!strcmp(p, "exactmap")) {
-#ifdef CONFIG_CRASH_DUMP
-		/*
-		 * If we are doing a crash dump, we still need to know
-		 * the real mem size before original memory map is
-		 * reset.
-		 */
-		e820_register_active_regions(0, 0, -1UL);
-		saved_max_pfn = e820_end_of_ram();
-		remove_all_active_ranges();
-#endif
-		e820.nr_map = 0;
-		userdef = 1;
-		return 0;
-	}
-
-	oldp = p;
-	mem_size = memparse(p, &p);
-	if (p == oldp)
-		return -EINVAL;
-
-	userdef = 1;
-	if (*p == '@') {
-		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_RAM);
-	} else if (*p == '#') {
-		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_ACPI);
-	} else if (*p == '$') {
-		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_RESERVED);
-	} else {
-		end_user_pfn = (mem_size >> PAGE_SHIFT);
-	}
-	return *p == '\0' ? 0 : -EINVAL;
-}
-early_param("memmap", parse_memmap_opt);
-
-void __init finish_e820_parsing(void)
-{
-	if (userdef) {
-		int nr = e820.nr_map;
-
-		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
-			early_panic("Invalid user supplied memory map");
-		e820.nr_map = nr;
-
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		e820_print_map("user");
-	}
-}
-
 int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
 {
 	int i;
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 8aa32323a182..a5959e3a5626 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -98,6 +98,8 @@ extern int e820_find_active_region(const struct e820entry *ei,
 extern void e820_register_active_regions(int nid, unsigned long start_pfn,
 					 unsigned long end_pfn);
 extern u64 e820_hole_size(u64 start, u64 end);
+extern void finish_e820_parsing(void);
+
 #endif /* __ASSEMBLY__ */
 
 #define ISA_START_ADDRESS	0xa0000
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index 212b74c10efc..9135ce6e617e 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -19,9 +19,7 @@
 #ifndef __ASSEMBLY__
 
 extern void setup_memory_map(void);
-extern void finish_e820_parsing(void);
 
-extern void limit_regions(unsigned long long size);
 extern void init_iomem_resources(struct resource *code_resource,
 				 struct resource *data_resource,
 				 struct resource *bss_resource);
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 368585daaa42..478547c63222 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -22,8 +22,6 @@ extern int is_memory_any_valid(unsigned long start, unsigned long end);
 extern int e820_all_non_reserved(unsigned long start, unsigned long end);
 extern int is_memory_all_valid(unsigned long start, unsigned long end);
 
-extern void finish_e820_parsing(void);
-
 #endif/*!__ASSEMBLY__*/
 
 #endif/*__E820_HEADER*/
-- 
cgit v1.2.3


From b1f006b65c12b85df81f12c1073ad18fd26f4a16 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 9 Jun 2008 18:11:36 -0700
Subject: x86: make generic arch support NUMAQ, fix #2

we are checking mptable early for numaq, so don't need to reserve_bootmem
for it. bootmem is not there yet.

do the same thing as 64-bit.

found it on 64g above system from 64-bit kernel kexec to 32 bit kernel with
numaq support.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 6ae60909b601..7ac1b689b70a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -853,9 +853,13 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 			smp_found_config = 1;
 #endif
 			mpf_found = mpf;
-#ifdef CONFIG_X86_32
+
 			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
 			       mpf, virt_to_phys(mpf));
+
+			if (!reserve)
+				return 1;
+#ifdef CONFIG_X86_32
 			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
 					BOOTMEM_DEFAULT);
 			if (mpf->mpf_physptr) {
@@ -877,9 +881,6 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 			}
 
 #else
-			if (!reserve)
-				return 1;
-
 			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
 				BOOTMEM_DEFAULT);
 			if (mpf->mpf_physptr)
-- 
cgit v1.2.3


From d2dbf343329dc777d77488743465f7be4245971d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 13 Jun 2008 02:00:56 -0700
Subject: x86: clean up reserve_bootmem_generic() and port it to 32-bit

1. add reserve_bootmem_generic for 32bit
2. change len to unsigned long
3. make early_res_to_bootmem to use it

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c    |  2 +-
 arch/x86/kernel/mpparse.c | 18 ++++++------------
 arch/x86/mm/init_32.c     |  6 ++++++
 arch/x86/mm/init_64.c     |  3 ++-
 include/asm-x86/proto.h   |  2 --
 include/linux/bootmem.h   |  2 ++
 6 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 4f2cd5d179e2..774063f11be0 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -635,7 +635,7 @@ void __init early_res_to_bootmem(u64 start, u64 end)
 			continue;
 		printk(KERN_INFO "  early res: %d [%llx-%llx] %s\n", i,
 			final_start, final_end - 1, r->name);
-		reserve_bootmem(final_start, final_end - final_start,
+		reserve_bootmem_generic(final_start, final_end - final_start,
 				BOOTMEM_DEFAULT);
 	}
 }
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 7ac1b689b70a..b62ac6ba1410 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -859,10 +859,11 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 
 			if (!reserve)
 				return 1;
-#ifdef CONFIG_X86_32
-			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
 					BOOTMEM_DEFAULT);
 			if (mpf->mpf_physptr) {
+				unsigned long size = PAGE_SIZE;
+#ifdef CONFIG_X86_32
 				/*
 				 * We cannot access to MPC table to compute
 				 * table size yet, as only few megabytes from
@@ -872,22 +873,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 				 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
 				 * in reserve_bootmem.
 				 */
-				unsigned long size = PAGE_SIZE;
 				unsigned long end = max_low_pfn * PAGE_SIZE;
 				if (mpf->mpf_physptr + size > end)
 					size = end - mpf->mpf_physptr;
-				reserve_bootmem(mpf->mpf_physptr, size,
+#endif
+				reserve_bootmem_generic(mpf->mpf_physptr, size,
 						BOOTMEM_DEFAULT);
 			}
 
-#else
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
-				BOOTMEM_DEFAULT);
-			if (mpf->mpf_physptr)
-				reserve_bootmem_generic(mpf->mpf_physptr,
-					PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
-		return 1;
+			return 1;
 		}
 		bp += 4;
 		length -= 16;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e7bb5e81670..abadb1da70df 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -785,3 +785,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	free_init_pages("initrd memory", start, end);
 }
 #endif
+
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+				   int flags)
+{
+	return reserve_bootmem(phys, len, flags);
+}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bf7bf1de6c25..b8c2c1ef7ad5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -799,7 +799,8 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+				   int flags)
 {
 #ifdef CONFIG_NUMA
 	int nid, next_nid;
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index a9f51472521e..3dd458c385c0 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -14,8 +14,6 @@ extern void ia32_syscall(void);
 extern void ia32_cstar_target(void);
 extern void ia32_sysenter_target(void);
 
-extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags);
-
 extern void syscall32_cpu_init(void);
 
 extern void check_efer(void);
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 686895bacd9d..a1d9b79078ea 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -84,6 +84,8 @@ extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
 	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
+extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
+				   int flags);
 extern unsigned long free_all_bootmem(void);
 extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
-- 
cgit v1.2.3


From 9a27f5c51629c3d3b7718dd4be3d2722b472fafe Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 13 Jun 2008 20:07:03 -0700
Subject: x86: clean up relocate_initrd

1. move that before zone_sizes_init ...
2. add free_early for one old one, otherwise it will be be reserved again
   when we init highmem.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 1d4be07e15e5..72b11d4557b7 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -584,6 +584,9 @@ static void __init relocate_initrd(void)
 	printk(KERN_INFO "Copied RAMDISK from %016llx - %016llx to %08llx - %08llx\n",
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
 		ramdisk_here, ramdisk_here + ramdisk_size - 1);
+
+	/* need to free that, otherwise init highmem will reserve it again */
+	free_early(ramdisk_image, ramdisk_image+ramdisk_size);
 }
 
 #endif /* CONFIG_BLK_DEV_INITRD */
@@ -801,10 +804,6 @@ void __init setup_arch(char **cmdline_p)
 		init_ohci1394_dma_on_all_controllers();
 #endif
 
-	remapped_pgdat_init();
-	sparse_init();
-	zone_sizes_init();
-
 	/*
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
@@ -813,6 +812,10 @@ void __init setup_arch(char **cmdline_p)
 	relocate_initrd();
 #endif
 
+	remapped_pgdat_init();
+	sparse_init();
+	zone_sizes_init();
+
 	paravirt_post_allocator_init();
 
 	dmi_scan_machine();
-- 
cgit v1.2.3


From d867e5310bd3c560093d39669ef52ff7f1b5711a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 14 Jun 2008 01:26:41 -0700
Subject: x86: keep MP_intsrc_info untouched if we do not update mptable

Daniel Exner reported IO-APIC enumeration breakage in linux-next.

Alexey Starikovskiy found out that it might be related to
commit 2944e16b25 "x86: update mptable".

use enable_update_mptable to decide if need check before add mp_irqs array.

Reported-by: Daniel Exner <webmaster@dragonslave.de>
Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c |  3 +++
 arch/x86/kernel/mpparse.c   | 13 +++++++------
 include/asm-x86/mpspec.h    |  1 +
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index caf4ed7ca069..4d370b1c5ae8 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1172,6 +1172,9 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
 	struct mpc_config_intsrc intsrc;
 	int ioapic;
 
+	if (!enable_update_mptable)
+		return 0;
+
 	/* print the entry should happen on mptable identically */
 	intsrc.mpc_type = MP_INTSRC;
 	intsrc.mpc_irqtype = mp_INT;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b62ac6ba1410..014ac5d90f80 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -34,6 +34,8 @@
 #include <mach_mpparse.h>
 #endif
 
+int enable_update_mptable;
+
 /*
  * Checksum an MP configuration block.
  */
@@ -295,10 +297,11 @@ void MP_intsrc_info(struct mpc_config_intsrc *m)
 
 	print_MP_intsrc_info(m);
 
-	for (i = 0; i < mp_irq_entries; i++) {
-		if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
-			return;
-	}
+	if (enable_update_mptable)
+		for (i = 0; i < mp_irq_entries; i++) {
+			if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
+				return;
+		}
 
 	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
@@ -1110,8 +1113,6 @@ out:
 	return 0;
 }
 
-int __initdata enable_update_mptable;
-
 static int __init update_mptable_setup(char *str)
 {
 	enable_update_mptable = 1;
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index b8ba37496e2d..f48dbca740e4 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -61,6 +61,7 @@ extern void mp_config_acpi_legacy_irqs(void);
 extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
 extern void MP_intsrc_info(struct mpc_config_intsrc *m);
 #ifdef CONFIG_X86_IO_APIC
+extern int enable_update_mptable;
 extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
 				u32 gsi, int triggering, int polarity);
 #else
-- 
cgit v1.2.3


From 6df8809bbd1b48cc6d428df7372ed749c8711641 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 15 Jun 2008 18:19:46 -0700
Subject: x86: use dstapic in mp_config_acpi_legacy_irqs

so we don't get the same value multiple times.

also make mp_config_acpi_legacy_irqs more readable by moving assignments
together.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4d370b1c5ae8..29b365a66cf3 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -962,8 +962,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 
 void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 {
-	int ioapic = -1;
-	int pin = -1;
+	int ioapic;
+	int pin;
 
 	/*
 	 * Convert 'gsi' to 'ioapic.pin'.
@@ -997,8 +997,9 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 
 void __init mp_config_acpi_legacy_irqs(void)
 {
-	int i = 0;
-	int ioapic = -1;
+	int i;
+	int ioapic;
+	unsigned int dstapic;
 
 #if defined (CONFIG_MCA) || defined (CONFIG_EISA)
 	/*
@@ -1023,6 +1024,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	ioapic = mp_find_ioapic(0);
 	if (ioapic < 0)
 		return;
+	dstapic = mp_ioapics[ioapic].mp_apicid;
 
 	/*
 	 * Use the default configuration for the IRQs 0-15.  Unless
@@ -1031,11 +1033,6 @@ void __init mp_config_acpi_legacy_irqs(void)
 	for (i = 0; i < 16; i++) {
 		int idx;
 
-		mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
-		mp_irqs[mp_irq_entries].mp_irqflag = 0;	/* Conforming */
-		mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
-		mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid;
-
 		for (idx = 0; idx < mp_irq_entries; idx++) {
 			struct mp_config_intsrc *irq = mp_irqs + idx;
 
@@ -1045,9 +1042,8 @@ void __init mp_config_acpi_legacy_irqs(void)
 				break;
 
 			/* Do we already have a mapping for this IOAPIC pin */
-			if ((irq->mp_dstapic ==
-				mp_irqs[mp_irq_entries].mp_dstapic) &&
-			    (irq->mp_dstirq == i))
+			if (irq->mp_dstapic == dstapic &&
+			    irq->mp_dstirq == i)
 				break;
 		}
 
@@ -1056,8 +1052,12 @@ void __init mp_config_acpi_legacy_irqs(void)
 			continue;	/* IRQ already used */
 		}
 
+		mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
+		mp_irqs[mp_irq_entries].mp_irqflag = 0;	/* Conforming */
+		mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
+		mp_irqs[mp_irq_entries].mp_dstapic = dstapic;
 		mp_irqs[mp_irq_entries].mp_irqtype = mp_INT;
-		mp_irqs[mp_irq_entries].mp_srcbusirq = i;	/* Identity mapped */
+		mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */
 		mp_irqs[mp_irq_entries].mp_dstirq = i;
 
 		if (++mp_irq_entries == MAX_IRQ_SOURCES)
-- 
cgit v1.2.3


From d0be6bdea103b8d04c8a3495538b7c0011ae4129 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 15 Jun 2008 18:58:51 -0700
Subject: x86: rename two e820 related functions

rename update_memory_range to e820_update_range
rename add_memory_region to e820_add_region

to make it more clear that they are about e820 map operations.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/aperture_64.c   |  2 +-
 arch/x86/kernel/cpu/mtrr/main.c |  2 +-
 arch/x86/kernel/e820.c          | 16 ++++++++--------
 arch/x86/kernel/efi.c           |  2 +-
 arch/x86/lguest/boot.c          |  2 +-
 arch/x86/mach-default/setup.c   |  4 ++--
 arch/x86/mach-visws/setup.c     |  6 +++---
 arch/x86/mach-voyager/setup.c   | 12 ++++++------
 arch/x86/xen/setup.c            |  4 ++--
 include/asm-x86/e820.h          |  4 ++--
 10 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 479926d9e004..66b140932b23 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -292,7 +292,7 @@ void __init early_gart_iommu_check(void)
 				    E820_RAM)) {
 			/* reserved it, so we can resuse it in second kernel */
 			printk(KERN_INFO "update e820 for GART\n");
-			add_memory_region(aper_base, aper_size, E820_RESERVED);
+			e820_add_region(aper_base, aper_size, E820_RESERVED);
 			update_e820();
 		}
 		return;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 0642201784e0..105afe12beb0 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -1440,7 +1440,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn,
 	trim_size <<= PAGE_SHIFT;
 	trim_size -= trim_start;
 
-	return update_memory_range(trim_start, trim_size, E820_RAM,
+	return e820_update_range(trim_start, trim_size, E820_RAM,
 				E820_RESERVED);
 }
 /**
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 774063f11be0..5051ce744b4e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -94,7 +94,7 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
 /*
  * Add a memory region to the kernel e820 map.
  */
-void __init add_memory_region(u64 start, u64 size, int type)
+void __init e820_add_region(u64 start, u64 size, int type)
 {
 	int x = e820.nr_map;
 
@@ -384,12 +384,12 @@ int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
 		if (start > end)
 			return -1;
 
-		add_memory_region(start, size, type);
+		e820_add_region(start, size, type);
 	} while (biosmap++, --nr_map);
 	return 0;
 }
 
-u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 				unsigned new_type)
 {
 	int i;
@@ -414,7 +414,7 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type,
 		final_end = min(start + size, ei->addr + ei->size);
 		if (final_start >= final_end)
 			continue;
-		add_memory_region(final_start, final_end - final_start,
+		e820_add_region(final_start, final_end - final_start,
 					 new_type);
 		real_updated_size += final_end - final_start;
 	}
@@ -774,7 +774,7 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 		return 0;
 
 	addr = round_down(start + size - sizet, align);
-	update_memory_range(addr, sizet, E820_RAM, E820_RESERVED);
+	e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
 	printk(KERN_INFO "update e820 for early_reserve_e820\n");
 	update_e820();
 
@@ -949,13 +949,13 @@ static int __init parse_memmap_opt(char *p)
 	userdef = 1;
 	if (*p == '@') {
 		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_RAM);
+		e820_add_region(start_at, mem_size, E820_RAM);
 	} else if (*p == '#') {
 		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_ACPI);
+		e820_add_region(start_at, mem_size, E820_ACPI);
 	} else if (*p == '$') {
 		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_RESERVED);
+		e820_add_region(start_at, mem_size, E820_RESERVED);
 	} else {
 		end_user_pfn = (mem_size >> PAGE_SHIFT);
 	}
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index d5c7fcdd1861..473c89fe5073 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -233,7 +233,7 @@ static void __init add_efi_memmap(void)
 			e820_type = E820_RAM;
 		else
 			e820_type = E820_RESERVED;
-		add_memory_region(start, size, e820_type);
+		e820_add_region(start, size, e820_type);
 	}
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 }
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5e4772907c6e..e72cf0793fbe 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -835,7 +835,7 @@ static __init char *lguest_memory_setup(void)
 
 	/* The Linux bootloader header contains an "e820" memory map: the
 	 * Launcher populated the first entry with our memory limit. */
-	add_memory_region(boot_params.e820_map[0].addr,
+	e820_add_region(boot_params.e820_map[0].addr,
 			  boot_params.e820_map[0].size,
 			  boot_params.e820_map[0].type);
 
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 56b4c39cb7fa..7ae692a76769 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -184,8 +184,8 @@ char * __init machine_specific_memory_setup(void)
 		}
 
 		e820.nr_map = 0;
-		add_memory_region(0, LOWMEMSIZE(), E820_RAM);
-		add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
   	}
 	return who;
 }
diff --git a/arch/x86/mach-visws/setup.c b/arch/x86/mach-visws/setup.c
index de4c9dbd086f..d67868ec9b7f 100644
--- a/arch/x86/mach-visws/setup.c
+++ b/arch/x86/mach-visws/setup.c
@@ -175,9 +175,9 @@ char * __init machine_specific_memory_setup(void)
 	sgivwfb_mem_size &= ~((1 << 20) - 1);
 	sgivwfb_mem_phys = mem_size - gfx_mem_size;
 
-	add_memory_region(0, LOWMEMSIZE(), E820_RAM);
-	add_memory_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
-	add_memory_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
+	e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+	e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
+	e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
 
 	return "PROM";
 }
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index f4aca9fa9546..f27b583154e5 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -74,7 +74,7 @@ char *__init machine_specific_memory_setup(void)
 
 		e820.nr_map = 0;
 		for (i = 0; voyager_memory_detect(i, &addr, &length); i++) {
-			add_memory_region(addr, length, E820_RAM);
+			e820_add_region(addr, length, E820_RAM);
 		}
 		return who;
 	} else if (voyager_level == 4) {
@@ -92,14 +92,14 @@ char *__init machine_specific_memory_setup(void)
 			tom = (boot_params.screen_info.ext_mem_k) << 10;
 		}
 		who = "Voyager-TOM";
-		add_memory_region(0, 0x9f000, E820_RAM);
+		e820_add_region(0, 0x9f000, E820_RAM);
 		/* map from 1M to top of memory */
-		add_memory_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024,
+		e820_add_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024,
 				  E820_RAM);
 		/* FIXME: Should check the ASICs to see if I need to
 		 * take out the 8M window.  Just do it at the moment
 		 * */
-		add_memory_region(8 * 1024 * 1024, 8 * 1024 * 1024,
+		e820_add_region(8 * 1024 * 1024, 8 * 1024 * 1024,
 				  E820_RESERVED);
 		return who;
 	}
@@ -131,8 +131,8 @@ char *__init machine_specific_memory_setup(void)
 		}
 
 		e820.nr_map = 0;
-		add_memory_region(0, LOWMEMSIZE(), E820_RAM);
-		add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
 	}
 	return who;
 }
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 82517e4a752a..9001c9df04d8 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -39,8 +39,8 @@ char * __init xen_memory_setup(void)
 	unsigned long max_pfn = xen_start_info->nr_pages;
 
 	e820.nr_map = 0;
-	add_memory_region(0, LOWMEMSIZE(), E820_RAM);
-	add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM);
+	e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+	e820_add_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM);
 
 	return "Xen";
 }
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index a5959e3a5626..6b0ce745a60c 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -60,12 +60,12 @@ extern struct e820map e820;
 
 extern int e820_any_mapped(u64 start, u64 end, unsigned type);
 extern int e820_all_mapped(u64 start, u64 end, unsigned type);
-extern void add_memory_region(u64 start, u64 size, int type);
+extern void e820_add_region(u64 start, u64 size, int type);
 extern void e820_print_map(char *who);
 extern int
 sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
 extern int copy_e820_map(struct e820entry *biosmap, int nr_map);
-extern u64 update_memory_range(u64 start, u64 size, unsigned old_type,
+extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
 			       unsigned new_type);
 extern void update_e820(void);
 extern void e820_setup_gap(void);
-- 
cgit v1.2.3


From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 14 Jun 2008 18:32:52 -0700
Subject: x86, mm: use add_highpages_with_active_regions() for high pages init
 v2

use early_node_map to init high pages, so we can remove page_is_ram() and
page_is_reserved_early() in the big loop with add_one_highpage

also remove page_is_reserved_early(), it is not needed anymore.

v2: fix the build of other platforms

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c     | 11 --------
 arch/x86/mm/discontig_32.c | 19 ++++++--------
 arch/x86/mm/init_32.c      | 62 ++++++++++++++++++++++++++++++++++++++--------
 include/asm-x86/e820.h     |  1 -
 include/asm-x86/highmem.h  |  3 +++
 include/linux/mm.h         |  2 ++
 mm/page_alloc.c            |  8 ++++++
 7 files changed, 71 insertions(+), 35 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 5051ce744b4e..ed46b7a6bc13 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -612,17 +612,6 @@ void __init free_early(u64 start, u64 end)
 	early_res[j - 1].end = 0;
 }
 
-int __init page_is_reserved_early(unsigned long pagenr)
-{
-	u64 start = (u64)pagenr << PAGE_SHIFT;
-	int i;
-	struct early_res *r;
-
-	i = find_overlapped_early(start, start + PAGE_SIZE);
-	r = &early_res[i];
-	return (i < MAX_EARLY_RES && r->end);
-}
-
 void __init early_res_to_bootmem(u64 start, u64 end)
 {
 	int i;
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index c3f119e99e0d..7c4d0255f8d8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 #endif
 
 extern unsigned long find_max_low_pfn(void);
-extern void add_one_highpage_init(struct page *, int, int);
 extern unsigned long highend_pfn, highstart_pfn;
 
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro)
 {
 #ifdef CONFIG_HIGHMEM
 	struct zone *zone;
-	struct page *page;
+	int nid;
 
 	for_each_zone(zone) {
-		unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+		unsigned long zone_start_pfn, zone_end_pfn;
 
 		if (!is_highmem(zone))
 			continue;
@@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro)
 		zone_start_pfn = zone->zone_start_pfn;
 		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
 
+		nid = zone_to_nid(zone);
 		printk("Initializing %s for node %d (%08lx:%08lx)\n",
-				zone->name, zone_to_nid(zone),
-				zone_start_pfn, zone_end_pfn);
-
-		for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
-			if (!pfn_valid(node_pfn))
-				continue;
-			page = pfn_to_page(node_pfn);
-			add_one_highpage_init(page, node_pfn, bad_ppro);
-		}
+				zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+		add_highpages_with_active_regions(nid, zone_start_pfn,
+				 zone_end_pfn, bad_ppro);
 	}
 	totalram_pages += totalhigh_pages;
 #endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index abadb1da70df..ba07a489230e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -287,10 +287,10 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pkmap_page_table = pte;
 }
 
-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+static void __init
+add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 {
-	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) &&
-	    !page_is_reserved_early(pfn)) {
+	if (!(bad_ppro && page_kills_ppro(pfn))) {
 		ClearPageReserved(page);
 		init_page_count(page);
 		__free_page(page);
@@ -299,18 +299,58 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 		SetPageReserved(page);
 }
 
+struct add_highpages_data {
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+	int bad_ppro;
+};
+
+static void __init add_highpages_work_fn(unsigned long start_pfn,
+					 unsigned long end_pfn, void *datax)
+{
+	int node_pfn;
+	struct page *page;
+	unsigned long final_start_pfn, final_end_pfn;
+	struct add_highpages_data *data;
+	int bad_ppro;
+
+	data = (struct add_highpages_data *)datax;
+	bad_ppro = data->bad_ppro;
+
+	final_start_pfn = max(start_pfn, data->start_pfn);
+	final_end_pfn = min(end_pfn, data->end_pfn);
+	if (final_start_pfn >= final_end_pfn)
+		return;
+
+	for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+	     node_pfn++) {
+		if (!pfn_valid(node_pfn))
+			continue;
+		page = pfn_to_page(node_pfn);
+		add_one_highpage_init(page, node_pfn, bad_ppro);
+	}
+
+}
+
+void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					      unsigned long end_pfn,
+					      int bad_ppro)
+{
+	struct add_highpages_data data;
+
+	data.start_pfn = start_pfn;
+	data.end_pfn = end_pfn;
+	data.bad_ppro = bad_ppro;
+
+	work_with_active_regions(nid, add_highpages_work_fn, &data);
+}
+
 #ifndef CONFIG_NUMA
 static void __init set_highmem_pages_init(int bad_ppro)
 {
-	int pfn;
+	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn,
+						bad_ppro);
 
-	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
-		/*
-		 * Holes under sparsemem might not have no mem_map[]:
-		 */
-		if (pfn_valid(pfn))
-			add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
-	}
 	totalram_pages += totalhigh_pages;
 }
 #endif /* !CONFIG_NUMA */
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 6b0ce745a60c..55d310596907 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -86,7 +86,6 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
 extern void reserve_early(u64 start, u64 end, char *name);
 extern void free_early(u64 start, u64 end);
 extern void early_res_to_bootmem(u64 start, u64 end);
-extern int page_is_reserved_early(unsigned long pagenr);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
 extern unsigned long e820_end_of_ram(void);
diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h
index e153f3b44774..85c4fea41ff6 100644
--- a/include/asm-x86/highmem.h
+++ b/include/asm-x86/highmem.h
@@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
 
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					unsigned long end_pfn, int bad_ppro);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 034a3156d2f0..e4de460907c1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1011,6 +1011,8 @@ extern unsigned long find_min_pfn_with_active_regions(void);
 extern unsigned long find_max_pfn_with_active_regions(void);
 extern void free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn);
+typedef void (*work_fn_t)(unsigned long, unsigned long, void *);
+extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
 extern void sparse_memory_present_with_active_regions(int nid);
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 extern int early_pfn_to_nid(unsigned long pfn);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d80e1868e570..41c6e3aa059f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid,
 	}
 }
 
+void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
+{
+	int i;
+
+	for_each_active_range_index_in_nid(i, nid)
+		work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn,
+			data);
+}
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
-- 
cgit v1.2.3


From 8c5beb50d3ec915d15c4d38aa37282309a65f14e Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 11 Jun 2008 11:33:39 +0800
Subject: x86 boot: pass E820 memory map entries more than 128 via linked list
 of setup data

Because of the size limits of struct boot_params (zero page), the
maximum number of E820 memory map entries can be passed to kernel is
128. As pointed by Paul Jackson, there is some machine produced by SGI
with so many nodes that the number of E820 memory map entries is more
than 128. To enabling Linux kernel on these system, a new setup data
type named SETUP_E820_EXT is defined to pass additional memory map
entries to Linux kernel.

This patch is based on x86/auto-latest branch of git-x86 tree and has
been tested on x86_64 and i386 platform.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c      | 59 +++++++++++++++++++++++++++++++++++----------
 arch/x86/kernel/setup.c     |  3 +++
 include/asm-x86/bootparam.h |  1 +
 include/asm-x86/e820.h      |  2 ++
 4 files changed, 52 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index ed46b7a6bc13..544dd12c70f4 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -359,6 +359,26 @@ static struct e820entry new_bios[E820_X_MAX] __initdata;
 	return 0;
 }
 
+static int __init __copy_e820_map(struct e820entry *biosmap, int nr_map)
+{
+	while (nr_map) {
+		u64 start = biosmap->addr;
+		u64 size = biosmap->size;
+		u64 end = start + size;
+		u32 type = biosmap->type;
+
+		/* Overflow in 64 bits? Ignore the memory map. */
+		if (start > end)
+			return -1;
+
+		e820_add_region(start, size, type);
+
+		biosmap++;
+		nr_map--;
+	}
+	return 0;
+}
+
 /*
  * Copy the BIOS e820 map into a safe place.
  *
@@ -374,19 +394,7 @@ int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
 	if (nr_map < 2)
 		return -1;
 
-	do {
-		u64 start = biosmap->addr;
-		u64 size = biosmap->size;
-		u64 end = start + size;
-		u32 type = biosmap->type;
-
-		/* Overflow in 64 bits? Ignore the memory map. */
-		if (start > end)
-			return -1;
-
-		e820_add_region(start, size, type);
-	} while (biosmap++, --nr_map);
-	return 0;
+	return __copy_e820_map(biosmap, nr_map);
 }
 
 u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
@@ -496,6 +504,31 @@ __init void e820_setup_gap(void)
 	       pci_mem_start, gapstart, gapsize);
 }
 
+/**
+ * Because of the size limitation of struct boot_params, only first
+ * 128 E820 memory entries are passed to kernel via
+ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
+ * linked list of struct setup_data, which is parsed here.
+ */
+void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
+{
+	u32 map_len;
+	int entries;
+	struct e820entry *extmap;
+
+	entries = sdata->len / sizeof(struct e820entry);
+	map_len = sdata->len + sizeof(struct setup_data);
+	if (map_len > PAGE_SIZE)
+		sdata = early_ioremap(pa_data, map_len);
+	extmap = (struct e820entry *)(sdata->data);
+	__copy_e820_map(extmap, entries);
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+	if (map_len > PAGE_SIZE)
+		early_iounmap(sdata, map_len);
+	printk(KERN_INFO "extended physical RAM map:\n");
+	e820_print_map("extended");
+}
+
 #if defined(CONFIG_X86_64) || \
 	(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
 /**
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 45a5e247d450..5b0de38cde48 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -150,6 +150,9 @@ void __init parse_setup_data(void)
 	while (pa_data) {
 		data = early_ioremap(pa_data, PAGE_SIZE);
 		switch (data->type) {
+		case SETUP_E820_EXT:
+			parse_e820_ext(data, pa_data);
+			break;
 		default:
 			break;
 		}
diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h
index 0a073904168b..876f21136660 100644
--- a/include/asm-x86/bootparam.h
+++ b/include/asm-x86/bootparam.h
@@ -11,6 +11,7 @@
 
 /* setup data types */
 #define SETUP_NONE			0
+#define SETUP_E820_EXT			1
 
 /* extensible setup data list node */
 struct setup_data {
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 55d310596907..77fc24d89163 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -69,6 +69,8 @@ extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
 			       unsigned new_type);
 extern void update_e820(void);
 extern void e820_setup_gap(void);
+struct setup_data;
+extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data);
 
 #if defined(CONFIG_X86_64) || \
 	(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
-- 
cgit v1.2.3


From 41c094fd3ca54f1a71233049cf136ff94c91f4ae Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 16 Jun 2008 13:03:31 -0700
Subject: x86: move e820_resource_resources to e820.c

and make 32-bit resource registration more like 64 bit.

also move probe_roms back to setup_32.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c     |  32 +++++++
 arch/x86/kernel/e820_32.c  | 192 -----------------------------------------
 arch/x86/kernel/e820_64.c  |  24 ------
 arch/x86/kernel/setup_32.c | 208 ++++++++++++++++++++++++++++++++++++++-------
 include/asm-x86/e820.h     |   1 +
 include/asm-x86/e820_32.h  |   4 -
 include/asm-x86/e820_64.h  |   1 -
 7 files changed, 208 insertions(+), 254 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 544dd12c70f4..432c49178577 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -998,3 +998,35 @@ void __init finish_e820_parsing(void)
 		e820_print_map("user");
 	}
 }
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+	int i;
+	struct resource *res;
+
+	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+	for (i = 0; i < e820.nr_map; i++) {
+		switch (e820.map[i].type) {
+		case E820_RAM:	res->name = "System RAM"; break;
+		case E820_ACPI:	res->name = "ACPI Tables"; break;
+		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
+		default:	res->name = "reserved";
+		}
+		res->start = e820.map[i].addr;
+		res->end = res->start + e820.map[i].size - 1;
+#ifndef CONFIG_RESOURCES_64BIT
+		if (res->end > 0x100000000ULL) {
+			res++;
+			continue;
+		}
+#endif
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		insert_resource(&iomem_resource, res);
+		res++;
+	}
+}
+
+
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 8de3df9548db..1205ccf086d7 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -15,198 +15,6 @@
 #include <asm/e820.h>
 #include <asm/setup.h>
 
-static struct resource system_rom_resource = {
-	.name	= "System ROM",
-	.start	= 0xf0000,
-	.end	= 0xfffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
-	.name	= "Extension ROM",
-	.start	= 0xe0000,
-	.end	= 0xeffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
-	.name 	= "Adapter ROM",
-	.start	= 0xc8000,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
-	.name 	= "Video ROM",
-	.start	= 0xc0000,
-	.end	= 0xc7fff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-#define ROMSIGNATURE 0xaa55
-
-static int __init romsignature(const unsigned char *rom)
-{
-	const unsigned short * const ptr = (const unsigned short *)rom;
-	unsigned short sig;
-
-	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
-}
-
-static int __init romchecksum(const unsigned char *rom, unsigned long length)
-{
-	unsigned char sum, c;
-
-	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
-		sum += c;
-	return !length && !sum;
-}
-
-static void __init probe_roms(void)
-{
-	const unsigned char *rom;
-	unsigned long start, length, upper;
-	unsigned char c;
-	int i;
-
-	/* video rom */
-	upper = adapter_rom_resources[0].start;
-	for (start = video_rom_resource.start; start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		video_rom_resource.start = start;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* if checksum okay, trust length byte */
-		if (length && romchecksum(rom, length))
-			video_rom_resource.end = start + length - 1;
-
-		request_resource(&iomem_resource, &video_rom_resource);
-		break;
-	}
-
-	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
-	if (start < upper)
-		start = upper;
-
-	/* system rom */
-	request_resource(&iomem_resource, &system_rom_resource);
-	upper = system_rom_resource.start;
-
-	/* check for extension rom (ignore length byte!) */
-	rom = isa_bus_to_virt(extension_rom_resource.start);
-	if (romsignature(rom)) {
-		length = extension_rom_resource.end - extension_rom_resource.start + 1;
-		if (romchecksum(rom, length)) {
-			request_resource(&iomem_resource, &extension_rom_resource);
-			upper = extension_rom_resource.start;
-		}
-	}
-
-	/* check for adapter roms on 2k boundaries */
-	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* but accept any length that fits if checksum okay */
-		if (!length || start + length > upper || !romchecksum(rom, length))
-			continue;
-
-		adapter_rom_resources[i].start = start;
-		adapter_rom_resources[i].end = start + length - 1;
-		request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
-		start = adapter_rom_resources[i++].end & ~2047UL;
-	}
-}
-
-/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
-void __init init_iomem_resources(struct resource *code_resource,
-		struct resource *data_resource,
-		struct resource *bss_resource)
-{
-	int i;
-
-	probe_roms();
-	for (i = 0; i < e820.nr_map; i++) {
-		struct resource *res;
-#ifndef CONFIG_RESOURCES_64BIT
-		if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
-			continue;
-#endif
-		res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
-		switch (e820.map[i].type) {
-		case E820_RAM:	res->name = "System RAM"; break;
-		case E820_ACPI:	res->name = "ACPI Tables"; break;
-		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
-		default:	res->name = "reserved";
-		}
-		res->start = e820.map[i].addr;
-		res->end = res->start + e820.map[i].size - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		if (request_resource(&iomem_resource, res)) {
-			kfree(res);
-			continue;
-		}
-		if (e820.map[i].type == E820_RAM) {
-			/*
-			 *  We don't know which RAM region contains kernel data,
-			 *  so we try it repeatedly and let the resource manager
-			 *  test it.
-			 */
-			request_resource(res, code_resource);
-			request_resource(res, data_resource);
-			request_resource(res, bss_resource);
-#ifdef CONFIG_KEXEC
-			if (crashk_res.start != crashk_res.end)
-				request_resource(res, &crashk_res);
-#endif
-		}
-	}
-}
-
 /* Overridden in paravirt.c if CONFIG_PARAVIRT */
 char * __init __attribute__((weak)) memory_setup(void)
 {
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 47952b1690b6..cb03bff9fa2f 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -41,30 +41,6 @@ unsigned long end_pfn;
  */
 unsigned long max_pfn_mapped;
 
-/*
- * Mark e820 reserved areas as busy for the resource manager.
- */
-void __init e820_reserve_resources(void)
-{
-	int i;
-	struct resource *res;
-
-	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
-	for (i = 0; i < e820.nr_map; i++) {
-		switch (e820.map[i].type) {
-		case E820_RAM:	res->name = "System RAM"; break;
-		case E820_ACPI:	res->name = "ACPI Tables"; break;
-		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
-		default:	res->name = "reserved";
-		}
-		res->start = e820.map[i].addr;
-		res->end = res->start + e820.map[i].size - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		insert_resource(&iomem_resource, res);
-		res++;
-	}
-}
-
 static void early_panic(char *msg)
 {
 	early_printk(msg);
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 72b11d4557b7..f3ddba5ed9a7 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -445,25 +445,28 @@ static void __init reserve_crashkernel(void)
 	ret = parse_crashkernel(boot_command_line, total_mem,
 			&crash_size, &crash_base);
 	if (ret == 0 && crash_size > 0) {
-		if (crash_base > 0) {
-			printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-					"for crashkernel (System RAM: %ldMB)\n",
-					(unsigned long)(crash_size >> 20),
-					(unsigned long)(crash_base >> 20),
-					(unsigned long)(total_mem >> 20));
-
-			if (reserve_bootmem(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE) < 0) {
-				printk(KERN_INFO "crashkernel reservation "
-					"failed - memory is in use\n");
-				return;
-			}
-
-			crashk_res.start = crash_base;
-			crashk_res.end   = crash_base + crash_size - 1;
-		} else
+		if (crash_base <= 0) {
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"you have to specify a base address\n");
+			return;
+		}
+
+		if (reserve_bootmem_generic(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE) < 0) {
+			printk(KERN_INFO "crashkernel reservation failed - "
+					"memory is in use\n");
+			return;
+		}
+
+		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+				"for crashkernel (System RAM: %ldMB)\n",
+				(unsigned long)(crash_size >> 20),
+				(unsigned long)(crash_base >> 20),
+				(unsigned long)(total_mem >> 20));
+
+		crashk_res.start = crash_base;
+		crashk_res.end   = crash_base + crash_size - 1;
+		insert_resource(&iomem_resource, &crashk_res);
 	}
 }
 #else
@@ -675,6 +678,8 @@ int x86_cpu_to_node_map_init[NR_CPUS] = {
 DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
 #endif
 
+static void probe_roms(void);
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -684,6 +689,7 @@ DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
  */
 void __init setup_arch(char **cmdline_p)
 {
+	int i;
 	unsigned long max_low_pfn;
 
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
@@ -745,6 +751,13 @@ void __init setup_arch(char **cmdline_p)
 
 	finish_e820_parsing();
 
+	probe_roms();
+
+	/* after parse_early_param, so could debug it */
+	insert_resource(&iomem_resource, &code_resource);
+	insert_resource(&iomem_resource, &data_resource);
+	insert_resource(&iomem_resource, &bss_resource);
+
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
 
@@ -861,9 +874,16 @@ void __init setup_arch(char **cmdline_p)
 			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
 #endif
 
-	e820_setup_gap();
+	e820_reserve_resources();
 	e820_mark_nosave_regions(max_low_pfn);
 
+	request_resource(&iomem_resource, &video_ram_resource);
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+
+	e820_setup_gap();
+
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
 	if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
@@ -874,25 +894,147 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
+static struct resource system_rom_resource = {
+	.name	= "System ROM",
+	.start	= 0xf0000,
+	.end	= 0xfffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+	.name	= "Extension ROM",
+	.start	= 0xe0000,
+	.end	= 0xeffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+	.name 	= "Adapter ROM",
+	.start	= 0xc8000,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+	.name 	= "Video ROM",
+	.start	= 0xc0000,
+	.end	= 0xc7fff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
+{
+	const unsigned short * const ptr = (const unsigned short *)rom;
+	unsigned short sig;
+
+	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+	unsigned char sum, c;
+
+	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+		sum += c;
+	return !length && !sum;
+}
+
+static void __init probe_roms(void)
 {
+	const unsigned char *rom;
+	unsigned long start, length, upper;
+	unsigned char c;
 	int i;
 
-	printk(KERN_INFO "Setting up standard PCI resources\n");
-	init_iomem_resources(&code_resource, &data_resource, &bss_resource);
+	/* video rom */
+	upper = adapter_rom_resources[0].start;
+	for (start = video_rom_resource.start; start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
 
-	request_resource(&iomem_resource, &video_ram_resource);
+		video_rom_resource.start = start;
 
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-	return 0;
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* if checksum okay, trust length byte */
+		if (length && romchecksum(rom, length))
+			video_rom_resource.end = start + length - 1;
+
+		request_resource(&iomem_resource, &video_rom_resource);
+		break;
+	}
+
+	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+	if (start < upper)
+		start = upper;
+
+	/* system rom */
+	request_resource(&iomem_resource, &system_rom_resource);
+	upper = system_rom_resource.start;
+
+	/* check for extension rom (ignore length byte!) */
+	rom = isa_bus_to_virt(extension_rom_resource.start);
+	if (romsignature(rom)) {
+		length = extension_rom_resource.end - extension_rom_resource.start + 1;
+		if (romchecksum(rom, length)) {
+			request_resource(&iomem_resource, &extension_rom_resource);
+			upper = extension_rom_resource.start;
+		}
+	}
+
+	/* check for adapter roms on 2k boundaries */
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* but accept any length that fits if checksum okay */
+		if (!length || start + length > upper || !romchecksum(rom, length))
+			continue;
+
+		adapter_rom_resources[i].start = start;
+		adapter_rom_resources[i].end = start + length - 1;
+		request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+		start = adapter_rom_resources[i++].end & ~2047UL;
+	}
 }
 
-subsys_initcall(request_standard_resources);
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 77fc24d89163..e860fe758e79 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -100,6 +100,7 @@ extern void e820_register_active_regions(int nid, unsigned long start_pfn,
 					 unsigned long end_pfn);
 extern u64 e820_hole_size(u64 start, u64 end);
 extern void finish_e820_parsing(void);
+extern void e820_reserve_resources(void);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index 9135ce6e617e..557b890549ff 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -20,9 +20,5 @@
 
 extern void setup_memory_map(void);
 
-extern void init_iomem_resources(struct resource *code_resource,
-				 struct resource *data_resource,
-				 struct resource *bss_resource);
-
 #endif/*!__ASSEMBLY__*/
 #endif/*__E820_HEADER*/
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 478547c63222..8d992109969b 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -16,7 +16,6 @@
 #ifndef __ASSEMBLY__
 extern void setup_memory_region(void);
 extern void contig_e820_setup(void);
-extern void e820_reserve_resources(void);
 extern int e820_any_non_reserved(unsigned long start, unsigned long end);
 extern int is_memory_any_valid(unsigned long start, unsigned long end);
 extern int e820_all_non_reserved(unsigned long start, unsigned long end);
-- 
cgit v1.2.3


From cc9f7a0ccf000d4db5fbdc7b0ae48eefea102f69 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 16 Jun 2008 16:11:08 -0700
Subject: x86: kill bad_ppro

so don't punish all other cpus without that problem when init highmem

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c |  9 +++++++++
 arch/x86/mm/discontig_32.c |  4 ++--
 arch/x86/mm/init_32.c      | 43 ++++++++++++-------------------------------
 include/asm-x86/highmem.h  |  2 +-
 include/asm-x86/numa_32.h  |  2 +-
 5 files changed, 25 insertions(+), 35 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index f3ddba5ed9a7..9692aeb8ecae 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -68,6 +68,7 @@
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
 #include <asm/efi.h>
+#include <asm/bugs.h>
 
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
@@ -764,6 +765,14 @@ void __init setup_arch(char **cmdline_p)
 	if (efi_enabled)
 		efi_init();
 
+	if (ppro_with_ram_bug()) {
+		e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+				  E820_RESERVED);
+		sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+		printk(KERN_INFO "fixed physical RAM map:\n");
+		e820_print_map("bad_ppro");
+	}
+
 	e820_register_active_regions(0, 0, -1UL);
 	/*
 	 * partially used pages are not usable - thus
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 7c4d0255f8d8..6216e43b6e95 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -427,7 +427,7 @@ void __init zone_sizes_init(void)
 	return;
 }
 
-void __init set_highmem_pages_init(int bad_ppro) 
+void __init set_highmem_pages_init(void)
 {
 #ifdef CONFIG_HIGHMEM
 	struct zone *zone;
@@ -447,7 +447,7 @@ void __init set_highmem_pages_init(int bad_ppro)
 				zone->name, nid, zone_start_pfn, zone_end_pfn);
 
 		add_highpages_with_active_regions(nid, zone_start_pfn,
-				 zone_end_pfn, bad_ppro);
+				 zone_end_pfn);
 	}
 	totalram_pages += totalhigh_pages;
 #endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ba07a489230e..fb5694d788bf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -220,13 +220,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	}
 }
 
-static inline int page_kills_ppro(unsigned long pagenr)
-{
-	if (pagenr >= 0x70000 && pagenr <= 0x7003F)
-		return 1;
-	return 0;
-}
-
 /*
  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
  * is valid. The argument is a physical page number.
@@ -287,22 +280,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pkmap_page_table = pte;
 }
 
-static void __init
-add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+static void __init add_one_highpage_init(struct page *page, int pfn)
 {
-	if (!(bad_ppro && page_kills_ppro(pfn))) {
-		ClearPageReserved(page);
-		init_page_count(page);
-		__free_page(page);
-		totalhigh_pages++;
-	} else
-		SetPageReserved(page);
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+	totalhigh_pages++;
 }
 
 struct add_highpages_data {
 	unsigned long start_pfn;
 	unsigned long end_pfn;
-	int bad_ppro;
 };
 
 static void __init add_highpages_work_fn(unsigned long start_pfn,
@@ -312,10 +300,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn,
 	struct page *page;
 	unsigned long final_start_pfn, final_end_pfn;
 	struct add_highpages_data *data;
-	int bad_ppro;
 
 	data = (struct add_highpages_data *)datax;
-	bad_ppro = data->bad_ppro;
 
 	final_start_pfn = max(start_pfn, data->start_pfn);
 	final_end_pfn = min(end_pfn, data->end_pfn);
@@ -327,29 +313,26 @@ static void __init add_highpages_work_fn(unsigned long start_pfn,
 		if (!pfn_valid(node_pfn))
 			continue;
 		page = pfn_to_page(node_pfn);
-		add_one_highpage_init(page, node_pfn, bad_ppro);
+		add_one_highpage_init(page, node_pfn);
 	}
 
 }
 
 void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
-					      unsigned long end_pfn,
-					      int bad_ppro)
+					      unsigned long end_pfn)
 {
 	struct add_highpages_data data;
 
 	data.start_pfn = start_pfn;
 	data.end_pfn = end_pfn;
-	data.bad_ppro = bad_ppro;
 
 	work_with_active_regions(nid, add_highpages_work_fn, &data);
 }
 
 #ifndef CONFIG_NUMA
-static void __init set_highmem_pages_init(int bad_ppro)
+static void __init set_highmem_pages_init(void)
 {
-	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn,
-						bad_ppro);
+	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
 
 	totalram_pages += totalhigh_pages;
 }
@@ -358,7 +341,7 @@ static void __init set_highmem_pages_init(int bad_ppro)
 #else
 # define kmap_init()				do { } while (0)
 # define permanent_kmaps_init(pgd_base)		do { } while (0)
-# define set_highmem_pages_init(bad_ppro)	do { } while (0)
+# define set_highmem_pages_init()	do { } while (0)
 #endif /* CONFIG_HIGHMEM */
 
 pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
@@ -605,13 +588,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
 void __init mem_init(void)
 {
 	int codesize, reservedpages, datasize, initsize;
-	int tmp, bad_ppro;
+	int tmp;
 
 #ifdef CONFIG_FLATMEM
 	BUG_ON(!mem_map);
 #endif
-	bad_ppro = ppro_with_ram_bug();
-
 #ifdef CONFIG_HIGHMEM
 	/* check that fixmap and pkmap do not overlap */
 	if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
@@ -634,7 +615,7 @@ void __init mem_init(void)
 		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
 			reservedpages++;
 
-	set_highmem_pages_init(bad_ppro);
+	set_highmem_pages_init();
 
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h
index 85c4fea41ff6..4514b16cc723 100644
--- a/include/asm-x86/highmem.h
+++ b/include/asm-x86/highmem.h
@@ -75,7 +75,7 @@ struct page *kmap_atomic_to_page(void *ptr);
 #define flush_cache_kmaps()	do { } while (0)
 
 extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
-					unsigned long end_pfn, int bad_ppro);
+					unsigned long end_pfn);
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-x86/numa_32.h b/include/asm-x86/numa_32.h
index 03d0f7a9bf02..a02674f64869 100644
--- a/include/asm-x86/numa_32.h
+++ b/include/asm-x86/numa_32.h
@@ -5,7 +5,7 @@ extern int pxm_to_nid(int pxm);
 
 #ifdef CONFIG_NUMA
 extern void __init remap_numa_kva(void);
-extern void set_highmem_pages_init(int);
+extern void set_highmem_pages_init(void);
 #else
 static inline void remap_numa_kva(void)
 {
-- 
cgit v1.2.3


From 064d25f12014ae1d97c2882f9ab874995321f2b2 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 16 Jun 2008 19:58:28 -0700
Subject: x86: merge setup_memory_map with e820

... and kill e820_32/64.c and e820_32/64.h

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile      |  2 +-
 arch/x86/kernel/e820.c        | 72 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/e820_32.c     | 29 --------------
 arch/x86/kernel/e820_64.c     | 92 -------------------------------------------
 arch/x86/kernel/setup_64.c    |  8 +---
 arch/x86/mach-default/setup.c | 47 ----------------------
 arch/x86/mm/init_64.c         | 12 ++++++
 include/asm-x86/e820.h        |  9 +++--
 include/asm-x86/e820_32.h     | 24 -----------
 include/asm-x86/e820_64.h     | 26 ------------
 include/asm-x86/setup.h       |  5 ---
 11 files changed, 92 insertions(+), 234 deletions(-)
 delete mode 100644 arch/x86/kernel/e820_32.c
 delete mode 100644 arch/x86/kernel/e820_64.c
 delete mode 100644 include/asm-x86/e820_32.h
 delete mode 100644 include/asm-x86/e820_64.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index dc3c636d113e..bcc2b123dabf 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -22,7 +22,7 @@ obj-y			+= setup_$(BITS).o i8259_$(BITS).o setup.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
-obj-y			+= bootflag.o e820_$(BITS).o e820.o
+obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 432c49178577..49477484a2fa 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1029,4 +1029,76 @@ void __init e820_reserve_resources(void)
 	}
 }
 
+char *__init __attribute__((weak)) machine_specific_memory_setup(void)
+{
+	char *who = "BIOS-e820";
+	int new_nr;
+	/*
+	 * Try to copy the BIOS-supplied E820-map.
+	 *
+	 * Otherwise fake a memory map; one section from 0k->640k,
+	 * the next section from 1mb->appropriate_mem_k
+	 */
+	new_nr = boot_params.e820_entries;
+	sanitize_e820_map(boot_params.e820_map,
+			ARRAY_SIZE(boot_params.e820_map),
+			&new_nr);
+	boot_params.e820_entries = new_nr;
+	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) {
+#ifdef CONFIG_X86_64
+		early_panic("Cannot find a valid memory map");
+#else
+		unsigned long mem_size;
+
+		/* compare results from other methods and take the greater */
+		if (boot_params.alt_mem_k
+		    < boot_params.screen_info.ext_mem_k) {
+			mem_size = boot_params.screen_info.ext_mem_k;
+			who = "BIOS-88";
+		} else {
+			mem_size = boot_params.alt_mem_k;
+			who = "BIOS-e801";
+		}
+
+		e820.nr_map = 0;
+		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+#endif
+	}
+
+	/* In case someone cares... */
+	return who;
+}
+
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __init __attribute__((weak)) memory_setup(void)
+{
+	return machine_specific_memory_setup();
+}
+
+void __init setup_memory_map(void)
+{
+	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+	e820_print_map(memory_setup());
+}
+
+#ifdef CONFIG_X86_64
+int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
+{
+	int i;
 
+	if (slot < 0 || slot >= e820.nr_map)
+		return -1;
+	for (i = slot; i < e820.nr_map; i++) {
+		if (e820.map[i].type != E820_RAM)
+			continue;
+		break;
+	}
+	if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
+		return -1;
+	*addr = e820.map[i].addr;
+	*size = min_t(u64, e820.map[i].size + e820.map[i].addr,
+		max_pfn << PAGE_SHIFT) - *addr;
+	return i + 1;
+}
+#endif
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
deleted file mode 100644
index 1205ccf086d7..000000000000
--- a/arch/x86/kernel/e820_32.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pfn.h>
-#include <linux/uaccess.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
-	return machine_specific_memory_setup();
-}
-
-void __init setup_memory_map(void)
-{
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	e820_print_map(memory_setup());
-}
-
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
deleted file mode 100644
index cb03bff9fa2f..000000000000
--- a/arch/x86/kernel/e820_64.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Handle the memory map.
- * The functions here do the job until bootmem takes over.
- *
- *  Getting sanitize_e820_map() in sync with i386 version by applying change:
- *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
- *     Alex Achenbach <xela@slit.de>, December 2002.
- *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- *
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pfn.h>
-#include <linux/pci.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/proto.h>
-#include <asm/setup.h>
-#include <asm/sections.h>
-#include <asm/kdebug.h>
-#include <asm/trampoline.h>
-
-/*
- * PFN of last memory page.
- */
-unsigned long end_pfn;
-
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_pfn_mapped;
-
-static void early_panic(char *msg)
-{
-	early_printk(msg);
-	panic(msg);
-}
-
-/* We're not void only for x86 32-bit compat */
-char *__init machine_specific_memory_setup(void)
-{
-	char *who = "BIOS-e820";
-	int new_nr;
-	/*
-	 * Try to copy the BIOS-supplied E820-map.
-	 *
-	 * Otherwise fake a memory map; one section from 0k->640k,
-	 * the next section from 1mb->appropriate_mem_k
-	 */
-	new_nr = boot_params.e820_entries;
-	sanitize_e820_map(boot_params.e820_map,
-			ARRAY_SIZE(boot_params.e820_map),
-			&new_nr);
-	boot_params.e820_entries = new_nr;
-	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
-		early_panic("Cannot find a valid memory map");
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	e820_print_map(who);
-
-	/* In case someone cares... */
-	return who;
-}
-
-int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
-{
-	int i;
-
-	if (slot < 0 || slot >= e820.nr_map)
-		return -1;
-	for (i = slot; i < e820.nr_map; i++) {
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		break;
-	}
-	if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
-		return -1;
-	*addr = e820.map[i].addr;
-	*size = min_t(u64, e820.map[i].size + e820.map[i].addr,
-		max_pfn << PAGE_SHIFT) - *addr;
-	return i + 1;
-}
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index ec65696ca2cd..3220c7b56eb3 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -266,12 +266,6 @@ static inline void __init reserve_crashkernel(void)
 {}
 #endif
 
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-void __attribute__((weak)) __init memory_setup(void)
-{
-       machine_specific_memory_setup();
-}
-
 #ifdef CONFIG_PCI_MMCONFIG
 extern void __cpuinit fam10h_check_enable_mmcfg(void);
 extern void __init check_enable_amd_mmconf_dmi(void);
@@ -316,7 +310,7 @@ void __init setup_arch(char **cmdline_p)
 
 	ARCH_SETUP
 
-	memory_setup();
+	setup_memory_map();
 	copy_edd();
 
 	if (!boot_params.hdr.root_flags)
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 7ae692a76769..2f5e277686b8 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -142,50 +142,3 @@ static int __init print_ipi_mode(void)
 
 late_initcall(print_ipi_mode);
 
-/**
- * machine_specific_memory_setup - Hook for machine specific memory setup.
- *
- * Description:
- *	This is included late in kernel/setup.c so that it can make
- *	use of all of the static functions.
- **/
-
-char * __init machine_specific_memory_setup(void)
-{
-	char *who;
-	int new_nr;
-
-
-	who = "BIOS-e820";
-
-	/*
-	 * Try to copy the BIOS-supplied E820-map.
-	 *
-	 * Otherwise fake a memory map; one section from 0k->640k,
-	 * the next section from 1mb->appropriate_mem_k
-	 */
-	new_nr = boot_params.e820_entries;
-	sanitize_e820_map(boot_params.e820_map,
-			ARRAY_SIZE(boot_params.e820_map),
-			&new_nr);
-	boot_params.e820_entries = new_nr;
-	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
-	    < 0) {
-		unsigned long mem_size;
-
-		/* compare results from other methods and take the greater */
-		if (boot_params.alt_mem_k
-		    < boot_params.screen_info.ext_mem_k) {
-			mem_size = boot_params.screen_info.ext_mem_k;
-			who = "BIOS-88";
-		} else {
-			mem_size = boot_params.alt_mem_k;
-			who = "BIOS-e801";
-		}
-
-		e820.nr_map = 0;
-		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
-		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
-  	}
-	return who;
-}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b8c2c1ef7ad5..5266f3141d6c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -47,6 +47,18 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 
+/*
+ * PFN of last memory page.
+ */
+unsigned long end_pfn;
+
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_pfn_mapped;
+
 static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index e860fe758e79..83144cb6c68d 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -101,6 +101,9 @@ extern void e820_register_active_regions(int nid, unsigned long start_pfn,
 extern u64 e820_hole_size(u64 start, u64 end);
 extern void finish_e820_parsing(void);
 extern void e820_reserve_resources(void);
+extern void setup_memory_map(void);
+extern char *machine_specific_memory_setup(void);
+extern char *memory_setup(void);
 
 #endif /* __ASSEMBLY__ */
 
@@ -111,10 +114,10 @@ extern void e820_reserve_resources(void);
 #define BIOS_END		0x00100000
 
 #ifdef __KERNEL__
+#include <linux/ioport.h>
+
 #ifdef CONFIG_X86_32
-# include "e820_32.h"
-#else
-# include "e820_64.h"
+#define HIGH_MEMORY	(1024*1024)
 #endif
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
deleted file mode 100644
index 557b890549ff..000000000000
--- a/include/asm-x86/e820_32.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * structures and definitions for the int 15, ax=e820 memory map
- * scheme.
- *
- * In a nutshell, arch/i386/boot/setup.S populates a scratch table
- * in the empty_zero_block that contains a list of usable address/size
- * duples.   In arch/i386/kernel/setup.c, this information is
- * transferred into the e820map, and in arch/i386/mm/init.c, that
- * new information is used to mark pages reserved or not.
- *
- */
-#ifndef __E820_HEADER
-#define __E820_HEADER
-
-#include <linux/ioport.h>
-
-#define HIGH_MEMORY	(1024*1024)
-
-#ifndef __ASSEMBLY__
-
-extern void setup_memory_map(void);
-
-#endif/*!__ASSEMBLY__*/
-#endif/*__E820_HEADER*/
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
deleted file mode 100644
index 8d992109969b..000000000000
--- a/include/asm-x86/e820_64.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * structures and definitions for the int 15, ax=e820 memory map
- * scheme.
- *
- * In a nutshell, setup.S populates a scratch table in the
- * empty_zero_block that contains a list of usable address/size
- * duples.  setup.c, this information is transferred into the e820map,
- * and in init.c/numa.c, that new information is used to mark pages
- * reserved or not.
- */
-#ifndef __E820_HEADER
-#define __E820_HEADER
-
-#include <linux/ioport.h>
-
-#ifndef __ASSEMBLY__
-extern void setup_memory_region(void);
-extern void contig_e820_setup(void);
-extern int e820_any_non_reserved(unsigned long start, unsigned long end);
-extern int is_memory_any_valid(unsigned long start, unsigned long end);
-extern int e820_all_non_reserved(unsigned long start, unsigned long end);
-extern int is_memory_all_valid(unsigned long start, unsigned long end);
-
-#endif/*!__ASSEMBLY__*/
-
-#endif/*__E820_HEADER*/
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 9e163fc3e984..b676b0be7986 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -8,7 +8,6 @@
 /* Interrupt control for vSMPowered x86_64 systems */
 void vsmp_init(void);
 
-char *machine_specific_memory_setup(void);
 #ifndef CONFIG_PARAVIRT
 #define paravirt_post_allocator_init()	do {} while (0)
 #endif
@@ -50,10 +49,6 @@ extern struct boot_params boot_params;
  */
 #define LOWMEMSIZE()	(0x9f000)
 
-char * __init machine_specific_memory_setup(void);
-char *memory_setup(void);
-
-
 void __init i386_start_kernel(void);
 
 extern unsigned long init_pg_tables_start;
-- 
cgit v1.2.3


From 593a0cc39046515a438ca9fcc168115961f9f503 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 17 Jun 2008 10:02:45 -0700
Subject: x86: move some function out of setup_bootmem_alloc

... to make it more like 64-bit.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 9692aeb8ecae..90a2b857b4a7 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -623,21 +623,6 @@ void __init setup_bootmem_allocator(void)
 		free_bootmem_with_active_regions(i, max_low_pfn);
 	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
 
-#ifdef CONFIG_ACPI_SLEEP
-	/*
-	 * Reserve low memory region for sleep support.
-	 */
-	acpi_reserve_bootmem();
-#endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
-	/*
-	 * Find and reserve possible boot-time SMP configuration:
-	 */
-	find_smp_config();
-#endif
-	reserve_crashkernel();
-
-	reserve_ibft_region();
 }
 
 /*
@@ -792,6 +777,22 @@ void __init setup_arch(char **cmdline_p)
 
 	max_low_pfn = setup_memory();
 
+#ifdef CONFIG_ACPI_SLEEP
+	/*
+	 * Reserve low memory region for sleep support.
+	 */
+	acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+	/*
+	 * Find and reserve possible boot-time SMP configuration:
+	 */
+	find_smp_config();
+#endif
+	reserve_crashkernel();
+
+	reserve_ibft_region();
+
 #ifdef CONFIG_KVM_CLOCK
 	kvmclock_init();
 #endif
-- 
cgit v1.2.3


From 1c6e55032e24ff79668581a0f296c278ef7edd4e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 17 Jun 2008 15:41:45 -0700
Subject: x86: use acpi_numa_init to parse on 32-bit numa

seperate SRAT finding and parsing from get_memcfg_from_srat,
and let getmemcfg_from_srat only handle array from previous step.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c |  36 +++++---
 arch/x86/kernel/srat_32.c  | 200 +++++++++++----------------------------------
 include/asm-x86/acpi.h     |   4 +-
 include/asm-x86/dmi.h      |   8 --
 4 files changed, 75 insertions(+), 173 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 90a2b857b4a7..7e06ecd83174 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -59,6 +59,7 @@
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
 #include <asm/sections.h>
+#include <asm/dmi.h>
 #include <asm/io_apic.h>
 #include <asm/ist.h>
 #include <asm/io.h>
@@ -184,6 +185,12 @@ int bootloader_type;
 /* user-defined highmem size */
 static unsigned int highmem_pages = -1;
 
+/*
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+
 /*
  * Setup options
  */
@@ -775,6 +782,24 @@ void __init setup_arch(char **cmdline_p)
 		max_pfn = e820_end_of_ram();
 	}
 
+	dmi_scan_machine();
+
+	io_delay_init();
+
+#ifdef CONFIG_ACPI
+	/*
+	 * Parse the ACPI tables for possible boot-time SMP configuration.
+	 */
+	acpi_boot_table_init();
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+        /*
+         * Parse SRAT to discover nodes.
+         */
+        acpi_numa_init();
+#endif
+
 	max_low_pfn = setup_memory();
 
 #ifdef CONFIG_ACPI_SLEEP
@@ -841,10 +866,6 @@ void __init setup_arch(char **cmdline_p)
 
 	paravirt_post_allocator_init();
 
-	dmi_scan_machine();
-
-	io_delay_init();
-
 #ifdef CONFIG_X86_SMP
 	/*
 	 * setup to use the early static init tables during kernel startup
@@ -861,13 +882,6 @@ void __init setup_arch(char **cmdline_p)
 	generic_apic_probe();
 #endif
 
-#ifdef CONFIG_ACPI
-	/*
-	 * Parse the ACPI tables for possible boot-time SMP configuration.
-	 */
-	acpi_boot_table_init();
-#endif
-
 	early_quirks();
 
 #ifdef CONFIG_ACPI
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index e9d91720a40f..5978023b799b 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -42,7 +42,7 @@
 #define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
 /* bitmap length; _PXM is at most 255 */
 #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
-static u8 pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
+static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
 
 #define MAX_CHUNKS_PER_NODE	3
 #define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
@@ -53,16 +53,37 @@ struct node_memory_chunk_s {
 	u8	nid;		// which cnode contains this chunk?
 	u8	bank;		// which mem bank on this node
 };
-static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
+static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
 
-static int num_memory_chunks;		/* total number of memory chunks */
+static int __initdata num_memory_chunks; /* total number of memory chunks */
 static u8 __initdata apicid_to_pxm[MAX_APICID];
 
+int numa_off __initdata;
+int acpi_numa __initdata;
+
+static __init void bad_srat(void)
+{
+        printk(KERN_ERR "SRAT: SRAT not used.\n");
+        acpi_numa = -1;
+	num_memory_chunks = 0;
+}
+
+static __init inline int srat_disabled(void)
+{
+	return numa_off || acpi_numa < 0;
+}
+
 /* Identify CPU proximity domains */
-static void __init parse_cpu_affinity_structure(char *p)
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
 {
-	struct acpi_srat_cpu_affinity *cpu_affinity =
-				(struct acpi_srat_cpu_affinity *) p;
+	if (srat_disabled())
+		return;
+	if (cpu_affinity->header.length !=
+	     sizeof(struct acpi_srat_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
 
 	if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 		return;		/* empty entry */
@@ -80,14 +101,21 @@ static void __init parse_cpu_affinity_structure(char *p)
  * Identify memory proximity domains and hot-remove capabilities.
  * Fill node memory chunk list structure.
  */
-static void __init parse_memory_affinity_structure (char *sratp)
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
 {
 	unsigned long long paddr, size;
 	unsigned long start_pfn, end_pfn;
 	u8 pxm;
 	struct node_memory_chunk_s *p, *q, *pend;
-	struct acpi_srat_mem_affinity *memory_affinity =
-			(struct acpi_srat_mem_affinity *) sratp;
+
+	if (srat_disabled())
+		return;
+	if (memory_affinity->header.length !=
+	     sizeof(struct acpi_srat_mem_affinity)) {
+		bad_srat();
+		return;
+	}
 
 	if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
 		return;		/* empty entry */
@@ -135,6 +163,14 @@ static void __init parse_memory_affinity_structure (char *sratp)
 		 "enabled and removable" : "enabled" ) );
 }
 
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+}
+
+void acpi_numa_arch_fixup(void)
+{
+}
 /*
  * The SRAT table always lists ascending addresses, so can always
  * assume that the first "start" address that you see is the real
@@ -167,39 +203,13 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
 		node_end_pfn[nid] = memory_chunk->end_pfn;
 }
 
-/* Parse the ACPI Static Resource Affinity Table */
-static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
+int __init get_memcfg_from_srat(void)
 {
-	u8 *start, *end, *p;
 	int i, j, nid;
 
-	start = (u8 *)(&(sratp->reserved) + 1);	/* skip header */
-	p = start;
-	end = (u8 *)sratp + sratp->header.length;
 
-	memset(pxm_bitmap, 0, sizeof(pxm_bitmap));	/* init proximity domain bitmap */
-	memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
-
-	num_memory_chunks = 0;
-	while (p < end) {
-		switch (*p) {
-		case ACPI_SRAT_TYPE_CPU_AFFINITY:
-			parse_cpu_affinity_structure(p);
-			break;
-		case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
-			parse_memory_affinity_structure(p);
-			break;
-		default:
-			printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
-			break;
-		}
-		p += p[1];
-		if (p[1] == 0) {
-			printk("acpi20_parse_srat: Entry length value is zero;"
-				" can't parse any further!\n");
-			break;
-		}
-	}
+	if (srat_disabled())
+		goto out_fail;
 
 	if (num_memory_chunks == 0) {
 		printk("could not finy any ACPI SRAT memory areas.\n");
@@ -248,7 +258,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 		e820_register_active_regions(chunk->nid, chunk->start_pfn,
 					     min(chunk->end_pfn, max_pfn));
 	}
- 
+
 	for_each_online_node(nid) {
 		unsigned long start = node_start_pfn[nid];
 		unsigned long end = min(node_end_pfn[nid], max_pfn);
@@ -258,118 +268,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 	}
 	return 1;
 out_fail:
-	return 0;
-}
-
-struct acpi_static_rsdt {
-	struct acpi_table_rsdt table;
-	u32 padding[32]; /* Allow for 32 more table entries */
-};
-
-int __init get_memcfg_from_srat(void)
-{
-	struct acpi_table_header *header = NULL;
-	struct acpi_table_rsdp *rsdp = NULL;
-	struct acpi_table_rsdt *rsdt = NULL;
-	acpi_native_uint rsdp_address = 0;
-	struct acpi_static_rsdt saved_rsdt;
-	int tables = 0;
-	int i = 0;
-
-	rsdp_address = acpi_os_get_root_pointer();
-	if (!rsdp_address) {
-		printk("%s: System description tables not found\n",
-		       __func__);
-		goto out_err;
-	}
-
-	printk("%s: assigning address to rsdp\n", __func__);
-	rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
-	if (!rsdp) {
-		printk("%s: Didn't find ACPI root!\n", __func__);
-		goto out_err;
-	}
-
-	printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
-		rsdp->oem_id);
-
-	if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
-		printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __func__);
-		goto out_err;
-	}
-
-	rsdt = (struct acpi_table_rsdt *)
-	    early_ioremap(rsdp->rsdt_physical_address, sizeof(saved_rsdt));
-
-	if (!rsdt) {
-		printk(KERN_WARNING
-		       "%s: ACPI: Invalid root system description tables (RSDT)\n",
-		       __func__);
-		goto out_err;
-	}
-
-	header = &rsdt->header;
-
-	if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
-		printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
-		early_iounmap(rsdt, sizeof(saved_rsdt));
-		goto out_err;
-	}
-
-	/* 
-	 * The number of tables is computed by taking the 
-	 * size of all entries (header size minus total 
-	 * size of RSDT) divided by the size of each entry
-	 * (4-byte table pointers).
-	 */
-	tables = (header->length - sizeof(struct acpi_table_header)) / sizeof(u32);
-
-	if (!tables)
-		goto out_err;
-
-	memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
-	early_iounmap(rsdt, sizeof(saved_rsdt));
-	if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
-		printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
-		       saved_rsdt.table.header.length);
-		goto out_err;
-	}
-
-	printk("Begin SRAT table scan....%d\n", tables);
-
-	for (i = 0; i < tables; i++){
-		int result;
-		u32 length;
-		/* Map in header, then map in full table length. */
-		header = (struct acpi_table_header *)
-			early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
-		if (!header)
-			break;
-
-                printk(KERN_INFO "ACPI: %4.4s %08lX, %04X\n",
-                           header->signature,
-		   (unsigned long)saved_rsdt.table.table_offset_entry[i],
-                           header->length);
-
-		if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4)) {
-			early_iounmap(header, sizeof(struct acpi_table_header));
-			continue;
-		}
-
-		length = header->length;
-		early_iounmap(header, sizeof(struct acpi_table_header));
-		header = (struct acpi_table_header *)
-			early_ioremap(saved_rsdt.table.table_offset_entry[i], length);
-		if (!header)
-			break;
-
-		/* we've found the srat table. don't need to look at any more tables */
-		result = acpi20_parse_srat((struct acpi_table_srat *)header);
-		early_iounmap(header, length);
-		return result;
-	}
-out_err:
-	remove_all_active_ranges();
 	printk("failed to get NUMA memory information from SRAT table\n");
 	return 0;
 }
diff --git a/include/asm-x86/acpi.h b/include/asm-x86/acpi.h
index 73ce5b32443f..635d764dc13e 100644
--- a/include/asm-x86/acpi.h
+++ b/include/asm-x86/acpi.h
@@ -161,9 +161,7 @@ struct bootnode;
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
 extern int acpi_scan_nodes(unsigned long start, unsigned long end);
-#ifdef CONFIG_X86_64
-# define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
-#endif
+#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
 				   int num_nodes);
 #else
diff --git a/include/asm-x86/dmi.h b/include/asm-x86/dmi.h
index 4edf7514a750..58a86571fe0f 100644
--- a/include/asm-x86/dmi.h
+++ b/include/asm-x86/dmi.h
@@ -3,12 +3,6 @@
 
 #include <asm/io.h>
 
-#ifdef CONFIG_X86_32
-
-#define dmi_alloc alloc_bootmem
-
-#else /* CONFIG_X86_32 */
-
 #define DMI_MAX_DATA 2048
 
 extern int dmi_alloc_index;
@@ -25,8 +19,6 @@ static inline void *dmi_alloc(unsigned len)
 	return dmi_alloc_data + idx;
 }
 
-#endif
-
 /* Use early IO mappings for DMI because it's initialized early */
 #define dmi_ioremap early_ioremap
 #define dmi_iounmap early_iounmap
-- 
cgit v1.2.3


From 95a71a45c250177854f7c530810c88a8a19a443b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 18 Jun 2008 17:27:08 -0700
Subject: x86: cleanup machine_specific_memory_setup, v2

1. let 64bit support 88 and e801 too
2. introduce default_machine_specific_memory_setup, and reuse it
   for voyager

v2: fix 64 bit compiling

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c        | 13 +++++++------
 arch/x86/mach-voyager/setup.c | 32 +-------------------------------
 include/asm-x86/e820.h        |  3 +--
 include/asm-x86/setup.h       |  3 ++-
 4 files changed, 11 insertions(+), 40 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 49477484a2fa..7b613d2efb04 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1029,7 +1029,7 @@ void __init e820_reserve_resources(void)
 	}
 }
 
-char *__init __attribute__((weak)) machine_specific_memory_setup(void)
+char *__init default_machine_specific_memory_setup(void)
 {
 	char *who = "BIOS-e820";
 	int new_nr;
@@ -1045,10 +1045,7 @@ char *__init __attribute__((weak)) machine_specific_memory_setup(void)
 			&new_nr);
 	boot_params.e820_entries = new_nr;
 	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) {
-#ifdef CONFIG_X86_64
-		early_panic("Cannot find a valid memory map");
-#else
-		unsigned long mem_size;
+		u64 mem_size;
 
 		/* compare results from other methods and take the greater */
 		if (boot_params.alt_mem_k
@@ -1063,13 +1060,17 @@ char *__init __attribute__((weak)) machine_specific_memory_setup(void)
 		e820.nr_map = 0;
 		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
 		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
-#endif
 	}
 
 	/* In case someone cares... */
 	return who;
 }
 
+char *__init __attribute__((weak)) machine_specific_memory_setup(void)
+{
+	return default_machine_specific_memory_setup();
+}
+
 /* Overridden in paravirt.c if CONFIG_PARAVIRT */
 char * __init __attribute__((weak)) memory_setup(void)
 {
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index f27b583154e5..6bbdd633864c 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -104,35 +104,5 @@ char *__init machine_specific_memory_setup(void)
 		return who;
 	}
 
-	who = "BIOS-e820";
-
-	/*
-	 * Try to copy the BIOS-supplied E820-map.
-	 *
-	 * Otherwise fake a memory map; one section from 0k->640k,
-	 * the next section from 1mb->appropriate_mem_k
-	 */
-	new_nr = boot_params.e820_entries;
-	sanitize_e820_map(boot_params.e820_map,
-			ARRAY_SIZE(boot_params.e820_map),
-			&new_nr);
-	boot_params.e820_entries = new_nr;
-	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries)
-	    < 0) {
-		unsigned long mem_size;
-
-		/* compare results from other methods and take the greater */
-		if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
-			mem_size = boot_params.screen_info.ext_mem_k;
-			who = "BIOS-88";
-		} else {
-			mem_size = boot_params.alt_mem_k;
-			who = "BIOS-e801";
-		}
-
-		e820.nr_map = 0;
-		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
-		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
-	}
-	return who;
+	return default_machine_specific_memory_setup();
 }
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 83144cb6c68d..668a0d743d25 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -102,6 +102,7 @@ extern u64 e820_hole_size(u64 start, u64 end);
 extern void finish_e820_parsing(void);
 extern void e820_reserve_resources(void);
 extern void setup_memory_map(void);
+extern char *default_machine_specific_memory_setup(void);
 extern char *machine_specific_memory_setup(void);
 extern char *memory_setup(void);
 
@@ -116,9 +117,7 @@ extern char *memory_setup(void);
 #ifdef __KERNEL__
 #include <linux/ioport.h>
 
-#ifdef CONFIG_X86_32
 #define HIGH_MEMORY	(1024*1024)
-#endif
 #endif /* __KERNEL__ */
 
 #endif  /* __ASM_E820_H */
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index b676b0be7986..e14b6e73d266 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -42,13 +42,14 @@ void vsmp_init(void);
  */
 extern struct boot_params boot_params;
 
-#ifdef __i386__
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
  */
 #define LOWMEMSIZE()	(0x9f000)
 
+#ifdef __i386__
+
 void __init i386_start_kernel(void);
 
 extern unsigned long init_pg_tables_start;
-- 
cgit v1.2.3


From fcfa146e412023dd55f8855f240b2c2082dc1baa Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 18 Jun 2008 17:29:31 -0700
Subject: x86: update mptable fix with no ioapic v2

if the system doesn't have ioapic, we don't need to store entries for mptable
update

also let mp_config_acpi_gsi not call func in mpparse
so later could decouple mpparse with acpi more easily

Reported-by: Daniel Exner <dex@dragonslave.de>
Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Daniel Exner <dex@dragonslave.de>
Cc: Len Brown <lenb@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 87 +++++++++++++++++++++++++++++----------------
 arch/x86/kernel/mpparse.c   | 19 +++++-----
 include/asm-x86/mpspec.h    |  2 --
 3 files changed, 65 insertions(+), 43 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 29b365a66cf3..91bb9a99338d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -960,10 +960,37 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	nr_ioapics++;
 }
 
+static void assign_to_mp_irq(struct mp_config_intsrc *m,
+				    struct mp_config_intsrc *mp_irq)
+{
+	memcpy(mp_irq, m, sizeof(struct mp_config_intsrc));
+}
+
+static int mp_irq_cmp(struct mp_config_intsrc *mp_irq,
+				struct mp_config_intsrc *m)
+{
+	return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc));
+}
+
+static void save_mp_irq(struct mp_config_intsrc *m)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (!mp_irq_cmp(&mp_irqs[i], m))
+			return;
+	}
+
+	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!!\n");
+}
+
 void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 {
 	int ioapic;
 	int pin;
+	struct mp_config_intsrc mp_irq;
 
 	/*
 	 * Convert 'gsi' to 'ioapic.pin'.
@@ -981,18 +1008,15 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	if ((bus_irq == 0) && (trigger == 3))
 		trigger = 1;
 
-	mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
-	mp_irqs[mp_irq_entries].mp_irqtype = mp_INT;
-	mp_irqs[mp_irq_entries].mp_irqflag = (trigger << 2) | polarity;
-	mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
-	mp_irqs[mp_irq_entries].mp_srcbusirq = bus_irq;	/* IRQ */
-	mp_irqs[mp_irq_entries].mp_dstapic =
-			mp_ioapics[ioapic].mp_apicid;	/* APIC ID */
-	mp_irqs[mp_irq_entries].mp_dstirq = pin;	/* INTIN# */
-
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
-		panic("Max # of irq sources exceeded!!\n");
+	mp_irq.mp_type = MP_INTSRC;
+	mp_irq.mp_irqtype = mp_INT;
+	mp_irq.mp_irqflag = (trigger << 2) | polarity;
+	mp_irq.mp_srcbus = MP_ISA_BUS;
+	mp_irq.mp_srcbusirq = bus_irq;	/* IRQ */
+	mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */
+	mp_irq.mp_dstirq = pin;	/* INTIN# */
 
+	save_mp_irq(&mp_irq);
 }
 
 void __init mp_config_acpi_legacy_irqs(void)
@@ -1000,6 +1024,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	int i;
 	int ioapic;
 	unsigned int dstapic;
+	struct mp_config_intsrc mp_irq;
 
 #if defined (CONFIG_MCA) || defined (CONFIG_EISA)
 	/*
@@ -1052,16 +1077,15 @@ void __init mp_config_acpi_legacy_irqs(void)
 			continue;	/* IRQ already used */
 		}
 
-		mp_irqs[mp_irq_entries].mp_type = MP_INTSRC;
-		mp_irqs[mp_irq_entries].mp_irqflag = 0;	/* Conforming */
-		mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS;
-		mp_irqs[mp_irq_entries].mp_dstapic = dstapic;
-		mp_irqs[mp_irq_entries].mp_irqtype = mp_INT;
-		mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */
-		mp_irqs[mp_irq_entries].mp_dstirq = i;
+		mp_irq.mp_type = MP_INTSRC;
+		mp_irq.mp_irqflag = 0;	/* Conforming */
+		mp_irq.mp_srcbus = MP_ISA_BUS;
+		mp_irq.mp_dstapic = dstapic;
+		mp_irq.mp_irqtype = mp_INT;
+		mp_irq.mp_srcbusirq = i; /* Identity mapped */
+		mp_irq.mp_dstirq = i;
 
-		if (++mp_irq_entries == MAX_IRQ_SOURCES)
-			panic("Max # of irq sources exceeded!!\n");
+		save_mp_irq(&mp_irq);
 	}
 }
 
@@ -1169,25 +1193,26 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
 			u32 gsi, int triggering, int polarity)
 {
-	struct mpc_config_intsrc intsrc;
+#ifdef CONFIG_X86_MPPARSE
+	struct mp_config_intsrc mp_irq;
 	int ioapic;
 
-	if (!enable_update_mptable)
+	if (!acpi_ioapic)
 		return 0;
 
 	/* print the entry should happen on mptable identically */
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqtype = mp_INT;
-	intsrc.mpc_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+	mp_irq.mp_type = MP_INTSRC;
+	mp_irq.mp_irqtype = mp_INT;
+	mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
 				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
-	intsrc.mpc_srcbus = number;
-	intsrc.mpc_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+	mp_irq.mp_srcbus = number;
+	mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
 	ioapic = mp_find_ioapic(gsi);
-	intsrc.mpc_dstapic = mp_ioapic_routing[ioapic].apic_id;
-	intsrc.mpc_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
-
-	MP_intsrc_info(&intsrc);
+	mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id;
+	mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
 
+	save_mp_irq(&mp_irq);
+#endif
 	return 0;
 }
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 014ac5d90f80..8b6b1e05c306 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -34,8 +34,6 @@
 #include <mach_mpparse.h>
 #endif
 
-int enable_update_mptable;
-
 /*
  * Checksum an MP configuration block.
  */
@@ -246,7 +244,7 @@ static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
 		mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
 }
 
-static void assign_to_mp_irq(struct mpc_config_intsrc *m,
+static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
 				    struct mp_config_intsrc *mp_irq)
 {
 	mp_irq->mp_dstapic = m->mpc_dstapic;
@@ -270,7 +268,7 @@ static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
 	m->mpc_dstirq = mp_irq->mp_dstirq;
 }
 
-static int mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
 					struct mpc_config_intsrc *m)
 {
 	if (mp_irq->mp_dstapic != m->mpc_dstapic)
@@ -291,17 +289,16 @@ static int mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
 	return 0;
 }
 
-void MP_intsrc_info(struct mpc_config_intsrc *m)
+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
 {
 	int i;
 
 	print_MP_intsrc_info(m);
 
-	if (enable_update_mptable)
-		for (i = 0; i < mp_irq_entries; i++) {
-			if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
-				return;
-		}
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
+			return;
+	}
 
 	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
@@ -1113,6 +1110,8 @@ out:
 	return 0;
 }
 
+static int __initdata enable_update_mptable;
+
 static int __init update_mptable_setup(char *str)
 {
 	enable_update_mptable = 1;
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index f48dbca740e4..6ec1a5453b3e 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -59,9 +59,7 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 				   u32 gsi);
 extern void mp_config_acpi_legacy_irqs(void);
 extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
-extern void MP_intsrc_info(struct mpc_config_intsrc *m);
 #ifdef CONFIG_X86_IO_APIC
-extern int enable_update_mptable;
 extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
 				u32 gsi, int triggering, int polarity);
 #else
-- 
cgit v1.2.3


From 471694ea6c6ccdf7131354f1d698d4d83a605293 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 1 Jul 2008 01:11:35 +0100
Subject: x86, ioapic, acpi: add a knob to disable IRQ 0 through I/O APIC

As discovered recently some systems exhibit problems when the 8254 timer
IRQ is routed through the I/O APIC.  These problems do not affect the
timer IRQ itself and therefore cannot be detected when the correctness of
operation of the interrupt is verified in check_timer().  Therefore the
I/O APIC path of the timer IRQ has to be disabled entirely.

This is a change that lets platforms ask for the timer IRQ not to be
registered in the I/O APIC interrupt tables.  The local APIC and ExtINTA
paths are unaffected.  This request is only taken into account for ACPI
platforms as MP table systems seem unaffected so far.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 91bb9a99338d..92a542653956 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -83,6 +83,8 @@ int acpi_lapic;
 int acpi_ioapic;
 int acpi_strict;
 
+static int disable_irq0_through_ioapic __initdata;
+
 u8 acpi_sci_flags __initdata;
 int acpi_sci_override_gsi __initdata;
 int acpi_skip_timer_override __initdata;
@@ -992,6 +994,10 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	int pin;
 	struct mp_config_intsrc mp_irq;
 
+	/* Skip the 8254 timer interrupt (IRQ 0) if requested.  */
+	if (bus_irq == 0 && disable_irq0_through_ioapic)
+		return;
+
 	/*
 	 * Convert 'gsi' to 'ioapic.pin'.
 	 */
@@ -1058,6 +1064,10 @@ void __init mp_config_acpi_legacy_irqs(void)
 	for (i = 0; i < 16; i++) {
 		int idx;
 
+		/* Skip the 8254 timer interrupt (IRQ 0) if requested.  */
+		if (i == 0 && disable_irq0_through_ioapic)
+			continue;
+
 		for (idx = 0; idx < mp_irq_entries; idx++) {
 			struct mp_config_intsrc *irq = mp_irqs + idx;
 
-- 
cgit v1.2.3


From 9340e1ccdf7b9b22a2be7f51cd74e8b5e11961bf Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Tue, 1 Jul 2008 01:12:06 +0100
Subject: x86, ioapic, acpi quirk: disable IRQ 0 through I/O APIC for some HP
 systems

Some HP laptops have a problem with their DSDT reporting as
HP/SB400/10000, which includes some code which overrides all temperature
trip points to 16C if the INTIN2 input of the I/O APIC is enabled.  This
input is incorrectly designated the ISA IRQ 0 via an interrupt source
override even though it is wired to the output of the master 8259A and
INTIN0 is not connected at all.  So far two models have been identified,
namely nx6125 and nx6325.

Use a knob provided by the I/O APIC interrupt registration code to
abandon any attempts to route IRQ 0 through the I/O APIC for these
systems.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 92a542653956..9908ef45a6b2 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1426,6 +1426,17 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
 	return 0;
 }
 
+/*
+ * Don't register any I/O APIC entries for the 8254 timer IRQ.
+ */
+static int __init
+dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d)
+{
+	pr_notice("%s detected: disabling IRQ 0 through I/O APIC\n", d->ident);
+	disable_irq0_through_ioapic = 1;
+	return 0;
+}
+
 /*
  * If your system is blacklisted here, but you find that acpi=force
  * works for you, please contact acpi-devel@sourceforge.net
@@ -1593,6 +1604,32 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
 		     DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
 		     },
 	 },
+	/*
+	 * HP laptops which use a DSDT reporting as HP/SB400/10000,
+	 * which includes some code which overrides all temperature
+	 * trip points to 16C if the INTIN2 input of the I/O APIC
+	 * is enabled.  This input is incorrectly designated the
+	 * ISA IRQ 0 via an interrupt source override even though
+	 * it is wired to the output of the master 8259A and INTIN0
+	 * is not connected at all.  Abandon any attempts to route
+	 * IRQ 0 through the I/O APIC therefore.
+	 */
+	{
+	 .callback = dmi_disable_irq0_through_ioapic,
+	 .ident = "HP NX6125 laptop",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
+		     },
+	 },
+	{
+	 .callback = dmi_disable_irq0_through_ioapic,
+	 .ident = "HP NX6325 laptop",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
+		     },
+	 },
 	{}
 };
 
-- 
cgit v1.2.3


From 7496b60654e759d0b9008b80908e80727904b3c4 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:12 +0200
Subject: x86: fix remove cpu_pda table patch

Mike Travis wrote:
> Ingo Molnar wrote:
>> * Mike Travis <travis@sgi.com> wrote:
>>
>>> [Ingo - please replace "PATCH 07/11" with this one.]
>>>
>>>     *	Remove 544k bytes from the kernel by removing the boot_cpu_pda
>>> 	array from the data section and allocating it during startup.
>>>
>>> 	Fixed panic in setup_per_cpu_areas when HOTPLUG_CPU not set.
>>>
>>> For inclusion into sched-devel/latest tree.
>> sched-devel.git randconfig testing found another crash with your queue:
>>
>> [    0.111060] Brought up 1 CPUs
>> [    0.111986] Total of 1 processors activated (4022.73 BogoMIPS).
>> [    0.112987] Testing NMI watchdog ... <1>BUG: unable to handle kernel NULL pointer dereference at 0000000000000040
>> [    0.114982] IP: [<ffffffff8180d4a0>] check_nmi_watchdog+0xb0/0x210
>> [    0.114982] PGD 0
>> [    0.114982] Oops: 0000 [1] SMP
>> [    0.114982] CPU 0
>> [............]
>>
>>  http://redhat.com/~mingo/misc/config-Mon_Apr_28_23_25_25_CEST_2008.bad
>>  http://redhat.com/~mingo/misc/log-Mon_Apr_28_23_25_25_CEST_2008.bad
>>
>> 	Ingo
>
> Hi Ingo,
>
> I need a bit more information on your hardware configuration.  Building a
> kernel with the above config file started up fine on both the Intel and AMD
> boxes.
>
> Based on the above output it looks like it might be a UP machine?
...

Ok, I think I found it.  In check_nmi_watchdog():

        for (cpu = 0; cpu < NR_CPUS; cpu++)
                prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;

As I mentioned it works fine on both of my systems so could you try it out?

Thanks!
Mike
--

  * Change function check_nmi_watchdog() to use nr_cpu_ids instead of NR_CPUS.

Based on:
	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    +   sched-devel/latest  .../mingo/linux-2.6-sched-devel.git

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 5a29ded994fa..2861b9408ac9 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -88,7 +88,7 @@ int __init check_nmi_watchdog(void)
 	if (!atomic_read(&nmi_active))
 		return 0;
 
-	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+	prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
 	if (!prev_nmi_count)
 		return -1;
 
@@ -99,7 +99,7 @@ int __init check_nmi_watchdog(void)
 		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 #endif
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
 		prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
 	local_irq_enable();
 	mdelay((20*1000)/nmi_hz); // wait 20 ticks
-- 
cgit v1.2.3


From 23ca4bba3e20c6c3cb11c1bb0ab4770b724d39ac Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:12 +0200
Subject: x86: cleanup early per cpu variables/accesses v4

  * Introduce a new PER_CPU macro called "EARLY_PER_CPU".  This is
    used by some per_cpu variables that are initialized and accessed
    before there are per_cpu areas allocated.

    ["Early" in respect to per_cpu variables is "earlier than the per_cpu
    areas have been setup".]

    This patchset adds these new macros:

	DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)
	EXPORT_EARLY_PER_CPU_SYMBOL(_name)
	DECLARE_EARLY_PER_CPU(_type, _name)

	early_per_cpu_ptr(_name)
	early_per_cpu_map(_name, _idx)
	early_per_cpu(_name, _cpu)

    The DEFINE macro defines the per_cpu variable as well as the early
    map and pointer.  It also initializes the per_cpu variable and map
    elements to "_initvalue".  The early_* macros provide access to
    the initial map (usually setup during system init) and the early
    pointer.  This pointer is initialized to point to the early map
    but is then NULL'ed when the actual per_cpu areas are setup.  After
    that the per_cpu variable is the correct access to the variable.

    The early_per_cpu() macro is not very efficient but does show how to
    access the variable if you have a function that can be called both
    "early" and "late".  It tests the early ptr to be NULL, and if not
    then it's still valid.  Otherwise, the per_cpu variable is used
    instead:

	#define early_per_cpu(_name, _cpu) 			\
		(early_per_cpu_ptr(_name) ?			\
			early_per_cpu_ptr(_name)[_cpu] :	\
			per_cpu(_name, _cpu))

    A better method is to actually check the pointer manually.  In the
    case below, numa_set_node can be called both "early" and "late":

	void __cpuinit numa_set_node(int cpu, int node)
	{
	    int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

	    if (cpu_to_node_map)
		    cpu_to_node_map[cpu] = node;
	    else
		    per_cpu(x86_cpu_to_node_map, cpu) = node;
	}

  * Add a flag "arch_provides_topology_pointers" that indicates pointers
    to topology cpumask_t maps are available.  Otherwise, use the function
    returning the cpumask_t value.  This is useful if cpumask_t set size
    is very large to avoid copying data on to/off of the stack.

  * The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while
    the non-debug case has been optimized a bit.

  * Remove an unreferenced compiler warning in drivers/base/topology.c

  * Clean up #ifdef in setup.c

For inclusion into sched-devel/latest tree.

Based on:
	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    +   sched-devel/latest  .../mingo/linux-2.6-sched-devel.git

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig           |   2 +-
 arch/x86/Kconfig.debug     |   2 +-
 arch/x86/kernel/apic_32.c  |   9 +--
 arch/x86/kernel/apic_64.c  |  11 ++--
 arch/x86/kernel/setup.c    |  96 ++++++++++++++++++++++++++----
 arch/x86/kernel/setup_32.c |  24 --------
 arch/x86/kernel/setup_64.c |   9 ---
 arch/x86/kernel/smpboot.c  |  20 +------
 arch/x86/mm/numa_64.c      |  43 ++++----------
 arch/x86/mm/srat_64.c      |   2 +-
 drivers/base/topology.c    |  25 +++++++-
 include/asm-x86/numa_64.h  |  19 +++---
 include/asm-x86/percpu.h   |  46 +++++++++++++++
 include/asm-x86/smp.h      |  15 +----
 include/asm-x86/topology.h | 143 ++++++++++++++++++++++++++-------------------
 15 files changed, 270 insertions(+), 196 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2e325521e5e9..4469a0db1ae1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -121,7 +121,7 @@ config ARCH_HAS_CACHE_LINE_SIZE
 	def_bool y
 
 config HAVE_SETUP_PER_CPU_AREA
-	def_bool X86_64 || (X86_SMP && !X86_VOYAGER)
+	def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
 
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 18363374d51a..24ca95a0ba54 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -60,7 +60,7 @@ config DEBUG_PAGEALLOC
 config DEBUG_PER_CPU_MAPS
 	bool "Debug access to per_cpu maps"
 	depends on DEBUG_KERNEL
-	depends on X86_64_SMP
+	depends on X86_SMP
 	default n
 	help
 	  Say Y to verify that the per_cpu map being accessed has
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6c..f17c1c1bc384 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -52,9 +52,6 @@
 
 unsigned long mp_lapic_addr;
 
-DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
 /*
  * Knob to control our willingness to enable the local APIC.
  *
@@ -1534,9 +1531,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 	}
 #ifdef CONFIG_SMP
 	/* are we being called early in kernel startup? */
-	if (x86_cpu_to_apicid_early_ptr) {
-		u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
-		u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+	if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
+		u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+		u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
 
 		cpu_to_apicid[cpu] = apicid;
 		bios_cpu_apicid[cpu] = apicid;
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 0633cfd0dc29..4fd21f7d698c 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -87,9 +87,6 @@ static unsigned long apic_phys;
 
 unsigned long mp_lapic_addr;
 
-DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
 unsigned int __cpuinitdata maxcpus = NR_CPUS;
 /*
  * Get the LAPIC version
@@ -1091,9 +1088,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		cpu = 0;
 	}
 	/* are we being called early in kernel startup? */
-	if (x86_cpu_to_apicid_early_ptr) {
-		u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
-		u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+	if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
+		u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+		u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
 
 		cpu_to_apicid[cpu] = apicid;
 		bios_cpu_apicid[cpu] = apicid;
@@ -1269,7 +1266,7 @@ __cpuinit int apic_is_clustered_box(void)
 	if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
 		return 0;
 
-	bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+	bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
 	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
 	for (i = 0; i < NR_CPUS; i++) {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6f80b852a196..03caa8e4351f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -19,13 +19,23 @@ unsigned disabled_cpus __cpuinitdata;
 unsigned int boot_cpu_physical_apicid = -1U;
 EXPORT_SYMBOL(boot_cpu_physical_apicid);
 
-DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map;
 #endif
 
+/* map cpu index to physical APIC ID */
+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#define	X86_64_NUMA	1
+
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+#endif
+
 #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
 /*
  * Copy data used in early init routines from the initial arrays to the
@@ -37,20 +47,21 @@ static void __init setup_per_cpu_maps(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
+		per_cpu(x86_cpu_to_apicid, cpu) =
+				early_per_cpu_map(x86_cpu_to_apicid, cpu);
 		per_cpu(x86_bios_cpu_apicid, cpu) =
-						x86_bios_cpu_apicid_init[cpu];
-#ifdef CONFIG_NUMA
+				early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+#ifdef X86_64_NUMA
 		per_cpu(x86_cpu_to_node_map, cpu) =
-						x86_cpu_to_node_map_init[cpu];
+				early_per_cpu_map(x86_cpu_to_node_map, cpu);
 #endif
 	}
 
 	/* indicate the early static arrays will soon be gone */
-	x86_cpu_to_apicid_early_ptr = NULL;
-	x86_bios_cpu_apicid_early_ptr = NULL;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = NULL;
+	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+#ifdef X86_64_NUMA
+	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
 #endif
 }
 
@@ -109,7 +120,8 @@ void __init setup_per_cpu_areas(void)
 		if (!node_online(node) || !NODE_DATA(node)) {
 			ptr = alloc_bootmem_pages(size);
 			printk(KERN_INFO
-			       "cpu %d has no node or node-local memory\n", i);
+			       "cpu %d has no node %d or node-local memory\n",
+				i, node);
 		}
 		else
 			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
@@ -137,3 +149,63 @@ void __init setup_per_cpu_areas(void)
 }
 
 #endif
+
+#ifdef X86_64_NUMA
+void __cpuinit numa_set_node(int cpu, int node)
+{
+	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+	if (cpu_to_node_map)
+		cpu_to_node_map[cpu] = node;
+
+	else if (per_cpu_offset(cpu))
+		per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+	else
+		Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
+}
+
+void __cpuinit numa_clear_node(int cpu)
+{
+	numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
+}
+#endif /* CONFIG_NUMA */
+
+#if defined(CONFIG_DEBUG_PER_CPU_MAPS) && defined(CONFIG_X86_64)
+
+int cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+		printk(KERN_WARNING
+			"cpu_to_node(%d): usage too early!\n", cpu);
+		dump_stack();
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(cpu_to_node);
+
+int early_cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map))
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+	if (!per_cpu_offset(cpu)) {
+		printk(KERN_WARNING
+			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+			dump_stack();
+		return NUMA_NO_NODE;
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+#endif
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 5a2f8e063887..ccd5f5cdbbe6 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -737,18 +737,6 @@ char * __init __attribute__((weak)) memory_setup(void)
 	return machine_specific_memory_setup();
 }
 
-#ifdef CONFIG_NUMA
-/*
- * In the golden day, when everything among i386 and x86_64 will be
- * integrated, this will not live here
- */
-void *x86_cpu_to_node_map_early_ptr;
-int x86_cpu_to_node_map_init[NR_CPUS] = {
-	[0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
-#endif
-
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -887,18 +875,6 @@ void __init setup_arch(char **cmdline_p)
 
 	io_delay_init();
 
-#ifdef CONFIG_X86_SMP
-	/*
-	 * setup to use the early static init tables during kernel startup
-	 * X86_SMP will exclude sub-arches that don't deal well with it.
-	 */
-	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
-	x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-#endif
-#endif
-
 #ifdef CONFIG_X86_GENERICARCH
 	generic_apic_probe();
 #endif
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6dff1286ad8a..e8df64fad540 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -406,15 +406,6 @@ void __init setup_arch(char **cmdline_p)
 	kvmclock_init();
 #endif
 
-#ifdef CONFIG_SMP
-	/* setup to use the early static init tables during kernel startup */
-	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
-	x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-#endif
-#endif
-
 #ifdef CONFIG_ACPI
 	/*
 	 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3e1cecedde42..036604d3daed 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -68,22 +68,6 @@
 #include <mach_wakecpu.h>
 #include <smpboot_hooks.h>
 
-/*
- * FIXME: For x86_64, those are defined in other files. But moving them here,
- * would make the setup areas dependent on smp, which is a loss. When we
- * integrate apic between arches, we can probably do a better job, but
- * right now, they'll stay here -- glommer
- */
-
-/* which logical CPU number maps to which CPU (physical APIC ID) */
-u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
-			{ [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_early_ptr;
-
-u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
-				= { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_bios_cpu_apicid_early_ptr;
-
 #ifdef CONFIG_X86_32
 u8 apicid_2_node[MAX_APICID];
 static int low_mappings;
@@ -992,7 +976,7 @@ do_rest:
 		/* Try to put things back the way they were before ... */
 		unmap_cpu_to_logical_apicid(cpu);
 #ifdef CONFIG_X86_64
-		clear_node_cpumask(cpu); /* was set by numa_add_cpu */
+		numa_remove_cpu(cpu); /* was set by numa_add_cpu */
 #endif
 		cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
 		cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
@@ -1373,7 +1357,7 @@ static void __ref remove_cpu_from_maps(int cpu)
 	cpu_clear(cpu, cpu_callin_map);
 	/* was set by cpu_init() */
 	clear_bit(cpu, (unsigned long *)&cpu_initialized);
-	clear_node_cpumask(cpu);
+	numa_remove_cpu(cpu);
 #endif
 }
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..970f86775c41 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -31,16 +31,6 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
 struct memnode memnode;
 
-#ifdef CONFIG_SMP
-int x86_cpu_to_node_map_init[NR_CPUS] = {
-	[0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-void *x86_cpu_to_node_map_early_ptr;
-EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
-#endif
-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-
 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
@@ -577,24 +567,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 }
 
-__cpuinit void numa_add_cpu(int cpu)
-{
-	set_bit(cpu,
-		(unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-
-void __cpuinit numa_set_node(int cpu, int node)
-{
-	int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
-
-	if(cpu_to_node_map)
-		cpu_to_node_map[cpu] = node;
-	else if(per_cpu_offset(cpu))
-		per_cpu(x86_cpu_to_node_map, cpu) = node;
-	else
-		Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
-}
-
 unsigned long __init numa_free_all_bootmem(void)
 {
 	unsigned long pages = 0;
@@ -641,6 +613,7 @@ static __init int numa_setup(char *opt)
 }
 early_param("numa", numa_setup);
 
+#ifdef CONFIG_NUMA
 /*
  * Setup early cpu_to_node.
  *
@@ -652,14 +625,19 @@ early_param("numa", numa_setup);
  * is already initialized in a round robin manner at numa_init_array,
  * prior to this call, and this initialization is good enough
  * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
  */
 void __init init_cpu_to_node(void)
 {
-	int i;
+	int cpu;
+	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
 
-	for (i = 0; i < NR_CPUS; i++) {
+	BUG_ON(cpu_to_apicid == NULL);
+
+	for_each_possible_cpu(cpu) {
 		int node;
-		u16 apicid = x86_cpu_to_apicid_init[i];
+		u16 apicid = cpu_to_apicid[cpu];
 
 		if (apicid == BAD_APICID)
 			continue;
@@ -668,8 +646,9 @@ void __init init_cpu_to_node(void)
 			continue;
 		if (!node_online(node))
 			continue;
-		numa_set_node(i, node);
+		numa_set_node(cpu, node);
 	}
 }
+#endif
 
 
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 99649dccad28..012220e31c99 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -376,7 +376,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		if (node == NUMA_NO_NODE)
 			continue;
 		if (!node_isset(node, node_possible_map))
-			numa_set_node(i, NUMA_NO_NODE);
+			numa_clear_node(i);
 	}
 	numa_init_array();
 	return 0;
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index fdf4044d2e74..1efe162e16d7 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -40,6 +40,7 @@ static ssize_t show_##name(struct sys_device *dev, char *buf)	\
 	return sprintf(buf, "%d\n", topology_##name(cpu));	\
 }
 
+#if defined(topology_thread_siblings) || defined(topology_core_siblings)
 static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
 {
 	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
@@ -54,21 +55,41 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
 	}
 	return n;
 }
+#endif
 
+#ifdef arch_provides_topology_pointers
 #define define_siblings_show_map(name)					\
-static inline ssize_t show_##name(struct sys_device *dev, char *buf)	\
+static ssize_t show_##name(struct sys_device *dev, char *buf)	\
 {									\
 	unsigned int cpu = dev->id;					\
 	return show_cpumap(0, &(topology_##name(cpu)), buf);		\
 }
 
 #define define_siblings_show_list(name)					\
-static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
+static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
 {									\
 	unsigned int cpu = dev->id;					\
 	return show_cpumap(1, &(topology_##name(cpu)), buf);		\
 }
 
+#else
+#define define_siblings_show_map(name)					\
+static ssize_t show_##name(struct sys_device *dev, char *buf)	\
+{									\
+	unsigned int cpu = dev->id;					\
+	cpumask_t mask = topology_##name(cpu);				\
+	return show_cpumap(0, &mask, buf);				\
+}
+
+#define define_siblings_show_list(name)					\
+static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
+{									\
+	unsigned int cpu = dev->id;					\
+	cpumask_t mask = topology_##name(cpu);				\
+	return show_cpumap(1, &mask, buf);				\
+}
+#endif
+
 #define define_siblings_show_func(name)		\
 	define_siblings_show_map(name); define_siblings_show_list(name)
 
diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h
index 22e87c9f6a80..b510daf4f4d8 100644
--- a/include/asm-x86/numa_64.h
+++ b/include/asm-x86/numa_64.h
@@ -14,11 +14,9 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
-extern void numa_add_cpu(int cpu);
 extern void numa_init_array(void);
 extern int numa_off;
 
-extern void numa_set_node(int cpu, int node);
 extern void srat_reserve_add_area(int nodeid);
 extern int hotadd_percent;
 
@@ -31,15 +29,16 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 
 #ifdef CONFIG_NUMA
 extern void __init init_cpu_to_node(void);
-
-static inline void clear_node_cpumask(int cpu)
-{
-	clear_bit(cpu, (unsigned long *)&node_to_cpumask_map[cpu_to_node(cpu)]);
-}
-
+extern void __cpuinit numa_set_node(int cpu, int node);
+extern void __cpuinit numa_clear_node(int cpu);
+extern void __cpuinit numa_add_cpu(int cpu);
+extern void __cpuinit numa_remove_cpu(int cpu);
 #else
-#define init_cpu_to_node() do {} while (0)
-#define clear_node_cpumask(cpu) do {} while (0)
+static inline void init_cpu_to_node(void)		{ }
+static inline void numa_set_node(int cpu, int node)	{ }
+static inline void numa_clear_node(int cpu)		{ }
+static inline void numa_add_cpu(int cpu, int node)	{ }
+static inline void numa_remove_cpu(int cpu)		{ }
 #endif
 
 #endif
diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h
index 736fc3bb8e1e..912a3a17b9db 100644
--- a/include/asm-x86/percpu.h
+++ b/include/asm-x86/percpu.h
@@ -143,4 +143,50 @@ do {							\
 #define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
 #endif /* !__ASSEMBLY__ */
 #endif /* !CONFIG_X86_64 */
+
+#ifdef CONFIG_SMP
+
+/*
+ * Define the "EARLY_PER_CPU" macros.  These are used for some per_cpu
+ * variables that are initialized and accessed before there are per_cpu
+ * areas allocated.
+ */
+
+#define	DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)			\
+	DEFINE_PER_CPU(_type, _name) = _initvalue;			\
+	__typeof__(_type) _name##_early_map[NR_CPUS] __initdata =	\
+				{ [0 ... NR_CPUS-1] = _initvalue };	\
+	__typeof__(_type) *_name##_early_ptr = _name##_early_map
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name)			\
+	EXPORT_PER_CPU_SYMBOL(_name)
+
+#define DECLARE_EARLY_PER_CPU(_type, _name)			\
+	DECLARE_PER_CPU(_type, _name);				\
+	extern __typeof__(_type) *_name##_early_ptr;		\
+	extern __typeof__(_type)  _name##_early_map[]
+
+#define	early_per_cpu_ptr(_name) (_name##_early_ptr)
+#define	early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
+#define	early_per_cpu(_name, _cpu) 				\
+	(early_per_cpu_ptr(_name) ?				\
+		early_per_cpu_ptr(_name)[_cpu] :		\
+		per_cpu(_name, _cpu))
+
+#else	/* !CONFIG_SMP */
+#define	DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)		\
+	DEFINE_PER_CPU(_type, _name) = _initvalue
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name)			\
+	EXPORT_PER_CPU_SYMBOL(_name)
+
+#define DECLARE_EARLY_PER_CPU(_type, _name)			\
+	DECLARE_PER_CPU(_type, _name)
+
+#define	early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
+#define	early_per_cpu_ptr(_name) NULL
+/* no early_per_cpu_map() */
+
+#endif	/* !CONFIG_SMP */
+
 #endif /* _ASM_X86_PERCPU_H_ */
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index 1ebaa5cd3112..ec841639fb44 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -29,21 +29,12 @@ extern int smp_num_siblings;
 extern unsigned int num_processors;
 extern cpumask_t cpu_initialized;
 
-#ifdef CONFIG_SMP
-extern u16 x86_cpu_to_apicid_init[];
-extern u16 x86_bios_cpu_apicid_init[];
-extern void *x86_cpu_to_apicid_early_ptr;
-extern void *x86_bios_cpu_apicid_early_ptr;
-#else
-#define x86_cpu_to_apicid_early_ptr NULL
-#define x86_bios_cpu_apicid_early_ptr NULL
-#endif
-
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
-DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
-DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
+
+DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
 
 /* Static state in head.S used to set up a CPU */
 extern struct {
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index dcf3f8131d6b..dac09cb66dca 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -35,87 +35,67 @@
 # endif
 #endif
 
+/* Node not present */
+#define NUMA_NO_NODE	(-1)
+
 #ifdef CONFIG_NUMA
 #include <linux/cpumask.h>
 #include <asm/mpspec.h>
 
-/* Mappings between logical cpu number and node number */
 #ifdef CONFIG_X86_32
-extern int cpu_to_node_map[];
-#else
-/* Returns the number of the current Node. */
-#define numa_node_id()		(early_cpu_to_node(raw_smp_processor_id()))
-#endif
-
-DECLARE_PER_CPU(int, x86_cpu_to_node_map);
-
-#ifdef CONFIG_SMP
-extern int x86_cpu_to_node_map_init[];
-extern void *x86_cpu_to_node_map_early_ptr;
-#else
-#define x86_cpu_to_node_map_early_ptr NULL
-#endif
 
+/* Mappings between node number and cpus on that node. */
 extern cpumask_t node_to_cpumask_map[];
 
-#define NUMA_NO_NODE	(-1)
+/* Mappings between logical cpu number and node number */
+extern int cpu_to_node_map[];
 
 /* Returns the number of the node containing CPU 'cpu' */
-#ifdef CONFIG_X86_32
-#define early_cpu_to_node(cpu)	cpu_to_node(cpu)
 static inline int cpu_to_node(int cpu)
 {
 	return cpu_to_node_map[cpu];
 }
+#define early_cpu_to_node(cpu)	cpu_to_node(cpu)
 
 #else /* CONFIG_X86_64 */
 
-#ifdef CONFIG_SMP
-static inline int early_cpu_to_node(int cpu)
-{
-	int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
-
-	if (cpu_to_node_map)
-		return cpu_to_node_map[cpu];
-	else if (per_cpu_offset(cpu))
-		return per_cpu(x86_cpu_to_node_map, cpu);
-	else
-		return NUMA_NO_NODE;
-}
-#else
-#define	early_cpu_to_node(cpu)	cpu_to_node(cpu)
-#endif
+/* Mappings between node number and cpus on that node. */
+extern cpumask_t node_to_cpumask_map[];
+
+/* Mappings between logical cpu number and node number */
+DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
+
+/* Returns the number of the current Node. */
+#define numa_node_id()	(per_cpu(x86_cpu_to_node_map, raw_smp_processor_id()))
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+extern int cpu_to_node(int cpu);
+extern int early_cpu_to_node(int cpu);
+extern cpumask_t *_node_to_cpumask_ptr(int node);
+extern cpumask_t node_to_cpumask(int node);
 
+#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+/* Returns the number of the node containing CPU 'cpu' */
 static inline int cpu_to_node(int cpu)
 {
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-	if (x86_cpu_to_node_map_early_ptr) {
-		printk("KERN_NOTICE cpu_to_node(%d): usage too early!\n",
-		       (int)cpu);
-		dump_stack();
-		return ((int *)x86_cpu_to_node_map_early_ptr)[cpu];
-	}
-#endif
 	return per_cpu(x86_cpu_to_node_map, cpu);
 }
 
-#ifdef	CONFIG_NUMA
-
-/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
-#define node_to_cpumask_ptr(v, node)		\
-		cpumask_t *v = &(node_to_cpumask_map[node])
-
-#define node_to_cpumask_ptr_next(v, node)	\
-			   v = &(node_to_cpumask_map[node])
-#endif
+/* Same function but used if called before per_cpu areas are setup */
+static inline int early_cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map))
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
 
-#endif /* CONFIG_X86_64 */
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
 
-/*
- * Returns the number of the node containing Node 'node'. This
- * architecture is flat, so it is a pretty simple function!
- */
-#define parent_node(node) (node)
+/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
+static inline cpumask_t *_node_to_cpumask_ptr(int node)
+{
+	return &node_to_cpumask_map[node];
+}
 
 /* Returns a bitmask of CPUs on Node 'node'. */
 static inline cpumask_t node_to_cpumask(int node)
@@ -123,14 +103,29 @@ static inline cpumask_t node_to_cpumask(int node)
 	return node_to_cpumask_map[node];
 }
 
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+#endif /* CONFIG_X86_64 */
+
+/* Replace default node_to_cpumask_ptr with optimized version */
+#define node_to_cpumask_ptr(v, node)		\
+		cpumask_t *v = _node_to_cpumask_ptr(node)
+
+#define node_to_cpumask_ptr_next(v, node)	\
+			   v = _node_to_cpumask_ptr(node)
+
 /* Returns the number of the first CPU on Node 'node'. */
 static inline int node_to_first_cpu(int node)
 {
-	cpumask_t mask = node_to_cpumask(node);
-
-	return first_cpu(mask);
+	node_to_cpumask_ptr(mask, node);
+	return first_cpu(*mask);
 }
 
+/*
+ * Returns the number of the node containing Node 'node'. This
+ * architecture is flat, so it is a pretty simple function!
+ */
+#define parent_node(node) (node)
+
 #define pcibus_to_node(bus) __pcibus_to_node(bus)
 #define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
 
@@ -180,8 +175,31 @@ extern int __node_distance(int, int);
 #define node_distance(a, b) __node_distance(a, b)
 #endif
 
-#else /* CONFIG_NUMA */
+#else /* !CONFIG_NUMA */
+
+#define numa_node_id()		0
+#define	cpu_to_node(cpu)	0
+#define	early_cpu_to_node(cpu)	0
+
+static inline cpumask_t *_node_to_cpumask_ptr(int node)
+{
+	return &cpu_online_map;
+}
+static inline cpumask_t node_to_cpumask(int node)
+{
+	return cpu_online_map;
+}
+static inline int node_to_first_cpu(int node)
+{
+	return first_cpu(cpu_online_map);
+}
+
+/* Replace default node_to_cpumask_ptr with optimized version */
+#define node_to_cpumask_ptr(v, node)		\
+		cpumask_t *v = _node_to_cpumask_ptr(node)
 
+#define node_to_cpumask_ptr_next(v, node)	\
+			   v = _node_to_cpumask_ptr(node)
 #endif
 
 #include <asm-generic/topology.h>
@@ -193,6 +211,9 @@ extern cpumask_t cpu_coregroup_map(int cpu);
 #define topology_core_id(cpu)			(cpu_data(cpu).cpu_core_id)
 #define topology_core_siblings(cpu)		(per_cpu(cpu_core_map, cpu))
 #define topology_thread_siblings(cpu)		(per_cpu(cpu_sibling_map, cpu))
+
+/* indicates that pointers to the topology cpumask_t maps are valid */
+#define arch_provides_topology_pointers		yes
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
@@ -220,4 +241,4 @@ static inline void set_mp_bus_to_node(int busnum, int node)
 }
 #endif
 
-#endif
+#endif /* _ASM_X86_TOPOLOGY_H */
-- 
cgit v1.2.3


From 7891a24e1ee50c96896c0cf7da216a8e7b573ca5 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:12 +0200
Subject: x86: restore pda nodenumber field

  * Restore the nodenumber field in the x86_64 pda.  This field is slightly
    different than the x86_cpu_to_node_map mainly because it's a static
    indication of which node the cpu is on while the cpu to node map is a
    dyanamic mapping that may get reset if the cpu goes offline.  This also
    simplifies the numa_node_id() macro.

For inclusion into sched-devel/latest tree.

Based on:
	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    +   sched-devel/latest  .../mingo/linux-2.6-sched-devel.git

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup.c    | 4 ++++
 include/asm-x86/pda.h      | 1 +
 include/asm-x86/topology.h | 2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 03caa8e4351f..0dff17ee3d73 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -32,6 +32,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
 #define	X86_64_NUMA	1
 
+/* map cpu index to node index */
 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
 #endif
@@ -155,6 +156,9 @@ void __cpuinit numa_set_node(int cpu, int node)
 {
 	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
 
+	if (node != NUMA_NO_NODE)
+		cpu_pda(cpu)->nodenumber = node;
+
 	if (cpu_to_node_map)
 		cpu_to_node_map[cpu] = node;
 
diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index 101fb9e11954..de2ad9ac35a9 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -22,6 +22,7 @@ struct x8664_pda {
 					   offset 40!!! */
 #endif
 	char *irqstackptr;
+	int nodenumber;			/* number of current node */
 	unsigned int __softirq_pending;
 	unsigned int __nmi_count;	/* number of NMI on this CPUs */
 	short mmu_state;
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index dac09cb66dca..c0e6ff7671ea 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -66,7 +66,7 @@ extern cpumask_t node_to_cpumask_map[];
 DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
 
 /* Returns the number of the current Node. */
-#define numa_node_id()	(per_cpu(x86_cpu_to_node_map, raw_smp_processor_id()))
+#define numa_node_id()		read_pda(nodenumber)
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 extern int cpu_to_node(int cpu);
-- 
cgit v1.2.3


From 9f248bde9d47cc177011198c9a15fb339b9f3215 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:12 +0200
Subject: x86: remove the static 256k node_to_cpumask_map

  * Consolidate node_to_cpumask operations and remove the 256k
    byte node_to_cpumask_map.  This is done by allocating the
    node_to_cpumask_map array after the number of possible nodes
    (nr_node_ids) is known.

  * Debug printouts when CONFIG_DEBUG_PER_CPU_MAPS is active have
    been increased.  It now shows faults when calling node_to_cpumask()
    and node_to_cpumask_ptr().

For inclusion into sched-devel/latest tree.

Based on:
	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    +   sched-devel/latest  .../mingo/linux-2.6-sched-devel.git

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup.c    | 132 +++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/mm/numa_64.c      |   6 ---
 include/asm-x86/topology.h |  25 ++++++---
 3 files changed, 144 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0dff17ee3d73..913af838c3c5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -35,6 +35,16 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
 /* map cpu index to node index */
 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+/* which logical CPUs are on which nodes */
+cpumask_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+/* setup node_to_cpumask_map */
+static void __init setup_node_to_cpumask_map(void);
+
+#else
+static inline void setup_node_to_cpumask_map(void) { }
 #endif
 
 #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
@@ -140,11 +150,15 @@ void __init setup_per_cpu_areas(void)
 	}
 
 	nr_cpu_ids = highest_cpu + 1;
-	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
+	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
+		NR_CPUS, nr_cpu_ids, nr_node_ids);
 
 	/* Setup percpu data maps */
 	setup_per_cpu_maps();
 
+	/* Setup node to cpumask map */
+	setup_node_to_cpumask_map();
+
 	/* Setup cpumask_of_cpu map */
 	setup_cpumask_of_cpu();
 }
@@ -152,6 +166,35 @@ void __init setup_per_cpu_areas(void)
 #endif
 
 #ifdef X86_64_NUMA
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ */
+static void __init setup_node_to_cpumask_map(void)
+{
+	unsigned int node, num = 0;
+	cpumask_t *map;
+
+	/* setup nr_node_ids if not done yet */
+	if (nr_node_ids == MAX_NUMNODES) {
+		for_each_node_mask(node, node_possible_map)
+			num = node;
+		nr_node_ids = num + 1;
+	}
+
+	/* allocate the map */
+	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+
+	Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
+		map, nr_node_ids);
+
+	/* node_to_cpumask() will now work */
+	node_to_cpumask_map = map;
+}
+
 void __cpuinit numa_set_node(int cpu, int node)
 {
 	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
@@ -174,6 +217,8 @@ void __cpuinit numa_clear_node(int cpu)
 	numa_set_node(cpu, NUMA_NO_NODE);
 }
 
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
 void __cpuinit numa_add_cpu(int cpu)
 {
 	cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -183,9 +228,44 @@ void __cpuinit numa_remove_cpu(int cpu)
 {
 	cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
 }
-#endif /* CONFIG_NUMA */
 
-#if defined(CONFIG_DEBUG_PER_CPU_MAPS) && defined(CONFIG_X86_64)
+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+/*
+ * --------- debug versions of the numa functions ---------
+ */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+	int node = cpu_to_node(cpu);
+	cpumask_t *mask;
+	char buf[64];
+
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_ERR "node_to_cpumask_map NULL\n");
+		dump_stack();
+		return;
+	}
+
+	mask = &node_to_cpumask_map[node];
+	if (enable)
+		cpu_set(cpu, *mask);
+	else
+		cpu_clear(cpu, *mask);
+
+	cpulist_scnprintf(buf, sizeof(buf), *mask);
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+		enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
+ }
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 0);
+}
 
 int cpu_to_node(int cpu)
 {
@@ -199,6 +279,10 @@ int cpu_to_node(int cpu)
 }
 EXPORT_SYMBOL(cpu_to_node);
 
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
 int early_cpu_to_node(int cpu)
 {
 	if (early_per_cpu_ptr(x86_cpu_to_node_map))
@@ -207,9 +291,47 @@ int early_cpu_to_node(int cpu)
 	if (!per_cpu_offset(cpu)) {
 		printk(KERN_WARNING
 			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
-			dump_stack();
+		dump_stack();
 		return NUMA_NO_NODE;
 	}
 	return per_cpu(x86_cpu_to_node_map, cpu);
 }
-#endif
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+cpumask_t *_node_to_cpumask_ptr(int node)
+{
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_WARNING
+			"_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
+			node);
+		dump_stack();
+		return &cpu_online_map;
+	}
+	return &node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(_node_to_cpumask_ptr);
+
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ */
+cpumask_t node_to_cpumask(int node)
+{
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_WARNING
+			"node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
+		dump_stack();
+		return cpu_online_map;
+	}
+	return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(node_to_cpumask);
+
+/*
+ * --------- end of debug versions of the numa functions ---------
+ */
+
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+#endif /* X86_64_NUMA */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 970f86775c41..14c7ab417ec7 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -35,9 +35,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
-cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_to_cpumask_map);
-
 int numa_off __initdata;
 unsigned long __initdata nodemap_addr;
 unsigned long __initdata nodemap_size;
@@ -560,9 +557,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	node_set(0, node_possible_map);
 	for (i = 0; i < NR_CPUS; i++)
 		numa_set_node(i, 0);
-	/* cpumask_of_cpu() may not be available during early startup */
-	memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
-	cpu_set(0, node_to_cpumask_map[0]);
 	e820_register_active_regions(0, start_pfn, end_pfn);
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 }
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index c0e6ff7671ea..1f97758de4ab 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -57,10 +57,16 @@ static inline int cpu_to_node(int cpu)
 }
 #define early_cpu_to_node(cpu)	cpu_to_node(cpu)
 
+/* Returns a bitmask of CPUs on Node 'node'. */
+static inline cpumask_t node_to_cpumask(int node)
+{
+	return node_to_cpumask_map[node];
+}
+
 #else /* CONFIG_X86_64 */
 
 /* Mappings between node number and cpus on that node. */
-extern cpumask_t node_to_cpumask_map[];
+extern cpumask_t *node_to_cpumask_map;
 
 /* Mappings between logical cpu number and node number */
 DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
@@ -104,7 +110,6 @@ static inline cpumask_t node_to_cpumask(int node)
 }
 
 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
-#endif /* CONFIG_X86_64 */
 
 /* Replace default node_to_cpumask_ptr with optimized version */
 #define node_to_cpumask_ptr(v, node)		\
@@ -113,12 +118,7 @@ static inline cpumask_t node_to_cpumask(int node)
 #define node_to_cpumask_ptr_next(v, node)	\
 			   v = _node_to_cpumask_ptr(node)
 
-/* Returns the number of the first CPU on Node 'node'. */
-static inline int node_to_first_cpu(int node)
-{
-	node_to_cpumask_ptr(mask, node);
-	return first_cpu(*mask);
-}
+#endif /* CONFIG_X86_64 */
 
 /*
  * Returns the number of the node containing Node 'node'. This
@@ -204,6 +204,15 @@ static inline int node_to_first_cpu(int node)
 
 #include <asm-generic/topology.h>
 
+#ifdef CONFIG_NUMA
+/* Returns the number of the first CPU on Node 'node'. */
+static inline int node_to_first_cpu(int node)
+{
+	node_to_cpumask_ptr(mask, node);
+	return first_cpu(*mask);
+}
+#endif
+
 extern cpumask_t cpu_coregroup_map(int cpu);
 
 #ifdef ENABLE_TOPO_DEFINES
-- 
cgit v1.2.3


From 3461b0af025251bbc6b3d56c821c6ac2de6f7209 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:13 +0200
Subject: x86: remove static boot_cpu_pda array v2

  * Remove the boot_cpu_pda array and pointer table from the data section.
    Allocate the pointer table and array during init.  do_boot_cpu()
    will reallocate the pda in node local memory and if the cpu is being
    brought up before the bootmem array is released (after_bootmem = 0),
    then it will free the initial pda.  This will happen for all cpus
    present at system startup.

    This removes 512k + 32k bytes from the data section.

For inclusion into sched-devel/latest tree.

Based on:
	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    +   sched-devel/latest  .../mingo/linux-2.6-sched-devel.git

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head64.c  | 26 +++++++++++++++--
 arch/x86/kernel/setup.c   | 73 ++++++++++++++++++++++++++++++++++++-----------
 arch/x86/kernel/setup64.c |  8 ++++--
 arch/x86/kernel/smpboot.c | 59 +++++++++++++++++++++++++++++---------
 include/asm-x86/pda.h     |  6 ++--
 include/linux/mm.h        |  1 +
 6 files changed, 135 insertions(+), 38 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index e25c57b8aa84..0ab59edd7067 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,6 +25,24 @@
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
 
+/* boot cpu pda */
+static struct x8664_pda _boot_cpu_pda __read_mostly;
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+/*
+ * We install an empty cpu_pda pointer table to trap references before
+ * the actual cpu_pda pointer table is created in setup_cpu_pda_map().
+ */
+static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
+#else
+static struct x8664_pda *__cpu_pda[1] __read_mostly;
+#endif
+
+#else /* !CONFIG_SMP (NR_CPUS will be 1) */
+static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
+#endif
+
 static void __init zap_identity_mappings(void)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
@@ -156,10 +174,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
 	early_printk("Kernel alive\n");
 
- 	for (i = 0; i < NR_CPUS; i++)
- 		cpu_pda(i) = &boot_cpu_pda[i];
-
+	_cpu_pda = __cpu_pda;
+	cpu_pda(0) = &_boot_cpu_pda;
 	pda_init(0);
+
+	early_printk("Kernel really alive\n");
+
 	copy_bootdata(__va(real_mode_data));
 
 	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 913af838c3c5..dd12c1c84a8f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -101,6 +101,50 @@ static inline void setup_cpumask_of_cpu(void) { }
  */
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
+static inline void setup_cpu_pda_map(void) { }
+
+#elif !defined(CONFIG_SMP)
+static inline void setup_cpu_pda_map(void) { }
+
+#else /* CONFIG_SMP && CONFIG_X86_64 */
+
+/*
+ * Allocate cpu_pda pointer table and array via alloc_bootmem.
+ */
+static void __init setup_cpu_pda_map(void)
+{
+	char *pda;
+	struct x8664_pda **new_cpu_pda;
+	unsigned long size;
+	int cpu;
+
+	size = roundup(sizeof(struct x8664_pda), cache_line_size());
+
+	/* allocate cpu_pda array and pointer table */
+	{
+		unsigned long tsize = nr_cpu_ids * sizeof(void *);
+		unsigned long asize = size * (nr_cpu_ids - 1);
+
+		tsize = roundup(tsize, cache_line_size());
+		new_cpu_pda = alloc_bootmem(tsize + asize);
+		pda = (char *)new_cpu_pda + tsize;
+	}
+
+	/* initialize pointer table to static pda's */
+	for_each_possible_cpu(cpu) {
+		if (cpu == 0) {
+			/* leave boot cpu pda in place */
+			new_cpu_pda[0] = cpu_pda(0);
+			continue;
+		}
+		new_cpu_pda[cpu] = (struct x8664_pda *)pda;
+		new_cpu_pda[cpu]->in_bootmem = 1;
+		pda += size;
+	}
+
+	/* point to new pointer table */
+	_cpu_pda = new_cpu_pda;
+}
 #endif
 
 /*
@@ -110,46 +154,43 @@ EXPORT_SYMBOL(__per_cpu_offset);
  */
 void __init setup_per_cpu_areas(void)
 {
-	int i, highest_cpu = 0;
-	unsigned long size;
+	ssize_t size = PERCPU_ENOUGH_ROOM;
+	char *ptr;
+	int cpu;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	prefill_possible_map();
+#else
+	nr_cpu_ids = num_processors;
 #endif
 
+	/* Setup cpu_pda map */
+	setup_cpu_pda_map();
+
 	/* Copy section for each CPU (we discard the original) */
 	size = PERCPU_ENOUGH_ROOM;
 	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
 			  size);
 
-	for_each_possible_cpu(i) {
-		char *ptr;
+	for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 		ptr = alloc_bootmem_pages(size);
 #else
-		int node = early_cpu_to_node(i);
+		int node = early_cpu_to_node(cpu);
 		if (!node_online(node) || !NODE_DATA(node)) {
 			ptr = alloc_bootmem_pages(size);
 			printk(KERN_INFO
 			       "cpu %d has no node %d or node-local memory\n",
-				i, node);
+				cpu, node);
 		}
 		else
 			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
 #endif
-		if (!ptr)
-			panic("Cannot allocate cpu data for CPU %d\n", i);
-#ifdef CONFIG_X86_64
-		cpu_pda(i)->data_offset = ptr - __per_cpu_start;
-#else
-		__per_cpu_offset[i] = ptr - __per_cpu_start;
-#endif
+		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 
-		highest_cpu = i;
 	}
 
-	nr_cpu_ids = highest_cpu + 1;
 	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
 		NR_CPUS, nr_cpu_ids, nr_node_ids);
 
@@ -199,7 +240,7 @@ void __cpuinit numa_set_node(int cpu, int node)
 {
 	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
 
-	if (node != NUMA_NO_NODE)
+	if (cpu_pda(cpu) && node != NUMA_NO_NODE)
 		cpu_pda(cpu)->nodenumber = node;
 
 	if (cpu_to_node_map)
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index aee0e8200777..631ea6cc01d8 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -12,6 +12,7 @@
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/kgdb.h>
+#include <linux/topology.h>
 #include <asm/pda.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -34,9 +35,8 @@ struct boot_params boot_params;
 
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
-struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
+struct x8664_pda **_cpu_pda __read_mostly;
 EXPORT_SYMBOL(_cpu_pda);
-struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
 
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
@@ -114,8 +114,10 @@ void pda_init(int cpu)
 			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
 		if (!pda->irqstackptr)
 			panic("cannot allocate irqstack for cpu %d", cpu); 
-	}
 
+		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+			pda->nodenumber = cpu_to_node(cpu);
+	}
 
 	pda->irqstackptr += IRQSTACKSIZE-64;
 } 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 036604d3daed..bf0833487455 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -816,6 +816,43 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 	complete(&c_idle->done);
 }
 
+/*
+ * Allocate node local memory for the AP pda.
+ *
+ * Must be called after the _cpu_pda pointer table is initialized.
+ */
+static int __cpuinit get_local_pda(int cpu)
+{
+	struct x8664_pda *oldpda, *newpda;
+	unsigned long size = sizeof(struct x8664_pda);
+	int node = cpu_to_node(cpu);
+
+	if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
+		return 0;
+
+	oldpda = cpu_pda(cpu);
+	newpda = kmalloc_node(size, GFP_ATOMIC, node);
+	if (!newpda) {
+		printk(KERN_ERR "Could not allocate node local PDA "
+			"for CPU %d on node %d\n", cpu, node);
+
+		if (oldpda)
+			return 0;	/* have a usable pda */
+		else
+			return -1;
+	}
+
+	if (oldpda) {
+		memcpy(newpda, oldpda, size);
+		if (!after_bootmem)
+			free_bootmem((unsigned long)oldpda, size);
+	}
+
+	newpda->in_bootmem = 0;
+	cpu_pda(cpu) = newpda;
+	return 0;
+}
+
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -841,19 +878,11 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	}
 
 	/* Allocate node local memory for AP pdas */
-	if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
-		struct x8664_pda *newpda, *pda;
-		int node = cpu_to_node(cpu);
-		pda = cpu_pda(cpu);
-		newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC,
-				      node);
-		if (newpda) {
-			memcpy(newpda, pda, sizeof(struct x8664_pda));
-			cpu_pda(cpu) = newpda;
-		} else
-			printk(KERN_ERR
-		"Could not allocate node local PDA for CPU %d on node %d\n",
-				cpu, node);
+	if (cpu > 0) {
+		boot_error = get_local_pda(cpu);
+		if (boot_error)
+			goto restore_state;
+			/* if can't get pda memory, can't start cpu */
 	}
 #endif
 
@@ -972,6 +1001,8 @@ do_rest:
 		}
 	}
 
+restore_state:
+
 	if (boot_error) {
 		/* Try to put things back the way they were before ... */
 		unmap_cpu_to_logical_apicid(cpu);
@@ -1347,6 +1378,8 @@ __init void prefill_possible_map(void)
 
 	for (i = 0; i < possible; i++)
 		cpu_set(i, cpu_possible_map);
+
+	nr_cpu_ids = possible;
 }
 
 static void __ref remove_cpu_from_maps(int cpu)
diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index de2ad9ac35a9..b34e9a7cc80b 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -22,7 +22,8 @@ struct x8664_pda {
 					   offset 40!!! */
 #endif
 	char *irqstackptr;
-	int nodenumber;			/* number of current node */
+	short nodenumber;		/* number of current node (32k max) */
+	short in_bootmem;		/* pda lives in bootmem */
 	unsigned int __softirq_pending;
 	unsigned int __nmi_count;	/* number of NMI on this CPUs */
 	short mmu_state;
@@ -38,8 +39,7 @@ struct x8664_pda {
 	unsigned irq_spurious_count;
 } ____cacheline_aligned_in_smp;
 
-extern struct x8664_pda *_cpu_pda[];
-extern struct x8664_pda boot_cpu_pda[];
+extern struct x8664_pda **_cpu_pda;
 extern void pda_init(int);
 
 #define cpu_pda(i) (_cpu_pda[i])
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 586a943cab01..0ea48a5af823 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1024,6 +1024,7 @@ extern void mem_init(void);
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
+extern int after_bootmem;
 
 #ifdef CONFIG_NUMA
 extern void setup_per_cpu_pageset(void);
-- 
cgit v1.2.3


From 5deb0b2a25b7b568ab81f7c38052572ecf4ccc96 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:13 +0200
Subject: x86: leave initial __cpu_pda array in place until cpus are booted

Ingo Molnar wrote:
...
> they crashed after about 3 randconfig iterations with:
>
>   early res: 4 [8000-afff] PGTABLE
>   early res: 5 [b000-b87f] MEMNODEMAP
> PANIC: early exception 0e rip 10:ffffffff8077a150 error 2 cr2 37
> Pid: 0, comm: swapper Not tainted 2.6.25-sched-devel.git-x86-latest.git #14
>
> Call Trace:
>  [<ffffffff81466196>] early_idt_handler+0x56/0x6a
>  [<ffffffff8077a150>] ? numa_set_node+0x30/0x60
>  [<ffffffff8077a129>] ? numa_set_node+0x9/0x60
>  [<ffffffff8147a543>] numa_init_array+0x93/0xf0
>  [<ffffffff8147b039>] acpi_scan_nodes+0x3b9/0x3f0
>  [<ffffffff8147a496>] numa_initmem_init+0x136/0x150
>  [<ffffffff8146da5f>] setup_arch+0x48f/0x700
>  [<ffffffff802566ea>] ? clockevents_register_notifier+0x3a/0x50
>  [<ffffffff81466a87>] start_kernel+0xd7/0x440
>  [<ffffffff81466422>] x86_64_start_kernel+0x222/0x280
...
Here's the fixup...  This one should follow the previous patches.

Thanks,
Mike
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head64.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0ab59edd7067..4bcb61cd9fcd 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -29,17 +29,13 @@
 static struct x8664_pda _boot_cpu_pda __read_mostly;
 
 #ifdef CONFIG_SMP
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
 /*
- * We install an empty cpu_pda pointer table to trap references before
- * the actual cpu_pda pointer table is created in setup_cpu_pda_map().
+ * We install an empty cpu_pda pointer table to indicate to early users
+ * (numa_set_node) that the cpu_pda pointer table for cpus other than
+ * the boot cpu is not yet setup.
  */
 static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
 #else
-static struct x8664_pda *__cpu_pda[1] __read_mostly;
-#endif
-
-#else /* !CONFIG_SMP (NR_CPUS will be 1) */
 static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
 #endif
 
-- 
cgit v1.2.3


From f307d25e638d3408659a2ec98fb3fd1737f7cb31 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 21 May 2008 11:21:13 +0100
Subject: x86: compile error fix for smpboot.c

Without this patch, my link fails with:

arch/x86/kernel/built-in.o(.cpuinit.text+0x3c6e): In function `get_local_pda':
: undefined reference to `_cpu_pda'
arch/x86/kernel/built-in.o(.cpuinit.text+0x3cd1): In function `get_local_pda':
: undefined reference to `after_bootmem'
arch/x86/kernel/built-in.o(.cpuinit.text+0x3cec): In function `get_local_pda':
: undefined reference to `_cpu_pda'
make[2]: *** [.tmp_vmlinux1] Error 1

Caused by commit 766da892634694f795b18b9538407816896fc470
    x86: remove static boot_cpu_pda array v2

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bf0833487455..bc1e1257e515 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -816,6 +816,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 	complete(&c_idle->done);
 }
 
+#ifdef CONFIG_X86_64
 /*
  * Allocate node local memory for the AP pda.
  *
@@ -852,6 +853,7 @@ static int __cpuinit get_local_pda(int cpu)
 	cpu_pda(cpu) = newpda;
 	return 0;
 }
+#endif /* CONFIG_X86_64 */
 
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
-- 
cgit v1.2.3


From 03db1f74a7d823e3de3767f36b1e08829f6fb3a1 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 6 Jun 2008 16:33:25 +0200
Subject: x86: don't return invalid pointers from node_to_cpumask()

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index dd12c1c84a8f..df49ce87a300 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -350,6 +350,7 @@ cpumask_t *_node_to_cpumask_ptr(int node)
 		dump_stack();
 		return &cpu_online_map;
 	}
+	BUG_ON(node >= nr_node_ids);
 	return &node_to_cpumask_map[node];
 }
 EXPORT_SYMBOL(_node_to_cpumask_ptr);
@@ -365,6 +366,7 @@ cpumask_t node_to_cpumask(int node)
 		dump_stack();
 		return cpu_online_map;
 	}
+	BUG_ON(node >= nr_node_ids);
 	return node_to_cpumask_map[node];
 }
 EXPORT_SYMBOL(node_to_cpumask);
-- 
cgit v1.2.3


From 053713f5745b8b08fb598adb65230bc168cb9d8d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 5 Jun 2008 11:10:59 -0700
Subject: x86: fix setup.c printk format warning

Fix setup.c printk format warning:

linux-next-20080605/arch/x86/kernel/setup.c: In function 'setup_per_cpu_areas':
linux-next-20080605/arch/x86/kernel/setup.c:173: warning: format '%lu' expects type 'long unsigned int', but argument 2 has type 'ssize_t'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index df49ce87a300..d4eaa4eb481d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -169,7 +169,7 @@ void __init setup_per_cpu_areas(void)
 
 	/* Copy section for each CPU (we discard the original) */
 	size = PERCPU_ENOUGH_ROOM;
-	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
+	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);
 
 	for_each_possible_cpu(cpu) {
-- 
cgit v1.2.3


From 3fd052b1b46ac23a2316283a996fe6c32dbcf132 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Sun, 8 Jun 2008 15:46:30 +0200
Subject: x86: add flags parameter to reserve_bootmem_generic()

This patch adds a 'flags' parameter to reserve_bootmem_generic() like it
already has been added in reserve_bootmem() with commit
72a7fe3967dbf86cb34e24fbf1d957fe24d2f246.

It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively
change the behaviour. Since the change is x86-specific, I don't think it's
necessary to add a new API for migration. There are only 4 users of that
function.

The change is necessary for the next patch, using reserve_bootmem_generic()
for crashkernel reservation.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_64.c |  3 ++-
 arch/x86/kernel/efi_64.c  |  3 ++-
 arch/x86/kernel/mpparse.c |  5 +++--
 arch/x86/mm/init_64.c     | 17 ++++++++++++-----
 include/asm-x86/proto.h   |  2 +-
 5 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 124480c0008d..af1eb0789740 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -118,7 +118,8 @@ void __init early_res_to_bootmem(unsigned long start, unsigned long end)
 			continue;
 		printk(KERN_INFO "  early res: %d [%lx-%lx] %s\n", i,
 			final_start, final_end - 1, r->name);
-		reserve_bootmem_generic(final_start, final_end - final_start);
+		reserve_bootmem_generic(final_start, final_end - final_start,
+				BOOTMEM_DEFAULT);
 	}
 }
 
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index d0060fdcccac..d561dd5f1e62 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,7 +100,8 @@ void __init efi_call_phys_epilog(void)
 void __init efi_reserve_bootmem(void)
 {
 	reserve_bootmem_generic((unsigned long)memmap.phys_map,
-				memmap.nr_map * memmap.desc_size);
+				memmap.nr_map * memmap.desc_size,
+				BOOTMEM_DEFAULT);
 }
 
 void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 404683b94e79..4901ae3f742c 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -729,10 +729,11 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 			if (!reserve)
 				return 1;
 
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+				BOOTMEM_DEFAULT);
 			if (mpf->mpf_physptr)
 				reserve_bootmem_generic(mpf->mpf_physptr,
-							PAGE_SIZE);
+					PAGE_SIZE, BOOTMEM_DEFAULT);
 #endif
 		return 1;
 		}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 819dad973b13..bf7bf1de6c25 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -799,12 +799,13 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
 {
 #ifdef CONFIG_NUMA
 	int nid, next_nid;
 #endif
 	unsigned long pfn = phys >> PAGE_SHIFT;
+	int ret;
 
 	if (pfn >= end_pfn) {
 		/*
@@ -812,11 +813,11 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 		 * firmware tables:
 		 */
 		if (pfn < max_pfn_mapped)
-			return;
+			return -EFAULT;
 
 		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
 				phys, len);
-		return;
+		return -EFAULT;
 	}
 
 	/* Should check here against the e820 map to avoid double free */
@@ -824,9 +825,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 	nid = phys_to_nid(phys);
 	next_nid = phys_to_nid(phys + len - 1);
 	if (nid == next_nid)
-		reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
 	else
-		reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+		ret = reserve_bootmem(phys, len, flags);
+
+	if (ret != 0)
+		return ret;
+
 #else
 	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
 #endif
@@ -835,6 +840,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 		dma_reserve += len / PAGE_SIZE;
 		set_dma_reserve(dma_reserve);
 	}
+
+	return 0;
 }
 
 int kern_addr_valid(unsigned long addr)
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index 6c8b41b03f6d..a9f51472521e 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -14,7 +14,7 @@ extern void ia32_syscall(void);
 extern void ia32_cstar_target(void);
 extern void ia32_sysenter_target(void);
 
-extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
+extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags);
 
 extern void syscall32_cpu_init(void);
 
-- 
cgit v1.2.3


From 46f68e1c6b04a04772e828ff3bcd07ed708805c2 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Sun, 8 Jun 2008 15:46:31 +0200
Subject: x86: use reserve_bootmem_generic() to reserve crashkernel memory on
 x86_64

This patch uses reserve_bootmem_generic() instead of reserve_bootmem()
to reserve the crashkernel memory on x86_64. That's necessary for NUMA
machines, see 00212fef814612245ed0261cbac8426d0c9a31a5:

  [PATCH] Fix kdump Crash Kernel boot memory reservation for NUMA machines

  This patch will fix a boot memory reservation bug that trashes memory on
  the ES7000 when loading the kdump crash kernel.

  The code in arch/x86_64/kernel/setup.c to reserve boot memory for the crash
  kernel uses the non-numa aware "reserve_bootmem" function instead of the
  NUMA aware "reserve_bootmem_generic".  I checked to make sure that no other
  function was using "reserve_bootmem" and found none, except the ones that
  had NUMA ifdef'ed out.

  I have tested this patch only on an ES7000 with NUMA on and off (numa=off)
  in a single (non-NUMA) and multi-cell (NUMA) configurations.

  Signed-off-by: Amul Shah <amul.shah@unisys.com>
  Looks-good-to: Vivek Goyal <vgoyal@in.ibm.com>
  Cc: Andi Kleen <ak@muc.de>
  Signed-off-by: Andrew Morton <akpm@osdl.org>
  Signed-off-by: Linus Torvalds <torvalds@osdl.org>

The switch-back to reserve_bootmem() was accidentally introduced in
5c3391f9f749023a49c64d607da4fb49263690eb when adding the BOOTMEM_EXCLUSIVE
parameter.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index e8df64fad540..4a666cdccb68 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -243,7 +243,7 @@ static void __init reserve_crashkernel(void)
 			return;
 		}
 
-		if (reserve_bootmem(crash_base, crash_size,
+		if (reserve_bootmem_generic(crash_base, crash_size,
 					BOOTMEM_EXCLUSIVE) < 0) {
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"memory is in use\n");
-- 
cgit v1.2.3


From 1812924bb1823950c1dc95c478b71b037057356e Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 2 Jun 2008 08:56:14 -0500
Subject: x86, SGI UV: TLB shootdown using broadcast assist unit

TLB shootdown for SGI UV.

Depends on patch (in tip/x86/irq):
   x86-update-macros-used-by-uv-platform.patch   Jack Steiner May 29

This patch provides the ability to flush TLB's in cpu's that are not on
the local node.  The hardware mechanism for distributing the flush
messages is the UV's "broadcast assist unit".

The hook to intercept TLB shootdown requests is a 2-line change to
native_flush_tlb_others() (arch/x86/kernel/tlb_64.c).

This code has been tested on a hardware simulator. The real hardware
is not yet available.

The shootdown statistics are provided through /proc/sgi_uv/ptc_statistics.
The use of /sys was considered, but would have required the use of
many /sys files.  The debugfs was also considered, but these statistics
should be available on an ongoing basis, not just for debugging.

Issues to be fixed later:
- The IRQ for the messaging interrupt is currently hardcoded as 200
  (see UV_BAU_MESSAGE).  It should be dynamically assigned in the future.
- The use of appropriate udelay()'s is untested, as they are a problem
  in the simulator.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile    |   2 +-
 arch/x86/kernel/entry_64.S  |   4 +
 arch/x86/kernel/tlb_64.c    |   5 +
 arch/x86/kernel/tlb_uv.c    | 736 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/uv/uv_bau.h | 331 ++++++++++++++++++++
 5 files changed, 1077 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/tlb_uv.c
 create mode 100644 include/asm-x86/uv/uv_bau.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 605995e11ea7..4078e98f0125 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_OLPC)		+= olpc.o
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-        obj-y				+= genapic_64.o genapic_flat_64.o genx2apic_uv_x.o
+        obj-y				+= genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
         obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
         obj-$(CONFIG_AUDIT)		+= audit_64.o
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..6fd1987466eb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -720,6 +720,10 @@ ENTRY(apic_timer_interrupt)
 	apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
 END(apic_timer_interrupt)
 
+ENTRY(uv_bau_message_intr1)
+	apicinterrupt 220,uv_bau_message_interrupt
+END(uv_bau_message_intr1)
+
 ENTRY(error_interrupt)
 	apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
 END(error_interrupt)
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index a1f07d793202..fc132113bdab 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -15,6 +15,8 @@
 #include <asm/proto.h>
 #include <asm/apicdef.h>
 #include <asm/idle.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_bau.h>
 
 #include <mach_ipi.h>
 /*
@@ -162,6 +164,9 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
 	union smp_flush_state *f;
 	cpumask_t cpumask = *cpumaskp;
 
+	if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
+			return;
+
 	/* Caller has disabled preemption */
 	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
 	f = &per_cpu(flush_state, sender);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
new file mode 100644
index 000000000000..28e7c68d9d78
--- /dev/null
+++ b/arch/x86/kernel/tlb_uv.c
@@ -0,0 +1,736 @@
+/*
+ *	SGI UltraViolet TLB flush routines.
+ *
+ *	(c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
+ *
+ *	This code is released under the GNU General Public License version 2 or
+ *	later.
+ */
+#include <linux/mc146818rtc.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+
+#include <asm/mach-bigsmp/mach_apic.h>
+#include <asm/mmu_context.h>
+#include <asm/idle.h>
+#include <asm/genapic.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_bau.h>
+
+struct bau_control **uv_bau_table_bases;
+static int uv_bau_retry_limit;
+static int uv_nshift;		/* position of pnode (which is nasid>>1) */
+static unsigned long uv_mmask;
+
+char *status_table[] = {
+	"IDLE",
+	"ACTIVE",
+	"DESTINATION TIMEOUT",
+	"SOURCE TIMEOUT"
+};
+
+DEFINE_PER_CPU(struct ptc_stats, ptcstats);
+DEFINE_PER_CPU(struct bau_control, bau_control);
+
+/*
+ * Free a software acknowledge hardware resource by clearing its Pending
+ * bit. This will return a reply to the sender.
+ * If the message has timed out, a reply has already been sent by the
+ * hardware but the resource has not been released. In that case our
+ * clear of the Timeout bit (as well) will free the resource. No reply will
+ * be sent (the hardware will only do one reply per message).
+ */
+static void
+uv_reply_to_message(int resource,
+		    struct bau_payload_queue_entry *msg,
+		    struct bau_msg_status *msp)
+{
+	int fw;
+
+	fw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+	msg->replied_to = 1;
+	msg->sw_ack_vector = 0;
+	if (msp)
+		msp->seen_by.bits = 0;
+	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, fw);
+	return;
+}
+
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void
+uv_bau_process_message(struct bau_payload_queue_entry *msg,
+		       int msg_slot, int sw_ack_slot)
+{
+	int cpu;
+	unsigned long this_cpu_mask;
+	struct bau_msg_status *msp;
+
+	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
+	cpu = uv_blade_processor_id();
+	msg->number_of_cpus =
+	    uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
+	this_cpu_mask = (unsigned long)1 << cpu;
+	if (msp->seen_by.bits & this_cpu_mask)
+		return;
+	atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
+
+	if (msg->replied_to == 1)
+		return;
+
+	if (msg->address == TLB_FLUSH_ALL) {
+		local_flush_tlb();
+		__get_cpu_var(ptcstats).alltlb++;
+	} else {
+		__flush_tlb_one(msg->address);
+		__get_cpu_var(ptcstats).onetlb++;
+	}
+
+	__get_cpu_var(ptcstats).requestee++;
+
+	atomic_inc_short(&msg->acknowledge_count);
+	if (msg->number_of_cpus == msg->acknowledge_count)
+		uv_reply_to_message(sw_ack_slot, msg, msp);
+	return;
+}
+
+/*
+ * Examine the payload queue on all the distribution nodes to see
+ * which messages have not been seen, and which cpu(s) have not seen them.
+ *
+ * Returns the number of cpu's that have not responded.
+ */
+static int
+uv_examine_destinations(struct bau_target_nodemask *distribution)
+{
+	int sender;
+	int i;
+	int j;
+	int k;
+	int count = 0;
+	struct bau_control *bau_tablesp;
+	struct bau_payload_queue_entry *msg;
+	struct bau_msg_status *msp;
+
+	sender = smp_processor_id();
+	for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE);
+	     i++) {
+		if (bau_node_isset(i, distribution)) {
+			bau_tablesp = uv_bau_table_bases[i];
+			for (msg = bau_tablesp->va_queue_first, j = 0;
+			     j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) {
+				if ((msg->sending_cpu == sender) &&
+				    (!msg->replied_to)) {
+					msp = bau_tablesp->msg_statuses + j;
+					printk(KERN_DEBUG
+				"blade %d: address:%#lx %d of %d, not cpu(s): ",
+					       i, msg->address,
+					       msg->acknowledge_count,
+					       msg->number_of_cpus);
+					for (k = 0; k < msg->number_of_cpus;
+					     k++) {
+						if (!((long)1 << k & msp->
+						      seen_by.bits)) {
+							count++;
+							printk("%d ", k);
+						}
+					}
+					printk("\n");
+				}
+			}
+		}
+	}
+	return count;
+}
+
+/**
+ * uv_flush_tlb_others - globally purge translation cache of a virtual
+ * address or all TLB's
+ * @cpumaskp: mask of all cpu's in which the address is to be removed
+ * @mm: mm_struct containing virtual address range
+ * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ *
+ * This is the entry point for initiating any UV global TLB shootdown.
+ *
+ * Purges the translation caches of all specified processors of the given
+ * virtual address, or purges all TLB's on specified processors.
+ *
+ * The caller has derived the cpumaskp from the mm_struct and has subtracted
+ * the local cpu from the mask.  This function is called only if there
+ * are bits set in the mask. (e.g. flush_tlb_page())
+ *
+ * The cpumaskp is converted into a nodemask of the nodes containing
+ * the cpus.
+ */
+int
+uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, unsigned long va)
+{
+	int i;
+	int blade;
+	int cpu;
+	int bit;
+	int right_shift;
+	int this_blade;
+	int exams = 0;
+	int tries = 0;
+	long source_timeouts = 0;
+	long destination_timeouts = 0;
+	unsigned long index;
+	unsigned long mmr_offset;
+	unsigned long descriptor_status;
+	struct bau_activation_descriptor *bau_desc;
+	ktime_t time1, time2;
+
+	cpu = uv_blade_processor_id();
+	this_blade = uv_numa_blade_id();
+	bau_desc = __get_cpu_var(bau_control).descriptor_base;
+	bau_desc += (UV_ITEMS_PER_DESCRIPTOR * cpu);
+
+	bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+
+	i = 0;
+	for_each_cpu_mask(bit, *cpumaskp) {
+		blade = uv_cpu_to_blade_id(bit);
+		if (blade > (UV_DISTRIBUTION_SIZE - 1))
+			BUG();
+		if (blade == this_blade)
+			continue;
+		bau_node_set(blade, &bau_desc->distribution);
+		/* leave the bits for the remote cpu's in the mask until
+		   success; on failure we fall back to the IPI method */
+		i++;
+	}
+	if (i == 0)
+		goto none_to_flush;
+	__get_cpu_var(ptcstats).requestor++;
+	__get_cpu_var(ptcstats).ntargeted += i;
+
+	bau_desc->payload.address = va;
+	bau_desc->payload.sending_cpu = smp_processor_id();
+
+	if (cpu < UV_CPUS_PER_ACT_STATUS) {
+		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+		right_shift = cpu * UV_ACT_STATUS_SIZE;
+	} else {
+		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+		right_shift =
+		    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
+	}
+	time1 = ktime_get();
+
+retry:
+	tries++;
+	index = ((unsigned long)
+		 1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | cpu;
+	uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+
+	while ((descriptor_status = (((unsigned long)
+				      uv_read_local_mmr(mmr_offset) >>
+				      right_shift) & UV_ACT_STATUS_MASK)) !=
+	       DESC_STATUS_IDLE) {
+		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+			source_timeouts++;
+			if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
+				source_timeouts = 0;
+			__get_cpu_var(ptcstats).s_retry++;
+			goto retry;
+		}
+		/* spin here looking for progress at the destinations */
+		if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
+			destination_timeouts++;
+			if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
+				/* returns # of cpus not responding */
+				if (uv_examine_destinations
+				    (&bau_desc->distribution) == 0) {
+					__get_cpu_var(ptcstats).d_retry++;
+					goto retry;
+				}
+				exams++;
+				if (exams >= uv_bau_retry_limit) {
+					printk(KERN_DEBUG
+					       "uv_flush_tlb_others");
+					printk("giving up on cpu %d\n",
+					       smp_processor_id());
+					goto unsuccessful;
+				}
+				/* delays can hang up the simulator
+				   udelay(1000);
+				 */
+				destination_timeouts = 0;
+			}
+		}
+	}
+	if (tries > 1)
+		__get_cpu_var(ptcstats).retriesok++;
+	/* on success, clear the remote cpu's from the mask so we don't
+	   use the IPI method of shootdown on them */
+	for_each_cpu_mask(bit, *cpumaskp) {
+		blade = uv_cpu_to_blade_id(bit);
+		if (blade == this_blade)
+			continue;
+		cpu_clear(bit, *cpumaskp);
+	}
+
+unsuccessful:
+	time2 = ktime_get();
+	__get_cpu_var(ptcstats).sflush_ns += (time2.tv64 - time1.tv64);
+
+none_to_flush:
+	if (cpus_empty(*cpumaskp))
+		return 1;
+
+	/* Cause the caller to do an IPI-style TLB shootdown on
+	   the cpu's still in the mask */
+	__get_cpu_var(ptcstats).ptc_i++;
+	return 0;
+}
+
+/*
+ * The BAU message interrupt comes here. (registered by set_intr_gate)
+ * See entry_64.S
+ *
+ * We received a broadcast assist message.
+ *
+ * Interrupts may have been disabled; this interrupt could represent
+ * the receipt of several messages.
+ *
+ * All cores/threads on this node get this interrupt.
+ * The last one to see it does the s/w ack.
+ * (the resource will not be freed until noninterruptable cpus see this
+ *  interrupt; hardware will timeout the s/w ack and reply ERROR)
+ */
+void
+uv_bau_message_interrupt(struct pt_regs *regs)
+{
+	struct bau_payload_queue_entry *pqp;
+	struct bau_payload_queue_entry *msg;
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	ktime_t time1, time2;
+	int msg_slot;
+	int sw_ack_slot;
+	int fw;
+	int count = 0;
+	unsigned long local_pnode;
+
+	ack_APIC_irq();
+	exit_idle();
+	irq_enter();
+
+	time1 = ktime_get();
+
+	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
+
+	pqp = __get_cpu_var(bau_control).va_queue_first;
+	msg = __get_cpu_var(bau_control).bau_msg_head;
+	while (msg->sw_ack_vector) {
+		count++;
+		fw = msg->sw_ack_vector;
+		msg_slot = msg - pqp;
+		sw_ack_slot = ffs(fw) - 1;
+
+		uv_bau_process_message(msg, msg_slot, sw_ack_slot);
+
+		msg++;
+		if (msg > __get_cpu_var(bau_control).va_queue_last)
+			msg = __get_cpu_var(bau_control).va_queue_first;
+		__get_cpu_var(bau_control).bau_msg_head = msg;
+	}
+	if (!count)
+		__get_cpu_var(ptcstats).nomsg++;
+	else if (count > 1)
+		__get_cpu_var(ptcstats).multmsg++;
+
+	time2 = ktime_get();
+	__get_cpu_var(ptcstats).dflush_ns += (time2.tv64 - time1.tv64);
+
+	irq_exit();
+	set_irq_regs(old_regs);
+	return;
+}
+
+static void
+uv_enable_timeouts(void)
+{
+	int i;
+	int blade;
+	int last_blade;
+	int pnode;
+	int cur_cpu = 0;
+	unsigned long apicid;
+
+	/* better if we had each_online_blade */
+	last_blade = -1;
+	for_each_online_node(i) {
+		blade = uv_node_to_blade_id(i);
+		if (blade == last_blade)
+			continue;
+		last_blade = blade;
+		apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+		pnode = uv_blade_to_pnode(blade);
+		cur_cpu += uv_blade_nr_possible_cpus(i);
+	}
+	return;
+}
+
+static void *
+uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
+{
+	if (*offset < num_possible_cpus())
+		return offset;
+	return NULL;
+}
+
+static void *
+uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
+{
+	(*offset)++;
+	if (*offset < num_possible_cpus())
+		return offset;
+	return NULL;
+}
+
+static void
+uv_ptc_seq_stop(struct seq_file *file, void *data)
+{
+}
+
+/*
+ * Display the statistics thru /proc
+ * data points to the cpu number
+ */
+static int
+uv_ptc_seq_show(struct seq_file *file, void *data)
+{
+	struct ptc_stats *stat;
+	int cpu;
+
+	cpu = *(loff_t *)data;
+
+	if (!cpu) {
+		seq_printf(file,
+		"# cpu requestor requestee one all sretry dretry ptc_i ");
+		seq_printf(file,
+		"sw_ack sflush_us dflush_us sok dnomsg dmult starget\n");
+	}
+	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
+		stat = &per_cpu(ptcstats, cpu);
+		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
+			   cpu, stat->requestor,
+			   stat->requestee, stat->onetlb, stat->alltlb,
+			   stat->s_retry, stat->d_retry, stat->ptc_i);
+		seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
+			   uv_read_global_mmr64(uv_blade_to_pnode
+					(uv_cpu_to_blade_id(cpu)),
+					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
+			   stat->sflush_ns / 1000, stat->dflush_ns / 1000,
+			   stat->retriesok, stat->nomsg,
+			   stat->multmsg, stat->ntargeted);
+	}
+
+	return 0;
+}
+
+/*
+ *  0: display meaning of the statistics
+ * >0: retry limit
+ */
+static ssize_t
+uv_ptc_proc_write(struct file *file, const char __user *user,
+		  size_t count, loff_t *data)
+{
+	long newmode;
+	char optstr[64];
+
+	if (copy_from_user(optstr, user, count))
+		return -EFAULT;
+	optstr[count - 1] = '\0';
+	if (strict_strtoul(optstr, 10, &newmode) < 0) {
+		printk(KERN_DEBUG "%s is invalid\n", optstr);
+		return -EINVAL;
+	}
+
+	if (newmode == 0) {
+		printk(KERN_DEBUG "# cpu:      cpu number\n");
+		printk(KERN_DEBUG
+		"requestor:  times this cpu was the flush requestor\n");
+		printk(KERN_DEBUG
+		"requestee:  times this cpu was requested to flush its TLBs\n");
+		printk(KERN_DEBUG
+		"one:        times requested to flush a single address\n");
+		printk(KERN_DEBUG
+		"all:        times requested to flush all TLB's\n");
+		printk(KERN_DEBUG
+		"sretry:     number of retries of source-side timeouts\n");
+		printk(KERN_DEBUG
+		"dretry:     number of retries of destination-side timeouts\n");
+		printk(KERN_DEBUG
+		"ptc_i:      times UV fell through to IPI-style flushes\n");
+		printk(KERN_DEBUG
+		"sw_ack:     image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
+		printk(KERN_DEBUG
+		"sflush_us:  microseconds spent in uv_flush_tlb_others()\n");
+		printk(KERN_DEBUG
+		"dflush_us:  microseconds spent in handling flush requests\n");
+		printk(KERN_DEBUG "sok:        successes on retry\n");
+		printk(KERN_DEBUG "dnomsg:     interrupts with no message\n");
+		printk(KERN_DEBUG
+		"dmult:      interrupts with multiple messages\n");
+		printk(KERN_DEBUG "starget:    nodes targeted\n");
+	} else {
+		uv_bau_retry_limit = newmode;
+		printk(KERN_DEBUG "timeout retry limit:%d\n",
+		       uv_bau_retry_limit);
+	}
+
+	return count;
+}
+
+static const struct seq_operations uv_ptc_seq_ops = {
+	.start = uv_ptc_seq_start,
+	.next = uv_ptc_seq_next,
+	.stop = uv_ptc_seq_stop,
+	.show = uv_ptc_seq_show
+};
+
+static int
+uv_ptc_proc_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &uv_ptc_seq_ops);
+}
+
+static const struct file_operations proc_uv_ptc_operations = {
+	.open = uv_ptc_proc_open,
+	.read = seq_read,
+	.write = uv_ptc_proc_write,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static struct proc_dir_entry *proc_uv_ptc;
+
+static int __init
+uv_ptc_init(void)
+{
+	static struct proc_dir_entry *sgi_proc_dir;
+
+	sgi_proc_dir = NULL;
+
+	if (!is_uv_system())
+		return 0;
+
+	sgi_proc_dir = proc_mkdir("sgi_uv", NULL);
+	if (!sgi_proc_dir)
+		return -EINVAL;
+
+	proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
+	if (!proc_uv_ptc) {
+		printk(KERN_ERR "unable to create %s proc entry\n",
+		       UV_PTC_BASENAME);
+		return -EINVAL;
+	}
+	proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
+	return 0;
+}
+
+static void __exit
+uv_ptc_exit(void)
+{
+	remove_proc_entry(UV_PTC_BASENAME, NULL);
+}
+
+module_init(uv_ptc_init);
+module_exit(uv_ptc_exit);
+
+/*
+ * Initialization of BAU-related structures
+ */
+int __init
+uv_bau_init(void)
+{
+	int i;
+	int j;
+	int blade;
+	int nblades;
+	int *ip;
+	int pnode;
+	int last_blade;
+	int cur_cpu = 0;
+	unsigned long pa;
+	unsigned long n;
+	unsigned long m;
+	unsigned long mmr_image;
+	unsigned long apicid;
+	char *cp;
+	struct bau_control *bau_tablesp;
+	struct bau_activation_descriptor *adp, *ad2;
+	struct bau_payload_queue_entry *pqp;
+	struct bau_msg_status *msp;
+	struct bau_control *bcp;
+
+	if (!is_uv_system())
+		return 0;
+
+	uv_bau_retry_limit = 1;
+
+	if ((sizeof(struct bau_local_cpumask) * BITSPERBYTE) <
+	    MAX_CPUS_PER_NODE) {
+		printk(KERN_ERR
+			"uv_bau_init: bau_local_cpumask.bits too small\n");
+		BUG();
+	}
+
+	uv_nshift = uv_hub_info->n_val;
+	uv_mmask = ((unsigned long)1 << uv_hub_info->n_val) - 1;
+	nblades = 0;
+	last_blade = -1;
+	for_each_online_node(i) {
+		blade = uv_node_to_blade_id(i);
+		if (blade == last_blade)
+			continue;
+		last_blade = blade;
+		nblades++;
+	}
+
+	uv_bau_table_bases = (struct bau_control **)
+	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
+	if (!uv_bau_table_bases)
+		BUG();
+
+	/* better if we had each_online_blade */
+	last_blade = -1;
+	for_each_online_node(i) {
+		blade = uv_node_to_blade_id(i);
+		if (blade == last_blade)
+			continue;
+		last_blade = blade;
+
+		bau_tablesp =
+		    kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, i);
+		if (!bau_tablesp)
+			BUG();
+
+		bau_tablesp->msg_statuses =
+		    kmalloc_node(sizeof(struct bau_msg_status) *
+				 DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, i);
+		if (!bau_tablesp->msg_statuses)
+			BUG();
+		for (j = 0, msp = bau_tablesp->msg_statuses;
+		     j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, msp++) {
+			bau_cpubits_clear(&msp->seen_by, (int)
+					  uv_blade_nr_possible_cpus(blade));
+		}
+
+		bau_tablesp->watching =
+		    kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES,
+				 GFP_KERNEL, i);
+		if (!bau_tablesp->watching)
+			BUG();
+		for (j = 0, ip = bau_tablesp->watching;
+		     j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, ip++) {
+			*ip = 0;
+		}
+
+		uv_bau_table_bases[i] = bau_tablesp;
+
+		pnode = uv_blade_to_pnode(blade);
+
+		if (sizeof(struct bau_activation_descriptor) != 64)
+			BUG();
+
+		adp = (struct bau_activation_descriptor *)
+		    kmalloc_node(16384, GFP_KERNEL, i);
+		if (!adp)
+			BUG();
+		if ((unsigned long)adp & 0xfff)
+			BUG();
+		pa = __pa((unsigned long)adp);
+		n = pa >> uv_nshift;
+		m = pa & uv_mmask;
+
+		mmr_image = uv_read_global_mmr64(pnode,
+						 UVH_LB_BAU_SB_DESCRIPTOR_BASE);
+		if (mmr_image)
+			uv_write_global_mmr64(pnode, (unsigned long)
+					      UVH_LB_BAU_SB_DESCRIPTOR_BASE,
+					      (n << UV_DESC_BASE_PNODE_SHIFT |
+					       m));
+		for (j = 0, ad2 = adp; j < UV_ACTIVATION_DESCRIPTOR_SIZE;
+		     j++, ad2++) {
+			memset(ad2, 0,
+			       sizeof(struct bau_activation_descriptor));
+			ad2->header.sw_ack_flag = 1;
+			ad2->header.base_dest_nodeid =
+			    uv_blade_to_pnode(uv_cpu_to_blade_id(0));
+			ad2->header.command = UV_NET_ENDPOINT_INTD;
+			ad2->header.int_both = 1;
+			/* all others need to be set to zero:
+			   fairness chaining multilevel count replied_to */
+		}
+
+		pqp = (struct bau_payload_queue_entry *)
+		    kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) *
+				 sizeof(struct bau_payload_queue_entry),
+				 GFP_KERNEL, i);
+		if (!pqp)
+			BUG();
+		if (sizeof(struct bau_payload_queue_entry) != 32)
+			BUG();
+		if ((unsigned long)(&((struct bau_payload_queue_entry *)0)->
+				    sw_ack_vector) != 15)
+			BUG();
+
+		cp = (char *)pqp + 31;
+		pqp = (struct bau_payload_queue_entry *)
+		    (((unsigned long)cp >> 5) << 5);
+		bau_tablesp->va_queue_first = pqp;
+		uv_write_global_mmr64(pnode,
+				      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
+				      ((unsigned long)pnode <<
+				       UV_PAYLOADQ_PNODE_SHIFT) |
+				      uv_physnodeaddr(pqp));
+		uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
+				      uv_physnodeaddr(pqp));
+		bau_tablesp->va_queue_last =
+		    pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1);
+		uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
+				      (unsigned long)
+				      uv_physnodeaddr(bau_tablesp->
+						      va_queue_last));
+		memset(pqp, 0, sizeof(struct bau_payload_queue_entry) *
+		       DESTINATION_PAYLOAD_QUEUE_SIZE);
+
+		/* this initialization can't be in firmware because the
+		   messaging IRQ will be determined by the OS */
+		apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+		pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
+		if ((pa & 0xff) != UV_BAU_MESSAGE) {
+			uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
+					      ((apicid << 32) |
+					       UV_BAU_MESSAGE));
+		}
+
+		for (j = cur_cpu; j < (cur_cpu + uv_blade_nr_possible_cpus(i));
+		     j++) {
+			bcp = (struct bau_control *)&per_cpu(bau_control, j);
+			bcp->bau_msg_head = bau_tablesp->va_queue_first;
+			bcp->va_queue_first = bau_tablesp->va_queue_first;
+
+			bcp->va_queue_last = bau_tablesp->va_queue_last;
+			bcp->watching = bau_tablesp->watching;
+			bcp->msg_statuses = bau_tablesp->msg_statuses;
+			bcp->descriptor_base = adp;
+		}
+		cur_cpu += uv_blade_nr_possible_cpus(i);
+	}
+
+	set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+
+	uv_enable_timeouts();
+
+	return 0;
+}
+
+__initcall(uv_bau_init);
diff --git a/include/asm-x86/uv/uv_bau.h b/include/asm-x86/uv/uv_bau.h
new file mode 100644
index 000000000000..f125f86c89ac
--- /dev/null
+++ b/include/asm-x86/uv/uv_bau.h
@@ -0,0 +1,331 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV Broadcast Assist Unit definitions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef __ASM_X86_UV_BAU__
+#define __ASM_X86_UV_BAU__
+
+#include <linux/bitmap.h>
+#define BITSPERBYTE 8
+
+/* Broadcast Assist Unit messaging structures */
+
+/*
+ * Selective Broadcast activations are induced by software action
+ * specifying a particular 8-descriptor "set" via a 6-bit index written
+ * to an MMR.
+ * Thus there are 64 unique 512-byte sets of SB descriptors - one set for
+ * each 6-bit index value. These descriptor sets are mapped in sequence
+ * starting with set 0 located at the address specified in the
+ * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
+ * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
+ *
+ * We will use 31 sets, one for sending BAU messages from each of the 32
+ * cpu's on the node.
+ *
+ * TLB shootdown will use the first of the 8 descriptors of each set.
+ * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
+ */
+
+#define UV_ITEMS_PER_DESCRIPTOR 8
+#define UV_CPUS_PER_ACT_STATUS 32
+#define UV_ACT_STATUS_MASK 0x3
+#define UV_ACT_STATUS_SIZE 2
+#define UV_ACTIVATION_DESCRIPTOR_SIZE 32
+#define UV_DISTRIBUTION_SIZE 256
+#define UV_SW_ACK_NPENDING 8
+#define UV_BAU_MESSAGE 200	/* Messaging irq; see irq_64.h */
+				/* and include/asm-x86/hw_irq_64.h */
+				/* To be dynamically allocated in the future */
+#define UV_NET_ENDPOINT_INTD 0x38
+#define UV_DESC_BASE_PNODE_SHIFT 49 /* position of pnode (nasid>>1) in MMR */
+#define UV_PAYLOADQ_PNODE_SHIFT 49
+
+#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
+#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
+
+/* bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 */
+#define DESC_STATUS_IDLE                0
+#define DESC_STATUS_ACTIVE              1
+#define DESC_STATUS_DESTINATION_TIMEOUT 2
+#define DESC_STATUS_SOURCE_TIMEOUT      3
+
+/* source side threshholds at which message retries print a warning */
+#define SOURCE_TIMEOUT_LIMIT            20
+#define DESTINATION_TIMEOUT_LIMIT       20
+
+/* number of entries in the destination side payload queue */
+#define DESTINATION_PAYLOAD_QUEUE_SIZE  17
+/* number of destination side software ack resources */
+#define DESTINATION_NUM_RESOURCES       8
+#define MAX_CPUS_PER_NODE		32
+
+/* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) */
+/* If the 'multilevel' flag in the header portion of the descriptor
+ * has been set to 0, then endpoint multi-unicast mode is selected.
+ * The distribution specification (32 bytes) is interpreted as a 256-bit
+ * distribution vector. Adjacent bits correspond to consecutive even numbered
+ * nodeIDs. The result of adding the index of a given bit to the 15-bit
+ * 'base_dest_nodeid' field of the header corresponds to the
+ * destination nodeID associated with that specified bit.  */
+struct bau_target_nodemask {
+	unsigned long bits[BITS_TO_LONGS(256)];
+};
+
+/* mask of cpu's on a node */
+/* (during initialization we need to check that unsigned long has
+    enough bits for max. cpu's per node) */
+struct bau_local_cpumask {
+	unsigned long bits;
+};
+
+/*
+ * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor)
+ * only 12 bytes (96 bits) of the payload area are usable.
+ * An additional 3 bytes (bits 27:4) of the header address are carried
+ * to the next bytes of the destination payload queue.
+ * And an additional 2 bytes of the header Suppl_A field are also
+ * carried to the destination payload queue.
+ * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte)
+ * of the destination payload queue, which is written by the hardware
+ * with the s/w ack resource bit vector.
+ * [ effective message contents (16 bytes (128 bits) maximum), not counting
+ *   the s/w ack bit vector  ]
+ */
+
+/* The payload is software-defined for INTD transactions */
+struct bau_msg_payload {
+	unsigned long address;		/* signifies a page or all TLB's
+						of the cpu */
+	/* 64 bits */
+	unsigned short sending_cpu;	/* filled in by sender */
+	/* 16 bits */
+	unsigned short acknowledge_count;/* filled in by destination */
+	/* 16 bits */
+	unsigned int reserved1:32;	/* not usable */
+};
+
+
+/* Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) */
+/* see table 4.2.3.0.1 in broacast_assist spec. */
+struct bau_msg_header {
+	int dest_subnodeid:6;	/* must be zero */
+	/* bits 5:0 */
+	int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */
+	/* bits 20:6 */
+	int command:8;		/* message type */
+	/* bits 28:21 */
+				/* 0x38: SN3net EndPoint Message */
+	int rsvd_1:3;		/* must be zero */
+	/* bits 31:29 */
+				/* int will align on 32 bits */
+	int rsvd_2:9;		/* must be zero */
+	/* bits 40:32 */
+				/* Suppl_A is 56-41 */
+	int payload_2a:8;	/* becomes byte 16 of msg */
+	/* bits 48:41 */	/* not currently using */
+	int payload_2b:8;	/* becomes byte 17 of msg */
+	/* bits 56:49 */	/* not currently using */
+				/* Address field (96:57) is never used as an
+				   address (these are address bits 42:3) */
+	int rsvd_3:1;		/* must be zero */
+	/* bit 57 */
+				/* address bits 27:4 are payload */
+				/* these 24 bits become bytes 12-14 of msg */
+	int replied_to:1;	/* sent as 0 by the source to byte 12 */
+	/* bit 58 */
+
+	int payload_1a:5;	/* not currently used */
+	/* bits 63:59 */
+	int payload_1b:8;	/* not currently used */
+	/* bits 71:64 */
+	int payload_1c:8;	/* not currently used */
+	/* bits 79:72 */
+	int payload_1d:2;	/* not currently used */
+	/* bits 81:80 */
+
+	int rsvd_4:7;		/* must be zero */
+	/* bits 88:82 */
+	int sw_ack_flag:1;	/* software acknowledge flag */
+	/* bit 89 */
+				/* INTD trasactions at destination are to
+				   wait for software acknowledge */
+	int rsvd_5:6;		/* must be zero */
+	/* bits 95:90 */
+	int rsvd_6:5;		/* must be zero */
+	/* bits 100:96 */
+	int int_both:1;		/* if 1, interrupt both sockets on the blade */
+	/* bit 101*/
+	int fairness:3;		/* usually zero */
+	/* bits 104:102 */
+	int multilevel:1;	/* multi-level multicast format */
+	/* bit 105 */
+				/* 0 for TLB: endpoint multi-unicast messages */
+	int chaining:1;		/* next descriptor is part of this activation*/
+	/* bit 106 */
+	int rsvd_7:21;		/* must be zero */
+	/* bits 127:107 */
+};
+
+/* The format of the message to send, plus all accompanying control */
+/* Should be 64 bytes */
+struct bau_activation_descriptor {
+	struct bau_target_nodemask distribution;
+	/* message template, consisting of header and payload: */
+	struct bau_msg_header header;
+	struct bau_msg_payload payload;
+};
+/*
+ *   -payload--    ---------header------
+ *   bytes 0-11    bits 41-56  bits 58-81
+ *       A           B  (2)      C (3)
+ *
+ *            A/B/C are moved to:
+ *       A            C          B
+ *   bytes 0-11  bytes 12-14  bytes 16-17  (byte 15 filled in by hw as vector)
+ *   ------------payload queue-----------
+ */
+
+/*
+ * The payload queue on the destination side is an array of these.
+ * With BAU_MISC_CONTROL set for software acknowledge mode, the messages
+ * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
+ * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
+ * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
+ *  sw_ack_vector and payload_2)
+ * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
+ *  Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
+ *  operation."
+ */
+struct bau_payload_queue_entry {
+	unsigned long address;		/* signifies a page or all TLB's
+						of the cpu */
+	/* 64 bits, bytes 0-7 */
+
+	unsigned short sending_cpu;	/* cpu that sent the message */
+	/* 16 bits, bytes 8-9 */
+
+	unsigned short acknowledge_count; /* filled in by destination */
+	/* 16 bits, bytes 10-11 */
+
+	unsigned short replied_to:1;	/* sent as 0 by the source */
+	/* 1 bit */
+	unsigned short unused1:7;       /* not currently using */
+	/* 7 bits: byte 12) */
+
+	unsigned char unused2[2];	/* not currently using */
+	/* bytes 13-14 */
+
+	unsigned char sw_ack_vector;	/* filled in by the hardware */
+	/* byte 15 (bits 127:120) */
+
+	unsigned char unused4[3];	/* not currently using bytes 17-19 */
+	/* bytes 17-19 */
+
+	int number_of_cpus;		/* filled in at destination */
+	/* 32 bits, bytes 20-23 (aligned) */
+
+	unsigned char unused5[8];       /* not using */
+	/* bytes 24-31 */
+};
+
+/* one for every slot in the destination payload queue */
+struct bau_msg_status {
+	struct bau_local_cpumask seen_by;	/* map of cpu's */
+};
+
+/* one for every slot in the destination software ack resources */
+struct bau_sw_ack_status {
+	struct bau_payload_queue_entry *msg;	/* associated message */
+	int watcher;				/* cpu monitoring, or -1 */
+};
+
+/* one on every node and per-cpu; to locate the software tables */
+struct bau_control {
+	struct bau_activation_descriptor *descriptor_base;
+	struct bau_payload_queue_entry *bau_msg_head;
+	struct bau_payload_queue_entry *va_queue_first;
+	struct bau_payload_queue_entry *va_queue_last;
+	struct bau_msg_status *msg_statuses;
+	int *watching; /* pointer to array */
+};
+
+/*
+ * This structure is allocated per_cpu for UV TLB shootdown statistics.
+ */
+struct ptc_stats {
+	unsigned long ptc_i;	/* number of IPI-style flushes */
+	unsigned long requestor;	/* number of nodes this cpu sent to */
+	unsigned long requestee;	/* times cpu was remotely requested */
+	unsigned long alltlb;	/* times all tlb's on this cpu were flushed */
+	unsigned long onetlb;	/* times just one tlb on this cpu was flushed */
+	unsigned long s_retry;	/* retries on source side timeouts */
+	unsigned long d_retry;	/* retries on destination side timeouts */
+	unsigned long sflush_ns;/* nanoseconds spent in uv_flush_tlb_others */
+	unsigned long dflush_ns;/* nanoseconds spent destination side */
+	unsigned long retriesok; /* successes on retries */
+	unsigned long nomsg;	/* interrupts with no message */
+	unsigned long multmsg;	/* interrupts with multiple messages */
+	unsigned long ntargeted;/* nodes targeted */
+};
+
+static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp)
+{
+	return constant_test_bit(node, &dstp->bits[0]);
+}
+static inline void bau_node_set(int node, struct bau_target_nodemask *dstp)
+{
+	__set_bit(node, &dstp->bits[0]);
+}
+static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits)
+{
+	bitmap_zero(&dstp->bits[0], nbits);
+}
+
+static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
+{
+	bitmap_zero(&dstp->bits, nbits);
+}
+
+/*
+ * atomic increment of a short integer
+ * (rather than using the __sync_add_and_fetch() intrinsic)
+ *
+ * returns the new value of the variable
+ */
+static inline short int atomic_inc_short(short int *v)
+{
+	asm volatile("movw $1, %%cx\n"
+			"lock ; xaddw %%cx, %0\n"
+			: "+m" (*v)		/* outputs */
+			: : "%cx", "memory");	/* inputs : clobbereds */
+	return *v;
+}
+
+/*
+ * atomic OR of two long integers
+ * (rather than using the __sync_or_and_fetch() intrinsic)
+ */
+static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
+{
+	asm volatile("movq %0, %%rax; lea %1, %%rdx\n"
+			"lock ; orq %%rax, %%rdx\n"
+			: "+m" (*v1)		/* outputs */
+			: "m" (v1), "m" (v2)	/* inputs */
+			: "memory");		/* clobbereds */
+}
+
+#define cpubit_isset(cpu, bau_local_cpumask) \
+	test_bit((cpu), (bau_local_cpumask).bits)
+
+int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
+void uv_bau_message_intr1(void);
+void uv_bau_timeout_intr1(void);
+
+#endif /* __ASM_X86_UV_BAU__ */
-- 
cgit v1.2.3


From b194b120507276b4f09e2e14f941884e777fc7c8 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Thu, 12 Jun 2008 08:23:48 -0500
Subject: SGI UV: TLB shootdown using broadcast assist unit, cleanups

TLB shootdown for SGI UV.

v1: 6/2 original
v2: 6/3 corrections/improvements per Ingo's review
v3: 6/4 split atomic operations off to a separate patch (Jeremy's review)
v4: 6/12 include <mach_apic.h> rather than <asm/mach-bigsmp/mach_apic.h>
         (fixes a !SMP build problem that Ingo found)
         fix the index on uv_table_bases[blade]

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_64.c    |   2 +-
 arch/x86/kernel/tlb_uv.c    | 704 ++++++++++++++++++++++++--------------------
 include/asm-x86/uv/uv_bau.h | 147 ++++-----
 3 files changed, 454 insertions(+), 399 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index fc132113bdab..5039d0f097a2 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -165,7 +165,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
 	cpumask_t cpumask = *cpumaskp;
 
 	if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
-			return;
+		return;
 
 	/* Caller has disabled preemption */
 	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 28e7c68d9d78..f7bc6a6fbe49 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -10,18 +10,20 @@
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
 
-#include <asm/mach-bigsmp/mach_apic.h>
 #include <asm/mmu_context.h>
 #include <asm/idle.h>
 #include <asm/genapic.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_bau.h>
+#include <asm/tsc.h>
 
-struct bau_control **uv_bau_table_bases;
-static int uv_bau_retry_limit;
-static int uv_nshift;		/* position of pnode (which is nasid>>1) */
-static unsigned long uv_mmask;
+#include <mach_apic.h>
+
+static struct bau_control **uv_bau_table_bases __read_mostly;
+static int uv_bau_retry_limit __read_mostly;
+static int uv_nshift __read_mostly; /* position of pnode (which is nasid>>1) */
+static unsigned long uv_mmask __read_mostly;
 
 char *status_table[] = {
 	"IDLE",
@@ -41,19 +43,18 @@ DEFINE_PER_CPU(struct bau_control, bau_control);
  * clear of the Timeout bit (as well) will free the resource. No reply will
  * be sent (the hardware will only do one reply per message).
  */
-static void
-uv_reply_to_message(int resource,
+static void uv_reply_to_message(int resource,
 		    struct bau_payload_queue_entry *msg,
 		    struct bau_msg_status *msp)
 {
-	int fw;
+	unsigned long dw;
 
-	fw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+	dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
 	msg->replied_to = 1;
 	msg->sw_ack_vector = 0;
 	if (msp)
 		msp->seen_by.bits = 0;
-	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, fw);
+	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
 	return;
 }
 
@@ -61,8 +62,7 @@ uv_reply_to_message(int resource,
  * Do all the things a cpu should do for a TLB shootdown message.
  * Other cpu's may come here at the same time for this message.
  */
-static void
-uv_bau_process_message(struct bau_payload_queue_entry *msg,
+static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
 		       int msg_slot, int sw_ack_slot)
 {
 	int cpu;
@@ -103,8 +103,7 @@ uv_bau_process_message(struct bau_payload_queue_entry *msg,
  *
  * Returns the number of cpu's that have not responded.
  */
-static int
-uv_examine_destinations(struct bau_target_nodemask *distribution)
+static int uv_examine_destinations(struct bau_target_nodemask *distribution)
 {
 	int sender;
 	int i;
@@ -118,34 +117,161 @@ uv_examine_destinations(struct bau_target_nodemask *distribution)
 	sender = smp_processor_id();
 	for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE);
 	     i++) {
-		if (bau_node_isset(i, distribution)) {
-			bau_tablesp = uv_bau_table_bases[i];
-			for (msg = bau_tablesp->va_queue_first, j = 0;
-			     j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) {
-				if ((msg->sending_cpu == sender) &&
-				    (!msg->replied_to)) {
-					msp = bau_tablesp->msg_statuses + j;
-					printk(KERN_DEBUG
+		if (!bau_node_isset(i, distribution))
+			continue;
+		bau_tablesp = uv_bau_table_bases[i];
+		for (msg = bau_tablesp->va_queue_first, j = 0;
+		     j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) {
+			if ((msg->sending_cpu == sender) &&
+			    (!msg->replied_to)) {
+				msp = bau_tablesp->msg_statuses + j;
+				printk(KERN_DEBUG
 				"blade %d: address:%#lx %d of %d, not cpu(s): ",
-					       i, msg->address,
-					       msg->acknowledge_count,
-					       msg->number_of_cpus);
-					for (k = 0; k < msg->number_of_cpus;
-					     k++) {
-						if (!((long)1 << k & msp->
-						      seen_by.bits)) {
-							count++;
-							printk("%d ", k);
-						}
+				       i, msg->address,
+				       msg->acknowledge_count,
+				       msg->number_of_cpus);
+				for (k = 0; k < msg->number_of_cpus;
+				     k++) {
+					if (!((long)1 << k & msp->
+					      seen_by.bits)) {
+						count++;
+						printk("%d ", k);
 					}
-					printk("\n");
 				}
+				printk("\n");
 			}
 		}
 	}
 	return count;
 }
 
+/*
+ * wait for completion of a broadcast message
+ *
+ * return COMPLETE, RETRY or GIVEUP
+ */
+static int uv_wait_completion(struct bau_activation_descriptor *bau_desc,
+			      unsigned long mmr_offset, int right_shift)
+{
+	int exams = 0;
+	long destination_timeouts = 0;
+	long source_timeouts = 0;
+	unsigned long descriptor_status;
+
+	while ((descriptor_status = (((unsigned long)
+		uv_read_local_mmr(mmr_offset) >>
+			right_shift) & UV_ACT_STATUS_MASK)) !=
+			DESC_STATUS_IDLE) {
+		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+			source_timeouts++;
+			if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
+				source_timeouts = 0;
+			__get_cpu_var(ptcstats).s_retry++;
+			return FLUSH_RETRY;
+		}
+		/*
+		 * spin here looking for progress at the destinations
+		 */
+		if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
+			destination_timeouts++;
+			if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
+				/*
+				 * returns number of cpus not responding
+				 */
+				if (uv_examine_destinations
+				    (&bau_desc->distribution) == 0) {
+					__get_cpu_var(ptcstats).d_retry++;
+					return FLUSH_RETRY;
+				}
+				exams++;
+				if (exams >= uv_bau_retry_limit) {
+					printk(KERN_DEBUG
+					       "uv_flush_tlb_others");
+					printk("giving up on cpu %d\n",
+					       smp_processor_id());
+					return FLUSH_GIVEUP;
+				}
+				/*
+				 * delays can hang the simulator
+				   udelay(1000);
+				 */
+				destination_timeouts = 0;
+			}
+		}
+	}
+	return FLUSH_COMPLETE;
+}
+
+/**
+ * uv_flush_send_and_wait
+ *
+ * Send a broadcast and wait for a broadcast message to complete.
+ *
+ * The cpumaskp mask contains the cpus the broadcast was sent to.
+ *
+ * Returns 1 if all remote flushing was done. The mask is zeroed.
+ * Returns 0 if some remote flushing remains to be done. The mask is left
+ * unchanged.
+ */
+int uv_flush_send_and_wait(int cpu, int this_blade,
+	struct bau_activation_descriptor *bau_desc, cpumask_t *cpumaskp)
+{
+	int completion_status = 0;
+	int right_shift;
+	int bit;
+	int blade;
+	int tries = 0;
+	unsigned long index;
+	unsigned long mmr_offset;
+	cycles_t time1;
+	cycles_t time2;
+
+	if (cpu < UV_CPUS_PER_ACT_STATUS) {
+		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+		right_shift = cpu * UV_ACT_STATUS_SIZE;
+	} else {
+		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+		right_shift =
+		    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
+	}
+	time1 = get_cycles();
+	do {
+		tries++;
+		index = ((unsigned long)
+			1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | cpu;
+		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+		completion_status = uv_wait_completion(bau_desc, mmr_offset,
+					right_shift);
+	} while (completion_status == FLUSH_RETRY);
+	time2 = get_cycles();
+	__get_cpu_var(ptcstats).sflush += (time2 - time1);
+	if (tries > 1)
+		__get_cpu_var(ptcstats).retriesok++;
+
+	if (completion_status == FLUSH_GIVEUP) {
+		/*
+		 * Cause the caller to do an IPI-style TLB shootdown on
+		 * the cpu's, all of which are still in the mask.
+		 */
+		__get_cpu_var(ptcstats).ptc_i++;
+		return 0;
+	}
+
+	/*
+	 * Success, so clear the remote cpu's from the mask so we don't
+	 * use the IPI method of shootdown on them.
+	 */
+	for_each_cpu_mask(bit, *cpumaskp) {
+		blade = uv_cpu_to_blade_id(bit);
+		if (blade == this_blade)
+			continue;
+		cpu_clear(bit, *cpumaskp);
+	}
+	if (!cpus_empty(*cpumaskp))
+		return 0;
+	return 1;
+}
+
 /**
  * uv_flush_tlb_others - globally purge translation cache of a virtual
  * address or all TLB's
@@ -164,30 +290,25 @@ uv_examine_destinations(struct bau_target_nodemask *distribution)
  *
  * The cpumaskp is converted into a nodemask of the nodes containing
  * the cpus.
+ *
+ * Returns 1 if all remote flushing was done.
+ * Returns 0 if some remote flushing remains to be done.
  */
-int
-uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, unsigned long va)
+int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
+	unsigned long va)
 {
 	int i;
+	int bit;
 	int blade;
 	int cpu;
-	int bit;
-	int right_shift;
 	int this_blade;
-	int exams = 0;
-	int tries = 0;
-	long source_timeouts = 0;
-	long destination_timeouts = 0;
-	unsigned long index;
-	unsigned long mmr_offset;
-	unsigned long descriptor_status;
+	int locals = 0;
 	struct bau_activation_descriptor *bau_desc;
-	ktime_t time1, time2;
 
 	cpu = uv_blade_processor_id();
 	this_blade = uv_numa_blade_id();
 	bau_desc = __get_cpu_var(bau_control).descriptor_base;
-	bau_desc += (UV_ITEMS_PER_DESCRIPTOR * cpu);
+	bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu;
 
 	bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
 
@@ -196,96 +317,29 @@ uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, unsigned long va)
 		blade = uv_cpu_to_blade_id(bit);
 		if (blade > (UV_DISTRIBUTION_SIZE - 1))
 			BUG();
-		if (blade == this_blade)
+		if (blade == this_blade) {
+			locals++;
 			continue;
+		}
 		bau_node_set(blade, &bau_desc->distribution);
-		/* leave the bits for the remote cpu's in the mask until
-		   success; on failure we fall back to the IPI method */
 		i++;
 	}
-	if (i == 0)
-		goto none_to_flush;
+	if (i == 0) {
+		/*
+		 * no off_node flushing; return status for local node
+		 */
+		if (locals)
+			return 0;
+		else
+			return 1;
+	}
 	__get_cpu_var(ptcstats).requestor++;
 	__get_cpu_var(ptcstats).ntargeted += i;
 
 	bau_desc->payload.address = va;
 	bau_desc->payload.sending_cpu = smp_processor_id();
 
-	if (cpu < UV_CPUS_PER_ACT_STATUS) {
-		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
-		right_shift = cpu * UV_ACT_STATUS_SIZE;
-	} else {
-		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
-		right_shift =
-		    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
-	}
-	time1 = ktime_get();
-
-retry:
-	tries++;
-	index = ((unsigned long)
-		 1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | cpu;
-	uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
-
-	while ((descriptor_status = (((unsigned long)
-				      uv_read_local_mmr(mmr_offset) >>
-				      right_shift) & UV_ACT_STATUS_MASK)) !=
-	       DESC_STATUS_IDLE) {
-		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
-			source_timeouts++;
-			if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
-				source_timeouts = 0;
-			__get_cpu_var(ptcstats).s_retry++;
-			goto retry;
-		}
-		/* spin here looking for progress at the destinations */
-		if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
-			destination_timeouts++;
-			if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
-				/* returns # of cpus not responding */
-				if (uv_examine_destinations
-				    (&bau_desc->distribution) == 0) {
-					__get_cpu_var(ptcstats).d_retry++;
-					goto retry;
-				}
-				exams++;
-				if (exams >= uv_bau_retry_limit) {
-					printk(KERN_DEBUG
-					       "uv_flush_tlb_others");
-					printk("giving up on cpu %d\n",
-					       smp_processor_id());
-					goto unsuccessful;
-				}
-				/* delays can hang up the simulator
-				   udelay(1000);
-				 */
-				destination_timeouts = 0;
-			}
-		}
-	}
-	if (tries > 1)
-		__get_cpu_var(ptcstats).retriesok++;
-	/* on success, clear the remote cpu's from the mask so we don't
-	   use the IPI method of shootdown on them */
-	for_each_cpu_mask(bit, *cpumaskp) {
-		blade = uv_cpu_to_blade_id(bit);
-		if (blade == this_blade)
-			continue;
-		cpu_clear(bit, *cpumaskp);
-	}
-
-unsuccessful:
-	time2 = ktime_get();
-	__get_cpu_var(ptcstats).sflush_ns += (time2.tv64 - time1.tv64);
-
-none_to_flush:
-	if (cpus_empty(*cpumaskp))
-		return 1;
-
-	/* Cause the caller to do an IPI-style TLB shootdown on
-	   the cpu's still in the mask */
-	__get_cpu_var(ptcstats).ptc_i++;
-	return 0;
+	return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp);
 }
 
 /*
@@ -302,13 +356,12 @@ none_to_flush:
  * (the resource will not be freed until noninterruptable cpus see this
  *  interrupt; hardware will timeout the s/w ack and reply ERROR)
  */
-void
-uv_bau_message_interrupt(struct pt_regs *regs)
+void uv_bau_message_interrupt(struct pt_regs *regs)
 {
 	struct bau_payload_queue_entry *pqp;
 	struct bau_payload_queue_entry *msg;
 	struct pt_regs *old_regs = set_irq_regs(regs);
-	ktime_t time1, time2;
+	cycles_t time1, time2;
 	int msg_slot;
 	int sw_ack_slot;
 	int fw;
@@ -319,7 +372,7 @@ uv_bau_message_interrupt(struct pt_regs *regs)
 	exit_idle();
 	irq_enter();
 
-	time1 = ktime_get();
+	time1 = get_cycles();
 
 	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
 
@@ -343,16 +396,15 @@ uv_bau_message_interrupt(struct pt_regs *regs)
 	else if (count > 1)
 		__get_cpu_var(ptcstats).multmsg++;
 
-	time2 = ktime_get();
-	__get_cpu_var(ptcstats).dflush_ns += (time2.tv64 - time1.tv64);
+	time2 = get_cycles();
+	__get_cpu_var(ptcstats).dflush += (time2 - time1);
 
 	irq_exit();
 	set_irq_regs(old_regs);
 	return;
 }
 
-static void
-uv_enable_timeouts(void)
+static void uv_enable_timeouts(void)
 {
 	int i;
 	int blade;
@@ -361,7 +413,6 @@ uv_enable_timeouts(void)
 	int cur_cpu = 0;
 	unsigned long apicid;
 
-	/* better if we had each_online_blade */
 	last_blade = -1;
 	for_each_online_node(i) {
 		blade = uv_node_to_blade_id(i);
@@ -375,16 +426,14 @@ uv_enable_timeouts(void)
 	return;
 }
 
-static void *
-uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
+static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
 {
 	if (*offset < num_possible_cpus())
 		return offset;
 	return NULL;
 }
 
-static void *
-uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
+static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
 {
 	(*offset)++;
 	if (*offset < num_possible_cpus())
@@ -392,8 +441,7 @@ uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
 	return NULL;
 }
 
-static void
-uv_ptc_seq_stop(struct seq_file *file, void *data)
+static void uv_ptc_seq_stop(struct seq_file *file, void *data)
 {
 }
 
@@ -401,8 +449,7 @@ uv_ptc_seq_stop(struct seq_file *file, void *data)
  * Display the statistics thru /proc
  * data points to the cpu number
  */
-static int
-uv_ptc_seq_show(struct seq_file *file, void *data)
+static int uv_ptc_seq_show(struct seq_file *file, void *data)
 {
 	struct ptc_stats *stat;
 	int cpu;
@@ -413,7 +460,7 @@ uv_ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file,
 		"# cpu requestor requestee one all sretry dretry ptc_i ");
 		seq_printf(file,
-		"sw_ack sflush_us dflush_us sok dnomsg dmult starget\n");
+		"sw_ack sflush dflush sok dnomsg dmult starget\n");
 	}
 	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
@@ -425,7 +472,7 @@ uv_ptc_seq_show(struct seq_file *file, void *data)
 			   uv_read_global_mmr64(uv_blade_to_pnode
 					(uv_cpu_to_blade_id(cpu)),
 					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
-			   stat->sflush_ns / 1000, stat->dflush_ns / 1000,
+			   stat->sflush, stat->dflush,
 			   stat->retriesok, stat->nomsg,
 			   stat->multmsg, stat->ntargeted);
 	}
@@ -437,8 +484,7 @@ uv_ptc_seq_show(struct seq_file *file, void *data)
  *  0: display meaning of the statistics
  * >0: retry limit
  */
-static ssize_t
-uv_ptc_proc_write(struct file *file, const char __user *user,
+static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 		  size_t count, loff_t *data)
 {
 	long newmode;
@@ -471,9 +517,9 @@ uv_ptc_proc_write(struct file *file, const char __user *user,
 		printk(KERN_DEBUG
 		"sw_ack:     image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
 		printk(KERN_DEBUG
-		"sflush_us:  microseconds spent in uv_flush_tlb_others()\n");
+		"sflush_us:  cycles spent in uv_flush_tlb_others()\n");
 		printk(KERN_DEBUG
-		"dflush_us:  microseconds spent in handling flush requests\n");
+		"dflush_us:  cycles spent in handling flush requests\n");
 		printk(KERN_DEBUG "sok:        successes on retry\n");
 		printk(KERN_DEBUG "dnomsg:     interrupts with no message\n");
 		printk(KERN_DEBUG
@@ -489,40 +535,33 @@ uv_ptc_proc_write(struct file *file, const char __user *user,
 }
 
 static const struct seq_operations uv_ptc_seq_ops = {
-	.start = uv_ptc_seq_start,
-	.next = uv_ptc_seq_next,
-	.stop = uv_ptc_seq_stop,
-	.show = uv_ptc_seq_show
+	.start	= uv_ptc_seq_start,
+	.next	= uv_ptc_seq_next,
+	.stop	= uv_ptc_seq_stop,
+	.show	= uv_ptc_seq_show
 };
 
-static int
-uv_ptc_proc_open(struct inode *inode, struct file *file)
+static int uv_ptc_proc_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &uv_ptc_seq_ops);
 }
 
 static const struct file_operations proc_uv_ptc_operations = {
-	.open = uv_ptc_proc_open,
-	.read = seq_read,
-	.write = uv_ptc_proc_write,
-	.llseek = seq_lseek,
-	.release = seq_release,
+	.open		= uv_ptc_proc_open,
+	.read		= seq_read,
+	.write		= uv_ptc_proc_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
 };
 
-static struct proc_dir_entry *proc_uv_ptc;
-
-static int __init
-uv_ptc_init(void)
+static int __init uv_ptc_init(void)
 {
-	static struct proc_dir_entry *sgi_proc_dir;
-
-	sgi_proc_dir = NULL;
+	struct proc_dir_entry *proc_uv_ptc;
 
 	if (!is_uv_system())
 		return 0;
 
-	sgi_proc_dir = proc_mkdir("sgi_uv", NULL);
-	if (!sgi_proc_dir)
+	if (!proc_mkdir("sgi_uv", NULL))
 		return -EINVAL;
 
 	proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
@@ -535,202 +574,213 @@ uv_ptc_init(void)
 	return 0;
 }
 
-static void __exit
-uv_ptc_exit(void)
+/*
+ * begin the initialization of the per-blade control structures
+ */
+static struct bau_control * __init uv_table_bases_init(int blade, int node)
 {
-	remove_proc_entry(UV_PTC_BASENAME, NULL);
+	int i;
+	int *ip;
+	struct bau_msg_status *msp;
+	struct bau_control *bau_tablesp;
+
+	bau_tablesp =
+	    kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
+	if (!bau_tablesp)
+		BUG();
+	bau_tablesp->msg_statuses =
+	    kmalloc_node(sizeof(struct bau_msg_status) *
+			 DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, node);
+	if (!bau_tablesp->msg_statuses)
+		BUG();
+	for (i = 0, msp = bau_tablesp->msg_statuses;
+	     i < DESTINATION_PAYLOAD_QUEUE_SIZE; i++, msp++) {
+		bau_cpubits_clear(&msp->seen_by, (int)
+				  uv_blade_nr_possible_cpus(blade));
+	}
+	bau_tablesp->watching =
+	    kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES,
+			 GFP_KERNEL, node);
+	if (!bau_tablesp->watching)
+		BUG();
+	for (i = 0, ip = bau_tablesp->watching;
+	     i < DESTINATION_PAYLOAD_QUEUE_SIZE; i++, ip++) {
+		*ip = 0;
+	}
+	uv_bau_table_bases[blade] = bau_tablesp;
+	return bau_tablesp;
 }
 
-module_init(uv_ptc_init);
-module_exit(uv_ptc_exit);
+/*
+ * finish the initialization of the per-blade control structures
+ */
+static void __init uv_table_bases_finish(int blade, int node, int cur_cpu,
+				  struct bau_control *bau_tablesp,
+				  struct bau_activation_descriptor *adp)
+{
+	int i;
+	struct bau_control *bcp;
+
+	for (i = cur_cpu; i < (cur_cpu + uv_blade_nr_possible_cpus(blade));
+	     i++) {
+		bcp = (struct bau_control *)&per_cpu(bau_control, i);
+		bcp->bau_msg_head = bau_tablesp->va_queue_first;
+		bcp->va_queue_first = bau_tablesp->va_queue_first;
+		bcp->va_queue_last = bau_tablesp->va_queue_last;
+		bcp->watching = bau_tablesp->watching;
+		bcp->msg_statuses = bau_tablesp->msg_statuses;
+		bcp->descriptor_base = adp;
+	}
+}
 
 /*
- * Initialization of BAU-related structures
+ * initialize the sending side's sending buffers
  */
-int __init
-uv_bau_init(void)
+static struct bau_activation_descriptor * __init
+uv_activation_descriptor_init(int node, int pnode)
 {
 	int i;
-	int j;
-	int blade;
-	int nblades;
-	int *ip;
-	int pnode;
-	int last_blade;
-	int cur_cpu = 0;
 	unsigned long pa;
-	unsigned long n;
 	unsigned long m;
+	unsigned long n;
 	unsigned long mmr_image;
-	unsigned long apicid;
+	struct bau_activation_descriptor *adp;
+	struct bau_activation_descriptor *ad2;
+
+	adp = (struct bau_activation_descriptor *)
+	    kmalloc_node(16384, GFP_KERNEL, node);
+	if (!adp)
+		BUG();
+	pa = __pa((unsigned long)adp);
+	n = pa >> uv_nshift;
+	m = pa & uv_mmask;
+	mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE);
+	if (mmr_image)
+		uv_write_global_mmr64(pnode, (unsigned long)
+				      UVH_LB_BAU_SB_DESCRIPTOR_BASE,
+				      (n << UV_DESC_BASE_PNODE_SHIFT | m));
+	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
+		memset(ad2, 0, sizeof(struct bau_activation_descriptor));
+		ad2->header.sw_ack_flag = 1;
+		ad2->header.base_dest_nodeid =
+		    uv_blade_to_pnode(uv_cpu_to_blade_id(0));
+		ad2->header.command = UV_NET_ENDPOINT_INTD;
+		ad2->header.int_both = 1;
+		/*
+		 * all others need to be set to zero:
+		 *   fairness chaining multilevel count replied_to
+		 */
+	}
+	return adp;
+}
+
+/*
+ * initialize the destination side's receiving buffers
+ */
+static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node,
+				int pnode, struct bau_control *bau_tablesp)
+{
 	char *cp;
-	struct bau_control *bau_tablesp;
-	struct bau_activation_descriptor *adp, *ad2;
 	struct bau_payload_queue_entry *pqp;
-	struct bau_msg_status *msp;
-	struct bau_control *bcp;
 
-	if (!is_uv_system())
-		return 0;
+	pqp = (struct bau_payload_queue_entry *)
+	    kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) *
+			 sizeof(struct bau_payload_queue_entry),
+			 GFP_KERNEL, node);
+	if (!pqp)
+		BUG();
+	cp = (char *)pqp + 31;
+	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
+	bau_tablesp->va_queue_first = pqp;
+	uv_write_global_mmr64(pnode,
+			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
+			      ((unsigned long)pnode <<
+			       UV_PAYLOADQ_PNODE_SHIFT) |
+			      uv_physnodeaddr(pqp));
+	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
+			      uv_physnodeaddr(pqp));
+	bau_tablesp->va_queue_last =
+	    pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1);
+	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
+			      (unsigned long)
+			      uv_physnodeaddr(bau_tablesp->va_queue_last));
+	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) *
+	       DESTINATION_PAYLOAD_QUEUE_SIZE);
+	return pqp;
+}
 
-	uv_bau_retry_limit = 1;
+/*
+ * Initialization of each UV blade's structures
+ */
+static int __init uv_init_blade(int blade, int node, int cur_cpu)
+{
+	int pnode;
+	unsigned long pa;
+	unsigned long apicid;
+	struct bau_activation_descriptor *adp;
+	struct bau_payload_queue_entry *pqp;
+	struct bau_control *bau_tablesp;
 
-	if ((sizeof(struct bau_local_cpumask) * BITSPERBYTE) <
-	    MAX_CPUS_PER_NODE) {
-		printk(KERN_ERR
-			"uv_bau_init: bau_local_cpumask.bits too small\n");
-		BUG();
+	bau_tablesp = uv_table_bases_init(blade, node);
+	pnode = uv_blade_to_pnode(blade);
+	adp = uv_activation_descriptor_init(node, pnode);
+	pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
+	uv_table_bases_finish(blade, node, cur_cpu, bau_tablesp, adp);
+	/*
+	 * the below initialization can't be in firmware because the
+	 * messaging IRQ will be determined by the OS
+	 */
+	apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
+	if ((pa & 0xff) != UV_BAU_MESSAGE) {
+		uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
+				      ((apicid << 32) | UV_BAU_MESSAGE));
 	}
+	return 0;
+}
+
+/*
+ * Initialization of BAU-related structures
+ */
+static int __init uv_bau_init(void)
+{
+	int blade;
+	int node;
+	int nblades;
+	int last_blade;
+	int cur_cpu = 0;
+
+	if (!is_uv_system())
+		return 0;
 
+	uv_bau_retry_limit = 1;
 	uv_nshift = uv_hub_info->n_val;
 	uv_mmask = ((unsigned long)1 << uv_hub_info->n_val) - 1;
 	nblades = 0;
 	last_blade = -1;
-	for_each_online_node(i) {
-		blade = uv_node_to_blade_id(i);
+	for_each_online_node(node) {
+		blade = uv_node_to_blade_id(node);
 		if (blade == last_blade)
 			continue;
 		last_blade = blade;
 		nblades++;
 	}
-
 	uv_bau_table_bases = (struct bau_control **)
 	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
 	if (!uv_bau_table_bases)
 		BUG();
-
-	/* better if we had each_online_blade */
 	last_blade = -1;
-	for_each_online_node(i) {
-		blade = uv_node_to_blade_id(i);
+	for_each_online_node(node) {
+		blade = uv_node_to_blade_id(node);
 		if (blade == last_blade)
 			continue;
 		last_blade = blade;
-
-		bau_tablesp =
-		    kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, i);
-		if (!bau_tablesp)
-			BUG();
-
-		bau_tablesp->msg_statuses =
-		    kmalloc_node(sizeof(struct bau_msg_status) *
-				 DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, i);
-		if (!bau_tablesp->msg_statuses)
-			BUG();
-		for (j = 0, msp = bau_tablesp->msg_statuses;
-		     j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, msp++) {
-			bau_cpubits_clear(&msp->seen_by, (int)
-					  uv_blade_nr_possible_cpus(blade));
-		}
-
-		bau_tablesp->watching =
-		    kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES,
-				 GFP_KERNEL, i);
-		if (!bau_tablesp->watching)
-			BUG();
-		for (j = 0, ip = bau_tablesp->watching;
-		     j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, ip++) {
-			*ip = 0;
-		}
-
-		uv_bau_table_bases[i] = bau_tablesp;
-
-		pnode = uv_blade_to_pnode(blade);
-
-		if (sizeof(struct bau_activation_descriptor) != 64)
-			BUG();
-
-		adp = (struct bau_activation_descriptor *)
-		    kmalloc_node(16384, GFP_KERNEL, i);
-		if (!adp)
-			BUG();
-		if ((unsigned long)adp & 0xfff)
-			BUG();
-		pa = __pa((unsigned long)adp);
-		n = pa >> uv_nshift;
-		m = pa & uv_mmask;
-
-		mmr_image = uv_read_global_mmr64(pnode,
-						 UVH_LB_BAU_SB_DESCRIPTOR_BASE);
-		if (mmr_image)
-			uv_write_global_mmr64(pnode, (unsigned long)
-					      UVH_LB_BAU_SB_DESCRIPTOR_BASE,
-					      (n << UV_DESC_BASE_PNODE_SHIFT |
-					       m));
-		for (j = 0, ad2 = adp; j < UV_ACTIVATION_DESCRIPTOR_SIZE;
-		     j++, ad2++) {
-			memset(ad2, 0,
-			       sizeof(struct bau_activation_descriptor));
-			ad2->header.sw_ack_flag = 1;
-			ad2->header.base_dest_nodeid =
-			    uv_blade_to_pnode(uv_cpu_to_blade_id(0));
-			ad2->header.command = UV_NET_ENDPOINT_INTD;
-			ad2->header.int_both = 1;
-			/* all others need to be set to zero:
-			   fairness chaining multilevel count replied_to */
-		}
-
-		pqp = (struct bau_payload_queue_entry *)
-		    kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) *
-				 sizeof(struct bau_payload_queue_entry),
-				 GFP_KERNEL, i);
-		if (!pqp)
-			BUG();
-		if (sizeof(struct bau_payload_queue_entry) != 32)
-			BUG();
-		if ((unsigned long)(&((struct bau_payload_queue_entry *)0)->
-				    sw_ack_vector) != 15)
-			BUG();
-
-		cp = (char *)pqp + 31;
-		pqp = (struct bau_payload_queue_entry *)
-		    (((unsigned long)cp >> 5) << 5);
-		bau_tablesp->va_queue_first = pqp;
-		uv_write_global_mmr64(pnode,
-				      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
-				      ((unsigned long)pnode <<
-				       UV_PAYLOADQ_PNODE_SHIFT) |
-				      uv_physnodeaddr(pqp));
-		uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
-				      uv_physnodeaddr(pqp));
-		bau_tablesp->va_queue_last =
-		    pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1);
-		uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
-				      (unsigned long)
-				      uv_physnodeaddr(bau_tablesp->
-						      va_queue_last));
-		memset(pqp, 0, sizeof(struct bau_payload_queue_entry) *
-		       DESTINATION_PAYLOAD_QUEUE_SIZE);
-
-		/* this initialization can't be in firmware because the
-		   messaging IRQ will be determined by the OS */
-		apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
-		pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
-		if ((pa & 0xff) != UV_BAU_MESSAGE) {
-			uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
-					      ((apicid << 32) |
-					       UV_BAU_MESSAGE));
-		}
-
-		for (j = cur_cpu; j < (cur_cpu + uv_blade_nr_possible_cpus(i));
-		     j++) {
-			bcp = (struct bau_control *)&per_cpu(bau_control, j);
-			bcp->bau_msg_head = bau_tablesp->va_queue_first;
-			bcp->va_queue_first = bau_tablesp->va_queue_first;
-
-			bcp->va_queue_last = bau_tablesp->va_queue_last;
-			bcp->watching = bau_tablesp->watching;
-			bcp->msg_statuses = bau_tablesp->msg_statuses;
-			bcp->descriptor_base = adp;
-		}
-		cur_cpu += uv_blade_nr_possible_cpus(i);
+		uv_init_blade(blade, node, cur_cpu);
+		cur_cpu += uv_blade_nr_possible_cpus(blade);
 	}
-
 	set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
-
 	uv_enable_timeouts();
-
 	return 0;
 }
-
 __initcall(uv_bau_init);
+__initcall(uv_ptc_init);
diff --git a/include/asm-x86/uv/uv_bau.h b/include/asm-x86/uv/uv_bau.h
index f125f86c89ac..e52fec822667 100644
--- a/include/asm-x86/uv/uv_bau.h
+++ b/include/asm-x86/uv/uv_bau.h
@@ -14,9 +14,9 @@
 #include <linux/bitmap.h>
 #define BITSPERBYTE 8
 
-/* Broadcast Assist Unit messaging structures */
-
 /*
+ * Broadcast Assist Unit messaging structures
+ *
  * Selective Broadcast activations are induced by software action
  * specifying a particular 8-descriptor "set" via a 6-bit index written
  * to an MMR.
@@ -33,54 +33,73 @@
  * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
  */
 
-#define UV_ITEMS_PER_DESCRIPTOR 8
-#define UV_CPUS_PER_ACT_STATUS 32
-#define UV_ACT_STATUS_MASK 0x3
-#define UV_ACT_STATUS_SIZE 2
-#define UV_ACTIVATION_DESCRIPTOR_SIZE 32
-#define UV_DISTRIBUTION_SIZE 256
-#define UV_SW_ACK_NPENDING 8
-#define UV_BAU_MESSAGE 200	/* Messaging irq; see irq_64.h */
-				/* and include/asm-x86/hw_irq_64.h */
-				/* To be dynamically allocated in the future */
-#define UV_NET_ENDPOINT_INTD 0x38
-#define UV_DESC_BASE_PNODE_SHIFT 49 /* position of pnode (nasid>>1) in MMR */
-#define UV_PAYLOADQ_PNODE_SHIFT 49
-
-#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
-#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
-
-/* bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 */
+#define UV_ITEMS_PER_DESCRIPTOR		8
+#define UV_CPUS_PER_ACT_STATUS		32
+#define UV_ACT_STATUS_MASK		0x3
+#define UV_ACT_STATUS_SIZE		2
+#define UV_ACTIVATION_DESCRIPTOR_SIZE	32
+#define UV_DISTRIBUTION_SIZE		256
+#define UV_SW_ACK_NPENDING		8
+#define UV_BAU_MESSAGE			200
+/*
+ * Messaging irq; see irq_64.h and include/asm-x86/hw_irq_64.h
+ * To be dynamically allocated in the future
+ */
+#define UV_NET_ENDPOINT_INTD		0x38
+#define UV_DESC_BASE_PNODE_SHIFT	49
+#define UV_PAYLOADQ_PNODE_SHIFT		49
+#define UV_PTC_BASENAME			"sgi_uv/ptc_statistics"
+#define uv_physnodeaddr(x)		((__pa((unsigned long)(x)) & uv_mmask))
+
+/*
+ * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
+ */
 #define DESC_STATUS_IDLE                0
 #define DESC_STATUS_ACTIVE              1
 #define DESC_STATUS_DESTINATION_TIMEOUT 2
 #define DESC_STATUS_SOURCE_TIMEOUT      3
 
-/* source side threshholds at which message retries print a warning */
+/*
+ * source side threshholds at which message retries print a warning
+ */
 #define SOURCE_TIMEOUT_LIMIT            20
 #define DESTINATION_TIMEOUT_LIMIT       20
 
-/* number of entries in the destination side payload queue */
+/*
+ * number of entries in the destination side payload queue
+ */
 #define DESTINATION_PAYLOAD_QUEUE_SIZE  17
-/* number of destination side software ack resources */
+/*
+ * number of destination side software ack resources
+ */
 #define DESTINATION_NUM_RESOURCES       8
 #define MAX_CPUS_PER_NODE		32
+/*
+ * completion statuses for sending a TLB flush message
+ */
+#define	FLUSH_RETRY			1
+#define	FLUSH_GIVEUP			2
+#define	FLUSH_COMPLETE			3
 
-/* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) */
-/* If the 'multilevel' flag in the header portion of the descriptor
+/*
+ * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
+ * If the 'multilevel' flag in the header portion of the descriptor
  * has been set to 0, then endpoint multi-unicast mode is selected.
  * The distribution specification (32 bytes) is interpreted as a 256-bit
  * distribution vector. Adjacent bits correspond to consecutive even numbered
  * nodeIDs. The result of adding the index of a given bit to the 15-bit
  * 'base_dest_nodeid' field of the header corresponds to the
- * destination nodeID associated with that specified bit.  */
+ * destination nodeID associated with that specified bit.
+ */
 struct bau_target_nodemask {
 	unsigned long bits[BITS_TO_LONGS(256)];
 };
 
-/* mask of cpu's on a node */
-/* (during initialization we need to check that unsigned long has
-    enough bits for max. cpu's per node) */
+/*
+ * mask of cpu's on a node
+ * (during initialization we need to check that unsigned long has
+ *  enough bits for max. cpu's per node)
+ */
 struct bau_local_cpumask {
 	unsigned long bits;
 };
@@ -99,7 +118,9 @@ struct bau_local_cpumask {
  *   the s/w ack bit vector  ]
  */
 
-/* The payload is software-defined for INTD transactions */
+/*
+ * The payload is software-defined for INTD transactions
+ */
 struct bau_msg_payload {
 	unsigned long address;		/* signifies a page or all TLB's
 						of the cpu */
@@ -112,8 +133,10 @@ struct bau_msg_payload {
 };
 
 
-/* Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) */
-/* see table 4.2.3.0.1 in broacast_assist spec. */
+/*
+ * Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
+ * see table 4.2.3.0.1 in broacast_assist spec.
+ */
 struct bau_msg_header {
 	int dest_subnodeid:6;	/* must be zero */
 	/* bits 5:0 */
@@ -173,11 +196,15 @@ struct bau_msg_header {
 	/* bits 127:107 */
 };
 
-/* The format of the message to send, plus all accompanying control */
-/* Should be 64 bytes */
+/*
+ * The format of the message to send, plus all accompanying control
+ * Should be 64 bytes
+ */
 struct bau_activation_descriptor {
 	struct bau_target_nodemask distribution;
-	/* message template, consisting of header and payload: */
+	/*
+	 * message template, consisting of header and payload:
+	 */
 	struct bau_msg_header header;
 	struct bau_msg_payload payload;
 };
@@ -235,18 +262,24 @@ struct bau_payload_queue_entry {
 	/* bytes 24-31 */
 };
 
-/* one for every slot in the destination payload queue */
+/*
+ * one for every slot in the destination payload queue
+ */
 struct bau_msg_status {
 	struct bau_local_cpumask seen_by;	/* map of cpu's */
 };
 
-/* one for every slot in the destination software ack resources */
+/*
+ * one for every slot in the destination software ack resources
+ */
 struct bau_sw_ack_status {
 	struct bau_payload_queue_entry *msg;	/* associated message */
 	int watcher;				/* cpu monitoring, or -1 */
 };
 
-/* one on every node and per-cpu; to locate the software tables */
+/*
+ * one on every node and per-cpu; to locate the software tables
+ */
 struct bau_control {
 	struct bau_activation_descriptor *descriptor_base;
 	struct bau_payload_queue_entry *bau_msg_head;
@@ -267,8 +300,8 @@ struct ptc_stats {
 	unsigned long onetlb;	/* times just one tlb on this cpu was flushed */
 	unsigned long s_retry;	/* retries on source side timeouts */
 	unsigned long d_retry;	/* retries on destination side timeouts */
-	unsigned long sflush_ns;/* nanoseconds spent in uv_flush_tlb_others */
-	unsigned long dflush_ns;/* nanoseconds spent destination side */
+	unsigned long sflush;	/* cycles spent in uv_flush_tlb_others */
+	unsigned long dflush;	/* cycles spent on destination side */
 	unsigned long retriesok; /* successes on retries */
 	unsigned long nomsg;	/* interrupts with no message */
 	unsigned long multmsg;	/* interrupts with multiple messages */
@@ -293,39 +326,11 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
 	bitmap_zero(&dstp->bits, nbits);
 }
 
-/*
- * atomic increment of a short integer
- * (rather than using the __sync_add_and_fetch() intrinsic)
- *
- * returns the new value of the variable
- */
-static inline short int atomic_inc_short(short int *v)
-{
-	asm volatile("movw $1, %%cx\n"
-			"lock ; xaddw %%cx, %0\n"
-			: "+m" (*v)		/* outputs */
-			: : "%cx", "memory");	/* inputs : clobbereds */
-	return *v;
-}
-
-/*
- * atomic OR of two long integers
- * (rather than using the __sync_or_and_fetch() intrinsic)
- */
-static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
-{
-	asm volatile("movq %0, %%rax; lea %1, %%rdx\n"
-			"lock ; orq %%rax, %%rdx\n"
-			: "+m" (*v1)		/* outputs */
-			: "m" (v1), "m" (v2)	/* inputs */
-			: "memory");		/* clobbereds */
-}
-
 #define cpubit_isset(cpu, bau_local_cpumask) \
 	test_bit((cpu), (bau_local_cpumask).bits)
 
-int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
-void uv_bau_message_intr1(void);
-void uv_bau_timeout_intr1(void);
+extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
+extern void uv_bau_message_intr1(void);
+extern void uv_bau_timeout_intr1(void);
 
 #endif /* __ASM_X86_UV_BAU__ */
-- 
cgit v1.2.3


From dc163a41ffba22a6ef70b51e7ddf68aa13b4b414 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Jun 2008 14:15:43 +0200
Subject: SGI UV: TLB shootdown using broadcast assist unit

TLB shootdown for SGI UV.

v5: 6/12 corrections/improvements per Ingo's second review

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c    | 183 ++++++++++++++++++++------------------------
 include/asm-x86/uv/uv_bau.h |  21 ++---
 2 files changed, 96 insertions(+), 108 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f7bc6a6fbe49..d8705e97e8d0 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -25,15 +25,8 @@ static int uv_bau_retry_limit __read_mostly;
 static int uv_nshift __read_mostly; /* position of pnode (which is nasid>>1) */
 static unsigned long uv_mmask __read_mostly;
 
-char *status_table[] = {
-	"IDLE",
-	"ACTIVE",
-	"DESTINATION TIMEOUT",
-	"SOURCE TIMEOUT"
-};
-
-DEFINE_PER_CPU(struct ptc_stats, ptcstats);
-DEFINE_PER_CPU(struct bau_control, bau_control);
+static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
+static DEFINE_PER_CPU(struct bau_control, bau_control);
 
 /*
  * Free a software acknowledge hardware resource by clearing its Pending
@@ -55,7 +48,6 @@ static void uv_reply_to_message(int resource,
 	if (msp)
 		msp->seen_by.bits = 0;
 	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
-	return;
 }
 
 /*
@@ -73,7 +65,7 @@ static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
 	cpu = uv_blade_processor_id();
 	msg->number_of_cpus =
 	    uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
-	this_cpu_mask = (unsigned long)1 << cpu;
+	this_cpu_mask = 1UL << cpu;
 	if (msp->seen_by.bits & this_cpu_mask)
 		return;
 	atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
@@ -94,53 +86,60 @@ static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
 	atomic_inc_short(&msg->acknowledge_count);
 	if (msg->number_of_cpus == msg->acknowledge_count)
 		uv_reply_to_message(sw_ack_slot, msg, msp);
-	return;
 }
 
 /*
- * Examine the payload queue on all the distribution nodes to see
+ * Examine the payload queue on one distribution node to see
  * which messages have not been seen, and which cpu(s) have not seen them.
  *
  * Returns the number of cpu's that have not responded.
  */
-static int uv_examine_destinations(struct bau_target_nodemask *distribution)
+static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
 {
-	int sender;
 	int i;
 	int j;
-	int k;
 	int count = 0;
-	struct bau_control *bau_tablesp;
 	struct bau_payload_queue_entry *msg;
 	struct bau_msg_status *msp;
 
+	for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
+	     msg++, i++) {
+		if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
+			msp = bau_tablesp->msg_statuses + i;
+			printk(KERN_DEBUG
+			       "blade %d: address:%#lx %d of %d, not cpu(s): ",
+			       i, msg->address, msg->acknowledge_count,
+			       msg->number_of_cpus);
+			for (j = 0; j < msg->number_of_cpus; j++) {
+				if (!((long)1 << j & msp-> seen_by.bits)) {
+					count++;
+					printk("%d ", j);
+				}
+			}
+			printk("\n");
+		}
+	}
+	return count;
+}
+
+/*
+ * Examine the payload queue on all the distribution nodes to see
+ * which messages have not been seen, and which cpu(s) have not seen them.
+ *
+ * Returns the number of cpu's that have not responded.
+ */
+static int uv_examine_destinations(struct bau_target_nodemask *distribution)
+{
+	int sender;
+	int i;
+	int count = 0;
+
 	sender = smp_processor_id();
 	for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE);
 	     i++) {
 		if (!bau_node_isset(i, distribution))
 			continue;
-		bau_tablesp = uv_bau_table_bases[i];
-		for (msg = bau_tablesp->va_queue_first, j = 0;
-		     j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) {
-			if ((msg->sending_cpu == sender) &&
-			    (!msg->replied_to)) {
-				msp = bau_tablesp->msg_statuses + j;
-				printk(KERN_DEBUG
-				"blade %d: address:%#lx %d of %d, not cpu(s): ",
-				       i, msg->address,
-				       msg->acknowledge_count,
-				       msg->number_of_cpus);
-				for (k = 0; k < msg->number_of_cpus;
-				     k++) {
-					if (!((long)1 << k & msp->
-					      seen_by.bits)) {
-						count++;
-						printk("%d ", k);
-					}
-				}
-				printk("\n");
-			}
-		}
+		count += uv_examine_destination(uv_bau_table_bases[i], sender);
 	}
 	return count;
 }
@@ -150,7 +149,7 @@ static int uv_examine_destinations(struct bau_target_nodemask *distribution)
  *
  * return COMPLETE, RETRY or GIVEUP
  */
-static int uv_wait_completion(struct bau_activation_descriptor *bau_desc,
+static int uv_wait_completion(struct bau_desc *bau_desc,
 			      unsigned long mmr_offset, int right_shift)
 {
 	int exams = 0;
@@ -213,8 +212,8 @@ static int uv_wait_completion(struct bau_activation_descriptor *bau_desc,
  * Returns 0 if some remote flushing remains to be done. The mask is left
  * unchanged.
  */
-int uv_flush_send_and_wait(int cpu, int this_blade,
-	struct bau_activation_descriptor *bau_desc, cpumask_t *cpumaskp)
+int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
+			   cpumask_t *cpumaskp)
 {
 	int completion_status = 0;
 	int right_shift;
@@ -237,8 +236,8 @@ int uv_flush_send_and_wait(int cpu, int this_blade,
 	time1 = get_cycles();
 	do {
 		tries++;
-		index = ((unsigned long)
-			1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | cpu;
+		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
+			cpu;
 		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
 		completion_status = uv_wait_completion(bau_desc, mmr_offset,
 					right_shift);
@@ -303,7 +302,7 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
 	int cpu;
 	int this_blade;
 	int locals = 0;
-	struct bau_activation_descriptor *bau_desc;
+	struct bau_desc *bau_desc;
 
 	cpu = uv_blade_processor_id();
 	this_blade = uv_numa_blade_id();
@@ -315,8 +314,7 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
 	i = 0;
 	for_each_cpu_mask(bit, *cpumaskp) {
 		blade = uv_cpu_to_blade_id(bit);
-		if (blade > (UV_DISTRIBUTION_SIZE - 1))
-			BUG();
+		BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
 		if (blade == this_blade) {
 			locals++;
 			continue;
@@ -360,6 +358,8 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 {
 	struct bau_payload_queue_entry *pqp;
 	struct bau_payload_queue_entry *msg;
+	struct bau_payload_queue_entry *va_queue_first;
+	struct bau_payload_queue_entry *va_queue_last;
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	cycles_t time1, time2;
 	int msg_slot;
@@ -376,7 +376,8 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 
 	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
 
-	pqp = __get_cpu_var(bau_control).va_queue_first;
+	pqp = va_queue_first = __get_cpu_var(bau_control).va_queue_first;
+	va_queue_last = __get_cpu_var(bau_control).va_queue_last;
 	msg = __get_cpu_var(bau_control).bau_msg_head;
 	while (msg->sw_ack_vector) {
 		count++;
@@ -387,8 +388,8 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 		uv_bau_process_message(msg, msg_slot, sw_ack_slot);
 
 		msg++;
-		if (msg > __get_cpu_var(bau_control).va_queue_last)
-			msg = __get_cpu_var(bau_control).va_queue_first;
+		if (msg > va_queue_last)
+			msg = va_queue_first;
 		__get_cpu_var(bau_control).bau_msg_head = msg;
 	}
 	if (!count)
@@ -401,7 +402,6 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 
 	irq_exit();
 	set_irq_regs(old_regs);
-	return;
 }
 
 static void uv_enable_timeouts(void)
@@ -423,7 +423,6 @@ static void uv_enable_timeouts(void)
 		pnode = uv_blade_to_pnode(blade);
 		cur_cpu += uv_blade_nr_possible_cpus(i);
 	}
-	return;
 }
 
 static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
@@ -535,10 +534,10 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 }
 
 static const struct seq_operations uv_ptc_seq_ops = {
-	.start	= uv_ptc_seq_start,
-	.next	= uv_ptc_seq_next,
-	.stop	= uv_ptc_seq_stop,
-	.show	= uv_ptc_seq_show
+	.start		= uv_ptc_seq_start,
+	.next		= uv_ptc_seq_next,
+	.stop		= uv_ptc_seq_stop,
+	.show		= uv_ptc_seq_show
 };
 
 static int uv_ptc_proc_open(struct inode *inode, struct file *file)
@@ -568,6 +567,7 @@ static int __init uv_ptc_init(void)
 	if (!proc_uv_ptc) {
 		printk(KERN_ERR "unable to create %s proc entry\n",
 		       UV_PTC_BASENAME);
+		remove_proc_entry("sgi_uv", NULL);
 		return -EINVAL;
 	}
 	proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
@@ -582,33 +582,26 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
 	int i;
 	int *ip;
 	struct bau_msg_status *msp;
-	struct bau_control *bau_tablesp;
+	struct bau_control *bau_tabp;
 
-	bau_tablesp =
+	bau_tabp =
 	    kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
-	if (!bau_tablesp)
-		BUG();
-	bau_tablesp->msg_statuses =
+	BUG_ON(!bau_tabp);
+	bau_tabp->msg_statuses =
 	    kmalloc_node(sizeof(struct bau_msg_status) *
-			 DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, node);
-	if (!bau_tablesp->msg_statuses)
-		BUG();
-	for (i = 0, msp = bau_tablesp->msg_statuses;
-	     i < DESTINATION_PAYLOAD_QUEUE_SIZE; i++, msp++) {
+			 DEST_Q_SIZE, GFP_KERNEL, node);
+	BUG_ON(!bau_tabp->msg_statuses);
+	for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
 		bau_cpubits_clear(&msp->seen_by, (int)
 				  uv_blade_nr_possible_cpus(blade));
-	}
-	bau_tablesp->watching =
-	    kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES,
-			 GFP_KERNEL, node);
-	if (!bau_tablesp->watching)
-		BUG();
-	for (i = 0, ip = bau_tablesp->watching;
-	     i < DESTINATION_PAYLOAD_QUEUE_SIZE; i++, ip++) {
+	bau_tabp->watching =
+	    kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
+	BUG_ON(!bau_tabp->watching);
+	for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++) {
 		*ip = 0;
 	}
-	uv_bau_table_bases[blade] = bau_tablesp;
-	return bau_tablesp;
+	uv_bau_table_bases[blade] = bau_tabp;
+	return bau_tabsp;
 }
 
 /*
@@ -616,7 +609,7 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
  */
 static void __init uv_table_bases_finish(int blade, int node, int cur_cpu,
 				  struct bau_control *bau_tablesp,
-				  struct bau_activation_descriptor *adp)
+				  struct bau_desc *adp)
 {
 	int i;
 	struct bau_control *bcp;
@@ -636,7 +629,7 @@ static void __init uv_table_bases_finish(int blade, int node, int cur_cpu,
 /*
  * initialize the sending side's sending buffers
  */
-static struct bau_activation_descriptor * __init
+static struct bau_desc * __init
 uv_activation_descriptor_init(int node, int pnode)
 {
 	int i;
@@ -644,13 +637,12 @@ uv_activation_descriptor_init(int node, int pnode)
 	unsigned long m;
 	unsigned long n;
 	unsigned long mmr_image;
-	struct bau_activation_descriptor *adp;
-	struct bau_activation_descriptor *ad2;
+	struct bau_desc *adp;
+	struct bau_desc *ad2;
 
-	adp = (struct bau_activation_descriptor *)
+	adp = (struct bau_desc *)
 	    kmalloc_node(16384, GFP_KERNEL, node);
-	if (!adp)
-		BUG();
+	BUG_ON(!adp);
 	pa = __pa((unsigned long)adp);
 	n = pa >> uv_nshift;
 	m = pa & uv_mmask;
@@ -660,7 +652,7 @@ uv_activation_descriptor_init(int node, int pnode)
 				      UVH_LB_BAU_SB_DESCRIPTOR_BASE,
 				      (n << UV_DESC_BASE_PNODE_SHIFT | m));
 	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
-		memset(ad2, 0, sizeof(struct bau_activation_descriptor));
+		memset(ad2, 0, sizeof(struct bau_desc));
 		ad2->header.sw_ack_flag = 1;
 		ad2->header.base_dest_nodeid =
 		    uv_blade_to_pnode(uv_cpu_to_blade_id(0));
@@ -683,12 +675,10 @@ static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node,
 	char *cp;
 	struct bau_payload_queue_entry *pqp;
 
-	pqp = (struct bau_payload_queue_entry *)
-	    kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) *
-			 sizeof(struct bau_payload_queue_entry),
-			 GFP_KERNEL, node);
-	if (!pqp)
-		BUG();
+	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
+		(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
+		GFP_KERNEL, node);
+	BUG_ON(!pqp);
 	cp = (char *)pqp + 31;
 	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
 	bau_tablesp->va_queue_first = pqp;
@@ -699,13 +689,11 @@ static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node,
 			      uv_physnodeaddr(pqp));
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
 			      uv_physnodeaddr(pqp));
-	bau_tablesp->va_queue_last =
-	    pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1);
+	bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
 			      (unsigned long)
 			      uv_physnodeaddr(bau_tablesp->va_queue_last));
-	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) *
-	       DESTINATION_PAYLOAD_QUEUE_SIZE);
+	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
 	return pqp;
 }
 
@@ -717,7 +705,7 @@ static int __init uv_init_blade(int blade, int node, int cur_cpu)
 	int pnode;
 	unsigned long pa;
 	unsigned long apicid;
-	struct bau_activation_descriptor *adp;
+	struct bau_desc *adp;
 	struct bau_payload_queue_entry *pqp;
 	struct bau_control *bau_tablesp;
 
@@ -755,7 +743,7 @@ static int __init uv_bau_init(void)
 
 	uv_bau_retry_limit = 1;
 	uv_nshift = uv_hub_info->n_val;
-	uv_mmask = ((unsigned long)1 << uv_hub_info->n_val) - 1;
+	uv_mmask = (1UL << uv_hub_info->n_val) - 1;
 	nblades = 0;
 	last_blade = -1;
 	for_each_online_node(node) {
@@ -767,8 +755,7 @@ static int __init uv_bau_init(void)
 	}
 	uv_bau_table_bases = (struct bau_control **)
 	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
-	if (!uv_bau_table_bases)
-		BUG();
+	BUG_ON(!uv_bau_table_bases);
 	last_blade = -1;
 	for_each_online_node(node) {
 		blade = uv_node_to_blade_id(node);
diff --git a/include/asm-x86/uv/uv_bau.h b/include/asm-x86/uv/uv_bau.h
index e52fec822667..91ac0dfb7588 100644
--- a/include/asm-x86/uv/uv_bau.h
+++ b/include/asm-x86/uv/uv_bau.h
@@ -54,25 +54,25 @@
 /*
  * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
  */
-#define DESC_STATUS_IDLE                0
-#define DESC_STATUS_ACTIVE              1
-#define DESC_STATUS_DESTINATION_TIMEOUT 2
-#define DESC_STATUS_SOURCE_TIMEOUT      3
+#define DESC_STATUS_IDLE		0
+#define DESC_STATUS_ACTIVE		1
+#define DESC_STATUS_DESTINATION_TIMEOUT	2
+#define DESC_STATUS_SOURCE_TIMEOUT	3
 
 /*
  * source side threshholds at which message retries print a warning
  */
-#define SOURCE_TIMEOUT_LIMIT            20
-#define DESTINATION_TIMEOUT_LIMIT       20
+#define SOURCE_TIMEOUT_LIMIT		20
+#define DESTINATION_TIMEOUT_LIMIT	20
 
 /*
  * number of entries in the destination side payload queue
  */
-#define DESTINATION_PAYLOAD_QUEUE_SIZE  17
+#define DEST_Q_SIZE			17
 /*
  * number of destination side software ack resources
  */
-#define DESTINATION_NUM_RESOURCES       8
+#define DEST_NUM_RESOURCES		8
 #define MAX_CPUS_PER_NODE		32
 /*
  * completion statuses for sending a TLB flush message
@@ -197,10 +197,11 @@ struct bau_msg_header {
 };
 
 /*
+ * The activation descriptor:
  * The format of the message to send, plus all accompanying control
  * Should be 64 bytes
  */
-struct bau_activation_descriptor {
+struct bau_desc {
 	struct bau_target_nodemask distribution;
 	/*
 	 * message template, consisting of header and payload:
@@ -281,7 +282,7 @@ struct bau_sw_ack_status {
  * one on every node and per-cpu; to locate the software tables
  */
 struct bau_control {
-	struct bau_activation_descriptor *descriptor_base;
+	struct bau_desc *descriptor_base;
 	struct bau_payload_queue_entry *bau_msg_head;
 	struct bau_payload_queue_entry *va_queue_first;
 	struct bau_payload_queue_entry *va_queue_last;
-- 
cgit v1.2.3


From b4c286e6af24a116228c8c9f58b9a9eb5b7c000a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Jun 2008 14:28:19 +0200
Subject: SGI UV: clean up arch/x86/kernel/tlb_uv.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 107 +++++++++++++++++++++++++++--------------------
 1 file changed, 62 insertions(+), 45 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d8705e97e8d0..7bdbf67a2d79 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,19 +11,22 @@
 #include <linux/kernel.h>
 
 #include <asm/mmu_context.h>
-#include <asm/idle.h>
-#include <asm/genapic.h>
-#include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_bau.h>
+#include <asm/genapic.h>
+#include <asm/idle.h>
 #include <asm/tsc.h>
 
 #include <mach_apic.h>
 
-static struct bau_control **uv_bau_table_bases __read_mostly;
-static int uv_bau_retry_limit __read_mostly;
-static int uv_nshift __read_mostly; /* position of pnode (which is nasid>>1) */
-static unsigned long uv_mmask __read_mostly;
+static struct bau_control	**uv_bau_table_bases __read_mostly;
+static int			uv_bau_retry_limit __read_mostly;
+
+/* position of pnode (which is nasid>>1): */
+static int			uv_nshift __read_mostly;
+
+static unsigned long		uv_mmask __read_mostly;
 
 static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
@@ -37,8 +40,8 @@ static DEFINE_PER_CPU(struct bau_control, bau_control);
  * be sent (the hardware will only do one reply per message).
  */
 static void uv_reply_to_message(int resource,
-		    struct bau_payload_queue_entry *msg,
-		    struct bau_msg_status *msp)
+				struct bau_payload_queue_entry *msg,
+				struct bau_msg_status *msp)
 {
 	unsigned long dw;
 
@@ -55,11 +58,11 @@ static void uv_reply_to_message(int resource,
  * Other cpu's may come here at the same time for this message.
  */
 static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
-		       int msg_slot, int sw_ack_slot)
+				   int msg_slot, int sw_ack_slot)
 {
-	int cpu;
 	unsigned long this_cpu_mask;
 	struct bau_msg_status *msp;
+	int cpu;
 
 	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
 	cpu = uv_blade_processor_id();
@@ -96,11 +99,11 @@ static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
  */
 static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
 {
-	int i;
-	int j;
-	int count = 0;
 	struct bau_payload_queue_entry *msg;
 	struct bau_msg_status *msp;
+	int count = 0;
+	int i;
+	int j;
 
 	for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
 	     msg++, i++) {
@@ -111,7 +114,7 @@ static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
 			       i, msg->address, msg->acknowledge_count,
 			       msg->number_of_cpus);
 			for (j = 0; j < msg->number_of_cpus; j++) {
-				if (!((long)1 << j & msp-> seen_by.bits)) {
+				if (!((1L << j) & msp->seen_by.bits)) {
 					count++;
 					printk("%d ", j);
 				}
@@ -135,8 +138,7 @@ static int uv_examine_destinations(struct bau_target_nodemask *distribution)
 	int count = 0;
 
 	sender = smp_processor_id();
-	for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE);
-	     i++) {
+	for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
 		if (!bau_node_isset(i, distribution))
 			continue;
 		count += uv_examine_destination(uv_bau_table_bases[i], sender);
@@ -217,11 +219,11 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
 {
 	int completion_status = 0;
 	int right_shift;
-	int bit;
-	int blade;
 	int tries = 0;
-	unsigned long index;
+	int blade;
+	int bit;
 	unsigned long mmr_offset;
+	unsigned long index;
 	cycles_t time1;
 	cycles_t time2;
 
@@ -294,7 +296,7 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
  * Returns 0 if some remote flushing remains to be done.
  */
 int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
-	unsigned long va)
+			unsigned long va)
 {
 	int i;
 	int bit;
@@ -356,12 +358,12 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
  */
 void uv_bau_message_interrupt(struct pt_regs *regs)
 {
-	struct bau_payload_queue_entry *pqp;
-	struct bau_payload_queue_entry *msg;
 	struct bau_payload_queue_entry *va_queue_first;
 	struct bau_payload_queue_entry *va_queue_last;
+	struct bau_payload_queue_entry *msg;
 	struct pt_regs *old_regs = set_irq_regs(regs);
-	cycles_t time1, time2;
+	cycles_t time1;
+	cycles_t time2;
 	int msg_slot;
 	int sw_ack_slot;
 	int fw;
@@ -376,13 +378,14 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 
 	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
 
-	pqp = va_queue_first = __get_cpu_var(bau_control).va_queue_first;
+	va_queue_first = __get_cpu_var(bau_control).va_queue_first;
 	va_queue_last = __get_cpu_var(bau_control).va_queue_last;
+
 	msg = __get_cpu_var(bau_control).bau_msg_head;
 	while (msg->sw_ack_vector) {
 		count++;
 		fw = msg->sw_ack_vector;
-		msg_slot = msg - pqp;
+		msg_slot = msg - va_queue_first;
 		sw_ack_slot = ffs(fw) - 1;
 
 		uv_bau_process_message(msg, msg_slot, sw_ack_slot);
@@ -484,7 +487,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
  * >0: retry limit
  */
 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
-		  size_t count, loff_t *data)
+				 size_t count, loff_t *data)
 {
 	long newmode;
 	char optstr[64];
@@ -587,42 +590,48 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
 	bau_tabp =
 	    kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
 	BUG_ON(!bau_tabp);
+
 	bau_tabp->msg_statuses =
 	    kmalloc_node(sizeof(struct bau_msg_status) *
 			 DEST_Q_SIZE, GFP_KERNEL, node);
 	BUG_ON(!bau_tabp->msg_statuses);
+
 	for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
 		bau_cpubits_clear(&msp->seen_by, (int)
 				  uv_blade_nr_possible_cpus(blade));
+
 	bau_tabp->watching =
 	    kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
 	BUG_ON(!bau_tabp->watching);
-	for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++) {
+
+	for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
 		*ip = 0;
-	}
+
 	uv_bau_table_bases[blade] = bau_tabp;
+
 	return bau_tabsp;
 }
 
 /*
  * finish the initialization of the per-blade control structures
  */
-static void __init uv_table_bases_finish(int blade, int node, int cur_cpu,
-				  struct bau_control *bau_tablesp,
-				  struct bau_desc *adp)
+static void __init
+uv_table_bases_finish(int blade, int node, int cur_cpu,
+		      struct bau_control *bau_tablesp,
+		      struct bau_desc *adp)
 {
-	int i;
 	struct bau_control *bcp;
+	int i;
 
-	for (i = cur_cpu; i < (cur_cpu + uv_blade_nr_possible_cpus(blade));
-	     i++) {
+	for (i = cur_cpu; i < cur_cpu + uv_blade_nr_possible_cpus(blade); i++) {
 		bcp = (struct bau_control *)&per_cpu(bau_control, i);
-		bcp->bau_msg_head = bau_tablesp->va_queue_first;
-		bcp->va_queue_first = bau_tablesp->va_queue_first;
-		bcp->va_queue_last = bau_tablesp->va_queue_last;
-		bcp->watching = bau_tablesp->watching;
-		bcp->msg_statuses = bau_tablesp->msg_statuses;
-		bcp->descriptor_base = adp;
+
+		bcp->bau_msg_head	= bau_tablesp->va_queue_first;
+		bcp->va_queue_first	= bau_tablesp->va_queue_first;
+		bcp->va_queue_last	= bau_tablesp->va_queue_last;
+		bcp->watching		= bau_tablesp->watching;
+		bcp->msg_statuses	= bau_tablesp->msg_statuses;
+		bcp->descriptor_base	= adp;
 	}
 }
 
@@ -643,14 +652,18 @@ uv_activation_descriptor_init(int node, int pnode)
 	adp = (struct bau_desc *)
 	    kmalloc_node(16384, GFP_KERNEL, node);
 	BUG_ON(!adp);
+
 	pa = __pa((unsigned long)adp);
 	n = pa >> uv_nshift;
 	m = pa & uv_mmask;
+
 	mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE);
-	if (mmr_image)
+	if (mmr_image) {
 		uv_write_global_mmr64(pnode, (unsigned long)
 				      UVH_LB_BAU_SB_DESCRIPTOR_BASE,
 				      (n << UV_DESC_BASE_PNODE_SHIFT | m));
+	}
+
 	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
 		memset(ad2, 0, sizeof(struct bau_desc));
 		ad2->header.sw_ack_flag = 1;
@@ -669,16 +682,17 @@ uv_activation_descriptor_init(int node, int pnode)
 /*
  * initialize the destination side's receiving buffers
  */
-static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node,
-				int pnode, struct bau_control *bau_tablesp)
+static struct bau_payload_queue_entry * __init
+uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
 {
-	char *cp;
 	struct bau_payload_queue_entry *pqp;
+	char *cp;
 
 	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
 		(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
 		GFP_KERNEL, node);
 	BUG_ON(!pqp);
+
 	cp = (char *)pqp + 31;
 	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
 	bau_tablesp->va_queue_first = pqp;
@@ -694,6 +708,7 @@ static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node,
 			      (unsigned long)
 			      uv_physnodeaddr(bau_tablesp->va_queue_last));
 	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
+
 	return pqp;
 }
 
@@ -756,6 +771,7 @@ static int __init uv_bau_init(void)
 	uv_bau_table_bases = (struct bau_control **)
 	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
 	BUG_ON(!uv_bau_table_bases);
+
 	last_blade = -1;
 	for_each_online_node(node) {
 		blade = uv_node_to_blade_id(node);
@@ -767,6 +783,7 @@ static int __init uv_bau_init(void)
 	}
 	set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
 	uv_enable_timeouts();
+
 	return 0;
 }
 __initcall(uv_bau_init);
-- 
cgit v1.2.3


From d400524affeb84bdfc2b00cd561fbfb8c09dadd7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Jun 2008 14:51:57 +0200
Subject: SGI UV: TLB shootdown using broadcast assist unit, fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

arch/x86/kernel/tlb_uv.c: In function ‘uv_table_bases_init':
arch/x86/kernel/tlb_uv.c:612: error: ‘bau_tabsp' undeclared (first use in this function)
arch/x86/kernel/tlb_uv.c:612: error: (Each undeclared identifier is reported only once
arch/x86/kernel/tlb_uv.c:612: error: for each function it appears in.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 7bdbf67a2d79..b362913f0199 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -609,7 +609,7 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
 
 	uv_bau_table_bases[blade] = bau_tabp;
 
-	return bau_tabsp;
+	return bau_tabp;
 }
 
 /*
-- 
cgit v1.2.3


From b6df1b8bc1250191cfee15627697111c1cbda53f Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Thu, 19 Jun 2008 21:51:05 -0500
Subject: x86: fix stack overflow for large values of MAX_APICS

physid_mask_of_physid() causes a huge stack (12k) to be created if the
number of APICS is large. Replace physid_mask_of_physid() with a
new function that does not create large stacks. This is a problem only
on large x86_64 systems.

this paves the way to increase MAX_APICS.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: linux-mm@kvack.org
Cc: mingo@elte.hu
Cc: tglx@linutronix.de
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c | 2 +-
 arch/x86/kernel/apic_64.c | 2 +-
 arch/x86/kernel/smpboot.c | 5 ++---
 include/asm-x86/mpspec.h  | 7 +++++++
 4 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index d5767cb19d56..3f13a1aa07cd 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1269,7 +1269,7 @@ int __init APIC_init_uniprocessor(void)
 #ifdef CONFIG_CRASH_DUMP
 	boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
 #endif
-	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+	physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
 
 	setup_local_APIC();
 
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 0633cfd0dc29..5e22f8d2d507 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -942,7 +942,7 @@ int __init APIC_init_uniprocessor(void)
 
 	verify_local_APIC();
 
-	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+	physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
 	apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
 
 	setup_local_APIC();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3e1cecedde42..664a5db36d40 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1091,10 +1091,9 @@ static __init void disable_smp(void)
 	smpboot_clear_io_apic_irqs();
 #endif
 	if (smp_found_config)
-		phys_cpu_present_map =
-				physid_mask_of_physid(boot_cpu_physical_apicid);
+		physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
 	else
-		phys_cpu_present_map = physid_mask_of_physid(0);
+		physid_set_mask_of_physid(0, &phys_cpu_present_map);
 	map_cpu_to_logical_apicid();
 	cpu_set(0, per_cpu(cpu_sibling_map, 0));
 	cpu_set(0, per_cpu(cpu_core_map, 0));
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index 57a991b9c053..b69e7ba7bf1d 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -101,6 +101,7 @@ typedef struct physid_mask physid_mask_t;
 		__physid_mask;						\
 	})
 
+/* Note: will create very large stack frames if physid_mask_t is big */
 #define physid_mask_of_physid(physid)					\
 	({								\
 		physid_mask_t __physid_mask = PHYSID_MASK_NONE;		\
@@ -108,6 +109,12 @@ typedef struct physid_mask physid_mask_t;
 		__physid_mask;						\
 	})
 
+static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
+{
+	physids_clear(*map);
+	physid_set(physid, *map);
+}
+
 #define PHYSID_MASK_ALL		{ {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
 #define PHYSID_MASK_NONE	{ {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
 
-- 
cgit v1.2.3


From cef53278682eb2604cbd99de64cdb59a8b35235a Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Thu, 19 Jun 2008 11:16:24 -0500
Subject: x86, SGI UV: TLB shootdown using broadcast assist unit, v6

v6: 6/19 close the security hole in uv_ptc_proc_write())

  > Found a potential security hole while doing that:
  > static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
  >                              size_t count, loff_t *data)
  >     if (copy_from_user(optstr, user, count))
  >             return -EFAULT;
  >
  > is count guaranteed to never be larger than 64?

is fixed below.

It adds tlb_uv.o to the Makefile.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: mingo@elte.hu
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index b362913f0199..c503b7f04481 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -492,6 +492,8 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 	long newmode;
 	char optstr[64];
 
+	if (count > 64)
+		return -EINVAL;
 	if (copy_from_user(optstr, user, count))
 		return -EFAULT;
 	optstr[count - 1] = '\0';
-- 
cgit v1.2.3


From e7eb8726d0e144f0925972c4ecee945e91a42753 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 23 Jun 2008 08:32:25 -0500
Subject: x86, SGI UV: uv_ptc_proc_write fix

Someone could write 0 bytes to /proc/sgi_uv/ptc_statistics,
causing
  optstr[count - 1] = '\0';
to write to who-knows-where.

(Andi Kleen noticed this need from a patch I sent for
 similar code in the ia64 world (sn2_ptc_proc_write()).)

(count less than zero is not possible here, as count is unsigned)

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index c503b7f04481..d0fbb7712ab0 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -492,7 +492,7 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 	long newmode;
 	char optstr[64];
 
-	if (count > 64)
+	if (count == 0 || count > sizeof(optstr))
 		return -EINVAL;
 	if (copy_from_user(optstr, user, count))
 		return -EFAULT;
-- 
cgit v1.2.3


From 1ecd27657b735128a728ebf0c31fce5e1456718a Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Fri, 20 Jun 2008 15:38:22 +0200
Subject: x86: unify crashkernel reservation for 32 and 64 bit

This patch moves the reserve_crashkernel() to setup.c and removes the
architecture-specific version. Both versions were more or less the same.

I tested it on both x86-64 and i386, with CONFIG_KEXEC on and off (so
that it compiles).

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Cc: yhlu.kernel@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c    | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_32.c | 53 ----------------------------------------
 arch/x86/kernel/setup_64.c | 40 ------------------------------
 include/asm-x86/setup.h    |  3 +++
 4 files changed, 64 insertions(+), 93 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ebb0a2bcdc08..c5330f601b68 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/percpu.h>
+#include <linux/kexec.h>
 #include <asm/smp.h>
 #include <asm/percpu.h>
 #include <asm/sections.h>
@@ -11,6 +12,7 @@
 #include <asm/topology.h>
 #include <asm/mpspec.h>
 #include <asm/apicdef.h>
+#include <asm/highmem.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 unsigned int num_processors;
@@ -404,3 +406,62 @@ EXPORT_SYMBOL(node_to_cpumask);
 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
 
 #endif /* X86_64_NUMA */
+
+
+/*
+ * --------- Crashkernel reservation ------------------------------
+ */
+
+static inline unsigned long long get_total_mem(void)
+{
+	unsigned long long total;
+
+	total = max_low_pfn - min_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	total += highend_pfn - highstart_pfn;
+#endif
+
+	return total << PAGE_SHIFT;
+}
+
+#ifdef CONFIG_KEXEC
+void __init reserve_crashkernel(void)
+{
+	unsigned long long total_mem;
+	unsigned long long crash_size, crash_base;
+	int ret;
+
+	total_mem = get_total_mem();
+
+	ret = parse_crashkernel(boot_command_line, total_mem,
+			&crash_size, &crash_base);
+	if (ret == 0 && crash_size > 0) {
+		if (crash_base <= 0) {
+			printk(KERN_INFO "crashkernel reservation failed - "
+					"you have to specify a base address\n");
+			return;
+		}
+
+		if (reserve_bootmem_generic(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE) < 0) {
+			printk(KERN_INFO "crashkernel reservation failed - "
+					"memory is in use\n");
+			return;
+		}
+
+		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+				"for crashkernel (System RAM: %ldMB)\n",
+				(unsigned long)(crash_size >> 20),
+				(unsigned long)(crash_base >> 20),
+				(unsigned long)(total_mem >> 20));
+
+		crashk_res.start = crash_base;
+		crashk_res.end   = crash_base + crash_size - 1;
+		insert_resource(&iomem_resource, &crashk_res);
+	}
+}
+#else
+void __init reserve_crashkernel(void)
+{}
+#endif
+
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index a9b19ad24edb..369d0fe1ff9c 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -429,59 +429,6 @@ extern unsigned long __init setup_memory(void);
 extern void zone_sizes_init(void);
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
-static inline unsigned long long get_total_mem(void)
-{
-	unsigned long long total;
-
-	total = max_low_pfn - min_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	total += highend_pfn - highstart_pfn;
-#endif
-
-	return total << PAGE_SHIFT;
-}
-
-#ifdef CONFIG_KEXEC
-static void __init reserve_crashkernel(void)
-{
-	unsigned long long total_mem;
-	unsigned long long crash_size, crash_base;
-	int ret;
-
-	total_mem = get_total_mem();
-
-	ret = parse_crashkernel(boot_command_line, total_mem,
-			&crash_size, &crash_base);
-	if (ret == 0 && crash_size > 0) {
-		if (crash_base <= 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"you have to specify a base address\n");
-			return;
-		}
-
-		if (reserve_bootmem_generic(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE) < 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"memory is in use\n");
-			return;
-		}
-
-		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-				"for crashkernel (System RAM: %ldMB)\n",
-				(unsigned long)(crash_size >> 20),
-				(unsigned long)(crash_base >> 20),
-				(unsigned long)(total_mem >> 20));
-
-		crashk_res.start = crash_base;
-		crashk_res.end   = crash_base + crash_size - 1;
-		insert_resource(&iomem_resource, &crashk_res);
-	}
-}
-#else
-static inline void __init reserve_crashkernel(void)
-{}
-#endif
-
 #ifdef CONFIG_BLK_DEV_INITRD
 
 static bool do_relocate_initrd = false;
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 16ef53ab538a..504caeaffd58 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -228,46 +228,6 @@ static inline void copy_edd(void)
 }
 #endif
 
-#ifdef CONFIG_KEXEC
-static void __init reserve_crashkernel(void)
-{
-	unsigned long long total_mem;
-	unsigned long long crash_size, crash_base;
-	int ret;
-
-	total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
-
-	ret = parse_crashkernel(boot_command_line, total_mem,
-			&crash_size, &crash_base);
-	if (ret == 0 && crash_size) {
-		if (crash_base <= 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"you have to specify a base address\n");
-			return;
-		}
-
-		if (reserve_bootmem_generic(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE) < 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"memory is in use\n");
-			return;
-		}
-
-		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-				"for crashkernel (System RAM: %ldMB)\n",
-				(unsigned long)(crash_size >> 20),
-				(unsigned long)(crash_base >> 20),
-				(unsigned long)(total_mem >> 20));
-		crashk_res.start = crash_base;
-		crashk_res.end   = crash_base + crash_size - 1;
-		insert_resource(&iomem_resource, &crashk_res);
-	}
-}
-#else
-static inline void __init reserve_crashkernel(void)
-{}
-#endif
-
 /*
  * setup_arch - architecture-specific boot-time initializations
  *
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index e14b6e73d266..b434bdd82ba7 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -8,6 +8,9 @@
 /* Interrupt control for vSMPowered x86_64 systems */
 void vsmp_init(void);
 
+/* Crashkernel reservation */
+void reserve_crashkernel(void);
+
 #ifndef CONFIG_PARAVIRT
 #define paravirt_post_allocator_init()	do {} while (0)
 #endif
-- 
cgit v1.2.3


From a7bf0bd5e6af7fe69342dabf2a3b721f0163469a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 28 May 2008 15:02:14 +0100
Subject: build: add __page_aligned_data and __page_aligned_bss

Making a variable page-aligned by using
__attribute__((section(".data.page_aligned"))) is fragile because if
sizeof(variable) is not also a multiple of page size, it leaves
variables in the remainder of the section unaligned.

This patch introduces two new qualifiers, __page_aligned_data and
__page_aligned_bss to set the section *and* the alignment of
variables.  This makes page-aligned variables more robust because the
linker will make sure they're aligned properly.  Unfortunately it
requires *all* page-aligned data to use these macros...

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup64.c | 2 +-
 arch/x86/mm/ioremap.c     | 3 +--
 include/linux/linkage.h   | 4 ++++
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 631ea6cc01d8..fc1a56da8240 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -40,7 +40,7 @@ EXPORT_SYMBOL(_cpu_pda);
 
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
-char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
 
 unsigned long __supported_pte_mask __read_mostly = ~0UL;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 416ea415f5c2..0561fde56a51 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -394,8 +394,7 @@ static int __init early_ioremap_debug_setup(char *str)
 early_param("early_ioremap_debug", early_ioremap_debug_setup);
 
 static __initdata int after_paging_init;
-static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
-		__section(.bss.page_aligned);
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 2119610b24f8..9fd1f859021b 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_LINKAGE_H
 #define _LINUX_LINKAGE_H
 
+#include <linux/compiler.h>
 #include <asm/linkage.h>
 
 #ifdef __cplusplus
@@ -17,6 +18,9 @@
 # define asmregparm
 #endif
 
+#define __page_aligned_data	__section(.data.page_aligned) __aligned(PAGE_SIZE)
+#define __page_aligned_bss	__section(.bss.page_aligned) __aligned(PAGE_SIZE)
+
 /*
  * This is used by architectures to keep arguments on the stack
  * untouched by the compiler by keeping them live until the end.
-- 
cgit v1.2.3


From 9cf4f298e29abba25c16679fe7be70898223167e Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Tue, 27 May 2008 18:22:54 -0700
Subject: x86: use stack_start in x86_64

call x86_64's init_rsp stack_start, just as i386 does.
Put a zeroed stack segment for consistency. With this,
we can eliminate one ugly ifdef in smpboot.c.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/sleep.c | 2 +-
 arch/x86/kernel/head_64.S    | 5 +++--
 arch/x86/kernel/smpboot.c    | 7 +------
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 36af01f029ed..a81f468ab410 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -86,7 +86,7 @@ int acpi_save_state_mem(void)
 	saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
 	header->trampoline_segment = setup_trampoline() >> 4;
-	init_rsp = (unsigned long)temp_stack + 4096;
+	stack_start.sp = temp_stack + 4096;
 	initial_code = (unsigned long)wakeup_long64;
 	saved_magic = 0x123456789abcdef0;
 #endif /* CONFIG_64BIT */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 263b9d14753e..918a2711aff6 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -191,7 +191,7 @@ ENTRY(secondary_startup_64)
 	movq	%rax, %cr0
 
 	/* Setup a boot time stack */
-	movq init_rsp(%rip),%rsp
+	movq stack_start(%rip),%rsp
 
 	/* zero EFLAGS after setting rsp */
 	pushq $0
@@ -252,8 +252,9 @@ ENTRY(secondary_startup_64)
 	.quad	x86_64_start_kernel
 	__FINITDATA
 
-	ENTRY(init_rsp)
+	ENTRY(stack_start)
 	.quad  init_thread_union+THREAD_SIZE-8
+	.word  0
 
 bad_address:
 	jmp bad_address
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d3ad4e09455b..a71e3cad5470 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -714,11 +714,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 	 * target processor state.
 	 */
 	startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
-#ifdef CONFIG_X86_64
-			 (unsigned long)init_rsp);
-#else
 			 (unsigned long)stack_start.sp);
-#endif
 
 	/*
 	 * Run STARTUP IPI loop.
@@ -905,15 +901,14 @@ do_rest:
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	c_idle.idle->thread.ip = (unsigned long) start_secondary;
 	/* Stack for startup_32 can be just as for start_secondary onwards */
-	stack_start.sp = (void *) c_idle.idle->thread.sp;
 	irq_ctx_init(cpu);
 #else
 	cpu_pda(cpu)->pcurrent = c_idle.idle;
-	init_rsp = c_idle.idle->thread.sp;
 	load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
 	initial_code = (unsigned long)start_secondary;
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 #endif
+	stack_start.sp = (void *) c_idle.idle->thread.sp;
 
 	/* start_ip had better be page-aligned! */
 	start_ip = setup_trampoline();
-- 
cgit v1.2.3


From 736f12bff9d9e7b4e895c64f73b190c8383fc2a1 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Tue, 27 May 2008 20:14:51 -0700
Subject: x86: don't use gdt_page openly.

There's a macro available for that.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index cf37d20b1ba7..dc7c05e5cfe7 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1119,7 +1119,7 @@ void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 
 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
 {
-	struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
+	struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
 	unsigned long base = (kesp - uesp) & -THREAD_SIZE;
 	unsigned long new_kesp = kesp - base;
 	unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
-- 
cgit v1.2.3


From a939098afcfa5f81d3474782ec15c6d114e57763 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 28 May 2008 16:19:53 -0700
Subject: x86: move x86_64 gdt closer to i386

i386 and x86_64 used two different schemes for maintaining the gdt.
With this patch, x86_64 initial gdt table is defined in a .c file,
same way as i386 is now. Also, we call it "gdt_page", and the descriptor,
"early_gdt_descr". This way we achieve common naming, which can allow for
more code integration.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_64.S        | 48 +++++-----------------------------------
 arch/x86/kernel/setup64.c        |  5 +----
 arch/x86/kernel/setup_64.c       | 19 ++++++++++++++--
 arch/x86/kernel/smpboot.c        | 12 +++-------
 arch/x86/kernel/x8664_ksyms_64.c |  5 -----
 include/asm-x86/desc.h           | 24 +++++++++-----------
 include/asm-x86/segment.h        | 23 ++++++++++---------
 7 files changed, 48 insertions(+), 88 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 918a2711aff6..32f5a114d1a2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -203,7 +203,7 @@ ENTRY(secondary_startup_64)
 	 * addresses where we're currently running on. We have to do that here
 	 * because in 32bit we couldn't load a 64bit linear address.
 	 */
-	lgdt	cpu_gdt_descr(%rip)
+	lgdt	early_gdt_descr(%rip)
 
 	/* set up data segments. actually 0 would do too */
 	movl $__KERNEL_DS,%eax
@@ -391,54 +391,16 @@ NEXT_PAGE(level2_spare_pgt)
 
 	.data
 	.align 16
-	.globl cpu_gdt_descr
-cpu_gdt_descr:
-	.word	gdt_end-cpu_gdt_table-1
-gdt:
-	.quad	cpu_gdt_table
-#ifdef CONFIG_SMP
-	.rept	NR_CPUS-1
-	.word	0
-	.quad	0
-	.endr
-#endif
+	.globl early_gdt_descr
+early_gdt_descr:
+	.word	GDT_ENTRIES*8-1
+	.quad   per_cpu__gdt_page
 
 ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
 	.quad   0x0000000000000000
 
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types  kkeil 2000/10/28
- * Also sysret mandates a special GDT layout 
- */
-		 		
-	.section .data.page_aligned, "aw"
-	.align PAGE_SIZE
-
-/* The TLS descriptors are currently at a different place compared to i386.
-   Hopefully nobody expects them at a fixed place (Wine?) */
 	
-ENTRY(cpu_gdt_table)
-	.quad	0x0000000000000000	/* NULL descriptor */
-	.quad	0x00cf9b000000ffff	/* __KERNEL32_CS */
-	.quad	0x00af9b000000ffff	/* __KERNEL_CS */
-	.quad	0x00cf93000000ffff	/* __KERNEL_DS */
-	.quad	0x00cffb000000ffff	/* __USER32_CS */
-	.quad	0x00cff3000000ffff	/* __USER_DS, __USER32_DS  */
-	.quad	0x00affb000000ffff	/* __USER_CS */
-	.quad	0x0			/* unused */
-	.quad	0,0			/* TSS */
-	.quad	0,0			/* LDT */
-	.quad   0,0,0			/* three TLS descriptors */ 
-	.quad	0x0000f40000000000	/* node/CPU stored in limit */
-gdt_end:	
-	/* asm/segment.h:GDT_ENTRIES must match this */	
-	/* This should be a multiple of the cache line size */
-	/* GDTs of other CPUs are now dynamically allocated */
-
-	/* zero the remaining page */
-	.fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
-
 	.section .bss, "aw", @nobits
 	.align L1_CACHE_BYTES
 ENTRY(idt_table)
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index fc1a56da8240..70ff07186772 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -202,11 +202,8 @@ void __cpuinit cpu_init (void)
 	 * Initialize the per-CPU GDT with the boot GDT,
 	 * and set up the GDT descriptor:
 	 */
-	if (cpu)
-		memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
 
-	cpu_gdt_descr[cpu].size = GDT_SIZE;
-	load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]);
+	switch_to_new_gdt();
 	load_idt((const struct desc_ptr *)&idt_descr);
 
 	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 504caeaffd58..a93300de4da9 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -81,8 +81,6 @@
 #define ARCH_SETUP
 #endif
 
-#include "cpu/cpu.h"
-
 /*
  * Machine setup..
  */
@@ -228,6 +226,23 @@ static inline void copy_edd(void)
 }
 #endif
 
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) __init memory_setup(void)
+{
+       machine_specific_memory_setup();
+}
+
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+	struct desc_ptr gdt_descr;
+
+	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+}
+
 /*
  * setup_arch - architecture-specific boot-time initializations
  *
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a71e3cad5470..fe2bd515d6cc 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -849,14 +849,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 		.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
 	};
 	INIT_WORK(&c_idle.work, do_fork_idle);
-#ifdef CONFIG_X86_64
-	/* allocate memory for gdts of secondary cpus. Hotplug is considered */
-	if (!cpu_gdt_descr[cpu].address &&
-		!(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
-		printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
-		return -1;
-	}
 
+#ifdef CONFIG_X86_64
 	/* Allocate node local memory for AP pdas */
 	if (cpu > 0) {
 		boot_error = get_local_pda(cpu);
@@ -898,7 +892,6 @@ do_rest:
 #ifdef CONFIG_X86_32
 	per_cpu(current_task, cpu) = c_idle.idle;
 	init_gdt(cpu);
-	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	c_idle.idle->thread.ip = (unsigned long) start_secondary;
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
@@ -908,6 +901,7 @@ do_rest:
 	initial_code = (unsigned long)start_secondary;
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 #endif
+	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	stack_start.sp = (void *) c_idle.idle->thread.sp;
 
 	/* start_ip had better be page-aligned! */
@@ -1252,8 +1246,8 @@ void __init native_smp_prepare_boot_cpu(void)
 	int me = smp_processor_id();
 #ifdef CONFIG_X86_32
 	init_gdt(me);
-	switch_to_new_gdt();
 #endif
+	switch_to_new_gdt();
 	/* already set me in cpu_online_map in boot_cpu_init() */
 	cpu_set(me, cpu_callout_map);
 	per_cpu(cpu_state, me) = CPU_ONLINE;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index f6c05d0410fb..2f306a826897 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -53,8 +53,3 @@ EXPORT_SYMBOL(init_level4_pgt);
 EXPORT_SYMBOL(load_gs_index);
 
 EXPORT_SYMBOL(_proxy_pda);
-
-#ifdef CONFIG_PARAVIRT
-/* Virtualized guests may want to use it */
-EXPORT_SYMBOL_GPL(cpu_gdt_descr);
-#endif
diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h
index b3875d4b4fab..07f9f2b17be8 100644
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -29,11 +29,17 @@ static inline void fill_ldt(struct desc_struct *desc,
 extern struct desc_ptr idt_descr;
 extern gate_desc idt_table[];
 
+struct gdt_page {
+	struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+	return per_cpu(gdt_page, cpu).gdt;
+}
+
 #ifdef CONFIG_X86_64
-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
-extern struct desc_ptr cpu_gdt_descr[];
-/* the cpu gdt accessor */
-#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
 
 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
 			     unsigned dpl, unsigned ist, unsigned seg)
@@ -51,16 +57,6 @@ static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
 }
 
 #else
-struct gdt_page {
-	struct desc_struct gdt[GDT_ENTRIES];
-} __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU(struct gdt_page, gdt_page);
-
-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
-{
-	return per_cpu(gdt_page, cpu).gdt;
-}
-
 static inline void pack_gate(gate_desc *gate, unsigned char type,
 			     unsigned long base, unsigned dpl, unsigned flags,
 			     unsigned short seg)
diff --git a/include/asm-x86/segment.h b/include/asm-x86/segment.h
index ed5131dd7d92..dfc8601c0892 100644
--- a/include/asm-x86/segment.h
+++ b/include/asm-x86/segment.h
@@ -61,18 +61,14 @@
 #define GDT_ENTRY_TLS_MAX 	(GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
 
 #define GDT_ENTRY_DEFAULT_USER_CS	14
-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
 
 #define GDT_ENTRY_DEFAULT_USER_DS	15
-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
 
 #define GDT_ENTRY_KERNEL_BASE	12
 
 #define GDT_ENTRY_KERNEL_CS		(GDT_ENTRY_KERNEL_BASE + 0)
-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
 
 #define GDT_ENTRY_KERNEL_DS		(GDT_ENTRY_KERNEL_BASE + 1)
-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
 
 #define GDT_ENTRY_TSS			(GDT_ENTRY_KERNEL_BASE + 4)
 #define GDT_ENTRY_LDT			(GDT_ENTRY_KERNEL_BASE + 5)
@@ -139,10 +135,11 @@
 #else
 #include <asm/cache.h>
 
-#define __KERNEL_CS	0x10
-#define __KERNEL_DS	0x18
+#define GDT_ENTRY_KERNEL32_CS 1
+#define GDT_ENTRY_KERNEL_CS 2
+#define GDT_ENTRY_KERNEL_DS 3
 
-#define __KERNEL32_CS   0x08
+#define __KERNEL32_CS   (GDT_ENTRY_KERNEL32_CS * 8)
 
 /*
  * we cannot use the same code segment descriptor for user and kernel
@@ -150,10 +147,10 @@
  * The segment offset needs to contain a RPL. Grr. -AK
  * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
  */
-
-#define __USER32_CS   0x23   /* 4*8+3 */
-#define __USER_DS     0x2b   /* 5*8+3 */
-#define __USER_CS     0x33   /* 6*8+3 */
+#define GDT_ENTRY_DEFAULT_USER32_CS 4
+#define GDT_ENTRY_DEFAULT_USER_DS 5
+#define GDT_ENTRY_DEFAULT_USER_CS 6
+#define __USER32_CS   (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
 #define __USER32_DS	__USER_DS
 
 #define GDT_ENTRY_TSS 8	/* needs two entries */
@@ -175,6 +172,10 @@
 
 #endif
 
+#define __KERNEL_CS	(GDT_ENTRY_KERNEL_CS * 8)
+#define __KERNEL_DS	(GDT_ENTRY_KERNEL_DS * 8)
+#define __USER_DS     (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
+#define __USER_CS     (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
 #ifndef CONFIG_PARAVIRT
 #define get_kernel_rpl()  0
 #endif
-- 
cgit v1.2.3


From e3f77edfc1d0beb7b10f9f31d9e39206f7dbef7b Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 28 May 2008 12:57:02 -0300
Subject: x86: use initial_code for i386

x86_64 jumps to whatever is written in "initial_code" symbol,
instead of a fixed address. Do it for i386 too. It will allow us
to integrate more of the smp boot code.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_32.S | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b98b338aae1a..ffb73a5b609f 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -455,7 +455,10 @@ is386:	movl $2,%ecx		# set MP
 	jmp initialize_secondary # all other CPUs call initialize_secondary
 1:
 #endif /* CONFIG_SMP */
-	jmp i386_start_kernel
+	jmp *(initial_code)
+.align 4
+ENTRY(initial_code)
+	.long i386_start_kernel
 
 /*
  * We depend on ET to be correct. This checks for 287/387.
-- 
cgit v1.2.3


From 3e9704739daf46a8ba6593d749c67b5f7cd633d2 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 28 May 2008 13:01:54 -0300
Subject: x86: boot secondary cpus through initial_code

remove "initialize_secondary". Boot both architectures via
initial_code.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_32.S |  2 +-
 arch/x86/kernel/smpboot.c | 25 +------------------------
 2 files changed, 2 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index ffb73a5b609f..f67e93441caf 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -452,7 +452,7 @@ is386:	movl $2,%ecx		# set MP
 	je   1f
 	movl $(__KERNEL_PERCPU), %eax
 	movl %eax,%fs		# set this cpu's percpu
-	jmp initialize_secondary # all other CPUs call initialize_secondary
+	movl (stack_start), %esp
 1:
 #endif /* CONFIG_SMP */
 	jmp *(initial_code)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fe2bd515d6cc..2a0d39f3f2f1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -349,28 +349,6 @@ static void __cpuinit start_secondary(void *unused)
 	cpu_idle();
 }
 
-#ifdef CONFIG_X86_32
-/*
- * Everything has been set up for the secondary
- * CPUs - they just need to reload everything
- * from the task structure
- * This function must not return.
- */
-void __devinit initialize_secondary(void)
-{
-	/*
-	 * We don't actually need to load the full TSS,
-	 * basically just the stack pointer and the ip.
-	 */
-
-	asm volatile(
-		"movl %0,%%esp\n\t"
-		"jmp *%1"
-		:
-		:"m" (current->thread.sp), "m" (current->thread.ip));
-}
-#endif
-
 static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_X86_32
@@ -892,16 +870,15 @@ do_rest:
 #ifdef CONFIG_X86_32
 	per_cpu(current_task, cpu) = c_idle.idle;
 	init_gdt(cpu);
-	c_idle.idle->thread.ip = (unsigned long) start_secondary;
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
 #else
 	cpu_pda(cpu)->pcurrent = c_idle.idle;
 	load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
-	initial_code = (unsigned long)start_secondary;
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 #endif
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+	initial_code = (unsigned long)start_secondary;
 	stack_start.sp = (void *) c_idle.idle->thread.sp;
 
 	/* start_ip had better be page-aligned! */
-- 
cgit v1.2.3


From 0f385d1ddd0952e01a968bfa5512378ad23de6df Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 28 May 2008 17:09:53 -0700
Subject: x86: clearing io_apic harmless for x86_64

so remove ifdef.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2a0d39f3f2f1..a74a261c5626 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1052,9 +1052,8 @@ static __init void disable_smp(void)
 {
 	cpu_present_map = cpumask_of_cpu(0);
 	cpu_possible_map = cpumask_of_cpu(0);
-#ifdef CONFIG_X86_32
 	smpboot_clear_io_apic_irqs();
-#endif
+
 	if (smp_found_config)
 		physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
 	else
-- 
cgit v1.2.3


From 86e430edf462e872ecfab28d6b8619be5ab9c300 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 28 May 2008 20:05:46 -0700
Subject: x86: remove ifdef from stepping

The stepping won't affect x86_64, since there are not x86_64 k7's
or pentiums. So, although it adds to the binary size, remove the ifdef
for smoother integration

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a74a261c5626..6f9a31a18811 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -351,7 +351,6 @@ static void __cpuinit start_secondary(void *unused)
 
 static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_32
 	/*
 	 * Mask B, Pentium, but not Pentium MMX
 	 */
@@ -401,7 +400,6 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
 
 valid_k7:
 	;
-#endif
 }
 
 static void __cpuinit smp_checks(void)
-- 
cgit v1.2.3


From 3fde690011a84e19f98f77bfaa349b2119ddd2d2 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 28 May 2008 20:34:19 -0700
Subject: x86: change __setup_vector_irq with setup_vector_irq

We create a version of it for i386, and then take the CONFIG_X86_64
ifdef out of the game. We could create a __setup_vector_irq for i386,
but it would incur in an unnecessary lock taking. Moreover, it is better
practice to only export setup_vector_irq anyway.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c |  5 +++++
 arch/x86/kernel/io_apic_64.c |  9 ++++++++-
 arch/x86/kernel/smpboot.c    | 11 ++---------
 include/asm-x86/hw_irq.h     |  2 +-
 4 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index fedb3b113ace..d6af301c822b 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1207,6 +1207,11 @@ static int assign_irq_vector(int irq)
 
 	return vector;
 }
+
+void setup_vector_irq(int cpu)
+{
+}
+
 static struct irq_chip ioapic_chip;
 
 #define IOAPIC_AUTO	-1
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 2eba4f4c14ba..08c48750888a 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -801,7 +801,7 @@ static void __clear_irq_vector(int irq)
 	cpus_clear(cfg->domain);
 }
 
-void __setup_vector_irq(int cpu)
+static void __setup_vector_irq(int cpu)
 {
 	/* Initialize vector_irq on a new cpu */
 	/* This function must be called with vector_lock held */
@@ -824,6 +824,13 @@ void __setup_vector_irq(int cpu)
 	}
 }
 
+void setup_vector_irq(int cpu)
+{
+	spin_lock(&vector_lock);
+	__setup_vector_irq(smp_processor_id());
+	spin_unlock(&vector_lock);
+}
+
 
 static struct irq_chip ioapic_chip;
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6f9a31a18811..e09f3124738a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -329,15 +329,8 @@ static void __cpuinit start_secondary(void *unused)
 	 * smp_call_function().
 	 */
 	lock_ipi_call_lock();
-#ifdef CONFIG_X86_64
-	spin_lock(&vector_lock);
-
-	/* Setup the per cpu irq handling data structures */
-	__setup_vector_irq(smp_processor_id());
-	/*
-	 * Allow the master to continue.
-	 */
-	spin_unlock(&vector_lock);
+#ifdef CONFIG_X86_IO_APIC
+	setup_vector_irq(smp_processor_id());
 #endif
 	cpu_set(smp_processor_id(), cpu_online_map);
 	unlock_ipi_call_lock();
diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h
index 1428b41dcbb9..18f067c310f7 100644
--- a/include/asm-x86/hw_irq.h
+++ b/include/asm-x86/hw_irq.h
@@ -97,9 +97,9 @@ extern void (*const interrupt[NR_IRQS])(void);
 #else
 typedef int vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
-extern void __setup_vector_irq(int cpu);
 extern spinlock_t vector_lock;
 #endif
+extern void setup_vector_irq(int cpu);
 
 #endif /* !ASSEMBLY_ */
 
-- 
cgit v1.2.3


From b5841765a2e735a38612c4e4a82170c33d701b3c Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 28 May 2008 13:38:28 -0300
Subject: x86: provide connect_bsp_APIC for x86_64

Although it is not really needed, we provide it to get
closer to i386. ifdefs around it are removed in smpboot.c

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_64.c | 10 ++++++++++
 arch/x86/kernel/smpboot.c |  5 +----
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index d9d663ffa641..5279dc924381 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -918,6 +918,8 @@ int __init APIC_init_uniprocessor(void)
 
 	verify_local_APIC();
 
+	connect_bsp_APIC();
+
 	physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
 	apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
 
@@ -999,6 +1001,14 @@ asmlinkage void smp_error_interrupt(void)
 	irq_exit();
 }
 
+/**
+ *  * connect_bsp_APIC - attach the APIC to the interrupt system
+ *   */
+void __init connect_bsp_APIC(void)
+{
+	enable_apic_mode();
+}
+
 void disconnect_bsp_APIC(int virt_wire_setup)
 {
 	/* Go back to Virtual Wire compatibility mode */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e09f3124738a..a569f06d789e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1116,9 +1116,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
 
 		localise_nmi_watchdog();
 
-#ifdef CONFIG_X86_32
 		connect_bsp_APIC();
-#endif
 		setup_local_APIC();
 		end_local_APIC_setup();
 		return -1;
@@ -1173,9 +1171,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	}
 	preempt_enable();
 
-#ifdef CONFIG_X86_32
 	connect_bsp_APIC();
-#endif
+
 	/*
 	 * Switch from PIC to APIC mode.
 	 */
-- 
cgit v1.2.3


From 78e622705c69da9649ba87071d8de85054b62df8 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 4 Jun 2008 02:03:07 -0300
Subject: x86: change naming to match x86_64

Change unmap_cpu_to_logical_apicid to numa_remove_cpu.
Besides being shorter, it is the same name x86_64 uses. We
can save an ifdef in the code this way.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a569f06d789e..b99c386af77d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -181,13 +181,12 @@ static void map_cpu_to_logical_apicid(void)
 	map_cpu_to_node(cpu, node);
 }
 
-static void unmap_cpu_to_logical_apicid(int cpu)
+static void numa_remove_cpu(int cpu)
 {
 	cpu_2_logical_apicid[cpu] = BAD_APICID;
 	unmap_cpu_to_node(cpu);
 }
 #else
-#define unmap_cpu_to_logical_apicid(cpu) do {} while (0)
 #define map_cpu_to_logical_apicid()  do {} while (0)
 #endif
 
@@ -946,10 +945,7 @@ restore_state:
 
 	if (boot_error) {
 		/* Try to put things back the way they were before ... */
-		unmap_cpu_to_logical_apicid(cpu);
-#ifdef CONFIG_X86_64
 		numa_remove_cpu(cpu); /* was set by numa_add_cpu */
-#endif
 		cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
 		cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
 		cpu_clear(cpu, cpu_present_map);
@@ -1244,7 +1240,7 @@ void cpu_exit_clear(void)
 	cpu_clear(cpu, cpu_callout_map);
 	cpu_clear(cpu, cpu_callin_map);
 
-	unmap_cpu_to_logical_apicid(cpu);
+	numa_remove_cpu(cpu);
 }
 #  endif /* CONFIG_X86_32 */
 
-- 
cgit v1.2.3


From b553a1e0ff48bd66fd18f705370e47c0b4ecea61 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 4 Jun 2008 02:05:03 -0300
Subject: x86: remove cpu from maps

during cpu disable, take cpus out of all maps in i386, instead
of just the online map.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b99c386af77d..820c23dbe761 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1319,13 +1319,11 @@ __init void prefill_possible_map(void)
 static void __ref remove_cpu_from_maps(int cpu)
 {
 	cpu_clear(cpu, cpu_online_map);
-#ifdef CONFIG_X86_64
 	cpu_clear(cpu, cpu_callout_map);
 	cpu_clear(cpu, cpu_callin_map);
 	/* was set by cpu_init() */
 	clear_bit(cpu, (unsigned long *)&cpu_initialized);
 	numa_remove_cpu(cpu);
-#endif
 }
 
 int __cpu_disable(void)
-- 
cgit v1.2.3


From 1481a3dd42c21ac4a8b9497cb9f5df816d6b064f Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 4 Jun 2008 15:35:03 -0300
Subject: x86: move cpu_exit_clear to process_32.c

Take it out of smpboot.c, and move it to process_32.c, closer
to its only user.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_32.c | 16 ++++++++++++++++
 arch/x86/kernel/smpboot.c    | 19 +------------------
 include/asm-x86/numa_32.h    |  1 +
 include/asm-x86/smp.h        |  1 -
 4 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c2a11d77b1b5..9a139f6c9df3 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -74,6 +74,22 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 
 #ifdef CONFIG_HOTPLUG_CPU
 #include <asm/nmi.h>
+
+static void cpu_exit_clear(void)
+{
+	int cpu = raw_smp_processor_id();
+
+	idle_task_exit();
+
+	cpu_uninit();
+	irq_ctx_exit(cpu);
+
+	cpu_clear(cpu, cpu_callout_map);
+	cpu_clear(cpu, cpu_callin_map);
+
+	numa_remove_cpu(cpu);
+}
+
 /* We don't actually take CPU down, just spin without interrupts. */
 static inline void play_dead(void)
 {
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 820c23dbe761..67af727f733a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -181,7 +181,7 @@ static void map_cpu_to_logical_apicid(void)
 	map_cpu_to_node(cpu, node);
 }
 
-static void numa_remove_cpu(int cpu)
+void numa_remove_cpu(int cpu)
 {
 	cpu_2_logical_apicid[cpu] = BAD_APICID;
 	unmap_cpu_to_node(cpu);
@@ -1227,23 +1227,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-#  ifdef CONFIG_X86_32
-void cpu_exit_clear(void)
-{
-	int cpu = raw_smp_processor_id();
-
-	idle_task_exit();
-
-	cpu_uninit();
-	irq_ctx_exit(cpu);
-
-	cpu_clear(cpu, cpu_callout_map);
-	cpu_clear(cpu, cpu_callin_map);
-
-	numa_remove_cpu(cpu);
-}
-#  endif /* CONFIG_X86_32 */
-
 static void remove_siblinginfo(int cpu)
 {
 	int sibling;
diff --git a/include/asm-x86/numa_32.h b/include/asm-x86/numa_32.h
index a02674f64869..ed6c88eac848 100644
--- a/include/asm-x86/numa_32.h
+++ b/include/asm-x86/numa_32.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_32_NUMA_H 1
 
 extern int pxm_to_nid(int pxm);
+extern void numa_remove_cpu(int cpu);
 
 #ifdef CONFIG_NUMA
 extern void __init remap_numa_kva(void);
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index fc1007321ef6..fad45f6a193f 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -188,7 +188,6 @@ static inline int hard_smp_processor_id(void)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_HOTPLUG_CPU
-extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
 #endif
 
-- 
cgit v1.2.3


From 7f6cbc905ee22c457e0dcd0bba9d4affbc290a6f Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Wed, 4 Jun 2008 23:05:39 -0700
Subject: x86: take load_sp0 out of smpboot.c

there's no particular reason to do load_sp0 in different
places for i386 and x86_64. They should all be in cpu_init.
Right now, cpu_init itself is not integrated, but with this patch,
the code becomes closer to each other, making in easier to integrate
when the time comes.

Furthermore, although doing it in do_boot_cpu for x86_64 is fine, since it's
only a copy, load_sp0 should be executed in the cpu it refers to anyway.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup64.c | 1 +
 arch/x86/kernel/smpboot.c | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 70ff07186772..151d3155ddf6 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -247,6 +247,7 @@ void __cpuinit cpu_init (void)
 		BUG();
 	enter_lazy_tlb(&init_mm, me);
 
+	load_sp0(t, &current->thread);
 	set_tss_desc(cpu, t);
 	load_TR_desc();
 	load_LDT(&init_mm.context);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 67af727f733a..3b48d1f4c7c3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -864,7 +864,6 @@ do_rest:
 	irq_ctx_init(cpu);
 #else
 	cpu_pda(cpu)->pcurrent = c_idle.idle;
-	load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 #endif
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
-- 
cgit v1.2.3


From 1ea598c29748a559a0086a84a016886d786e6272 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Jun 2008 20:31:54 +0200
Subject: x86: fix sleep.c build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

arch/x86/kernel/acpi/sleep.c: In function ‘acpi_save_state_mem':
arch/x86/kernel/acpi/sleep.c:75: error: ‘stack_start' undeclared (first use in this function)
arch/x86/kernel/acpi/sleep.c:75: error: (Each undeclared identifier is reported
only once
arch/x86/kernel/acpi/sleep.c:75: error: for each function it appears in.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/sleep.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index a81f468ab410..e6a4b564ccaa 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -86,7 +86,9 @@ int acpi_save_state_mem(void)
 	saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
 	header->trampoline_segment = setup_trampoline() >> 4;
+#ifdef CONFIG_SMP
 	stack_start.sp = temp_stack + 4096;
+#endif
 	initial_code = (unsigned long)wakeup_long64;
 	saved_magic = 0x123456789abcdef0;
 #endif /* CONFIG_64BIT */
-- 
cgit v1.2.3


From 3c999f142665265afd0fe9190204dd051f17e505 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 20 Jun 2008 16:11:20 -0700
Subject: x86: check command line when CONFIG_X86_MPPARSE is not set, v2

if acpi=off, acpi=noirq and pci=noacpi, we need to disable apic.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Maciej W. Rozycki" <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 14 ++++++++++++++
 arch/x86/kernel/apic_32.c   |  2 +-
 arch/x86/kernel/setup.c     |  4 ++++
 arch/x86/kernel/setup_32.c  |  5 +++++
 arch/x86/kernel/setup_64.c  |  9 +++++++++
 include/asm-x86/apic.h      |  6 +++++-
 include/linux/acpi.h        |  6 ++++++
 7 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6516359922ba..5c0107602b62 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1787,6 +1787,20 @@ static int __init parse_pci(char *arg)
 }
 early_param("pci", parse_pci);
 
+int __init acpi_mps_check(void)
+{
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
+/* mptable code is not built-in*/
+	if (acpi_disabled || acpi_noirq) {
+		printk(KERN_WARNING "MPS support code is not built-in.\n"
+		       "Using acpi=off or acpi=noirq or pci=noacpi "
+		       "may have problem\n");
+		return 1;
+	}
+#endif
+	return 0;
+}
+
 #ifdef CONFIG_X86_IO_APIC
 static int __init parse_acpi_skip_timer_override(char *arg)
 {
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index dd8de26b2786..4932d7813bcd 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -57,7 +57,7 @@ unsigned long mp_lapic_addr;
  *
  * -1=force-disable, +1=force-enable
  */
-static int enable_local_apic __initdata;
+int enable_local_apic;
 
 /* Local APIC timer verification ok */
 static int local_apic_timer_verify_ok;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c5330f601b68..56aee55cf8dc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -161,6 +161,10 @@ void __init setup_per_cpu_areas(void)
 	char *ptr;
 	int cpu;
 
+	/* no processor from mptable or madt */
+	if (!num_processors)
+		num_processors = 1;
+
 #ifdef CONFIG_HOTPLUG_CPU
 	prefill_possible_map();
 #else
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 369d0fe1ff9c..cad4e893df05 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -677,6 +677,11 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+	if (acpi_mps_check()){
+		enable_local_apic = -1;
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+	}
+
 	finish_e820_parsing();
 
 	probe_roms();
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index a93300de4da9..175c696ec536 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -302,6 +302,11 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+	if (acpi_mps_check()) {
+		disable_apic = 1;
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+	}
+
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
 	if (init_ohci1394_dma_early)
 		init_ohci1394_dma_on_all_controllers();
@@ -723,6 +728,10 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 		cpu_devs[c->x86_vendor]->c_early_init(c);
 
 	validate_pat_support(c);
+
+	/* early_param could clear that, but recall get it set again */
+	if (disable_apic)
+		clear_cpu_cap(c, X86_FEATURE_APIC);
 }
 
 /*
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index 313bcaf4b6c3..9fe941cd843d 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -39,8 +39,12 @@ extern int apic_verbosity;
 extern int local_apic_timer_c2_ok;
 
 extern int ioapic_force;
-extern int disable_apic;
 
+#ifdef CONFIG_X86_64
+extern int disable_apic;
+#else
+extern int enable_local_apic;
+#endif
 /*
  * Basic functions accessing APICs.
  */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 41f7ce7edd7a..0601075d09a1 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -82,6 +82,7 @@ char * __acpi_map_table (unsigned long phys_addr, unsigned long size);
 int early_acpi_boot_init(void);
 int acpi_boot_init (void);
 int acpi_boot_table_init (void);
+int acpi_mps_check (void);
 int acpi_numa_init (void);
 
 int acpi_table_init (void);
@@ -250,6 +251,11 @@ static inline int acpi_boot_table_init(void)
 	return 0;
 }
 
+static inline int acpi_mps_check(void)
+{
+	return 0;
+}
+
 static inline int acpi_check_resource_conflict(struct resource *res)
 {
 	return 0;
-- 
cgit v1.2.3


From ce38cc79964687d3c7e92663bc040552416fca27 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 01:14:27 -0700
Subject: x86: clean up init_amd()

1. move out calling of check_enable_amd_mmconf_dmi out of setup_64.c
   put it into init_amd(), so don't need to make extra dmi check for
   system with other cpus.
2. 15 --> 0xf

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd_64.c | 22 ++++++++++++++--------
 arch/x86/kernel/setup_64.c   |  4 ----
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 30b7557c9641..958526d6a74a 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -131,7 +131,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	 * Errata 63 for SH-B3 steppings
 	 * Errata 122 for all steppings (F+ have it disabled by default)
 	 */
-	if (c->x86 == 15) {
+	if (c->x86 == 0xf) {
 		rdmsrl(MSR_K8_HWCR, value);
 		value |= 1 << 6;
 		wrmsrl(MSR_K8_HWCR, value);
@@ -143,10 +143,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	clear_cpu_cap(c, 0*32+31);
 
 	/* On C+ stepping K8 rep microcode works well for copy/memset */
-	level = cpuid_eax(1);
-	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
-			     level >= 0x0f58))
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	if (c->x86 == 0xf) {
+		level = cpuid_eax(1);
+		if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+			set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	}
 	if (c->x86 == 0x10 || c->x86 == 0x11)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 
@@ -157,7 +158,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	level = get_model_name(c);
 	if (!level) {
 		switch (c->x86) {
-		case 15:
+		case 0xf:
 			/* Should distinguish Models here, but this is only
 			   a fallback anyways. */
 			strcpy(c->x86_model_id, "Hammer");
@@ -176,14 +177,19 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	else
 		num_cache_leaves = 3;
 
-	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
+	if (c->x86 >= 0xf && c->x86 <= 0x11)
 		set_cpu_cap(c, X86_FEATURE_K8);
 
 	/* MFENCE stops RDTSC speculation */
 	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
 
-	if (c->x86 == 0x10)
+	if (c->x86 == 0x10) {
+		/* do this for boot cpu */
+		if (c == &boot_cpu_data)
+			check_enable_amd_mmconf_dmi();
+
 		fam10h_check_enable_mmcfg();
+	}
 
 	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
 		unsigned long long tseg;
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 175c696ec536..c94464ab04ba 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -72,7 +72,6 @@
 #include <asm/topology.h>
 #include <asm/trampoline.h>
 #include <asm/pat.h>
-#include <asm/mmconfig.h>
 
 #include <mach_apic.h>
 #ifdef CONFIG_PARAVIRT
@@ -474,9 +473,6 @@ void __init setup_arch(char **cmdline_p)
 	conswitchp = &dummy_con;
 #endif
 #endif
-
-	/* do this before identify_cpu for boot cpu */
-	check_enable_amd_mmconf_dmi();
 }
 
 struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-- 
cgit v1.2.3


From 04606618bb50c4ec754585a82732ea4facfe2bc9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 01:38:41 -0700
Subject: x86: remove some acpi ifdefs in setup_32/64

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 5 +----
 arch/x86/kernel/setup_64.c | 4 ----
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index cad4e893df05..e274ee6ff582 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -726,12 +726,10 @@ void __init setup_arch(char **cmdline_p)
 
 	io_delay_init();
 
-#ifdef CONFIG_ACPI
 	/*
 	 * Parse the ACPI tables for possible boot-time SMP configuration.
 	 */
 	acpi_boot_table_init();
-#endif
 
 #ifdef CONFIG_ACPI_NUMA
         /*
@@ -812,9 +810,8 @@ void __init setup_arch(char **cmdline_p)
 
 	early_quirks();
 
-#ifdef CONFIG_ACPI
 	acpi_boot_init();
-#endif
+
 #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
 	if (smp_found_config)
 		get_smp_config();
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index c94464ab04ba..9b516eecada0 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -355,13 +355,11 @@ void __init setup_arch(char **cmdline_p)
 	kvmclock_init();
 #endif
 
-#ifdef CONFIG_ACPI
 	/*
 	 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
 	 * Call this early for SRAT node setup.
 	 */
 	acpi_boot_table_init();
-#endif
 
 	/* How many end-of-memory variables you have, grandma! */
 	max_low_pfn = end_pfn;
@@ -432,12 +430,10 @@ void __init setup_arch(char **cmdline_p)
 
 	early_quirks();
 
-#ifdef CONFIG_ACPI
 	/*
 	 * Read APIC and some other early information from ACPI tables.
 	 */
 	acpi_boot_init();
-#endif
 
 	init_cpu_to_node();
 
-- 
cgit v1.2.3


From f580366f77cc4e035a68369105fbeae5bf436b4c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 03:24:19 -0700
Subject: x86: seperate funcs from setup_64 to cpu common_64.c

Signed-off-by: Yinghai Lu <yhlu.kernel@mail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/Makefile    |   2 +-
 arch/x86/kernel/cpu/common_64.c | 406 ++++++++++++++++++++
 arch/x86/kernel/setup_64.c      | 823 ----------------------------------------
 include/asm-x86/processor.h     |   1 +
 4 files changed, 408 insertions(+), 824 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/common_64.c
 delete mode 100644 arch/x86/kernel/setup_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 65b1be5fe9ce..ee76eaad3001 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -6,7 +6,7 @@ obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o feature_names.o
 
 obj-$(CONFIG_X86_32)	+= common.o bugs.o
-obj-$(CONFIG_X86_64)	+= bugs_64.o
+obj-$(CONFIG_X86_64)	+= common_64.o bugs_64.o
 obj-$(CONFIG_X86_32)	+= amd.o
 obj-$(CONFIG_X86_64)	+= amd_64.o
 obj-$(CONFIG_X86_32)	+= cyrix.o
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
new file mode 100644
index 000000000000..b6f205063f74
--- /dev/null
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -0,0 +1,406 @@
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/bootmem.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/mtrr.h>
+#include <asm/mce.h>
+#include <asm/pat.h>
+#include <asm/numa.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <mach_apic.h>
+#endif
+
+#include "cpu.h"
+
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types  kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ */
+/* The TLS descriptors are currently at a different place compared to i386.
+   Hopefully nobody expects them at a fixed place (Wine?) */
+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+	[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+	struct desc_ptr gdt_descr;
+
+	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+}
+
+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+	display_cacheinfo(c);
+}
+
+static struct cpu_dev __cpuinitdata default_cpu = {
+	.c_init	= default_init,
+	.c_vendor = "Unknown",
+};
+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+	unsigned int *v;
+
+	if (c->extended_cpuid_level < 0x80000004)
+		return 0;
+
+	v = (unsigned int *) c->x86_model_id;
+	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+	c->x86_model_id[48] = 0;
+	return 1;
+}
+
+
+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+{
+	unsigned int n, dummy, eax, ebx, ecx, edx;
+
+	n = c->extended_cpuid_level;
+
+	if (n >= 0x80000005) {
+		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
+		       "D cache %dK (%d bytes/line)\n",
+		       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+		c->x86_cache_size = (ecx>>24) + (edx>>24);
+		/* On K8 L1 TLB is inclusive, so don't count it */
+		c->x86_tlbsize = 0;
+	}
+
+	if (n >= 0x80000006) {
+		cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+		ecx = cpuid_ecx(0x80000006);
+		c->x86_cache_size = ecx >> 16;
+		c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+
+		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+		c->x86_cache_size, ecx & 0xFF);
+	}
+	if (n >= 0x80000008) {
+		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+}
+
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	u32 eax, ebx, ecx, edx;
+	int index_msb, core_bits;
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+
+
+	if (!cpu_has(c, X86_FEATURE_HT))
+		return;
+	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+		goto out;
+
+	smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+	if (smp_num_siblings == 1) {
+		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+	} else if (smp_num_siblings > 1) {
+
+		if (smp_num_siblings > NR_CPUS) {
+			printk(KERN_WARNING "CPU: Unsupported number of "
+			       "siblings %d", smp_num_siblings);
+			smp_num_siblings = 1;
+			return;
+		}
+
+		index_msb = get_count_order(smp_num_siblings);
+		c->phys_proc_id = phys_pkg_id(index_msb);
+
+		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+		index_msb = get_count_order(smp_num_siblings);
+
+		core_bits = get_count_order(c->x86_max_cores);
+
+		c->cpu_core_id = phys_pkg_id(index_msb) &
+					       ((1 << core_bits) - 1);
+	}
+out:
+	if ((c->x86_max_cores * smp_num_siblings) > 1) {
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       c->phys_proc_id);
+		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+		       c->cpu_core_id);
+	}
+
+#endif
+}
+
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+	char *v = c->x86_vendor_id;
+	int i;
+	static int printed;
+
+	for (i = 0; i < X86_VENDOR_NUM; i++) {
+		if (cpu_devs[i]) {
+			if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+			    (cpu_devs[i]->c_ident[1] &&
+			    !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+				c->x86_vendor = i;
+				this_cpu = cpu_devs[i];
+				return;
+			}
+		}
+	}
+	if (!printed) {
+		printed++;
+		printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+		printk(KERN_ERR "CPU: Your system may be unstable.\n");
+	}
+	c->x86_vendor = X86_VENDOR_UNKNOWN;
+}
+
+static void __init early_cpu_support_print(void)
+{
+	int i,j;
+	struct cpu_dev *cpu_devx;
+
+	printk("KERNEL supported cpus:\n");
+	for (i = 0; i < X86_VENDOR_NUM; i++) {
+		cpu_devx = cpu_devs[i];
+		if (!cpu_devx)
+			continue;
+		for (j = 0; j < 2; j++) {
+			if (!cpu_devx->c_ident[j])
+				continue;
+			printk("  %s %s\n", cpu_devx->c_vendor,
+				cpu_devx->c_ident[j]);
+		}
+	}
+}
+
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
+
+void __init early_cpu_init(void)
+{
+        struct cpu_vendor_dev *cvdev;
+
+        for (cvdev = __x86cpuvendor_start ;
+             cvdev < __x86cpuvendor_end   ;
+             cvdev++)
+                cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+	early_cpu_support_print();
+	early_identify_cpu(&boot_cpu_data);
+}
+
+/* Do some early cpuid on the boot CPU to get some parameter that are
+   needed before check_bugs. Everything advanced is in identify_cpu
+   below. */
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+{
+	u32 tfms, xlvl;
+
+	c->loops_per_jiffy = loops_per_jiffy;
+	c->x86_cache_size = -1;
+	c->x86_vendor = X86_VENDOR_UNKNOWN;
+	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
+	c->x86_vendor_id[0] = '\0'; /* Unset */
+	c->x86_model_id[0] = '\0';  /* Unset */
+	c->x86_clflush_size = 64;
+	c->x86_cache_alignment = c->x86_clflush_size;
+	c->x86_max_cores = 1;
+	c->x86_coreid_bits = 0;
+	c->extended_cpuid_level = 0;
+	memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+	/* Get vendor name */
+	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+	      (unsigned int *)&c->x86_vendor_id[0],
+	      (unsigned int *)&c->x86_vendor_id[8],
+	      (unsigned int *)&c->x86_vendor_id[4]);
+
+	get_cpu_vendor(c);
+
+	/* Initialize the standard set of capabilities */
+	/* Note that the vendor-specific code below might override */
+
+	/* Intel-defined flags: level 0x00000001 */
+	if (c->cpuid_level >= 0x00000001) {
+		__u32 misc;
+		cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
+		      &c->x86_capability[0]);
+		c->x86 = (tfms >> 8) & 0xf;
+		c->x86_model = (tfms >> 4) & 0xf;
+		c->x86_mask = tfms & 0xf;
+		if (c->x86 == 0xf)
+			c->x86 += (tfms >> 20) & 0xff;
+		if (c->x86 >= 0x6)
+			c->x86_model += ((tfms >> 16) & 0xF) << 4;
+		if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
+			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+	} else {
+		/* Have CPUID level 0 only - unheard of */
+		c->x86 = 4;
+	}
+
+	c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
+#ifdef CONFIG_SMP
+	c->phys_proc_id = c->initial_apicid;
+#endif
+	/* AMD-defined flags: level 0x80000001 */
+	xlvl = cpuid_eax(0x80000000);
+	c->extended_cpuid_level = xlvl;
+	if ((xlvl & 0xffff0000) == 0x80000000) {
+		if (xlvl >= 0x80000001) {
+			c->x86_capability[1] = cpuid_edx(0x80000001);
+			c->x86_capability[6] = cpuid_ecx(0x80000001);
+		}
+		if (xlvl >= 0x80000004)
+			get_model_name(c); /* Default name */
+	}
+
+	/* Transmeta-defined flags: level 0x80860001 */
+	xlvl = cpuid_eax(0x80860000);
+	if ((xlvl & 0xffff0000) == 0x80860000) {
+		/* Don't set x86_cpuid_level here for now to not confuse. */
+		if (xlvl >= 0x80860001)
+			c->x86_capability[2] = cpuid_edx(0x80860001);
+	}
+
+	c->extended_cpuid_level = cpuid_eax(0x80000000);
+	if (c->extended_cpuid_level >= 0x80000007)
+		c->x86_power = cpuid_edx(0x80000007);
+
+	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+	    cpu_devs[c->x86_vendor]->c_early_init)
+		cpu_devs[c->x86_vendor]->c_early_init(c);
+
+	validate_pat_support(c);
+
+	/* early_param could clear that, but recall get it set again */
+	if (disable_apic)
+		clear_cpu_cap(c, X86_FEATURE_APIC);
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+	int i;
+
+	early_identify_cpu(c);
+
+	init_scattered_cpuid_features(c);
+
+	c->apicid = phys_pkg_id(0);
+
+	/*
+	 * Vendor-specific initialization.  In this section we
+	 * canonicalize the feature flags, meaning if there are
+	 * features a certain CPU supports which CPUID doesn't
+	 * tell us, CPUID claiming incorrect flags, or other bugs,
+	 * we handle them here.
+	 *
+	 * At the end of this section, c->x86_capability better
+	 * indicate the features this CPU genuinely supports!
+	 */
+	if (this_cpu->c_init)
+		this_cpu->c_init(c);
+
+	detect_ht(c);
+
+	/*
+	 * On SMP, boot_cpu_data holds the common feature set between
+	 * all CPUs; so make sure that we indicate which features are
+	 * common between the CPUs.  The first time this routine gets
+	 * executed, c == &boot_cpu_data.
+	 */
+	if (c != &boot_cpu_data) {
+		/* AND the already accumulated flags with these */
+		for (i = 0; i < NCAPINTS; i++)
+			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+	}
+
+	/* Clear all flags overriden by options */
+	for (i = 0; i < NCAPINTS; i++)
+		c->x86_capability[i] &= ~cleared_cpu_caps[i];
+
+#ifdef CONFIG_X86_MCE
+	mcheck_init(c);
+#endif
+	select_idle_routine(c);
+
+#ifdef CONFIG_NUMA
+	numa_add_cpu(smp_processor_id());
+#endif
+
+}
+
+void __cpuinit identify_boot_cpu(void)
+{
+	identify_cpu(&boot_cpu_data);
+}
+
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+	BUG_ON(c == &boot_cpu_data);
+	identify_cpu(c);
+	mtrr_ap_init();
+}
+
+static __init int setup_noclflush(char *arg)
+{
+	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+	return 1;
+}
+__setup("noclflush", setup_noclflush);
+
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+	if (c->x86_model_id[0])
+		printk(KERN_CONT "%s", c->x86_model_id);
+
+	if (c->x86_mask || c->cpuid_level >= 0)
+		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+	else
+		printk(KERN_CONT "\n");
+}
+
+static __init int setup_disablecpuid(char *arg)
+{
+	int bit;
+	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+		setup_clear_cpu_cap(bit);
+	else
+		return 0;
+	return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
deleted file mode 100644
index 9b516eecada0..000000000000
--- a/arch/x86/kernel/setup_64.c
+++ /dev/null
@@ -1,823 +0,0 @@
-/*
- *  Copyright (C) 1995  Linus Torvalds
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/initrd.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <linux/console.h>
-#include <linux/seq_file.h>
-#include <linux/crash_dump.h>
-#include <linux/root_dev.h>
-#include <linux/pci.h>
-#include <asm/pci-direct.h>
-#include <linux/efi.h>
-#include <linux/acpi.h>
-#include <linux/kallsyms.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/mmzone.h>
-#include <linux/kexec.h>
-#include <linux/cpufreq.h>
-#include <linux/dmi.h>
-#include <linux/dma-mapping.h>
-#include <linux/ctype.h>
-#include <linux/sort.h>
-#include <linux/uaccess.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-
-#include <asm/mtrr.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/vsyscall.h>
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/msr.h>
-#include <asm/desc.h>
-#include <video/edid.h>
-#include <asm/e820.h>
-#include <asm/mpspec.h>
-#include <asm/dma.h>
-#include <asm/gart.h>
-#include <asm/mpspec.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/setup.h>
-#include <asm/numa.h>
-#include <asm/sections.h>
-#include <asm/dmi.h>
-#include <asm/cacheflush.h>
-#include <asm/mce.h>
-#include <asm/ds.h>
-#include <asm/topology.h>
-#include <asm/trampoline.h>
-#include <asm/pat.h>
-
-#include <mach_apic.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define ARCH_SETUP
-#endif
-
-/*
- * Machine setup..
- */
-
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
-EXPORT_SYMBOL(boot_cpu_data);
-
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-
-unsigned long mmu_cr4_features;
-
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
-
-unsigned long saved_video_mode;
-
-/*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
- * Setup options
- */
-struct screen_info screen_info;
-EXPORT_SYMBOL(screen_info);
-struct sys_desc_table_struct {
-	unsigned short length;
-	unsigned char table[0];
-};
-
-struct edid_info edid_info;
-EXPORT_SYMBOL_GPL(edid_info);
-
-extern int root_mountflags;
-
-static char __initdata command_line[COMMAND_LINE_SIZE];
-
-static struct resource standard_io_resources[] = {
-	{ .name = "dma1", .start = 0x00, .end = 0x1f,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "pic1", .start = 0x20, .end = 0x21,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "timer0", .start = 0x40, .end = 0x43,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "timer1", .start = 0x50, .end = 0x53,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "keyboard", .start = 0x60, .end = 0x60,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "keyboard", .start = 0x64, .end = 0x64,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "fpu", .start = 0xf0, .end = 0xff,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
-};
-
-#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
-
-static struct resource data_resource = {
-	.name = "Kernel data",
-	.start = 0,
-	.end = 0,
-	.flags = IORESOURCE_RAM,
-};
-static struct resource code_resource = {
-	.name = "Kernel code",
-	.start = 0,
-	.end = 0,
-	.flags = IORESOURCE_RAM,
-};
-static struct resource bss_resource = {
-	.name = "Kernel bss",
-	.start = 0,
-	.end = 0,
-	.flags = IORESOURCE_RAM,
-};
-
-static void __init early_cpu_init(void);
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
-{
-	char *end;
-	if (!arg)
-		return -EINVAL;
-	elfcorehdr_addr = memparse(arg, &end);
-	return end > arg ? 0 : -EINVAL;
-}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
-
-#ifndef CONFIG_NUMA
-static void __init
-contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long bootmap_size, bootmap;
-
-	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
-	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
-				 PAGE_SIZE);
-	if (bootmap == -1L)
-		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
-	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
-	e820_register_active_regions(0, start_pfn, end_pfn);
-	free_bootmem_with_active_regions(0, end_pfn);
-	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
-	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
-}
-#endif
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-struct edd edd;
-#ifdef CONFIG_EDD_MODULE
-EXPORT_SYMBOL(edd);
-#endif
-/**
- * copy_edd() - Copy the BIOS EDD information
- *              from boot_params into a safe place.
- *
- */
-static inline void copy_edd(void)
-{
-     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
-	    sizeof(edd.mbr_signature));
-     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
-     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
-     edd.edd_info_nr = boot_params.eddbuf_entries;
-}
-#else
-static inline void copy_edd(void)
-{
-}
-#endif
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-void __attribute__((weak)) __init memory_setup(void)
-{
-       machine_specific_memory_setup();
-}
-
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-void switch_to_new_gdt(void)
-{
-	struct desc_ptr gdt_descr;
-
-	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
-	gdt_descr.size = GDT_SIZE - 1;
-	load_gdt(&gdt_descr);
-}
-
-/*
- * setup_arch - architecture-specific boot-time initializations
- *
- * Note: On x86_64, fixmaps are ready for use even before this is called.
- */
-void __init setup_arch(char **cmdline_p)
-{
-	unsigned i;
-
-	printk(KERN_INFO "Command line: %s\n", boot_command_line);
-
-	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
-	screen_info = boot_params.screen_info;
-	edid_info = boot_params.edid_info;
-	saved_video_mode = boot_params.hdr.vid_mode;
-	bootloader_type = boot_params.hdr.type_of_loader;
-
-#ifdef CONFIG_BLK_DEV_RAM
-	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
-	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
-	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-#endif
-#ifdef CONFIG_EFI
-	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL64", 4)) {
-		efi_enabled = 1;
-		efi_reserve_early();
-	}
-#endif
-
-	ARCH_SETUP
-
-	setup_memory_map();
-	copy_edd();
-
-	if (!boot_params.hdr.root_flags)
-		root_mountflags &= ~MS_RDONLY;
-	init_mm.start_code = (unsigned long) &_text;
-	init_mm.end_code = (unsigned long) &_etext;
-	init_mm.end_data = (unsigned long) &_edata;
-	init_mm.brk = (unsigned long) &_end;
-
-	code_resource.start = virt_to_phys(&_text);
-	code_resource.end = virt_to_phys(&_etext)-1;
-	data_resource.start = virt_to_phys(&_etext);
-	data_resource.end = virt_to_phys(&_edata)-1;
-	bss_resource.start = virt_to_phys(&__bss_start);
-	bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
-	early_cpu_init();
-	early_identify_cpu(&boot_cpu_data);
-
-	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-	*cmdline_p = command_line;
-
-	parse_setup_data();
-
-	parse_early_param();
-
-	if (acpi_mps_check()) {
-		disable_apic = 1;
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-	}
-
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	if (init_ohci1394_dma_early)
-		init_ohci1394_dma_on_all_controllers();
-#endif
-
-	finish_e820_parsing();
-
-	/* after parse_early_param, so could debug it */
-	insert_resource(&iomem_resource, &code_resource);
-	insert_resource(&iomem_resource, &data_resource);
-	insert_resource(&iomem_resource, &bss_resource);
-
-	early_gart_iommu_check();
-
-	e820_register_active_regions(0, 0, -1UL);
-	/*
-	 * partially used pages are not usable - thus
-	 * we are rounding upwards:
-	 */
-	end_pfn = e820_end_of_ram();
-
-	/* pre allocte 4k for mptable mpc */
-	early_reserve_e820_mpc_new();
-	/* update e820 for memory not covered by WB MTRRs */
-	mtrr_bp_init();
-	if (mtrr_trim_uncached_memory(end_pfn)) {
-		remove_all_active_ranges();
-		e820_register_active_regions(0, 0, -1UL);
-		end_pfn = e820_end_of_ram();
-	}
-
-	num_physpages = end_pfn;
-
-	check_efer();
-
-	max_pfn_mapped = init_memory_mapping(0, (end_pfn << PAGE_SHIFT));
-	if (efi_enabled)
-		efi_init();
-
-	vsmp_init();
-
-	dmi_scan_machine();
-
-	io_delay_init();
-
-#ifdef CONFIG_KVM_CLOCK
-	kvmclock_init();
-#endif
-
-	/*
-	 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
-	 * Call this early for SRAT node setup.
-	 */
-	acpi_boot_table_init();
-
-	/* How many end-of-memory variables you have, grandma! */
-	max_low_pfn = end_pfn;
-	max_pfn = end_pfn;
-	high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
-
-	/* Remove active ranges so rediscovery with NUMA-awareness happens */
-	remove_all_active_ranges();
-
-#ifdef CONFIG_ACPI_NUMA
-	/*
-	 * Parse SRAT to discover nodes.
-	 */
-	acpi_numa_init();
-#endif
-
-#ifdef CONFIG_NUMA
-	numa_initmem_init(0, end_pfn);
-#else
-	contig_initmem_init(0, end_pfn);
-#endif
-
-	dma32_reserve_bootmem();
-
-#ifdef CONFIG_ACPI_SLEEP
-	/*
-	 * Reserve low memory region for sleep support.
-	 */
-       acpi_reserve_bootmem();
-#endif
-
-#ifdef CONFIG_X86_MPPARSE
-       /*
-	* Find and reserve possible boot-time SMP configuration:
-	*/
-	find_smp_config();
-#endif
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-		unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
-		unsigned long end_of_mem    = end_pfn << PAGE_SHIFT;
-
-		if (ramdisk_end <= end_of_mem) {
-			/*
-			 * don't need to reserve again, already reserved early
-			 * in x86_64_start_kernel, and early_res_to_bootmem
-			 * convert that to reserved in bootmem
-			 */
-			initrd_start = ramdisk_image + PAGE_OFFSET;
-			initrd_end = initrd_start+ramdisk_size;
-		} else {
-			free_bootmem(ramdisk_image, ramdisk_size);
-			printk(KERN_ERR "initrd extends beyond end of memory "
-			       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-			       ramdisk_end, end_of_mem);
-			initrd_start = 0;
-		}
-	}
-#endif
-	reserve_crashkernel();
-
-	reserve_ibft_region();
-
-	paging_init();
-	map_vsyscall();
-
-	early_quirks();
-
-	/*
-	 * Read APIC and some other early information from ACPI tables.
-	 */
-	acpi_boot_init();
-
-	init_cpu_to_node();
-
-#ifdef CONFIG_X86_MPPARSE
-	/*
-	 * get boot-time SMP configuration:
-	 */
-	if (smp_found_config)
-		get_smp_config();
-#endif
-	init_apic_mappings();
-	ioapic_init_mappings();
-
-	kvm_guest_init();
-
-	/*
-	 * We trust e820 completely. No explicit ROM probing in memory.
-	 */
-	e820_reserve_resources();
-	e820_mark_nosave_regions(end_pfn);
-
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-
-	e820_setup_gap();
-
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
-	if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
-		conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
-	conswitchp = &dummy_con;
-#endif
-#endif
-}
-
-struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
-{
-	display_cacheinfo(c);
-}
-
-static struct cpu_dev __cpuinitdata default_cpu = {
-	.c_init	= default_init,
-	.c_vendor = "Unknown",
-};
-static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
-
-int __cpuinit get_model_name(struct cpuinfo_x86 *c)
-{
-	unsigned int *v;
-
-	if (c->extended_cpuid_level < 0x80000004)
-		return 0;
-
-	v = (unsigned int *) c->x86_model_id;
-	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
-	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
-	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
-	c->x86_model_id[48] = 0;
-	return 1;
-}
-
-
-void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
-{
-	unsigned int n, dummy, eax, ebx, ecx, edx;
-
-	n = c->extended_cpuid_level;
-
-	if (n >= 0x80000005) {
-		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
-		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
-		       "D cache %dK (%d bytes/line)\n",
-		       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-		c->x86_cache_size = (ecx>>24) + (edx>>24);
-		/* On K8 L1 TLB is inclusive, so don't count it */
-		c->x86_tlbsize = 0;
-	}
-
-	if (n >= 0x80000006) {
-		cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
-		ecx = cpuid_ecx(0x80000006);
-		c->x86_cache_size = ecx >> 16;
-		c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
-
-		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
-		c->x86_cache_size, ecx & 0xFF);
-	}
-	if (n >= 0x80000008) {
-		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-}
-
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	u32 eax, ebx, ecx, edx;
-	int index_msb, core_bits;
-
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-
-
-	if (!cpu_has(c, X86_FEATURE_HT))
-		return;
-	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
-		goto out;
-
-	smp_num_siblings = (ebx & 0xff0000) >> 16;
-
-	if (smp_num_siblings == 1) {
-		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1) {
-
-		if (smp_num_siblings > NR_CPUS) {
-			printk(KERN_WARNING "CPU: Unsupported number of "
-			       "siblings %d", smp_num_siblings);
-			smp_num_siblings = 1;
-			return;
-		}
-
-		index_msb = get_count_order(smp_num_siblings);
-		c->phys_proc_id = phys_pkg_id(index_msb);
-
-		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
-		index_msb = get_count_order(smp_num_siblings);
-
-		core_bits = get_count_order(c->x86_max_cores);
-
-		c->cpu_core_id = phys_pkg_id(index_msb) &
-					       ((1 << core_bits) - 1);
-	}
-out:
-	if ((c->x86_max_cores * smp_num_siblings) > 1) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-		       c->cpu_core_id);
-	}
-
-#endif
-}
-
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
-{
-	char *v = c->x86_vendor_id;
-	int i;
-	static int printed;
-
-	for (i = 0; i < X86_VENDOR_NUM; i++) {
-		if (cpu_devs[i]) {
-			if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
-			    (cpu_devs[i]->c_ident[1] &&
-			    !strcmp(v, cpu_devs[i]->c_ident[1]))) {
-				c->x86_vendor = i;
-				this_cpu = cpu_devs[i];
-				return;
-			}
-		}
-	}
-	if (!printed) {
-		printed++;
-		printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
-		printk(KERN_ERR "CPU: Your system may be unstable.\n");
-	}
-	c->x86_vendor = X86_VENDOR_UNKNOWN;
-}
-
-static void __init early_cpu_support_print(void)
-{
-	int i,j;
-	struct cpu_dev *cpu_devx;
-
-	printk("KERNEL supported cpus:\n");
-	for (i = 0; i < X86_VENDOR_NUM; i++) {
-		cpu_devx = cpu_devs[i];
-		if (!cpu_devx)
-			continue;
-		for (j = 0; j < 2; j++) {
-			if (!cpu_devx->c_ident[j])
-				continue;
-			printk("  %s %s\n", cpu_devx->c_vendor,
-				cpu_devx->c_ident[j]);
-		}
-	}
-}
-
-static void __init early_cpu_init(void)
-{
-        struct cpu_vendor_dev *cvdev;
-
-        for (cvdev = __x86cpuvendor_start ;
-             cvdev < __x86cpuvendor_end   ;
-             cvdev++)
-                cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
-	early_cpu_support_print();
-}
-
-/* Do some early cpuid on the boot CPU to get some parameter that are
-   needed before check_bugs. Everything advanced is in identify_cpu
-   below. */
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
-{
-	u32 tfms, xlvl;
-
-	c->loops_per_jiffy = loops_per_jiffy;
-	c->x86_cache_size = -1;
-	c->x86_vendor = X86_VENDOR_UNKNOWN;
-	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
-	c->x86_vendor_id[0] = '\0'; /* Unset */
-	c->x86_model_id[0] = '\0';  /* Unset */
-	c->x86_clflush_size = 64;
-	c->x86_cache_alignment = c->x86_clflush_size;
-	c->x86_max_cores = 1;
-	c->x86_coreid_bits = 0;
-	c->extended_cpuid_level = 0;
-	memset(&c->x86_capability, 0, sizeof c->x86_capability);
-
-	/* Get vendor name */
-	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
-	      (unsigned int *)&c->x86_vendor_id[0],
-	      (unsigned int *)&c->x86_vendor_id[8],
-	      (unsigned int *)&c->x86_vendor_id[4]);
-
-	get_cpu_vendor(c);
-
-	/* Initialize the standard set of capabilities */
-	/* Note that the vendor-specific code below might override */
-
-	/* Intel-defined flags: level 0x00000001 */
-	if (c->cpuid_level >= 0x00000001) {
-		__u32 misc;
-		cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
-		      &c->x86_capability[0]);
-		c->x86 = (tfms >> 8) & 0xf;
-		c->x86_model = (tfms >> 4) & 0xf;
-		c->x86_mask = tfms & 0xf;
-		if (c->x86 == 0xf)
-			c->x86 += (tfms >> 20) & 0xff;
-		if (c->x86 >= 0x6)
-			c->x86_model += ((tfms >> 16) & 0xF) << 4;
-		if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
-			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
-	} else {
-		/* Have CPUID level 0 only - unheard of */
-		c->x86 = 4;
-	}
-
-	c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
-#ifdef CONFIG_SMP
-	c->phys_proc_id = c->initial_apicid;
-#endif
-	/* AMD-defined flags: level 0x80000001 */
-	xlvl = cpuid_eax(0x80000000);
-	c->extended_cpuid_level = xlvl;
-	if ((xlvl & 0xffff0000) == 0x80000000) {
-		if (xlvl >= 0x80000001) {
-			c->x86_capability[1] = cpuid_edx(0x80000001);
-			c->x86_capability[6] = cpuid_ecx(0x80000001);
-		}
-		if (xlvl >= 0x80000004)
-			get_model_name(c); /* Default name */
-	}
-
-	/* Transmeta-defined flags: level 0x80860001 */
-	xlvl = cpuid_eax(0x80860000);
-	if ((xlvl & 0xffff0000) == 0x80860000) {
-		/* Don't set x86_cpuid_level here for now to not confuse. */
-		if (xlvl >= 0x80860001)
-			c->x86_capability[2] = cpuid_edx(0x80860001);
-	}
-
-	c->extended_cpuid_level = cpuid_eax(0x80000000);
-	if (c->extended_cpuid_level >= 0x80000007)
-		c->x86_power = cpuid_edx(0x80000007);
-
-	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
-	    cpu_devs[c->x86_vendor]->c_early_init)
-		cpu_devs[c->x86_vendor]->c_early_init(c);
-
-	validate_pat_support(c);
-
-	/* early_param could clear that, but recall get it set again */
-	if (disable_apic)
-		clear_cpu_cap(c, X86_FEATURE_APIC);
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
-	int i;
-
-	early_identify_cpu(c);
-
-	init_scattered_cpuid_features(c);
-
-	c->apicid = phys_pkg_id(0);
-
-	/*
-	 * Vendor-specific initialization.  In this section we
-	 * canonicalize the feature flags, meaning if there are
-	 * features a certain CPU supports which CPUID doesn't
-	 * tell us, CPUID claiming incorrect flags, or other bugs,
-	 * we handle them here.
-	 *
-	 * At the end of this section, c->x86_capability better
-	 * indicate the features this CPU genuinely supports!
-	 */
-	if (this_cpu->c_init)
-		this_cpu->c_init(c);
-
-	detect_ht(c);
-
-	/*
-	 * On SMP, boot_cpu_data holds the common feature set between
-	 * all CPUs; so make sure that we indicate which features are
-	 * common between the CPUs.  The first time this routine gets
-	 * executed, c == &boot_cpu_data.
-	 */
-	if (c != &boot_cpu_data) {
-		/* AND the already accumulated flags with these */
-		for (i = 0; i < NCAPINTS; i++)
-			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
-	}
-
-	/* Clear all flags overriden by options */
-	for (i = 0; i < NCAPINTS; i++)
-		c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
-#ifdef CONFIG_X86_MCE
-	mcheck_init(c);
-#endif
-	select_idle_routine(c);
-
-#ifdef CONFIG_NUMA
-	numa_add_cpu(smp_processor_id());
-#endif
-
-}
-
-void __cpuinit identify_boot_cpu(void)
-{
-	identify_cpu(&boot_cpu_data);
-}
-
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
-{
-	BUG_ON(c == &boot_cpu_data);
-	identify_cpu(c);
-	mtrr_ap_init();
-}
-
-static __init int setup_noclflush(char *arg)
-{
-	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
-	return 1;
-}
-__setup("noclflush", setup_noclflush);
-
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
-{
-	if (c->x86_model_id[0])
-		printk(KERN_CONT "%s", c->x86_model_id);
-
-	if (c->x86_mask || c->cpuid_level >= 0)
-		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
-	else
-		printk(KERN_CONT "\n");
-}
-
-static __init int setup_disablecpuid(char *arg)
-{
-	int bit;
-	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
-		setup_clear_cpu_cap(bit);
-	else
-		return 0;
-	return 1;
-}
-__setup("clearcpuid=", setup_disablecpuid);
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 4ab2ede6f4b9..a5df34fb5231 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -153,6 +153,7 @@ static inline int hlt_works(int cpu)
 
 extern void cpu_detect(struct cpuinfo_x86 *c);
 
+extern void early_cpu_init(void);
 extern void identify_cpu(struct cpuinfo_x86 *);
 extern void identify_boot_cpu(void);
 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
-- 
cgit v1.2.3


From 9a250347591da3e60b5ee53dd1d341732f081117 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 03:24:00 -0700
Subject: x86: change identify_cpu to static

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common.c    | 2 +-
 arch/x86/kernel/cpu/common_64.c | 2 +-
 include/asm-x86/processor.h     | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d0463a946247..80ab20d4fa39 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -427,7 +427,7 @@ __setup("serialnumber", x86_serial_nr_setup);
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 {
 	int i;
 
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index b6f205063f74..48ba79961583 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -312,7 +312,7 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 {
 	int i;
 
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index a5df34fb5231..12b5ede14c7a 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -154,7 +154,6 @@ static inline int hlt_works(int cpu)
 extern void cpu_detect(struct cpuinfo_x86 *c);
 
 extern void early_cpu_init(void);
-extern void identify_cpu(struct cpuinfo_x86 *);
 extern void identify_boot_cpu(void);
 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
 extern void print_cpu_info(struct cpuinfo_x86 *);
-- 
cgit v1.2.3


From 7a1fd9866cbb59a00006f1e0fd5726951b167c97 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 14:48:05 -0700
Subject: x86: add e820_remove_range

... so could add real hole in e820

agp check is using request_mem_region, and could fail if e820 is reserved...

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 35 +++++++++++++++++++++++++++++++++++
 include/asm-x86/e820.h |  2 ++
 2 files changed, 37 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7b613d2efb04..e285ea38c8e5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -429,6 +429,41 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 	return real_updated_size;
 }
 
+/* make e820 not cover the range */
+u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
+			     int checktype)
+{
+	int i;
+	u64 real_removed_size = 0;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 final_start, final_end;
+
+		if (checktype && ei->type != old_type)
+			continue;
+		/* totally covered? */
+		if (ei->addr >= start &&
+		    (ei->addr + ei->size) <= (start + size)) {
+			real_removed_size += ei->size;
+			memset(ei, 0, sizeof(struct e820entry));
+			continue;
+		}
+		/* partially covered */
+		final_start = max(start, ei->addr);
+		final_end = min(start + size, ei->addr + ei->size);
+		if (final_start >= final_end)
+			continue;
+		real_removed_size += final_end - final_start;
+
+		ei->size -= final_end - final_start;
+		if (ei->addr < final_start)
+			continue;
+		ei->addr = final_end;
+	}
+	return real_removed_size;
+}
+
 void __init update_e820(void)
 {
 	int nr_map;
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 0e92b6a2ea00..7c32df07bae4 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -67,6 +67,8 @@ sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
 extern int copy_e820_map(struct e820entry *biosmap, int nr_map);
 extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
 			       unsigned new_type);
+extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
+			     int checktype);
 extern void update_e820(void);
 extern void e820_setup_gap(void);
 struct setup_data;
-- 
cgit v1.2.3


From a9c1182fbd349882fe912245d6e03cd30943be2d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 15:39:41 -0700
Subject: x86: seperate probe_roms into another file

it is only needed for 32bit

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile        |   1 +
 arch/x86/kernel/probe_roms_32.c | 166 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_32.c      | 146 -----------------------------------
 include/asm-x86/setup.h         |   1 +
 4 files changed, 168 insertions(+), 146 deletions(-)
 create mode 100644 arch/x86/kernel/probe_roms_32.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9a71be827927..9c9ce75e3ae4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -19,6 +19,7 @@ obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
 obj-y			+= setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o
+obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c
new file mode 100644
index 000000000000..675a48c404a5
--- /dev/null
+++ b/arch/x86/kernel/probe_roms_32.c
@@ -0,0 +1,166 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/mmzone.h>
+#include <linux/ioport.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+
+
+#include <asm/e820.h>
+#include <asm/mmzone.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <setup_arch.h>
+
+static struct resource system_rom_resource = {
+	.name	= "System ROM",
+	.start	= 0xf0000,
+	.end	= 0xfffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+	.name	= "Extension ROM",
+	.start	= 0xe0000,
+	.end	= 0xeffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+	.name 	= "Adapter ROM",
+	.start	= 0xc8000,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+	.name 	= "Video ROM",
+	.start	= 0xc0000,
+	.end	= 0xc7fff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
+{
+	const unsigned short * const ptr = (const unsigned short *)rom;
+	unsigned short sig;
+
+	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+	unsigned char sum, c;
+
+	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+		sum += c;
+	return !length && !sum;
+}
+
+void __init probe_roms(void)
+{
+	const unsigned char *rom;
+	unsigned long start, length, upper;
+	unsigned char c;
+	int i;
+
+	/* video rom */
+	upper = adapter_rom_resources[0].start;
+	for (start = video_rom_resource.start; start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		video_rom_resource.start = start;
+
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* if checksum okay, trust length byte */
+		if (length && romchecksum(rom, length))
+			video_rom_resource.end = start + length - 1;
+
+		request_resource(&iomem_resource, &video_rom_resource);
+		break;
+	}
+
+	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+	if (start < upper)
+		start = upper;
+
+	/* system rom */
+	request_resource(&iomem_resource, &system_rom_resource);
+	upper = system_rom_resource.start;
+
+	/* check for extension rom (ignore length byte!) */
+	rom = isa_bus_to_virt(extension_rom_resource.start);
+	if (romsignature(rom)) {
+		length = extension_rom_resource.end - extension_rom_resource.start + 1;
+		if (romchecksum(rom, length)) {
+			request_resource(&iomem_resource, &extension_rom_resource);
+			upper = extension_rom_resource.start;
+		}
+	}
+
+	/* check for adapter roms on 2k boundaries */
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* but accept any length that fits if checksum okay */
+		if (!length || start + length > upper || !romchecksum(rom, length))
+			continue;
+
+		adapter_rom_resources[i].start = start;
+		adapter_rom_resources[i].end = start + length - 1;
+		request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+		start = adapter_rom_resources[i++].end & ~2047UL;
+	}
+}
+
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index e274ee6ff582..865838b11792 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -606,8 +606,6 @@ static void set_mca_bus(int x)
 static void set_mca_bus(int x) { }
 #endif
 
-static void probe_roms(void);
-
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -843,147 +841,3 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
-static struct resource system_rom_resource = {
-	.name	= "System ROM",
-	.start	= 0xf0000,
-	.end	= 0xfffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
-	.name	= "Extension ROM",
-	.start	= 0xe0000,
-	.end	= 0xeffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
-	.name 	= "Adapter ROM",
-	.start	= 0xc8000,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
-	.name 	= "Video ROM",
-	.start	= 0xc0000,
-	.end	= 0xc7fff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-#define ROMSIGNATURE 0xaa55
-
-static int __init romsignature(const unsigned char *rom)
-{
-	const unsigned short * const ptr = (const unsigned short *)rom;
-	unsigned short sig;
-
-	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
-}
-
-static int __init romchecksum(const unsigned char *rom, unsigned long length)
-{
-	unsigned char sum, c;
-
-	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
-		sum += c;
-	return !length && !sum;
-}
-
-static void __init probe_roms(void)
-{
-	const unsigned char *rom;
-	unsigned long start, length, upper;
-	unsigned char c;
-	int i;
-
-	/* video rom */
-	upper = adapter_rom_resources[0].start;
-	for (start = video_rom_resource.start; start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		video_rom_resource.start = start;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* if checksum okay, trust length byte */
-		if (length && romchecksum(rom, length))
-			video_rom_resource.end = start + length - 1;
-
-		request_resource(&iomem_resource, &video_rom_resource);
-		break;
-	}
-
-	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
-	if (start < upper)
-		start = upper;
-
-	/* system rom */
-	request_resource(&iomem_resource, &system_rom_resource);
-	upper = system_rom_resource.start;
-
-	/* check for extension rom (ignore length byte!) */
-	rom = isa_bus_to_virt(extension_rom_resource.start);
-	if (romsignature(rom)) {
-		length = extension_rom_resource.end - extension_rom_resource.start + 1;
-		if (romchecksum(rom, length)) {
-			request_resource(&iomem_resource, &extension_rom_resource);
-			upper = extension_rom_resource.start;
-		}
-	}
-
-	/* check for adapter roms on 2k boundaries */
-	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* but accept any length that fits if checksum okay */
-		if (!length || start + length > upper || !romchecksum(rom, length))
-			continue;
-
-		adapter_rom_resources[i].start = start;
-		adapter_rom_resources[i].end = start + length - 1;
-		request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
-		start = adapter_rom_resources[i++].end & ~2047UL;
-	}
-}
-
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index b434bdd82ba7..cf87d6d3675c 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -54,6 +54,7 @@ extern struct boot_params boot_params;
 #ifdef __i386__
 
 void __init i386_start_kernel(void);
+extern void probe_roms(void);
 
 extern unsigned long init_pg_tables_start;
 extern unsigned long init_pg_tables_end;
-- 
cgit v1.2.3


From 0f0124fa742da7c51e2e3c5ded7f5e5e06ddc195 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 16:25:37 -0700
Subject: x86: merge setup64.c into common_64.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile        |   2 +-
 arch/x86/kernel/cpu/common_64.c | 277 +++++++++++++++++++++++++++++++++++++-
 arch/x86/kernel/setup64.c       | 287 ----------------------------------------
 3 files changed, 277 insertions(+), 289 deletions(-)
 delete mode 100644 arch/x86/kernel/setup64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9c9ce75e3ae4..0a1987b4acc4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -22,7 +22,7 @@ obj-y			+= setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
+obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 48ba79961583..9fb5b7caaa89 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -1,10 +1,17 @@
 #include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/kgdb.h>
+#include <linux/topology.h>
 #include <linux/string.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
-#include <linux/bootmem.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
@@ -19,6 +26,15 @@
 #include <asm/apic.h>
 #include <mach_apic.h>
 #endif
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/genapic.h>
 
 #include "cpu.h"
 
@@ -404,3 +420,262 @@ static __init int setup_disablecpuid(char *arg)
 	return 1;
 }
 __setup("clearcpuid=", setup_disablecpuid);
+
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+
+struct x8664_pda **_cpu_pda __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+
+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+
+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+static int do_not_nx __cpuinitdata;
+
+/* noexec=on|off
+Control non executable mappings for 64bit processes.
+
+on	Enable(default)
+off	Disable
+*/
+static int __init nonx_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	if (!strncmp(str, "on", 2)) {
+		__supported_pte_mask |= _PAGE_NX;
+		do_not_nx = 0;
+	} else if (!strncmp(str, "off", 3)) {
+		do_not_nx = 1;
+		__supported_pte_mask &= ~_PAGE_NX;
+	}
+	return 0;
+}
+early_param("noexec", nonx_setup);
+
+int force_personality32;
+
+/* noexec32=on|off
+Control non executable heap for 32bit processes.
+To control the stack too use noexec=off
+
+on	PROT_READ does not imply PROT_EXEC for 32bit processes (default)
+off	PROT_READ implies PROT_EXEC
+*/
+static int __init nonx32_setup(char *str)
+{
+	if (!strcmp(str, "on"))
+		force_personality32 &= ~READ_IMPLIES_EXEC;
+	else if (!strcmp(str, "off"))
+		force_personality32 |= READ_IMPLIES_EXEC;
+	return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+void pda_init(int cpu)
+{
+	struct x8664_pda *pda = cpu_pda(cpu);
+
+	/* Setup up data that may be needed in __get_free_pages early */
+	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
+	/* Memory clobbers used to order PDA accessed */
+	mb();
+	wrmsrl(MSR_GS_BASE, pda);
+	mb();
+
+	pda->cpunumber = cpu;
+	pda->irqcount = -1;
+	pda->kernelstack = (unsigned long)stack_thread_info() -
+				 PDA_STACKOFFSET + THREAD_SIZE;
+	pda->active_mm = &init_mm;
+	pda->mmu_state = 0;
+
+	if (cpu == 0) {
+		/* others are initialized in smpboot.c */
+		pda->pcurrent = &init_task;
+		pda->irqstackptr = boot_cpu_stack;
+	} else {
+		pda->irqstackptr = (char *)
+			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+		if (!pda->irqstackptr)
+			panic("cannot allocate irqstack for cpu %d", cpu);
+
+		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+			pda->nodenumber = cpu_to_node(cpu);
+	}
+
+	pda->irqstackptr += IRQSTACKSIZE-64;
+}
+
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+			   DEBUG_STKSZ]
+__attribute__((section(".bss.page_aligned")));
+
+extern asmlinkage void ignore_sysret(void);
+
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+	/*
+	 * LSTAR and STAR live in a bit strange symbiosis.
+	 * They both write to the same internal register. STAR allows to
+	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+	 */
+	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
+	wrmsrl(MSR_LSTAR, system_call);
+	wrmsrl(MSR_CSTAR, ignore_sysret);
+
+#ifdef CONFIG_IA32_EMULATION
+	syscall32_cpu_init();
+#endif
+
+	/* Flags to clear on syscall */
+	wrmsrl(MSR_SYSCALL_MASK,
+	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+}
+
+void __cpuinit check_efer(void)
+{
+	unsigned long efer;
+
+	rdmsrl(MSR_EFER, efer);
+	if (!(efer & EFER_NX) || do_not_nx)
+		__supported_pte_mask &= ~_PAGE_NX;
+}
+
+unsigned long kernel_eflags;
+
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init.
+ */
+void __cpuinit cpu_init(void)
+{
+	int cpu = stack_smp_processor_id();
+	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+	unsigned long v;
+	char *estacks = NULL;
+	struct task_struct *me;
+	int i;
+
+	/* CPU 0 is initialised in head64.c */
+	if (cpu != 0)
+		pda_init(cpu);
+	else
+		estacks = boot_exception_stacks;
+
+	me = current;
+
+	if (cpu_test_and_set(cpu, cpu_initialized))
+		panic("CPU#%d already initialized!\n", cpu);
+
+	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+	/*
+	 * Initialize the per-CPU GDT with the boot GDT,
+	 * and set up the GDT descriptor:
+	 */
+
+	switch_to_new_gdt();
+	load_idt((const struct desc_ptr *)&idt_descr);
+
+	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+	syscall_init();
+
+	wrmsrl(MSR_FS_BASE, 0);
+	wrmsrl(MSR_KERNEL_GS_BASE, 0);
+	barrier();
+
+	check_efer();
+
+	/*
+	 * set up and load the per-CPU TSS
+	 */
+	for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+		static const unsigned int order[N_EXCEPTION_STACKS] = {
+			[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+			[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+		};
+		if (cpu) {
+			estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+			if (!estacks)
+				panic("Cannot allocate exception stack %ld %d\n",
+				      v, cpu);
+		}
+		estacks += PAGE_SIZE << order[v];
+		orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
+	}
+
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+	/*
+	 * <= is required because the CPU will access up to
+	 * 8 bits beyond the end of the IO permission bitmap.
+	 */
+	for (i = 0; i <= IO_BITMAP_LONGS; i++)
+		t->io_bitmap[i] = ~0UL;
+
+	atomic_inc(&init_mm.mm_count);
+	me->active_mm = &init_mm;
+	if (me->mm)
+		BUG();
+	enter_lazy_tlb(&init_mm, me);
+
+	load_sp0(t, &current->thread);
+	set_tss_desc(cpu, t);
+	load_TR_desc();
+	load_LDT(&init_mm.context);
+
+#ifdef CONFIG_KGDB
+	/*
+	 * If the kgdb is connected no debug regs should be altered.  This
+	 * is only applicable when KGDB and a KGDB I/O module are built
+	 * into the kernel and you are using early debugging with
+	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+	 */
+	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+		arch_kgdb_ops.correct_hw_break();
+	else {
+#endif
+	/*
+	 * Clear all 6 debug registers:
+	 */
+
+	set_debugreg(0UL, 0);
+	set_debugreg(0UL, 1);
+	set_debugreg(0UL, 2);
+	set_debugreg(0UL, 3);
+	set_debugreg(0UL, 6);
+	set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+	/* If the kgdb is connected no debug regs should be altered. */
+	}
+#endif
+
+	fpu_init();
+
+	raw_local_save_flags(kernel_eflags);
+
+	if (is_uv_system())
+		uv_cpu_init();
+}
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
deleted file mode 100644
index 151d3155ddf6..000000000000
--- a/arch/x86/kernel/setup64.c
+++ /dev/null
@@ -1,287 +0,0 @@
-/* 
- * X86-64 specific CPU setup.
- * Copyright (C) 1995  Linus Torvalds
- * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
- * See setup.c for older changelog.
- */ 
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/bootmem.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/mmu_context.h>
-#include <asm/smp.h>
-#include <asm/i387.h>
-#include <asm/percpu.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/genapic.h>
-
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-
-char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
-
-unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL_GPL(__supported_pte_mask);
-
-static int do_not_nx __cpuinitdata = 0;
-
-/* noexec=on|off
-Control non executable mappings for 64bit processes.
-
-on	Enable(default)
-off	Disable
-*/ 
-static int __init nonx_setup(char *str)
-{
-	if (!str)
-		return -EINVAL;
-	if (!strncmp(str, "on", 2)) {
-                __supported_pte_mask |= _PAGE_NX; 
- 		do_not_nx = 0; 
-	} else if (!strncmp(str, "off", 3)) {
-		do_not_nx = 1;
-		__supported_pte_mask &= ~_PAGE_NX;
-        }
-	return 0;
-} 
-early_param("noexec", nonx_setup);
-
-int force_personality32 = 0; 
-
-/* noexec32=on|off
-Control non executable heap for 32bit processes.
-To control the stack too use noexec=off
-
-on	PROT_READ does not imply PROT_EXEC for 32bit processes (default)
-off	PROT_READ implies PROT_EXEC
-*/
-static int __init nonx32_setup(char *str)
-{
-	if (!strcmp(str, "on"))
-		force_personality32 &= ~READ_IMPLIES_EXEC;
-	else if (!strcmp(str, "off"))
-		force_personality32 |= READ_IMPLIES_EXEC;
-	return 1;
-}
-__setup("noexec32=", nonx32_setup);
-
-void pda_init(int cpu)
-{ 
-	struct x8664_pda *pda = cpu_pda(cpu);
-
-	/* Setup up data that may be needed in __get_free_pages early */
-	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
-	/* Memory clobbers used to order PDA accessed */
-	mb();
-	wrmsrl(MSR_GS_BASE, pda);
-	mb();
-
-	pda->cpunumber = cpu; 
-	pda->irqcount = -1;
-	pda->kernelstack = 
-		(unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
-	pda->active_mm = &init_mm;
-	pda->mmu_state = 0;
-
-	if (cpu == 0) {
-		/* others are initialized in smpboot.c */
-		pda->pcurrent = &init_task;
-		pda->irqstackptr = boot_cpu_stack; 
-	} else {
-		pda->irqstackptr = (char *)
-			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
-		if (!pda->irqstackptr)
-			panic("cannot allocate irqstack for cpu %d", cpu); 
-
-		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
-			pda->nodenumber = cpu_to_node(cpu);
-	}
-
-	pda->irqstackptr += IRQSTACKSIZE-64;
-} 
-
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
-
-extern asmlinkage void ignore_sysret(void);
-
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
-{
-	/* 
-	 * LSTAR and STAR live in a bit strange symbiosis.
-	 * They both write to the same internal register. STAR allows to set CS/DS
-	 * but only a 32bit target. LSTAR sets the 64bit rip. 	 
-	 */ 
-	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
-	wrmsrl(MSR_LSTAR, system_call); 
-	wrmsrl(MSR_CSTAR, ignore_sysret);
-
-#ifdef CONFIG_IA32_EMULATION   		
-	syscall32_cpu_init ();
-#endif
-
-	/* Flags to clear on syscall */
-	wrmsrl(MSR_SYSCALL_MASK,
-	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
-}
-
-void __cpuinit check_efer(void)
-{
-	unsigned long efer;
-
-	rdmsrl(MSR_EFER, efer); 
-        if (!(efer & EFER_NX) || do_not_nx) { 
-                __supported_pte_mask &= ~_PAGE_NX; 
-        }       
-}
-
-unsigned long kernel_eflags;
-
-/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
- */
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init.
- */
-void __cpuinit cpu_init (void)
-{
-	int cpu = stack_smp_processor_id();
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
-	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
-	unsigned long v; 
-	char *estacks = NULL; 
-	struct task_struct *me;
-	int i;
-
-	/* CPU 0 is initialised in head64.c */
-	if (cpu != 0) {
-		pda_init(cpu);
-	} else 
-		estacks = boot_exception_stacks; 
-
-	me = current;
-
-	if (cpu_test_and_set(cpu, cpu_initialized))
-		panic("CPU#%d already initialized!\n", cpu);
-
-	printk("Initializing CPU#%d\n", cpu);
-
-	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
-	/*
-	 * Initialize the per-CPU GDT with the boot GDT,
-	 * and set up the GDT descriptor:
-	 */
-
-	switch_to_new_gdt();
-	load_idt((const struct desc_ptr *)&idt_descr);
-
-	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
-	syscall_init();
-
-	wrmsrl(MSR_FS_BASE, 0);
-	wrmsrl(MSR_KERNEL_GS_BASE, 0);
-	barrier(); 
-
-	check_efer();
-
-	/*
-	 * set up and load the per-CPU TSS
-	 */
-	for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-		static const unsigned int order[N_EXCEPTION_STACKS] = {
-			[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
-			[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
-		};
-		if (cpu) {
-			estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
-			if (!estacks)
-				panic("Cannot allocate exception stack %ld %d\n",
-				      v, cpu); 
-		}
-		estacks += PAGE_SIZE << order[v];
-		orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
-	}
-
-	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
-	/*
-	 * <= is required because the CPU will access up to
-	 * 8 bits beyond the end of the IO permission bitmap.
-	 */
-	for (i = 0; i <= IO_BITMAP_LONGS; i++)
-		t->io_bitmap[i] = ~0UL;
-
-	atomic_inc(&init_mm.mm_count);
-	me->active_mm = &init_mm;
-	if (me->mm)
-		BUG();
-	enter_lazy_tlb(&init_mm, me);
-
-	load_sp0(t, &current->thread);
-	set_tss_desc(cpu, t);
-	load_TR_desc();
-	load_LDT(&init_mm.context);
-
-#ifdef CONFIG_KGDB
-	/*
-	 * If the kgdb is connected no debug regs should be altered.  This
-	 * is only applicable when KGDB and a KGDB I/O module are built
-	 * into the kernel and you are using early debugging with
-	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
-	 */
-	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
-		arch_kgdb_ops.correct_hw_break();
-	else {
-#endif
-	/*
-	 * Clear all 6 debug registers:
-	 */
-
-	set_debugreg(0UL, 0);
-	set_debugreg(0UL, 1);
-	set_debugreg(0UL, 2);
-	set_debugreg(0UL, 3);
-	set_debugreg(0UL, 6);
-	set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
-	/* If the kgdb is connected no debug regs should be altered. */
-	}
-#endif
-
-	fpu_init(); 
-
-	raw_local_save_flags(kernel_eflags);
-
-	if (is_uv_system())
-		uv_cpu_init();
-}
-- 
cgit v1.2.3


From f81be876eaa9c71b3024c3dc05e4d1bf210cc255 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 19:16:52 -0700
Subject: x86: remove two duplicated funcs in setup_32.c

early_cpu_init is declared in processor.h
memory_setup is defined in e820.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 865838b11792..602a45c59ff6 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -206,7 +206,6 @@ struct ist_info ist_info;
 EXPORT_SYMBOL(ist_info);
 #endif
 
-extern void early_cpu_init(void);
 extern int root_mountflags;
 
 unsigned long saved_video_mode;
-- 
cgit v1.2.3


From ce97c40e28223c148e142bda7af48fd0f27c81f9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 20:22:09 -0700
Subject: x86: move reserve_standard_io_resource to setup.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c    | 32 ++++++++++++++++++++++++++
 arch/x86/kernel/setup_32.c | 57 +---------------------------------------------
 include/asm-x86/setup.h    |  2 ++
 3 files changed, 35 insertions(+), 56 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 56aee55cf8dc..3b9ec81ba4fb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -468,4 +468,36 @@ void __init reserve_crashkernel(void)
 void __init reserve_crashkernel(void)
 {}
 #endif
+static struct resource standard_io_resources[] = {
+	{ .name = "dma1", .start = 0x00, .end = 0x1f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic1", .start = 0x20, .end = 0x21,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer0", .start = 0x40, .end = 0x43,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer1", .start = 0x50, .end = 0x53,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x60, .end = 0x60,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x64, .end = 0x64,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "fpu", .start = 0xf0, .end = 0xff,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+void __init reserve_standard_io_resources(void)
+{
+	int i;
+
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+
+}
 
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 602a45c59ff6..2574ec8234c7 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -108,58 +108,6 @@ static struct resource video_ram_resource = {
 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
-static struct resource standard_io_resources[] = { {
-	.name	= "dma1",
-	.start	= 0x0000,
-	.end	= 0x001f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "pic1",
-	.start	= 0x0020,
-	.end	= 0x0021,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name   = "timer0",
-	.start	= 0x0040,
-	.end    = 0x0043,
-	.flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name   = "timer1",
-	.start  = 0x0050,
-	.end    = 0x0053,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "keyboard",
-	.start	= 0x0060,
-	.end	= 0x0060,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "keyboard",
-	.start	= 0x0064,
-	.end	= 0x0064,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "dma page reg",
-	.start	= 0x0080,
-	.end	= 0x008f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "pic2",
-	.start	= 0x00a0,
-	.end	= 0x00a1,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "dma2",
-	.start	= 0x00c0,
-	.end	= 0x00df,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "fpu",
-	.start	= 0x00f0,
-	.end	= 0x00ff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-} };
-
 /* cpu data as detected by the assembly code in head.S */
 struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 /* common cpu data for all cpus */
@@ -614,7 +562,6 @@ static void set_mca_bus(int x) { }
  */
 void __init setup_arch(char **cmdline_p)
 {
-	int i;
 	unsigned long max_low_pfn;
 
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
@@ -824,9 +771,7 @@ void __init setup_arch(char **cmdline_p)
 	e820_mark_nosave_regions(max_low_pfn);
 
 	request_resource(&iomem_resource, &video_ram_resource);
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
+	reserve_standard_io_resources();
 
 	e820_setup_gap();
 
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index cf87d6d3675c..bb12a1619c12 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -38,6 +38,8 @@ void reserve_crashkernel(void);
 #ifndef __ASSEMBLY__
 #include <asm/bootparam.h>
 
+void reserve_standard_io_resources(void);
+
 #ifndef _SETUP
 
 /*
-- 
cgit v1.2.3


From 17b4cceb1feb2a8865ce47064dd3bd446063a5d5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 21 Jun 2008 21:02:20 -0700
Subject: x86: move elfcorehdr parsing to setup.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c    | 19 +++++++++++++++++++
 arch/x86/kernel/setup_32.c | 16 ----------------
 2 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3b9ec81ba4fb..5497fb9b00a0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -4,6 +4,7 @@
 #include <linux/bootmem.h>
 #include <linux/percpu.h>
 #include <linux/kexec.h>
+#include <linux/crash_dump.h>
 #include <asm/smp.h>
 #include <asm/percpu.h>
 #include <asm/sections.h>
@@ -501,3 +502,21 @@ void __init reserve_standard_io_resources(void)
 
 }
 
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+	char *end;
+	if (!arg)
+		return -EINVAL;
+	elfcorehdr_addr = memparse(arg, &end);
+	return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
+#endif
+
+
+
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 2574ec8234c7..d24ac268523f 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -42,7 +42,6 @@
 #include <linux/iscsi_ibft.h>
 #include <linux/nodemask.h>
 #include <linux/kexec.h>
-#include <linux/crash_dump.h>
 #include <linux/dmi.h>
 #include <linux/pfn.h>
 #include <linux/pci.h>
@@ -194,21 +193,6 @@ static inline void copy_edd(void)
 }
 #endif
 
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel.
- */
-static int __init parse_elfcorehdr(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	elfcorehdr_addr = memparse(arg, &arg);
-	return 0;
-}
-early_param("elfcorehdr", parse_elfcorehdr);
-#endif /* CONFIG_PROC_VMCORE */
-
 /*
  * highmem=size forces highmem to be exactly 'size' bytes.
  * This works even on boxes that have no highmem otherwise.
-- 
cgit v1.2.3


From b2ac82a0909aea0d2620ba4c189f37c567c21fe5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 22 Jun 2008 02:45:39 -0700
Subject: x86: introduce initmem_init for 32 bit

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 94 +---------------------------------------------
 arch/x86/mm/discontig_32.c |  4 +-
 arch/x86/mm/init_32.c      | 90 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/page_32.h  |  5 +++
 4 files changed, 99 insertions(+), 94 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index d24ac268523f..190546bd3bd3 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -300,71 +300,11 @@ unsigned long __init find_max_low_pfn(void)
 	return max_low_pfn;
 }
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-static void __init setup_bootmem_allocator(void);
-static unsigned long __init setup_memory(void)
-{
-	/*
-	 * partially used pages are not usable - thus
-	 * we are rounding upwards:
-	 */
-	min_low_pfn = PFN_UP(init_pg_tables_end);
-
-	max_low_pfn = find_max_low_pfn();
-
-#ifdef CONFIG_HIGHMEM
-	highstart_pfn = highend_pfn = max_pfn;
-	if (max_pfn > max_low_pfn) {
-		highstart_pfn = max_low_pfn;
-	}
-	memory_present(0, 0, highend_pfn);
-	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
-		pages_to_mb(highend_pfn - highstart_pfn));
-	num_physpages = highend_pfn;
-	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
-#else
-	memory_present(0, 0, max_low_pfn);
-	num_physpages = max_low_pfn;
-	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
-#endif
-#ifdef CONFIG_FLATMEM
-	max_mapnr = num_physpages;
-#endif
-	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
-			pages_to_mb(max_low_pfn));
-
-	setup_bootmem_allocator();
-
-	return max_low_pfn;
-}
-
-static void __init zone_sizes_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA] =
-		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-	remove_all_active_ranges();
-#ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-	e820_register_active_regions(0, 0, highend_pfn);
-#else
-	e820_register_active_regions(0, 0, max_low_pfn);
-#endif
-
-	free_area_init_nodes(max_zone_pfns);
-}
-#else
-extern unsigned long __init setup_memory(void);
-extern void zone_sizes_init(void);
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
-
 #ifdef CONFIG_BLK_DEV_INITRD
 
 static bool do_relocate_initrd = false;
 
-static void __init reserve_initrd(void)
+void __init reserve_initrd(void)
 {
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
@@ -480,36 +420,6 @@ static void __init relocate_initrd(void)
 
 #endif /* CONFIG_BLK_DEV_INITRD */
 
-void __init setup_bootmem_allocator(void)
-{
-	int i;
-	unsigned long bootmap_size, bootmap;
-	/*
-	 * Initialize the boot-time allocator (with low memory only):
-	 */
-	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
-	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
-				 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
-				 PAGE_SIZE);
-	if (bootmap == -1L)
-		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
-	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
-#ifdef CONFIG_BLK_DEV_INITRD
-	reserve_initrd();
-#endif
-	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
-	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
-		 max_pfn_mapped<<PAGE_SHIFT);
-	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
-		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
-	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
-		 bootmap, bootmap + bootmap_size);
-	for_each_online_node(i)
-		free_bootmem_with_active_regions(i, max_low_pfn);
-	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-
-}
-
 /*
  * The node 0 pgdat is initialized before all of these because
  * it's needed for bootmem.  node>0 pgdats have their virtual
@@ -666,7 +576,7 @@ void __init setup_arch(char **cmdline_p)
         acpi_numa_init();
 #endif
 
-	max_low_pfn = setup_memory();
+	max_low_pfn = initmem_init(0, max_pfn);
 
 #ifdef CONFIG_ACPI_SLEEP
 	/*
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index a2f73ba42b8b..3e75be46c4f2 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -309,8 +309,8 @@ static void init_remap_allocator(int nid)
 		(ulong) node_remap_end_vaddr[nid]);
 }
 
-extern void setup_bootmem_allocator(void);
-unsigned long __init setup_memory(void)
+unsigned long __init initmem_init(unsigned long start_pfn,
+				  unsigned long end_pfn)
 {
 	int nid;
 	unsigned long system_start_pfn, system_max_low_pfn;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index a0484adbf59d..9bc8607d7980 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -540,6 +540,96 @@ static void __init set_nx(void)
 }
 #endif
 
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+extern unsigned long find_max_low_pfn(void);
+unsigned long __init initmem_init(unsigned long start_pfn,
+				  unsigned long end_pfn)
+{
+	/*
+	 * partially used pages are not usable - thus
+	 * we are rounding upwards:
+	 */
+	min_low_pfn = PFN_UP(init_pg_tables_end);
+
+	max_low_pfn = find_max_low_pfn();
+
+#ifdef CONFIG_HIGHMEM
+	highstart_pfn = highend_pfn = max_pfn;
+	if (max_pfn > max_low_pfn)
+		highstart_pfn = max_low_pfn;
+	memory_present(0, 0, highend_pfn);
+	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+		pages_to_mb(highend_pfn - highstart_pfn));
+	num_physpages = highend_pfn;
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+	memory_present(0, 0, max_low_pfn);
+	num_physpages = max_low_pfn;
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+	max_mapnr = num_physpages;
+#endif
+	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+			pages_to_mb(max_low_pfn));
+
+	setup_bootmem_allocator();
+
+	return max_low_pfn;
+}
+
+void __init zone_sizes_init(void)
+{
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+	max_zone_pfns[ZONE_DMA] =
+		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+	remove_all_active_ranges();
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+	e820_register_active_regions(0, 0, highend_pfn);
+#else
+	e820_register_active_regions(0, 0, max_low_pfn);
+#endif
+
+	free_area_init_nodes(max_zone_pfns);
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+extern void reserve_initrd(void);
+
+void __init setup_bootmem_allocator(void)
+{
+	int i;
+	unsigned long bootmap_size, bootmap;
+	/*
+	 * Initialize the boot-time allocator (with low memory only):
+	 */
+	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
+	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+				 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+				 PAGE_SIZE);
+	if (bootmap == -1L)
+		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+#ifdef CONFIG_BLK_DEV_INITRD
+	reserve_initrd();
+#endif
+	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
+	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+		 max_pfn_mapped<<PAGE_SHIFT);
+	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
+		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
+		 bootmap, bootmap + bootmap_size);
+	for_each_online_node(i)
+		free_bootmem_with_active_regions(i, max_low_pfn);
+	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+
+}
+
+
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
  * already mapped by head.S.
diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h
index 73ed2e4ebf95..a0713d175f45 100644
--- a/include/asm-x86/page_32.h
+++ b/include/asm-x86/page_32.h
@@ -92,6 +92,11 @@ extern int sysctl_legacy_va_layout;
 #define VMALLOC_RESERVE		((unsigned long)__VMALLOC_RESERVE)
 #define MAXMEM			(-__PAGE_OFFSET - __VMALLOC_RESERVE)
 
+extern unsigned long initmem_init(unsigned long, unsigned long);
+extern void zone_sizes_init(void);
+extern void setup_bootmem_allocator(void);
+
+
 #ifdef CONFIG_X86_USE_3DNOW
 #include <asm/mmx.h>
 
-- 
cgit v1.2.3


From 225c37d71bc8b97eb2063e8eda153b383328b20b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 22 Jun 2008 02:46:58 -0700
Subject: x86: introduce reserve_initrd

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 16 ++++++++++------
 arch/x86/mm/init_32.c      |  6 ++----
 include/asm-x86/setup.h    |  2 ++
 3 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 190546bd3bd3..90b51047ce63 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -336,7 +336,7 @@ void __init reserve_initrd(void)
 		 * in i386_start_kernel
 		 */
 		initrd_start = ramdisk_image + PAGE_OFFSET;
-		initrd_end = initrd_start+ramdisk_size;
+		initrd_end = initrd_start + ramdisk_size;
 		return;
 	}
 
@@ -363,7 +363,7 @@ void __init reserve_initrd(void)
 
 #define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
 
-static void __init relocate_initrd(void)
+static void __init post_reserve_initrd(void)
 {
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
@@ -417,7 +417,13 @@ static void __init relocate_initrd(void)
 	/* need to free that, otherwise init highmem will reserve it again */
 	free_early(ramdisk_image, ramdisk_image+ramdisk_size);
 }
-
+#else
+void __init reserve_initrd(void)
+{
+}
+static void __init post_reserve_initrd(void)
+{
+}
 #endif /* CONFIG_BLK_DEV_INITRD */
 
 /*
@@ -632,9 +638,7 @@ void __init setup_arch(char **cmdline_p)
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
 
-#ifdef CONFIG_BLK_DEV_INITRD
-	relocate_initrd();
-#endif
+	post_reserve_initrd();
 
 	remapped_pgdat_init();
 	sparse_init();
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9bc8607d7980..98080782ee47 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -597,8 +597,6 @@ void __init zone_sizes_init(void)
 }
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
-extern void reserve_initrd(void);
-
 void __init setup_bootmem_allocator(void)
 {
 	int i;
@@ -613,9 +611,9 @@ void __init setup_bootmem_allocator(void)
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
 	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
-#ifdef CONFIG_BLK_DEV_INITRD
+
 	reserve_initrd();
-#endif
+
 	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
 		 max_pfn_mapped<<PAGE_SHIFT);
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index bb12a1619c12..8f85b2450562 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -39,6 +39,8 @@ void reserve_crashkernel(void);
 #include <asm/bootparam.h>
 
 void reserve_standard_io_resources(void);
+void reserve_initrd(void);
+
 
 #ifndef _SETUP
 
-- 
cgit v1.2.3


From 7f0be02c5ed1deb04c54c6a17f412e04f417df11 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 22 Jun 2008 17:37:54 -0700
Subject: x86: move boot_params declaring to setup.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common_64.c | 6 ------
 arch/x86/kernel/setup.c         | 6 ++++++
 arch/x86/kernel/setup_32.c      | 6 ------
 3 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 9fb5b7caaa89..39eaefbcec06 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -421,12 +421,6 @@ static __init int setup_disablecpuid(char *arg)
 }
 __setup("clearcpuid=", setup_disablecpuid);
 
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
 struct x8664_pda **_cpu_pda __read_mostly;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5497fb9b00a0..5c0c4bb5727d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -15,6 +15,12 @@
 #include <asm/apicdef.h>
 #include <asm/highmem.h>
 
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+
 #ifdef CONFIG_X86_LOCAL_APIC
 unsigned int num_processors;
 unsigned disabled_cpus __cpuinitdata;
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 90b51047ce63..3149f434d7d0 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -163,12 +163,6 @@ unsigned long saved_video_mode;
 
 static char __initdata command_line[COMMAND_LINE_SIZE];
 
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
 struct edd edd;
 #ifdef CONFIG_EDD_MODULE
-- 
cgit v1.2.3


From 90d967e0ef68f5312ed4b081d5c9312ff53c1c93 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 23 Jun 2008 21:00:45 +0200
Subject: x86: move find_max_low_pfn to init_32.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 77 ---------------------------------------
 arch/x86/mm/init_32.c      | 89 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 88 insertions(+), 78 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 3149f434d7d0..13155009ce94 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -129,9 +129,6 @@ unsigned int BIOS_revision;
 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
 int bootloader_type;
 
-/* user-defined highmem size */
-static unsigned int highmem_pages = -1;
-
 /*
  * Early DMI memory
  */
@@ -187,21 +184,6 @@ static inline void copy_edd(void)
 }
 #endif
 
-/*
- * highmem=size forces highmem to be exactly 'size' bytes.
- * This works even on boxes that have no highmem otherwise.
- * This also works to reduce highmem size on bigger boxes.
- */
-static int __init parse_highmem(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
-	return 0;
-}
-early_param("highmem", parse_highmem);
-
 /*
  * vmalloc=size forces the vmalloc area to be exactly 'size'
  * bytes. This can be used to increase (or decrease) the
@@ -235,65 +217,6 @@ static int __init parse_reservetop(char *arg)
 }
 early_param("reservetop", parse_reservetop);
 
-/*
- * Determine low and high memory ranges:
- */
-unsigned long __init find_max_low_pfn(void)
-{
-	unsigned long max_low_pfn;
-
-	max_low_pfn = max_pfn;
-	if (max_low_pfn > MAXMEM_PFN) {
-		if (highmem_pages == -1)
-			highmem_pages = max_pfn - MAXMEM_PFN;
-		if (highmem_pages + MAXMEM_PFN < max_pfn)
-			max_pfn = MAXMEM_PFN + highmem_pages;
-		if (highmem_pages + MAXMEM_PFN > max_pfn) {
-			printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
-			highmem_pages = 0;
-		}
-		max_low_pfn = MAXMEM_PFN;
-#ifndef CONFIG_HIGHMEM
-		/* Maximum memory usable is what is directly addressable */
-		printk(KERN_WARNING "Warning only %ldMB will be used.\n",
-					MAXMEM>>20);
-		if (max_pfn > MAX_NONPAE_PFN)
-			printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
-		else
-			printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
-		max_pfn = MAXMEM_PFN;
-#else /* !CONFIG_HIGHMEM */
-#ifndef CONFIG_HIGHMEM64G
-		if (max_pfn > MAX_NONPAE_PFN) {
-			max_pfn = MAX_NONPAE_PFN;
-			printk(KERN_WARNING "Warning only 4GB will be used.\n");
-			printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
-		}
-#endif /* !CONFIG_HIGHMEM64G */
-#endif /* !CONFIG_HIGHMEM */
-	} else {
-		if (highmem_pages == -1)
-			highmem_pages = 0;
-#ifdef CONFIG_HIGHMEM
-		if (highmem_pages >= max_pfn) {
-			printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
-			highmem_pages = 0;
-		}
-		if (highmem_pages) {
-			if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
-				printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
-				highmem_pages = 0;
-			}
-			max_low_pfn -= highmem_pages;
-		}
-#else
-		if (highmem_pages)
-			printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
-#endif
-	}
-	return max_low_pfn;
-}
-
 #ifdef CONFIG_BLK_DEV_INITRD
 
 static bool do_relocate_initrd = false;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 98080782ee47..d1017336f1b5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -540,8 +540,95 @@ static void __init set_nx(void)
 }
 #endif
 
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+	return 0;
+}
+early_param("highmem", parse_highmem);
+
+/*
+ * Determine low and high memory ranges:
+ */
+unsigned long __init find_max_low_pfn(void)
+{
+	unsigned long max_low_pfn;
+
+	max_low_pfn = max_pfn;
+	if (max_low_pfn > MAXMEM_PFN) {
+		if (highmem_pages == -1)
+			highmem_pages = max_pfn - MAXMEM_PFN;
+		if (highmem_pages + MAXMEM_PFN < max_pfn)
+			max_pfn = MAXMEM_PFN + highmem_pages;
+		if (highmem_pages + MAXMEM_PFN > max_pfn) {
+			printk(KERN_WARNING "only %luMB highmem pages "
+				"available, ignoring highmem size of %uMB.\n",
+				pages_to_mb(max_pfn - MAXMEM_PFN),
+				pages_to_mb(highmem_pages));
+			highmem_pages = 0;
+		}
+		max_low_pfn = MAXMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+		/* Maximum memory usable is what is directly addressable */
+		printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+					MAXMEM>>20);
+		if (max_pfn > MAX_NONPAE_PFN)
+			printk(KERN_WARNING
+				 "Use a HIGHMEM64G enabled kernel.\n");
+		else
+			printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+		max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+		if (max_pfn > MAX_NONPAE_PFN) {
+			max_pfn = MAX_NONPAE_PFN;
+			printk(KERN_WARNING "Warning only 4GB will be used."
+				"Use a HIGHMEM64G enabled kernel.\n");
+		}
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+	} else {
+		if (highmem_pages == -1)
+			highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+		if (highmem_pages >= max_pfn) {
+			printk(KERN_ERR "highmem size specified (%uMB) is "
+				"bigger than pages available (%luMB)!.\n",
+				pages_to_mb(highmem_pages),
+				pages_to_mb(max_pfn));
+			highmem_pages = 0;
+		}
+		if (highmem_pages) {
+			if (max_low_pfn - highmem_pages <
+			    64*1024*1024/PAGE_SIZE){
+				printk(KERN_ERR "highmem size %uMB results in "
+				"smaller than 64MB lowmem, ignoring it.\n"
+					, pages_to_mb(highmem_pages));
+				highmem_pages = 0;
+			}
+			max_low_pfn -= highmem_pages;
+		}
+#else
+		if (highmem_pages)
+			printk(KERN_ERR "ignoring highmem size on non-highmem"
+					" kernel!\n");
+#endif
+	}
+	return max_low_pfn;
+}
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-extern unsigned long find_max_low_pfn(void);
 unsigned long __init initmem_init(unsigned long start_pfn,
 				  unsigned long end_pfn)
 {
-- 
cgit v1.2.3


From bef1568d9714f1162086c32583ba7984a7ca8e3e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 22 Jun 2008 17:40:10 -0700
Subject: x86: move reservetop and vmalloc parsing to pgtable_32.c

also change reserve_top_address to __init attibute

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 33 ---------------------------------
 arch/x86/mm/pgtable_32.c   | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 13155009ce94..9a08490a3889 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -184,39 +184,6 @@ static inline void copy_edd(void)
 }
 #endif
 
-/*
- * vmalloc=size forces the vmalloc area to be exactly 'size'
- * bytes. This can be used to increase (or decrease) the
- * vmalloc area - the default is 128m.
- */
-static int __init parse_vmalloc(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	__VMALLOC_RESERVE = memparse(arg, &arg);
-	return 0;
-}
-early_param("vmalloc", parse_vmalloc);
-
-/*
- * reservetop=size reserves a hole at the top of the kernel address space which
- * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
- * so relocating the fixmap can be done before paging initialization.
- */
-static int __init parse_reservetop(char *arg)
-{
-	unsigned long address;
-
-	if (!arg)
-		return -EINVAL;
-
-	address = memparse(arg, &arg);
-	reserve_top_address(address);
-	return 0;
-}
-early_param("reservetop", parse_reservetop);
-
 #ifdef CONFIG_BLK_DEV_INITRD
 
 static bool do_relocate_initrd = false;
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 0662f345212f..828907d001e8 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL(__FIXADDR_TOP);
  * Can be used to relocate the fixmap area and poke a hole in the top
  * of kernel address space to make room for a hypervisor.
  */
-void reserve_top_address(unsigned long reserve)
+void __init reserve_top_address(unsigned long reserve)
 {
 	BUG_ON(fixmaps_set > 0);
 	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
@@ -160,3 +160,36 @@ void reserve_top_address(unsigned long reserve)
 	__FIXADDR_TOP = -reserve - PAGE_SIZE;
 	__VMALLOC_RESERVE += reserve;
 }
+
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	__VMALLOC_RESERVE = memparse(arg, &arg);
+	return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+	unsigned long address;
+
+	if (!arg)
+		return -EINVAL;
+
+	address = memparse(arg, &arg);
+	reserve_top_address(address);
+	return 0;
+}
+early_param("reservetop", parse_reservetop);
-- 
cgit v1.2.3


From 2ec65f8b89ea003c27ff7723525a2ee335a2b393 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 23 Jun 2008 03:05:30 -0700
Subject: x86: clean up using max_low_pfn on 32-bit

so that max_low_pfn is not changed after it is set.
so we can move that early and out of initmem_init.

could call find_low_pfn_range just after max_pfn is set.

also could move reserve_initrd out of setup_bootmem_allocator

so 32bit is more like 64bit.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 16 ++++++++++------
 arch/x86/mm/discontig_32.c | 22 +++++++---------------
 arch/x86/mm/init_32.c      | 25 +++++++++----------------
 include/asm-x86/page_32.h  |  3 ++-
 include/asm-x86/setup.h    |  2 --
 5 files changed, 28 insertions(+), 40 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 9a08490a3889..b42f570a5a56 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -188,13 +188,14 @@ static inline void copy_edd(void)
 
 static bool do_relocate_initrd = false;
 
-void __init reserve_initrd(void)
+static void __init reserve_initrd(void)
 {
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
 	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
 	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
 	u64 ramdisk_here;
+	u64 ramdisk_target;
 
 	if (!boot_params.hdr.type_of_loader ||
 	    !ramdisk_image || !ramdisk_size)
@@ -202,7 +203,7 @@ void __init reserve_initrd(void)
 
 	initrd_start = 0;
 
-	if (ramdisk_size >= end_of_lowmem/2) {
+	if (ramdisk_size >= (end_of_lowmem>>1)) {
 		free_early(ramdisk_image, ramdisk_end);
 		printk(KERN_ERR "initrd too large to handle, "
 		       "disabling initrd\n");
@@ -225,7 +226,8 @@ void __init reserve_initrd(void)
 	}
 
 	/* We need to move the initrd down into lowmem */
-	ramdisk_here = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+	ramdisk_target = max_pfn_mapped<<PAGE_SHIFT;
+	ramdisk_here = find_e820_area(min(ramdisk_target, end_of_lowmem>>1),
 				 end_of_lowmem, ramdisk_size,
 				 PAGE_SIZE);
 
@@ -346,8 +348,6 @@ static void set_mca_bus(int x) { }
  */
 void __init setup_arch(char **cmdline_p)
 {
-	unsigned long max_low_pfn;
-
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	pre_setup_arch_hook();
 	early_cpu_init();
@@ -450,6 +450,10 @@ void __init setup_arch(char **cmdline_p)
 		max_pfn = e820_end_of_ram();
 	}
 
+	find_low_pfn_range();
+
+	reserve_initrd();
+
 	dmi_scan_machine();
 
 	io_delay_init();
@@ -466,7 +470,7 @@ void __init setup_arch(char **cmdline_p)
         acpi_numa_init();
 #endif
 
-	max_low_pfn = initmem_init(0, max_pfn);
+	initmem_init(0, max_pfn);
 
 #ifdef CONFIG_ACPI_SLEEP
 	/*
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 3e75be46c4f2..1dfff700264c 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -309,11 +309,10 @@ static void init_remap_allocator(int nid)
 		(ulong) node_remap_end_vaddr[nid]);
 }
 
-unsigned long __init initmem_init(unsigned long start_pfn,
+void __init initmem_init(unsigned long start_pfn,
 				  unsigned long end_pfn)
 {
 	int nid;
-	unsigned long system_start_pfn, system_max_low_pfn;
 	long kva_target_pfn;
 
 	/*
@@ -324,17 +323,11 @@ unsigned long __init initmem_init(unsigned long start_pfn,
 	 * and ZONE_HIGHMEM.
 	 */
 
-	/* call find_max_low_pfn at first, it could update max_pfn */
-	system_max_low_pfn = max_low_pfn = find_max_low_pfn();
-
 	remove_all_active_ranges();
 	get_memcfg_numa();
 
 	kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
 
-	/* partially used pages are not usable - thus round upwards */
-	system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
-
 	kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
 	do {
 		kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
@@ -357,19 +350,19 @@ unsigned long __init initmem_init(unsigned long start_pfn,
 		     "KVA PG");
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
-	if (max_pfn > system_max_low_pfn)
-		highstart_pfn = system_max_low_pfn;
+	if (max_pfn > max_low_pfn)
+		highstart_pfn = max_low_pfn;
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 	       pages_to_mb(highend_pfn - highstart_pfn));
 	num_physpages = highend_pfn;
 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-	num_physpages = system_max_low_pfn;
-	high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1;
+	num_physpages = max_low_pfn;
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
-			pages_to_mb(system_max_low_pfn));
-	printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 
+			pages_to_mb(max_low_pfn));
+	printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
 			min_low_pfn, max_low_pfn, highstart_pfn);
 
 	printk("Low memory ends at vaddr %08lx\n",
@@ -387,7 +380,6 @@ unsigned long __init initmem_init(unsigned long start_pfn,
 	memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
 	NODE_DATA(0)->bdata = &node0_bdata;
 	setup_bootmem_allocator();
-	return max_low_pfn;
 }
 
 void __init zone_sizes_init(void)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index d1017336f1b5..27b829312944 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -561,9 +561,15 @@ early_param("highmem", parse_highmem);
 /*
  * Determine low and high memory ranges:
  */
-unsigned long __init find_max_low_pfn(void)
+void __init find_low_pfn_range(void)
 {
-	unsigned long max_low_pfn;
+	/* it could update max_pfn */
+
+	/*
+	 * partially used pages are not usable - thus
+	 * we are rounding upwards:
+	 */
+	min_low_pfn = PFN_UP(init_pg_tables_end);
 
 	max_low_pfn = max_pfn;
 	if (max_low_pfn > MAXMEM_PFN) {
@@ -625,21 +631,12 @@ unsigned long __init find_max_low_pfn(void)
 					" kernel!\n");
 #endif
 	}
-	return max_low_pfn;
 }
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-unsigned long __init initmem_init(unsigned long start_pfn,
+void __init initmem_init(unsigned long start_pfn,
 				  unsigned long end_pfn)
 {
-	/*
-	 * partially used pages are not usable - thus
-	 * we are rounding upwards:
-	 */
-	min_low_pfn = PFN_UP(init_pg_tables_end);
-
-	max_low_pfn = find_max_low_pfn();
-
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
 	if (max_pfn > max_low_pfn)
@@ -661,8 +658,6 @@ unsigned long __init initmem_init(unsigned long start_pfn,
 			pages_to_mb(max_low_pfn));
 
 	setup_bootmem_allocator();
-
-	return max_low_pfn;
 }
 
 void __init zone_sizes_init(void)
@@ -699,8 +694,6 @@ void __init setup_bootmem_allocator(void)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
 	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
 
-	reserve_initrd();
-
 	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
 		 max_pfn_mapped<<PAGE_SHIFT);
diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h
index a0713d175f45..3810d14051e8 100644
--- a/include/asm-x86/page_32.h
+++ b/include/asm-x86/page_32.h
@@ -92,7 +92,8 @@ extern int sysctl_legacy_va_layout;
 #define VMALLOC_RESERVE		((unsigned long)__VMALLOC_RESERVE)
 #define MAXMEM			(-__PAGE_OFFSET - __VMALLOC_RESERVE)
 
-extern unsigned long initmem_init(unsigned long, unsigned long);
+extern void find_low_pfn_range(void);
+extern void initmem_init(unsigned long, unsigned long);
 extern void zone_sizes_init(void);
 extern void setup_bootmem_allocator(void);
 
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 8f85b2450562..bb12a1619c12 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -39,8 +39,6 @@ void reserve_crashkernel(void);
 #include <asm/bootparam.h>
 
 void reserve_standard_io_resources(void);
-void reserve_initrd(void);
-
 
 #ifndef _SETUP
 
-- 
cgit v1.2.3


From 3eb11edc1321e14a121deeb8b18c177750226eca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Jun 2008 22:19:22 +0200
Subject: x86: build fix

fix:

arch/x86/kernel/setup_32.c:409: error: 'enable_local_apic' undeclared (first use in this function)
arch/x86/kernel/setup_32.c:409: error: (Each undeclared identifier is reported only once
arch/x86/kernel/setup_32.c:409: error: for each function it appears in.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index b42f570a5a56..1e670372c191 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -406,7 +406,9 @@ void __init setup_arch(char **cmdline_p)
 	parse_early_param();
 
 	if (acpi_mps_check()){
+#ifdef CONFIG_X86_LOCAL_APIC
 		enable_local_apic = -1;
+#endif
 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 	}
 
-- 
cgit v1.2.3


From c09434571d4b1d8abf530ba4ce28cb868b45f2e5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 23 Jun 2008 16:41:30 -0700
Subject: x86: numa32 pfn print out using hex instead

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/srat_32.c  | 31 +++++++++++++++++++------------
 arch/x86/mm/discontig_32.c | 29 +++++++++++++++--------------
 2 files changed, 34 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 5978023b799b..f41d67f8f831 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -93,7 +93,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
 
 	apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
 
-	printk("CPU 0x%02X in proximity domain 0x%02X\n",
+	printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
 		cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
 }
 
@@ -134,7 +134,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
 
 
 	if (num_memory_chunks >= MAXCHUNKS) {
-		printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
+		printk(KERN_WARNING "Too many mem chunks in SRAT."
+			" Ignoring %lld MBytes at %llx\n",
 			size/(1024*1024), paddr);
 		return;
 	}
@@ -155,7 +156,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
 
 	num_memory_chunks++;
 
-	printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
+	printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)"
+			  " in proximity domain %02x %s\n",
 		start_pfn, end_pfn,
 		memory_affinity->memory_type,
 		pxm,
@@ -186,7 +188,7 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
 	 * *possible* memory hotplug areas the same as normal RAM.
 	 */
 	if (memory_chunk->start_pfn >= max_pfn) {
-		printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n",
+		printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
 			memory_chunk->start_pfn, memory_chunk->end_pfn);
 		return;
 	}
@@ -212,7 +214,8 @@ int __init get_memcfg_from_srat(void)
 		goto out_fail;
 
 	if (num_memory_chunks == 0) {
-		printk("could not finy any ACPI SRAT memory areas.\n");
+		printk(KERN_WARNING
+			 "could not finy any ACPI SRAT memory areas.\n");
 		goto out_fail;
 	}
 
@@ -239,20 +242,23 @@ int __init get_memcfg_from_srat(void)
 	for (i = 0; i < num_memory_chunks; i++)
 		node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
 
-	printk("pxm bitmap: ");
+	printk(KERN_DEBUG "pxm bitmap: ");
 	for (i = 0; i < sizeof(pxm_bitmap); i++) {
-		printk("%02X ", pxm_bitmap[i]);
+		printk(KERN_CONT "%02x ", pxm_bitmap[i]);
 	}
-	printk("\n");
-	printk("Number of logical nodes in system = %d\n", num_online_nodes());
-	printk("Number of memory chunks in system = %d\n", num_memory_chunks);
+	printk(KERN_CONT "\n");
+	printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
+			 num_online_nodes());
+	printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
+			 num_memory_chunks);
 
 	for (i = 0; i < MAX_APICID; i++)
 		apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
 
 	for (j = 0; j < num_memory_chunks; j++){
 		struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
-		printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
+		printk(KERN_DEBUG
+			"chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
 		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
 		node_read_chunk(chunk->nid, chunk);
 		e820_register_active_regions(chunk->nid, chunk->start_pfn,
@@ -268,6 +274,7 @@ int __init get_memcfg_from_srat(void)
 	}
 	return 1;
 out_fail:
-	printk("failed to get NUMA memory information from SRAT table\n");
+	printk(KERN_ERR "failed to get NUMA memory information from SRAT"
+			" table\n");
 	return 0;
 }
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 1dfff700264c..f5ae31935ca8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -76,13 +76,13 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 {
 	unsigned long pfn;
 
-	printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
+	printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
 			nid, start, end);
 	printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
 	printk(KERN_DEBUG "  ");
 	for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
 		physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
-		printk(KERN_CONT "%ld ", pfn);
+		printk(KERN_CONT "%lx ", pfn);
 	}
 	printk(KERN_CONT "\n");
 }
@@ -117,7 +117,7 @@ static unsigned long kva_pages;
  */
 int __init get_memcfg_numa_flat(void)
 {
-	printk("NUMA - single node, flat memory mode\n");
+	printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
 
 	node_start_pfn[0] = 0;
 	node_end_pfn[0] = max_pfn;
@@ -233,7 +233,7 @@ static unsigned long calculate_numa_remap_pages(void)
 		 * The acpi/srat node info can show hot-add memroy zones
 		 * where memory could be added but not currently present.
 		 */
-		printk("node %d pfn: [%lx - %lx]\n",
+		printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
 			nid, node_start_pfn[nid], node_end_pfn[nid]);
 		if (node_start_pfn[nid] > max_pfn)
 			continue;
@@ -268,7 +268,8 @@ static unsigned long calculate_numa_remap_pages(void)
 		node_remap_size[nid] = size;
 		node_remap_offset[nid] = reserve_pages;
 		reserve_pages += size;
-		printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
+		printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
+				  " node %d at %llx\n",
 				size, nid, node_kva_final>>PAGE_SHIFT);
 
 		/*
@@ -290,7 +291,7 @@ static unsigned long calculate_numa_remap_pages(void)
 		remove_active_range(nid, node_remap_start_pfn[nid],
 					 node_remap_start_pfn[nid] + size);
 	}
-	printk("Reserving total of %ld pages for numa KVA remap\n",
+	printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
 			reserve_pages);
 	return reserve_pages;
 }
@@ -304,7 +305,7 @@ static void init_remap_allocator(int nid)
 	node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
 		ALIGN(sizeof(pg_data_t), PAGE_SIZE);
 
-	printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+	printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
 		(ulong) node_remap_start_vaddr[nid],
 		(ulong) node_remap_end_vaddr[nid]);
 }
@@ -340,9 +341,9 @@ void __init initmem_init(unsigned long start_pfn,
 	if (kva_start_pfn == -1UL)
 		panic("Can not get kva space\n");
 
-	printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
+	printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
 		kva_start_pfn, max_low_pfn);
-	printk("max_pfn = %ld\n", max_pfn);
+	printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
 
 	/* avoid clash with initrd */
 	reserve_early(kva_start_pfn<<PAGE_SHIFT,
@@ -362,17 +363,17 @@ void __init initmem_init(unsigned long start_pfn,
 #endif
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(max_low_pfn));
-	printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
-			min_low_pfn, max_low_pfn, highstart_pfn);
+	printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
+			max_low_pfn, highstart_pfn);
 
-	printk("Low memory ends at vaddr %08lx\n",
+	printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(max_low_pfn));
 	for_each_online_node(nid) {
 		init_remap_allocator(nid);
 
 		allocate_pgdat(nid);
 	}
-	printk("High memory starts at vaddr %08lx\n",
+	printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(highstart_pfn));
 	for_each_online_node(nid)
 		propagate_e820_map_node(nid);
@@ -413,7 +414,7 @@ void __init set_highmem_pages_init(void)
 		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
 
 		nid = zone_to_nid(zone);
-		printk("Initializing %s for node %d (%08lx:%08lx)\n",
+		printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
 				zone->name, nid, zone_start_pfn, zone_end_pfn);
 
 		add_highpages_with_active_regions(nid, zone_start_pfn,
-- 
cgit v1.2.3


From 11cd0bc140b5d66566c9eb49c1058737888cd75c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 23 Jun 2008 19:51:10 -0700
Subject: x86: move some func calling from setup_arch to paging_init

those function depend on paging setup pgtable, so they could access
the ram in bootmem region but just get mapped.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 34 ++--------------------------------
 arch/x86/mm/init_32.c      | 29 +++++++++++++++++++++++++++++
 include/asm-x86/setup.h    |  1 +
 3 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 1e670372c191..220d92faf0a9 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -249,7 +249,7 @@ static void __init reserve_initrd(void)
 
 #define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
 
-static void __init post_reserve_initrd(void)
+void __init post_reserve_initrd(void)
 {
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
@@ -307,29 +307,11 @@ static void __init post_reserve_initrd(void)
 void __init reserve_initrd(void)
 {
 }
-static void __init post_reserve_initrd(void)
+void __init post_reserve_initrd(void)
 {
 }
 #endif /* CONFIG_BLK_DEV_INITRD */
 
-/*
- * The node 0 pgdat is initialized before all of these because
- * it's needed for bootmem.  node>0 pgdats have their virtual
- * space allocated before the pagetables are in place to access
- * them, so they can't be cleared then.
- *
- * This should all compile down to nothing when NUMA is off.
- */
-static void __init remapped_pgdat_init(void)
-{
-	int nid;
-
-	for_each_online_node(nid) {
-		if (nid != 0)
-			memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
-	}
-}
-
 #ifdef CONFIG_MCA
 static void set_mca_bus(int x)
 {
@@ -524,18 +506,6 @@ void __init setup_arch(char **cmdline_p)
 		init_ohci1394_dma_on_all_controllers();
 #endif
 
-	/*
-	 * NOTE: at this point the bootmem allocator is fully available.
-	 */
-
-	post_reserve_initrd();
-
-	remapped_pgdat_init();
-	sparse_init();
-	zone_sizes_init();
-
-	paravirt_post_allocator_init();
-
 #ifdef CONFIG_X86_GENERICARCH
 	generic_apic_probe();
 #endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9bb35cf8bc4c..20ca29591abe 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -705,6 +705,23 @@ void __init setup_bootmem_allocator(void)
 
 }
 
+/*
+ * The node 0 pgdat is initialized before all of these because
+ * it's needed for bootmem.  node>0 pgdats have their virtual
+ * space allocated before the pagetables are in place to access
+ * them, so they can't be cleared then.
+ *
+ * This should all compile down to nothing when NUMA is off.
+ */
+static void __init remapped_pgdat_init(void)
+{
+	int nid;
+
+	for_each_online_node(nid) {
+		if (nid != 0)
+			memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+	}
+}
 
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
@@ -727,6 +744,18 @@ void __init paging_init(void)
 	__flush_tlb_all();
 
 	kmap_init();
+
+	/*
+	 * NOTE: at this point the bootmem allocator is fully available.
+	 */
+
+	post_reserve_initrd();
+
+	remapped_pgdat_init();
+	sparse_init();
+	zone_sizes_init();
+
+	paravirt_post_allocator_init();
 }
 
 /*
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index bb12a1619c12..4ebb4ef14c06 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -39,6 +39,7 @@ void reserve_crashkernel(void);
 #include <asm/bootparam.h>
 
 void reserve_standard_io_resources(void);
+extern void post_reserve_initrd(void);
 
 #ifndef _SETUP
 
-- 
cgit v1.2.3


From 7465252ea0121c9cd28be68dfb86293a7554a111 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 23 Jun 2008 19:53:33 -0700
Subject: x86: setup_arch 32bit move efi check later

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 220d92faf0a9..52f4e01bb655 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -336,14 +336,6 @@ void __init setup_arch(char **cmdline_p)
 	early_ioremap_init();
 	reserve_setup_data();
 
-#ifdef CONFIG_EFI
-	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL32", 4)) {
-		efi_enabled = 1;
-		efi_reserve_early();
-	}
-#endif
-
 	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
 	screen_info = boot_params.screen_info;
 	edid_info = boot_params.edid_info;
@@ -363,10 +355,17 @@ void __init setup_arch(char **cmdline_p)
 	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
 	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
 #endif
+#ifdef CONFIG_EFI
+	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+		     "EL32", 4)) {
+		efi_enabled = 1;
+		efi_reserve_early();
+	}
+#endif
+
 	ARCH_SETUP
 
 	setup_memory_map();
-
 	copy_edd();
 
 	if (!boot_params.hdr.root_flags)
-- 
cgit v1.2.3


From 9a2e59302668b9ac2fb2a2c9bca1fc793c5d0409 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 23 Jun 2008 19:54:23 -0700
Subject: x86: setup_arch 32bit move command line copying early

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 52f4e01bb655..243b2f7ca137 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -382,6 +382,9 @@ void __init setup_arch(char **cmdline_p)
 	bss_resource.start = virt_to_phys(&__bss_start);
 	bss_resource.end = virt_to_phys(&__bss_stop)-1;
 
+	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
+
 	parse_setup_data();
 
 	parse_early_param();
@@ -402,9 +405,6 @@ void __init setup_arch(char **cmdline_p)
 	insert_resource(&iomem_resource, &data_resource);
 	insert_resource(&iomem_resource, &bss_resource);
 
-	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-	*cmdline_p = command_line;
-
 	if (efi_enabled)
 		efi_init();
 
-- 
cgit v1.2.3


From 295deae401fc5b6f215e876d93b40f25cb968c88 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 23 Jun 2008 19:55:05 -0700
Subject: x86: setup_arch 32bit move kvm_guest_init later

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 243b2f7ca137..bba8d57bd7d8 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -482,8 +482,6 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	vmi_init();
 #endif
-	kvm_guest_init();
-
 	/*
 	 * NOTE: before this point _nobody_ is allowed to allocate
 	 * any memory using the bootmem allocator.  Although the
@@ -511,9 +509,15 @@ void __init setup_arch(char **cmdline_p)
 
 	early_quirks();
 
+	/*
+	 * Read APIC and some other early information from ACPI tables.
+	 */
 	acpi_boot_init();
 
 #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+	/*
+	 * get boot-time SMP configuration:
+	 */
 	if (smp_found_config)
 		get_smp_config();
 #endif
@@ -523,6 +527,7 @@ void __init setup_arch(char **cmdline_p)
 			"CONFIG_X86_PC cannot handle it.\nUse "
 			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
 #endif
+	kvm_guest_init();
 
 	e820_reserve_resources();
 	e820_mark_nosave_regions(max_low_pfn);
-- 
cgit v1.2.3


From 157fabf09594ab064b7ae92c81942af4b94663cb Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 22 Jun 2008 07:21:57 -0700
Subject: x86 boot: e820 code indentation fix

Fix indentation.  An earlier code merge got the
indentation of four lines of code off by a tab.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: "Yinghai Lu" <yhlu.kernel@gmail.com>
Cc: "Jack Steiner" <steiner@sgi.com>
Cc: "Mike Travis" <travis@sgi.com>
Cc: "Huang
Cc: Ying" <ying.huang@intel.com>
Cc: "Andi Kleen" <andi@firstfloor.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e285ea38c8e5..8a8afbdeb34e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -207,10 +207,10 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 		struct e820entry *pbios; /* pointer to original bios entry */
 		unsigned long long addr; /* address for this change point */
 	};
-static struct change_member change_point_list[2*E820_X_MAX] __initdata;
-static struct change_member *change_point[2*E820_X_MAX] __initdata;
-static struct e820entry *overlap_list[E820_X_MAX] __initdata;
-static struct e820entry new_bios[E820_X_MAX] __initdata;
+	static struct change_member change_point_list[2*E820_X_MAX] __initdata;
+	static struct change_member *change_point[2*E820_X_MAX] __initdata;
+	static struct e820entry *overlap_list[E820_X_MAX] __initdata;
+	static struct e820entry new_bios[E820_X_MAX] __initdata;
 	struct change_member *change_tmp;
 	unsigned long current_type, last_type;
 	unsigned long long last_addr;
-- 
cgit v1.2.3


From 05486fa7e631a3be31a0bbc5a575a389a1609e94 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 22 Jun 2008 07:22:02 -0700
Subject: x86 boot: x86_64 efi compiler warning fix

Fix a compiler warning.  Rather than always casting a u32 to a pointer
(which generates a warning on x86_64 systems) instead separate the
x86_32 and x86_64 assignments entirely with ifdefs.  Until other recent
changes to this code, it used to have x86_64 separated like this.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: "Yinghai Lu" <yhlu.kernel@gmail.com>
Cc: "Jack Steiner" <steiner@sgi.com>
Cc: "Mike Travis" <travis@sgi.com>
Cc: "Huang
Cc: Ying" <ying.huang@intel.com>
Cc: "Andi Kleen" <andi@firstfloor.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/efi.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 473c89fe5073..a03ca36a302c 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -242,9 +242,11 @@ void __init efi_reserve_early(void)
 {
 	unsigned long pmap;
 
+#ifdef CONFIG_X86_32
 	pmap = boot_params.efi_info.efi_memmap;
-#ifdef CONFIG_X86_64
-	pmap += (__u64)boot_params.efi_info.efi_memmap_hi << 32;
+#else
+	pmap = (boot_params.efi_info.efi_memmap |
+		((__u64)boot_params.efi_info.efi_memmap_hi<<32));
 #endif
 	memmap.phys_map = (void *)pmap;
 	memmap.nr_map = boot_params.efi_info.efi_memmap_size /
@@ -284,10 +286,12 @@ void __init efi_init(void)
 	int i = 0;
 	void *tmp;
 
+#ifdef CONFIG_X86_32
 	efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
-#ifdef CONFIG_X86_64
-	efi_phys.systab = (void *)efi_phys.systab +
-		((__u64)boot_params.efi_info.efi_systab_hi<<32);
+#else
+	efi_phys.systab = (efi_system_table_t *)
+		(boot_params.efi_info.efi_systab |
+		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
 #endif
 
 	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
-- 
cgit v1.2.3


From c4ba1320b7075e9ce33ad0afaef43ba13260b4c2 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 22 Jun 2008 07:22:07 -0700
Subject: x86 boot: allow overlapping early reserve memory ranges

Add support for overlapping early memory reservations.

In general, they still can't overlap, and will panic
with "Overlapping early reservations" if they do overlap.

But if a memory range is reserved with the new call:
    reserve_early_overlap_ok()
rather than with the usual call:
    reserve_early()
then subsequent early reservations are allowed to overlap.

This new reserve_early_overlap_ok() call is only used in one
place so far, which is the "BIOS reserved" reservation for the
the EBDA region, which out of Paranoia reserves more than what
the BIOS might have specified, and which thus might overlap with
another legitimate early memory reservation (such as, perhaps,
the EFI memmap.)

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: "Yinghai Lu" <yhlu.kernel@gmail.com>
Cc: "Jack Steiner" <steiner@sgi.com>
Cc: "Mike Travis" <travis@sgi.com>
Cc: "Huang
Cc: Ying" <ying.huang@intel.com>
Cc: "Andi Kleen" <andi@firstfloor.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 140 +++++++++++++++++++++++++++++++++++++++++++++----
 arch/x86/kernel/head.c |   2 +-
 include/asm-x86/e820.h |   1 +
 3 files changed, 133 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 8a8afbdeb34e..600c9de237a0 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -604,6 +604,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 struct early_res {
 	u64 start, end;
 	char name[16];
+	char overlap_ok;
 };
 static struct early_res early_res[MAX_EARLY_RES] __initdata = {
 	{ 0, PAGE_SIZE, "BIOS data page" },	/* BIOS data page */
@@ -640,7 +641,93 @@ static int __init find_overlapped_early(u64 start, u64 end)
 	return i;
 }
 
-void __init reserve_early(u64 start, u64 end, char *name)
+/*
+ * Drop the i-th range from the early reservation map,
+ * by copying any higher ranges down one over it, and
+ * clearing what had been the last slot.
+ */
+static void __init drop_range(int i)
+{
+	int j;
+
+	for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+		;
+
+	memmove(&early_res[i], &early_res[i + 1],
+	       (j - 1 - i) * sizeof(struct early_res));
+
+	early_res[j - 1].end = 0;
+}
+
+/*
+ * Split any existing ranges that:
+ *  1) are marked 'overlap_ok', and
+ *  2) overlap with the stated range [start, end)
+ * into whatever portion (if any) of the existing range is entirely
+ * below or entirely above the stated range.  Drop the portion
+ * of the existing range that overlaps with the stated range,
+ * which will allow the caller of this routine to then add that
+ * stated range without conflicting with any existing range.
+ */
+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+{
+	int i;
+	struct early_res *r;
+	u64 lower_start, lower_end;
+	u64 upper_start, upper_end;
+	char name[16];
+
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		r = &early_res[i];
+
+		/* Continue past non-overlapping ranges */
+		if (end <= r->start || start >= r->end)
+			continue;
+
+		/*
+		 * Leave non-ok overlaps as is; let caller
+		 * panic "Overlapping early reservations"
+		 * when it hits this overlap.
+		 */
+		if (!r->overlap_ok)
+			return;
+
+		/*
+		 * We have an ok overlap.  We will drop it from the early
+		 * reservation map, and add back in any non-overlapping
+		 * portions (lower or upper) as separate, overlap_ok,
+		 * non-overlapping ranges.
+		 */
+
+		/* 1. Note any non-overlapping (lower or upper) ranges. */
+		strncpy(name, r->name, sizeof(name) - 1);
+
+		lower_start = lower_end = 0;
+		upper_start = upper_end = 0;
+		if (r->start < start) {
+		 	lower_start = r->start;
+			lower_end = start;
+		}
+		if (r->end > end) {
+			upper_start = end;
+			upper_end = r->end;
+		}
+
+		/* 2. Drop the original ok overlapping range */
+		drop_range(i);
+
+		i--;		/* resume for-loop on copied down entry */
+
+		/* 3. Add back in any non-overlapping ranges. */
+		if (lower_end)
+			reserve_early_overlap_ok(lower_start, lower_end, name);
+		if (upper_end)
+			reserve_early_overlap_ok(upper_start, upper_end, name);
+	}
+}
+
+static void __init __reserve_early(u64 start, u64 end, char *name,
+						int overlap_ok)
 {
 	int i;
 	struct early_res *r;
@@ -656,14 +743,55 @@ void __init reserve_early(u64 start, u64 end, char *name)
 		      r->end - 1, r->name);
 	r->start = start;
 	r->end = end;
+	r->overlap_ok = overlap_ok;
 	if (name)
 		strncpy(r->name, name, sizeof(r->name) - 1);
 }
 
+/*
+ * A few early reservtations come here.
+ *
+ * The 'overlap_ok' in the name of this routine does -not- mean it
+ * is ok for these reservations to overlap an earlier reservation.
+ * Rather it means that it is ok for subsequent reservations to
+ * overlap this one.
+ *
+ * Use this entry point to reserve early ranges when you are doing
+ * so out of "Paranoia", reserving perhaps more memory than you need,
+ * just in case, and don't mind a subsequent overlapping reservation
+ * that is known to be needed.
+ *
+ * The drop_overlaps_that_are_ok() call here isn't really needed.
+ * It would be needed if we had two colliding 'overlap_ok'
+ * reservations, so that the second such would not panic on the
+ * overlap with the first.  We don't have any such as of this
+ * writing, but might as well tolerate such if it happens in
+ * the future.
+ */
+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
+{
+	drop_overlaps_that_are_ok(start, end);
+	__reserve_early(start, end, name, 1);
+}
+
+/*
+ * Most early reservations come here.
+ *
+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
+ * 'overlap_ok' ranges, so that we can then reserve this memory
+ * range without risk of panic'ing on an overlapping overlap_ok
+ * early reservation.
+ */
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+	drop_overlaps_that_are_ok(start, end);
+	__reserve_early(start, end, name, 0);
+}
+
 void __init free_early(u64 start, u64 end)
 {
 	struct early_res *r;
-	int i, j;
+	int i;
 
 	i = find_overlapped_early(start, end);
 	r = &early_res[i];
@@ -671,13 +799,7 @@ void __init free_early(u64 start, u64 end)
 		panic("free_early on not reserved area: %llx-%llx!",
 			 start, end - 1);
 
-	for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
-		;
-
-	memmove(&early_res[i], &early_res[i + 1],
-	       (j - 1 - i) * sizeof(struct early_res));
-
-	early_res[j - 1].end = 0;
+	drop_range(i);
 }
 
 void __init early_res_to_bootmem(u64 start, u64 end)
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index a727c0b9819c..a6816be01cd6 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -51,7 +51,7 @@ void __init reserve_ebda_region(void)
 		lowmem = 0x9f000;
 
 	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_early(lowmem, 0x100000, "BIOS reserved");
+	reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
 }
 
 void __init reserve_setup_data(void)
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 7c32df07bae4..13fa5a076aa2 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -88,6 +88,7 @@ extern unsigned long end_user_pfn;
 extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
 extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
 extern void reserve_early(u64 start, u64 end, char *name);
+extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
 extern void free_early(u64 start, u64 end);
 extern void early_res_to_bootmem(u64 start, u64 end);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
-- 
cgit v1.2.3


From e2fc252e0ce695b4c4abe27bb073c35bd0d73252 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 22 Jun 2008 07:22:12 -0700
Subject: x86 boot: show pfn addresses in hex not decimal in some kernel info
 printks

Page frame numbers (the portion of physical addresses above the low
order page offsets) are displayed in several kernel debug and info
prints in decimal, not hex.  Decimal addresse are unreadable.  Use hex.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: "Yinghai Lu" <yhlu.kernel@gmail.com>
Cc: "Jack Steiner" <steiner@sgi.com>
Cc: "Mike Travis" <travis@sgi.com>
Cc: "Huang
Cc: Ying" <ying.huang@intel.com>
Cc: "Andi Kleen" <andi@firstfloor.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 2 +-
 mm/page_alloc.c        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 600c9de237a0..512f779fc6af 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -990,7 +990,7 @@ unsigned long __init e820_end_of_ram(void)
 	if (last_pfn > end_user_pfn)
 		last_pfn = end_user_pfn;
 
-	printk(KERN_INFO "last_pfn = %lu max_arch_pfn = %lu\n",
+	printk(KERN_INFO "last_pfn = 0x%lx max_arch_pfn = 0x%lx\n",
 			 last_pfn, max_arch_pfn);
 	return last_pfn;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e25b6b24f844..c09c2c8d2c6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 {
 	int i;
 
-	printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
+	printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) "
 			  "%d entries of %d used\n",
 			  nid, start_pfn, end_pfn,
 			  nr_nodemap_entries, MAX_ACTIVE_REGIONS);
@@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
-		printk("  %-8s %8lu -> %8lu\n",
+		printk("  %-8s 0x%8lx -> 0x%8lx\n",
 				zone_names[i],
 				arch_zone_lowest_possible_pfn[i],
 				arch_zone_highest_possible_pfn[i]);
@@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	/* Print out the early_node_map[] */
 	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
 	for (i = 0; i < nr_nodemap_entries; i++)
-		printk("  %3d: %8lu -> %8lu\n", early_node_map[i].nid,
+		printk("  %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid,
 						early_node_map[i].start_pfn,
 						early_node_map[i].end_pfn);
 
-- 
cgit v1.2.3


From 47a486cc110fe77518c79a566b50a5c785c813ae Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 24 Jun 2008 22:52:03 +0200
Subject: x86: perfctr-watchdog.c - coding style cleanup

Just some code beautification. Nothing else.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: macro@linux-mips.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perfctr-watchdog.c | 202 ++++++++++++++++++---------------
 1 file changed, 112 insertions(+), 90 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index ddda4b64f545..2e9bef6e3aa3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -1,11 +1,15 @@
-/* local apic based NMI watchdog for various CPUs.
-   This file also handles reservation of performance counters for coordination
-   with other users (like oprofile).
-
-   Note that these events normally don't tick when the CPU idles. This means
-   the frequency varies with CPU load.
-
-   Original code for K7/P6 written by Keith Owens */
+/*
+ * local apic based NMI watchdog for various CPUs.
+ *
+ * This file also handles reservation of performance counters for coordination
+ * with other users (like oprofile).
+ *
+ * Note that these events normally don't tick when the CPU idles. This means
+ * the frequency varies with CPU load.
+ *
+ * Original code for K7/P6 written by Keith Owens
+ *
+ */
 
 #include <linux/percpu.h>
 #include <linux/module.h>
@@ -36,12 +40,16 @@ struct wd_ops {
 
 static const struct wd_ops *wd_ops;
 
-/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
- * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
+/*
+ * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0.
+ *
+ * It will be the max for all platforms (for now)
  */
 #define NMI_MAX_COUNTER_BITS 66
 
-/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+/*
+ * perfctr_nmi_owner tracks the ownership of the perfctr registers:
  * evtsel_nmi_owner tracks the ownership of the event selection
  * - different performance counters/ event selection may be reserved for
  *   different subsystems this reservation system just tries to coordinate
@@ -73,8 +81,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
 	return 0;
 }
 
-/* converts an msr to an appropriate reservation bit */
-/* returns the bit offset of the event selection register */
+/*
+ * converts an msr to an appropriate reservation bit
+ * returns the bit offset of the event selection register
+ */
 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
 {
 	/* returns the bit offset of the event selection register */
@@ -114,6 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr)
 
 	return (!test_bit(counter, perfctr_nmi_owner));
 }
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
 
 int reserve_perfctr_nmi(unsigned int msr)
 {
@@ -128,6 +139,7 @@ int reserve_perfctr_nmi(unsigned int msr)
 		return 1;
 	return 0;
 }
+EXPORT_SYMBOL(reserve_perfctr_nmi);
 
 void release_perfctr_nmi(unsigned int msr)
 {
@@ -140,6 +152,7 @@ void release_perfctr_nmi(unsigned int msr)
 
 	clear_bit(counter, perfctr_nmi_owner);
 }
+EXPORT_SYMBOL(release_perfctr_nmi);
 
 int reserve_evntsel_nmi(unsigned int msr)
 {
@@ -154,6 +167,7 @@ int reserve_evntsel_nmi(unsigned int msr)
 		return 1;
 	return 0;
 }
+EXPORT_SYMBOL(reserve_evntsel_nmi);
 
 void release_evntsel_nmi(unsigned int msr)
 {
@@ -166,11 +180,6 @@ void release_evntsel_nmi(unsigned int msr)
 
 	clear_bit(counter, evntsel_nmi_owner);
 }
-
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-EXPORT_SYMBOL(reserve_perfctr_nmi);
-EXPORT_SYMBOL(release_perfctr_nmi);
-EXPORT_SYMBOL(reserve_evntsel_nmi);
 EXPORT_SYMBOL(release_evntsel_nmi);
 
 void disable_lapic_nmi_watchdog(void)
@@ -234,8 +243,8 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz)
 	return retval;
 }
 
-static void
-write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz)
+static void write_watchdog_counter(unsigned int perfctr_msr,
+				const char *descr, unsigned nmi_hz)
 {
 	u64 count = (u64)cpu_khz * 1000;
 
@@ -246,7 +255,7 @@ write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi
 }
 
 static void write_watchdog_counter32(unsigned int perfctr_msr,
-		const char *descr, unsigned nmi_hz)
+				const char *descr, unsigned nmi_hz)
 {
 	u64 count = (u64)cpu_khz * 1000;
 
@@ -256,9 +265,10 @@ static void write_watchdog_counter32(unsigned int perfctr_msr,
 	wrmsr(perfctr_msr, (u32)(-count), 0);
 }
 
-/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface
-   nicely stable so there is not much variety */
-
+/*
+ * AMD K7/K8/Family10h/Family11h support.
+ * AMD keeps this interface nicely stable so there is not much variety
+ */
 #define K7_EVNTSEL_ENABLE	(1 << 22)
 #define K7_EVNTSEL_INT		(1 << 20)
 #define K7_EVNTSEL_OS		(1 << 17)
@@ -291,7 +301,7 @@ static int setup_k7_watchdog(unsigned nmi_hz)
 
 	wd->perfctr_msr = perfctr_msr;
 	wd->evntsel_msr = evntsel_msr;
-	wd->cccr_msr = 0;  //unused
+	wd->cccr_msr = 0;  /* unused */
 	return 1;
 }
 
@@ -327,18 +337,19 @@ static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
 }
 
 static const struct wd_ops k7_wd_ops = {
-	.reserve = single_msr_reserve,
-	.unreserve = single_msr_unreserve,
-	.setup = setup_k7_watchdog,
-	.rearm = single_msr_rearm,
-	.stop = single_msr_stop_watchdog,
-	.perfctr = MSR_K7_PERFCTR0,
-	.evntsel = MSR_K7_EVNTSEL0,
-	.checkbit = 1ULL<<47,
+	.reserve	= single_msr_reserve,
+	.unreserve	= single_msr_unreserve,
+	.setup		= setup_k7_watchdog,
+	.rearm		= single_msr_rearm,
+	.stop		= single_msr_stop_watchdog,
+	.perfctr	= MSR_K7_PERFCTR0,
+	.evntsel	= MSR_K7_EVNTSEL0,
+	.checkbit	= 1ULL << 47,
 };
 
-/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
-
+/*
+ * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
+ */
 #define P6_EVNTSEL0_ENABLE	(1 << 22)
 #define P6_EVNTSEL_INT		(1 << 20)
 #define P6_EVNTSEL_OS		(1 << 17)
@@ -374,52 +385,58 @@ static int setup_p6_watchdog(unsigned nmi_hz)
 
 	wd->perfctr_msr = perfctr_msr;
 	wd->evntsel_msr = evntsel_msr;
-	wd->cccr_msr = 0;  //unused
+	wd->cccr_msr = 0;  /* unused */
 	return 1;
 }
 
 static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
 {
-	/* P6 based Pentium M need to re-unmask
+	/*
+	 * P6 based Pentium M need to re-unmask
 	 * the apic vector but it doesn't hurt
 	 * other P6 variant.
-	 * ArchPerfom/Core Duo also needs this */
+	 * ArchPerfom/Core Duo also needs this
+	 */
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
 	/* P6/ARCH_PERFMON has 32 bit counter write */
 	write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
 }
 
 static const struct wd_ops p6_wd_ops = {
-	.reserve = single_msr_reserve,
-	.unreserve = single_msr_unreserve,
-	.setup = setup_p6_watchdog,
-	.rearm = p6_rearm,
-	.stop = single_msr_stop_watchdog,
-	.perfctr = MSR_P6_PERFCTR0,
-	.evntsel = MSR_P6_EVNTSEL0,
-	.checkbit = 1ULL<<39,
+	.reserve	= single_msr_reserve,
+	.unreserve	= single_msr_unreserve,
+	.setup		= setup_p6_watchdog,
+	.rearm		= p6_rearm,
+	.stop		= single_msr_stop_watchdog,
+	.perfctr	= MSR_P6_PERFCTR0,
+	.evntsel	= MSR_P6_EVNTSEL0,
+	.checkbit	= 1ULL << 39,
 };
 
-/* Intel P4 performance counters. By far the most complicated of all. */
-
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
-#define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
-#define P4_ESCR_OS		(1<<3)
-#define P4_ESCR_USR		(1<<2)
-#define P4_CCCR_OVF_PMI0	(1<<26)
-#define P4_CCCR_OVF_PMI1	(1<<27)
-#define P4_CCCR_THRESHOLD(N)	((N)<<20)
-#define P4_CCCR_COMPLEMENT	(1<<19)
-#define P4_CCCR_COMPARE		(1<<18)
-#define P4_CCCR_REQUIRED	(3<<16)
-#define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
-#define P4_CCCR_ENABLE		(1<<12)
-#define P4_CCCR_OVF 		(1<<31)
-
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
-   CRU_ESCR0 (with any non-null event selector) through a complemented
-   max threshold. [IA32-Vol3, Section 14.9.9] */
+/*
+ * Intel P4 performance counters.
+ * By far the most complicated of all.
+ */
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1 << 7)
+#define P4_ESCR_EVENT_SELECT(N)	((N) << 25)
+#define P4_ESCR_OS		(1 << 3)
+#define P4_ESCR_USR		(1 << 2)
+#define P4_CCCR_OVF_PMI0	(1 << 26)
+#define P4_CCCR_OVF_PMI1	(1 << 27)
+#define P4_CCCR_THRESHOLD(N)	((N) << 20)
+#define P4_CCCR_COMPLEMENT	(1 << 19)
+#define P4_CCCR_COMPARE		(1 << 18)
+#define P4_CCCR_REQUIRED	(3 << 16)
+#define P4_CCCR_ESCR_SELECT(N)	((N) << 13)
+#define P4_CCCR_ENABLE		(1 << 12)
+#define P4_CCCR_OVF 		(1 << 31)
 
+/*
+ * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ * CRU_ESCR0 (with any non-null event selector) through a complemented
+ * max threshold. [IA32-Vol3, Section 14.9.9]
+ */
 static int setup_p4_watchdog(unsigned nmi_hz)
 {
 	unsigned int perfctr_msr, evntsel_msr, cccr_msr;
@@ -444,7 +461,8 @@ static int setup_p4_watchdog(unsigned nmi_hz)
 #endif
 		ht_num = 0;
 
-	/* performance counters are shared resources
+	/*
+	 * performance counters are shared resources
 	 * assign each hyperthread its own set
 	 * (re-use the ESCR0 register, seems safe
 	 * and keeps the cccr_val the same)
@@ -542,20 +560,21 @@ static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
 }
 
 static const struct wd_ops p4_wd_ops = {
-	.reserve = p4_reserve,
-	.unreserve = p4_unreserve,
-	.setup = setup_p4_watchdog,
-	.rearm = p4_rearm,
-	.stop = stop_p4_watchdog,
+	.reserve	= p4_reserve,
+	.unreserve	= p4_unreserve,
+	.setup		= setup_p4_watchdog,
+	.rearm		= p4_rearm,
+	.stop		= stop_p4_watchdog,
 	/* RED-PEN this is wrong for the other sibling */
-	.perfctr = MSR_P4_BPU_PERFCTR0,
-	.evntsel = MSR_P4_BSU_ESCR0,
-	.checkbit = 1ULL<<39,
+	.perfctr	= MSR_P4_BPU_PERFCTR0,
+	.evntsel	= MSR_P4_BSU_ESCR0,
+	.checkbit	= 1ULL << 39,
 };
 
-/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully
-   all future Intel CPUs. */
-
+/*
+ * Watchdog using the Intel architected PerfMon.
+ * Used for Core2 and hopefully all future Intel CPUs.
+ */
 #define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
 #define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
 
@@ -601,19 +620,19 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
 
 	wd->perfctr_msr = perfctr_msr;
 	wd->evntsel_msr = evntsel_msr;
-	wd->cccr_msr = 0;  //unused
+	wd->cccr_msr = 0;  /* unused */
 	intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
 	return 1;
 }
 
 static struct wd_ops intel_arch_wd_ops __read_mostly = {
-	.reserve = single_msr_reserve,
-	.unreserve = single_msr_unreserve,
-	.setup = setup_intel_arch_watchdog,
-	.rearm = p6_rearm,
-	.stop = single_msr_stop_watchdog,
-	.perfctr = MSR_ARCH_PERFMON_PERFCTR1,
-	.evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
+	.reserve	= single_msr_reserve,
+	.unreserve	= single_msr_unreserve,
+	.setup		= setup_intel_arch_watchdog,
+	.rearm		= p6_rearm,
+	.stop		= single_msr_stop_watchdog,
+	.perfctr	= MSR_ARCH_PERFMON_PERFCTR1,
+	.evntsel	= MSR_ARCH_PERFMON_EVENTSEL1,
 };
 
 static void probe_nmi_watchdog(void)
@@ -626,8 +645,10 @@ static void probe_nmi_watchdog(void)
 		wd_ops = &k7_wd_ops;
 		break;
 	case X86_VENDOR_INTEL:
-		/* Work around Core Duo (Yonah) errata AE49 where perfctr1
-		   doesn't have a working enable bit. */
+		/*
+		 * Work around Core Duo (Yonah) errata AE49 where perfctr1
+		 * doesn't have a working enable bit.
+		 */
 		if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
 			intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
 			intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
@@ -638,7 +659,7 @@ static void probe_nmi_watchdog(void)
 		}
 		switch (boot_cpu_data.x86) {
 		case 6:
-			if (boot_cpu_data.x86_model > 0xd)
+			if (boot_cpu_data.x86_model > 13)
 				return;
 
 			wd_ops = &p6_wd_ops;
@@ -699,10 +720,11 @@ int lapic_wd_event(unsigned nmi_hz)
 {
 	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 	u64 ctr;
+
 	rdmsrl(wd->perfctr_msr, ctr);
-	if (ctr & wd_ops->checkbit) { /* perfctr still running? */
+	if (ctr & wd_ops->checkbit) /* perfctr still running? */
 		return 0;
-	}
+
 	wd_ops->rearm(wd, nmi_hz);
 	return 1;
 }
-- 
cgit v1.2.3


From 116f570e5d8347bee2614da2361eeadbb639ea50 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 24 Jun 2008 22:52:04 +0200
Subject: x86: nmi_watchdog - use nmi_watchdog variable for printing

Since it is possible NMI_ definitions could be changed
one day we better print out real nmi_watchdog value instead
of constant string.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: macro@linux-mips.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c | 2 +-
 arch/x86/kernel/apic_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4932d7813bcd..b5571fb0f77e 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -550,7 +550,7 @@ void __init setup_boot_APIC_clock(void)
 			lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
 		else
 			printk(KERN_WARNING "APIC timer registered as dummy,"
-			       " due to nmi_watchdog=1!\n");
+				" due to nmi_watchdog=%d!\n", nmi_watchdog);
 	}
 
 	/* Setup the lapic or request the broadcast */
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 5279dc924381..8c751f731bca 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -417,7 +417,7 @@ void __init setup_boot_APIC_clock(void)
 		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
 	else
 		printk(KERN_WARNING "APIC timer registered as dummy,"
-		       " due to nmi_watchdog=1!\n");
+			" due to nmi_watchdog=%d!\n", nmi_watchdog);
 
 	setup_APIC_timer();
 }
-- 
cgit v1.2.3


From 2b6addad2d67a2d75ae10a1c8efd18d81d78ff82 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 24 Jun 2008 22:52:04 +0200
Subject: x86: nmi_watchdog - remove useless check

Since nmi_watchdog is unsigned variable we may
safely remove the check for negative value.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: macro@linux-mips.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 9ebf71323c9a..427c511e7adc 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -187,7 +187,7 @@ error:
 
 static int __init setup_nmi_watchdog(char *str)
 {
-	int nmi;
+	unsigned int nmi;
 
 	if (!strncmp(str, "panic", 5)) {
 		panic_on_timeout = 1;
@@ -199,7 +199,7 @@ static int __init setup_nmi_watchdog(char *str)
 
 	get_option(&str, &nmi);
 
-	if (nmi >= NMI_INVALID || nmi < NMI_NONE)
+	if (nmi >= NMI_INVALID)
 		return 0;
 
 	nmi_watchdog = nmi;
-- 
cgit v1.2.3


From c376d45432d935e6f1e0ff2d6be3734bcd3ba455 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 24 Jun 2008 22:52:05 +0200
Subject: x86: nmi_watchdog - use NMI_NONE by default

There is no need to keep NMI_DISABLED definition and use it
for nmi_watchdog by default. Here is the point why:

- IO-APIC and APIC chips are programmed for nmi_watchdog support at very
  early stage of kernel booting and not having nmi_watchdog specified as
  boot option lead only to nmi_watchdog becomes to NMI_NONE anyway
- enable nmi_watchdog thru /proc/sys/kernel/nmi if it was not specified at
  boot is not possible too (even having this sysfs entry)

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: macro@linux-mips.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_64.c    |  1 -
 arch/x86/kernel/io_apic_64.c |  2 --
 arch/x86/kernel/nmi.c        | 26 +++-----------------------
 arch/x86/kernel/smpboot.c    |  1 -
 include/asm-x86/nmi.h        |  3 ---
 5 files changed, 3 insertions(+), 30 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 8c751f731bca..1e3d32e27c14 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -826,7 +826,6 @@ static void __cpuinit lapic_setup_esr(void)
 void __cpuinit end_local_APIC_setup(void)
 {
 	lapic_setup_esr();
-	nmi_watchdog_default();
 	setup_apic_nmi_watchdog(NULL);
 	apic_pm_activate();
 }
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 08c48750888a..2b4c40bc12c9 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1729,7 +1729,6 @@ static inline void __init check_timer(void)
 		}
 		unmask_IO_APIC_irq(0);
 		if (!no_timer_check && timer_irq_works()) {
-			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
 				enable_8259A_irq(0);
@@ -1758,7 +1757,6 @@ static inline void __init check_timer(void)
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
 			timer_through_8259 = 1;
-			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
 				disable_8259A_irq(0);
 				setup_nmi();
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 427c511e7adc..32acda25e3cb 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -52,7 +52,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
 atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
 EXPORT_SYMBOL(nmi_active);
 
-unsigned int nmi_watchdog = NMI_DEFAULT;
+unsigned int nmi_watchdog = NMI_NONE;
 EXPORT_SYMBOL(nmi_watchdog);
 
 static int panic_on_timeout;
@@ -92,14 +92,6 @@ static inline unsigned int get_timer_irqs(int cpu)
 #endif
 }
 
-/* Run after command line and cpu_init init, but before all other checks */
-void nmi_watchdog_default(void)
-{
-	if (nmi_watchdog != NMI_DEFAULT)
-		return;
-	nmi_watchdog = NMI_NONE;
-}
-
 #ifdef CONFIG_SMP
 /*
  * The performance counters used by NMI_LOCAL_APIC don't trigger when
@@ -127,7 +119,7 @@ int __init check_nmi_watchdog(void)
 	unsigned int *prev_nmi_count;
 	int cpu;
 
-	if (nmi_watchdog == NMI_NONE || nmi_watchdog == NMI_DISABLED)
+	if (nmi_watchdog == NMI_NONE)
 		return 0;
 
 	if (!atomic_read(&nmi_active))
@@ -482,24 +474,12 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 	if (!!old_state == !!nmi_watchdog_enabled)
 		return 0;
 
-	if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
+	if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_NONE) {
 		printk(KERN_WARNING
 			"NMI watchdog is permanently disabled\n");
 		return -EIO;
 	}
 
-	/* if nmi_watchdog is not set yet, then set it */
-	nmi_watchdog_default();
-
-#ifdef CONFIG_X86_32
-	if (nmi_watchdog == NMI_NONE) {
-		if (lapic_watchdog_ok())
-			nmi_watchdog = NMI_LOCAL_APIC;
-		else
-			nmi_watchdog = NMI_IO_APIC;
-	}
-#endif
-
 	if (nmi_watchdog == NMI_LOCAL_APIC) {
 		if (nmi_watchdog_enabled)
 			enable_lapic_nmi_watchdog();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3b48d1f4c7c3..3b19441d78b8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1139,7 +1139,6 @@ static void __init smp_cpu_index_default(void)
 void __init native_smp_prepare_cpus(unsigned int max_cpus)
 {
 	preempt_disable();
-	nmi_watchdog_default();
 	smp_cpu_index_default();
 	current_cpu_data = boot_cpu_data;
 	cpu_callin_map = cpumask_of_cpu(0);
diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h
index f0e435dd38fb..1348e542360f 100644
--- a/include/asm-x86/nmi.h
+++ b/include/asm-x86/nmi.h
@@ -20,7 +20,6 @@ extern void default_do_nmi(struct pt_regs *);
 #endif
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
-extern void nmi_watchdog_default(void);
 extern int check_nmi_watchdog(void);
 extern int nmi_watchdog_enabled;
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
@@ -38,12 +37,10 @@ extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
 
 extern atomic_t nmi_active;
 extern unsigned int nmi_watchdog;
-#define NMI_DISABLED    -1
 #define NMI_NONE	0
 #define NMI_IO_APIC	1
 #define NMI_LOCAL_APIC	2
 #define NMI_INVALID	3
-#define NMI_DEFAULT	NMI_DISABLED
 
 struct ctl_table;
 struct file;
-- 
cgit v1.2.3


From 4de0043617f949fdac538fd59335e2150cd1b863 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 24 Jun 2008 22:52:06 +0200
Subject: x86: nmi_watchdog - introduce nmi_watchdog_active() helper

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: macro@linux-mips.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi.c | 13 ++++---------
 include/asm-x86/nmi.h | 13 +++++++++++++
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 32acda25e3cb..8dfe9db87a9e 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -119,10 +119,7 @@ int __init check_nmi_watchdog(void)
 	unsigned int *prev_nmi_count;
 	int cpu;
 
-	if (nmi_watchdog == NMI_NONE)
-		return 0;
-
-	if (!atomic_read(&nmi_active))
+	if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
 		return 0;
 
 	prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
@@ -317,8 +314,7 @@ void setup_apic_nmi_watchdog(void *unused)
 void stop_apic_nmi_watchdog(void *unused)
 {
 	/* only support LOCAL and IO APICs for now */
-	if (nmi_watchdog != NMI_LOCAL_APIC &&
-	    nmi_watchdog != NMI_IO_APIC)
+	if (!nmi_watchdog_active())
 		return;
 	if (__get_cpu_var(wd_enabled) == 0)
 		return;
@@ -348,8 +344,7 @@ static DEFINE_PER_CPU(int, nmi_touch);
 
 void touch_nmi_watchdog(void)
 {
-	if (nmi_watchdog == NMI_LOCAL_APIC ||
-		nmi_watchdog == NMI_IO_APIC) {
+	if (nmi_watchdog_active()) {
 		unsigned cpu;
 
 		/*
@@ -474,7 +469,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 	if (!!old_state == !!nmi_watchdog_enabled)
 		return 0;
 
-	if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_NONE) {
+	if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
 		printk(KERN_WARNING
 			"NMI watchdog is permanently disabled\n");
 		return -EIO;
diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h
index 1348e542360f..21f8d0202a82 100644
--- a/include/asm-x86/nmi.h
+++ b/include/asm-x86/nmi.h
@@ -56,6 +56,19 @@ static inline void localise_nmi_watchdog(void)
 	if (nmi_watchdog == NMI_IO_APIC)
 		nmi_watchdog = NMI_LOCAL_APIC;
 }
+
+/* check if nmi_watchdog is active (ie was specified at boot) */
+static inline int nmi_watchdog_active(void)
+{
+	/*
+	 * actually it should be:
+	 * 	return (nmi_watchdog == NMI_LOCAL_APIC ||
+	 * 		nmi_watchdog == NMI_IO_APIC)
+	 * but since they are power of two we could use a
+	 * cheaper way --cvg
+	 */
+	return nmi_watchdog & 0x3;
+}
 #endif
 
 void lapic_watchdog_stop(void);
-- 
cgit v1.2.3


From ada857082317e6883cfcf7deb4e0c54d3c447cb0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:00 -0400
Subject: x86: remove open-coded save/load segment operations

This removes a pile of buggy open-coded implementations of savesegment
and loadsegment.

(They are buggy because they don't have memory barriers to prevent
them from being reordered with respect to memory accesses.)

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common_64.c |  3 ++-
 arch/x86/kernel/process_64.c    | 28 +++++++++++++++-------------
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 39eaefbcec06..751850235291 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -480,7 +480,8 @@ void pda_init(int cpu)
 	struct x8664_pda *pda = cpu_pda(cpu);
 
 	/* Setup up data that may be needed in __get_free_pages early */
-	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
+	loadsegment(fs, 0);
+	loadsegment(gs, 0);
 	/* Memory clobbers used to order PDA accessed */
 	mb();
 	wrmsrl(MSR_GS_BASE, pda);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 290183e9731a..ddc6fcc73dc6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -335,10 +335,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 	p->thread.fs = me->thread.fs;
 	p->thread.gs = me->thread.gs;
 
-	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
-	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
-	asm("mov %%es,%0" : "=m" (p->thread.es));
-	asm("mov %%ds,%0" : "=m" (p->thread.ds));
+	savesegment(gs, p->thread.gsindex);
+	savesegment(fs, p->thread.fsindex);
+	savesegment(es, p->thread.es);
+	savesegment(ds, p->thread.ds);
 
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
@@ -377,7 +377,9 @@ out:
 void
 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 {
-	asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
+	loadsegment(fs, 0);
+	loadsegment(es, 0);
+	loadsegment(ds, 0);
 	load_gs_index(0);
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
@@ -550,11 +552,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Switch DS and ES.
 	 * This won't pick up thread selector changes, but I guess that is ok.
 	 */
-	asm volatile("mov %%es,%0" : "=m" (prev->es));
+	savesegment(es, prev->es);
 	if (unlikely(next->es | prev->es))
 		loadsegment(es, next->es); 
-	
-	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
+
+	savesegment(ds, prev->ds);
 	if (unlikely(next->ds | prev->ds))
 		loadsegment(ds, next->ds);
 
@@ -565,7 +567,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	{ 
 		unsigned fsindex;
-		asm volatile("movl %%fs,%0" : "=r" (fsindex)); 
+		savesegment(fs, fsindex);
 		/* segment register != 0 always requires a reload. 
 		   also reload when it has changed. 
 		   when prev process used 64bit base always reload
@@ -586,7 +588,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	}
 	{ 
 		unsigned gsindex;
-		asm volatile("movl %%gs,%0" : "=r" (gsindex)); 
+		savesegment(gs, gsindex);
 		if (unlikely(gsindex | next->gsindex | prev->gs)) {
 			load_gs_index(next->gsindex);
 			if (gsindex)
@@ -767,7 +769,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 			set_32bit_tls(task, FS_TLS, addr);
 			if (doit) {
 				load_TLS(&task->thread, cpu);
-				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
+				loadsegment(fs, FS_TLS_SEL);
 			}
 			task->thread.fsindex = FS_TLS_SEL;
 			task->thread.fs = 0;
@@ -777,7 +779,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 			if (doit) {
 				/* set the selector to 0 to not confuse
 				   __switch_to */
-				asm volatile("movl %0,%%fs" :: "r" (0));
+				loadsegment(fs, 0);
 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
 			}
 		}
@@ -800,7 +802,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 		if (task->thread.gsindex == GS_TLS_SEL)
 			base = read_32bit_tls(task, GS_TLS);
 		else if (doit) {
-			asm("movl %%gs,%0" : "=r" (gsindex));
+			savesegment(gs, gsindex);
 			if (gsindex)
 				rdmsrl(MSR_KERNEL_GS_BASE, base);
 			else
-- 
cgit v1.2.3


From fc8b8a60ffa7c89da58c75109dacf0b2798c7caf Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:01 -0400
Subject: x86, 64-bit: use write_gdt_entry in vsyscall_set_cpu

Use write_gdt_entry to generate the special vgetcpu descriptor in the
vsyscall page.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsyscall_64.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 61efa2f7d564..c87cbd84c3e5 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -249,7 +249,7 @@ static ctl_table kernel_root_table2[] = {
    doesn't violate that. We'll find out if it does. */
 static void __cpuinit vsyscall_set_cpu(int cpu)
 {
-	unsigned long *d;
+	unsigned long d;
 	unsigned long node = 0;
 #ifdef CONFIG_NUMA
 	node = cpu_to_node(cpu);
@@ -260,11 +260,11 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
 	/* Store cpu number in limit so that it can be loaded quickly
 	   in user space in vgetcpu.
 	   12 bits for the CPU and 8 bits for the node. */
-	d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU);
-	*d = 0x0f40000000000ULL;
-	*d |= cpu;
-	*d |= (node & 0xf) << 12;
-	*d |= (node >> 4) << 48;
+	d = 0x0f40000000000ULL;
+	d |= cpu;
+	d |= (node & 0xf) << 12;
+	d |= (node >> 4) << 48;
+	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
 }
 
 static void __cpuinit cpu_vsyscall_init(void *arg)
-- 
cgit v1.2.3


From 4e29684c40f2a332ba4d05f6482d5807725d5624 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 24 Jun 2008 12:18:14 -0700
Subject: x86: introduce init_memory_mapping for 32bit #1

... so can we use mem below max_low_pfn earlier.

this allows us to move several functions more early instead of waiting
to after paging_init.

That includes moving relocate_initrd() earlier in the bootup, and kva
related early setup done in initmem_init. (in followup patches)

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c |  10 ++--
 arch/x86/mm/init_32.c      | 141 +++++++++++++++++++++++++++++++++++----------
 include/asm-x86/page_32.h  |   2 +
 3 files changed, 120 insertions(+), 33 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index bba8d57bd7d8..03007cada0d1 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -226,10 +226,8 @@ static void __init reserve_initrd(void)
 	}
 
 	/* We need to move the initrd down into lowmem */
-	ramdisk_target = max_pfn_mapped<<PAGE_SHIFT;
-	ramdisk_here = find_e820_area(min(ramdisk_target, end_of_lowmem>>1),
-				 end_of_lowmem, ramdisk_size,
-				 PAGE_SIZE);
+	ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
+					 PAGE_SIZE);
 
 	if (ramdisk_here == -1ULL)
 		panic("Cannot find place for new RAMDISK of size %lld\n",
@@ -433,8 +431,12 @@ void __init setup_arch(char **cmdline_p)
 		max_pfn = e820_end_of_ram();
 	}
 
+	/* max_low_pfn get updated here */
 	find_low_pfn_range();
 
+	/* max_pfn_mapped is updated here */
+	init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT));
+
 	reserve_initrd();
 
 	dmi_scan_machine();
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 20ca29591abe..619058e6bff8 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -57,6 +57,27 @@ unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
 
+
+static unsigned long __initdata table_start;
+static unsigned long __meminitdata table_end;
+static unsigned long __meminitdata table_top;
+
+static int __initdata after_init_bootmem;
+
+static __init void *alloc_low_page(unsigned long *phys)
+{
+	unsigned long pfn = table_end++;
+	void *adr;
+
+	if (pfn >= table_top)
+		panic("alloc_low_page: ran out of memory");
+
+	adr = __va(pfn * PAGE_SIZE);
+	memset(adr, 0, PAGE_SIZE);
+	*phys  = pfn * PAGE_SIZE;
+	return adr;
+}
+
 /*
  * Creates a middle page table and puts a pointer to it in the
  * given global directory entry. This only returns the gd entry
@@ -68,9 +89,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 	pmd_t *pmd_table;
 
 #ifdef CONFIG_X86_PAE
+	unsigned long phys;
 	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-		pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-
+		if (after_init_bootmem)
+			pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+		else
+			pmd_table = (pmd_t *)alloc_low_page(&phys);
 		paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
 		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
 		pud = pud_offset(pgd, 0);
@@ -92,12 +116,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
 		pte_t *page_table = NULL;
 
+		if (after_init_bootmem) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+			page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
-		if (!page_table) {
-			page_table =
+			if (!page_table)
+				page_table =
 				(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+		} else {
+			unsigned long phys;
+			page_table = (pte_t *)alloc_low_page(&phys);
 		}
 
 		paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -155,7 +183,9 @@ static inline int is_kernel_text(unsigned long addr)
  * of max_low_pfn pages, by creating page tables starting from address
  * PAGE_OFFSET:
  */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
+						unsigned long start,
+						unsigned long end)
 {
 	int pgd_idx, pmd_idx, pte_ofs;
 	unsigned long pfn;
@@ -163,18 +193,19 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	pmd_t *pmd;
 	pte_t *pte;
 	unsigned pages_2m = 0, pages_4k = 0;
+	unsigned limit_pfn = end >> PAGE_SHIFT;
 
 	pgd_idx = pgd_index(PAGE_OFFSET);
 	pgd = pgd_base + pgd_idx;
-	pfn = 0;
+	pfn = start >> PAGE_SHIFT;
 
 	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
 		pmd = one_md_table_init(pgd);
-		if (pfn >= max_low_pfn)
+		if (pfn >= limit_pfn)
 			continue;
 
 		for (pmd_idx = 0;
-		     pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+		     pmd_idx < PTRS_PER_PMD && pfn < limit_pfn;
 		     pmd++, pmd_idx++) {
 			unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
 
@@ -418,20 +449,7 @@ static void __init pagetable_init(void)
 
 	paravirt_pagetable_setup_start(pgd_base);
 
-	/* Enable PSE if available */
-	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
-
-	/* Enable PGE if available */
-	if (cpu_has_pge) {
-		set_in_cr4(X86_CR4_PGE);
-		__PAGE_KERNEL |= _PAGE_GLOBAL;
-		__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
-	}
-
-	kernel_physical_mapping_init(pgd_base);
 	remap_numa_kva();
-
 	/*
 	 * Fixed mappings, only the page table structure has to be
 	 * created - mappings will be set by set_fixmap():
@@ -703,6 +721,7 @@ void __init setup_bootmem_allocator(void)
 		free_bootmem_with_active_regions(i, max_low_pfn);
 	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
 
+	after_init_bootmem = 1;
 }
 
 /*
@@ -723,6 +742,77 @@ static void __init remapped_pgdat_init(void)
 	}
 }
 
+static void __init find_early_table_space(unsigned long end)
+{
+	unsigned long puds, pmds, tables, start;
+
+	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	tables = PAGE_ALIGN(puds * sizeof(pud_t));
+
+	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+	tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
+
+	/*
+	 * RED-PEN putting page tables only on node 0 could
+	 * cause a hotspot and fill up ZONE_DMA. The page tables
+	 * need roughly 0.5KB per GB.
+	 */
+	start = 0x7000;
+	table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+					tables, PAGE_SIZE);
+	if (table_start == -1UL)
+		panic("Cannot find space for the kernel page tables");
+
+	table_start >>= PAGE_SHIFT;
+	table_end = table_start;
+	table_top = table_start + (tables>>PAGE_SHIFT);
+
+	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+		end, table_start << PAGE_SHIFT,
+		(table_start << PAGE_SHIFT) + tables);
+}
+
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+						unsigned long end)
+{
+	pgd_t *pgd_base = swapper_pg_dir;
+
+	/*
+	 * Find space for the kernel direct mapping tables.
+	 */
+	if (!after_init_bootmem)
+		find_early_table_space(end);
+
+#ifdef CONFIG_X86_PAE
+	set_nx();
+	if (nx_enabled)
+		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+	/* Enable PSE if available */
+	if (cpu_has_pse)
+		set_in_cr4(X86_CR4_PSE);
+
+	/* Enable PGE if available */
+	if (cpu_has_pge) {
+		set_in_cr4(X86_CR4_PGE);
+		__PAGE_KERNEL |= _PAGE_GLOBAL;
+		__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
+	}
+
+	kernel_physical_mapping_init(pgd_base, start, end);
+
+	load_cr3(swapper_pg_dir);
+
+	__flush_tlb_all();
+
+	if (!after_init_bootmem)
+		reserve_early(table_start << PAGE_SHIFT,
+				 table_end << PAGE_SHIFT, "PGTABLE");
+
+	return end >> PAGE_SHIFT;
+}
+
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
  * already mapped by head.S.
@@ -732,15 +822,8 @@ static void __init remapped_pgdat_init(void)
  */
 void __init paging_init(void)
 {
-#ifdef CONFIG_X86_PAE
-	set_nx();
-	if (nx_enabled)
-		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
 	pagetable_init();
 
-	load_cr3(swapper_pg_dir);
-
 	__flush_tlb_all();
 
 	kmap_init();
diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h
index 3810d14051e8..4ae1daba129b 100644
--- a/include/asm-x86/page_32.h
+++ b/include/asm-x86/page_32.h
@@ -93,6 +93,8 @@ extern int sysctl_legacy_va_layout;
 #define MAXMEM			(-__PAGE_OFFSET - __VMALLOC_RESERVE)
 
 extern void find_low_pfn_range(void);
+extern unsigned long init_memory_mapping(unsigned long start,
+					 unsigned long end);
 extern void initmem_init(unsigned long, unsigned long);
 extern void zone_sizes_init(void);
 extern void setup_bootmem_allocator(void);
-- 
cgit v1.2.3


From cfb0e53b05402f1ce65053677409a819c1798d34 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 24 Jun 2008 12:18:58 -0700
Subject: x86: introduce init_memory_mapping for 32bit #2

moving relocate_initrd early

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 22 +++++++---------------
 arch/x86/mm/init_32.c      |  3 ---
 include/asm-x86/setup.h    |  1 -
 3 files changed, 7 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 03007cada0d1..4b953dc93883 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -186,7 +186,7 @@ static inline void copy_edd(void)
 
 #ifdef CONFIG_BLK_DEV_INITRD
 
-static bool do_relocate_initrd = false;
+static void __init relocate_initrd(void);
 
 static void __init reserve_initrd(void)
 {
@@ -195,7 +195,6 @@ static void __init reserve_initrd(void)
 	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
 	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
 	u64 ramdisk_here;
-	u64 ramdisk_target;
 
 	if (!boot_params.hdr.type_of_loader ||
 	    !ramdisk_image || !ramdisk_size)
@@ -242,12 +241,12 @@ static void __init reserve_initrd(void)
 	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
 			 ramdisk_here, ramdisk_here + ramdisk_size);
 
-	do_relocate_initrd = true;
+	relocate_initrd();
 }
 
 #define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
 
-void __init post_reserve_initrd(void)
+static void __init relocate_initrd(void)
 {
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
@@ -256,9 +255,6 @@ void __init post_reserve_initrd(void)
 	unsigned long slop, clen, mapaddr;
 	char *p, *q;
 
-	if (!do_relocate_initrd)
-		return;
-
 	ramdisk_here = initrd_start - PAGE_OFFSET;
 
 	q = (char *)initrd_start;
@@ -269,10 +265,6 @@ void __init post_reserve_initrd(void)
 		p = (char *)__va(ramdisk_image);
 		memcpy(q, p, clen);
 		q += clen;
-		/* need to free these low pages...*/
-		printk(KERN_INFO "Freeing old partial RAMDISK %08llx-%08llx\n",
-			 ramdisk_image, ramdisk_image + clen - 1);
-		free_bootmem(ramdisk_image, clen);
 		ramdisk_image += clen;
 		ramdisk_size  -= clen;
 	}
@@ -298,16 +290,16 @@ void __init post_reserve_initrd(void)
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
 		ramdisk_here, ramdisk_here + ramdisk_size - 1);
 
-	/* need to free that, otherwise init highmem will reserve it again */
+	/*
+	 * need to free old one, otherwise init cross max_low_pfn could be
+	 * converted to bootmem
+	 */
 	free_early(ramdisk_image, ramdisk_image+ramdisk_size);
 }
 #else
 void __init reserve_initrd(void)
 {
 }
-void __init post_reserve_initrd(void)
-{
-}
 #endif /* CONFIG_BLK_DEV_INITRD */
 
 #ifdef CONFIG_MCA
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 619058e6bff8..ecccb055b085 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -831,9 +831,6 @@ void __init paging_init(void)
 	/*
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
-
-	post_reserve_initrd();
-
 	remapped_pgdat_init();
 	sparse_init();
 	zone_sizes_init();
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 4ebb4ef14c06..bb12a1619c12 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -39,7 +39,6 @@ void reserve_crashkernel(void);
 #include <asm/bootparam.h>
 
 void reserve_standard_io_resources(void);
-extern void post_reserve_initrd(void);
 
 #ifndef _SETUP
 
-- 
cgit v1.2.3


From 976dd4dc99c3eaf45e3802ed46e3cc06a1ad8689 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 24 Jun 2008 14:55:32 -0700
Subject: x86: fix e820_update_range size when overlapping

before that we relay on sanitize_e820_map to remove the overlap.

but e820_update_range(,,E820_RESERVED, E820_RAM) will not work

this patch fix that

who is going to use this?

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 512f779fc6af..15b4393ff9bf 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -425,6 +425,11 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 		e820_add_region(final_start, final_end - final_start,
 					 new_type);
 		real_updated_size += final_end - final_start;
+
+		ei->size -= final_end - final_start;
+		if (ei->addr < final_start)
+			continue;
+		ei->addr = final_end;
 	}
 	return real_updated_size;
 }
-- 
cgit v1.2.3


From 232b957ae93973a5f8619ef61b916744b747478c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 24 Jun 2008 14:58:38 -0700
Subject: x86: change size if e820_update/remove_range

in case someone using crazy parameter while calling them.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 15b4393ff9bf..1b76b25b4d9a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -405,6 +405,9 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 
 	BUG_ON(old_type == new_type);
 
+	if (size > (ULLONG_MAX - start))
+		size = ULLONG_MAX - start;
+
 	for (i = 0; i < e820.nr_map; i++) {
 		struct e820entry *ei = &e820.map[i];
 		u64 final_start, final_end;
@@ -441,6 +444,9 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
 	int i;
 	u64 real_removed_size = 0;
 
+	if (size > (ULLONG_MAX - start))
+		size = ULLONG_MAX - start;
+
 	for (i = 0; i < e820.nr_map; i++) {
 		struct e820entry *ei = &e820.map[i];
 		u64 final_start, final_end;
-- 
cgit v1.2.3


From c987d12f8455b19b3b057d63bac3de161bd809fc Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 24 Jun 2008 22:14:09 -0700
Subject: x86: remove end_pfn in 64bit

and use max_pfn directly.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/aperture_64.c      |  6 ++++--
 arch/x86/kernel/e820.c             |  2 +-
 arch/x86/kernel/early-quirks.c     |  2 +-
 arch/x86/kernel/machine_kexec_64.c |  2 +-
 arch/x86/kernel/pci-calgary_64.c   |  4 ++--
 arch/x86/kernel/pci-dma.c          |  4 ++--
 arch/x86/kernel/pci-gart_64.c      |  4 ++--
 arch/x86/kernel/pci-swiotlb_64.c   |  2 +-
 arch/x86/mm/init_64.c              | 17 ++++++-----------
 arch/x86/mm/k8topology_64.c        |  4 ++--
 arch/x86/mm/numa_64.c              |  4 ++--
 arch/x86/mm/srat_64.c              |  2 +-
 arch/x86/power/hibernate_64.c      |  2 +-
 include/asm-x86/page_64.h          |  5 +++--
 14 files changed, 29 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 600470d464fa..9f907806c1a5 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -407,7 +407,9 @@ void __init gart_iommu_hole_init(void)
 				    agp_aper_base == aper_base &&
 				    agp_aper_order == aper_order) {
 					/* the same between two setting from NB and agp */
-					if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) {
+					if (!no_iommu &&
+					    max_pfn > MAX_DMA32_PFN &&
+					    !printed_gart_size_msg) {
 						printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
 						printk(KERN_ERR "please increase GART size in your BIOS setup\n");
 						printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
@@ -448,7 +450,7 @@ out:
 		/* Got the aperture from the AGP bridge */
 	} else if (swiotlb && !valid_agp) {
 		/* Do nothing */
-	} else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
+	} else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
 		   force_iommu ||
 		   valid_agp ||
 		   fallback_aper_force) {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 1b76b25b4d9a..3900ff51bc68 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -527,7 +527,7 @@ __init void e820_setup_gap(void)
 
 #ifdef CONFIG_X86_64
 	if (!found) {
-		gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
+		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
 		printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
 		       "address range\n"
 		       KERN_ERR "PCI: Unassigned devices with 32bit resource "
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 84fd9f2a28ff..a4665f37cfc5 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -50,7 +50,7 @@ static void __init fix_hypertransport_config(int num, int slot, int func)
 static void __init via_bugs(int  num, int slot, int func)
 {
 #ifdef CONFIG_GART_IOMMU
-	if ((end_pfn > MAX_DMA32_PFN ||  force_iommu) &&
+	if ((max_pfn > MAX_DMA32_PFN ||  force_iommu) &&
 	    !gart_iommu_aperture_allowed) {
 		printk(KERN_INFO
 		       "Looks like a VIA chipset. Disabling IOMMU."
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 576a03db4511..7830dc4a8380 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -110,7 +110,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 {
 	pgd_t *level4p;
 	level4p = (pgd_t *)__va(start_pgtable);
- 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+	return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
 }
 
 static void set_idt(void *newidt, u16 limit)
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e28ec497e142..6959b5c45df4 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1394,7 +1394,7 @@ void __init detect_calgary(void)
 		return;
 	}
 
-	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
+	specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE);
 
 	for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
 		struct calgary_bus_info *info = &bus_info[bus];
@@ -1459,7 +1459,7 @@ int __init calgary_iommu_init(void)
 	if (ret) {
 		printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
 		       "falling back to no_iommu\n", ret);
-		if (end_pfn > MAX_DMA32_PFN)
+		if (max_pfn > MAX_DMA32_PFN)
 			printk(KERN_ERR "WARNING more than 4GB of memory, "
 					"32bit PCI may malfunction.\n");
 		return ret;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index cb0bdf440715..8467ec2320f1 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -75,7 +75,7 @@ early_param("dma32_size", parse_dma32_size_opt);
 void __init dma32_reserve_bootmem(void)
 {
 	unsigned long size, align;
-	if (end_pfn <= MAX_DMA32_PFN)
+	if (max_pfn <= MAX_DMA32_PFN)
 		return;
 
 	/*
@@ -94,7 +94,7 @@ void __init dma32_reserve_bootmem(void)
 static void __init dma32_free_bootmem(void)
 {
 
-	if (end_pfn <= MAX_DMA32_PFN)
+	if (max_pfn <= MAX_DMA32_PFN)
 		return;
 
 	if (!dma32_bootmem_ptr)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 021f3c684a62..d0d18db5d2a4 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -751,10 +751,10 @@ void __init gart_iommu_init(void)
 		return;
 
 	if (no_iommu ||
-	    (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
+	    (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
 	    !gart_iommu_aperture ||
 	    (no_agp && init_k8_gatt(&info) < 0)) {
-		if (end_pfn > MAX_DMA32_PFN) {
+		if (max_pfn > MAX_DMA32_PFN) {
 			printk(KERN_WARNING "More than 4GB of memory "
 			       	          "but GART IOMMU not available.\n"
 			       KERN_WARNING "falling back to iommu=soft.\n");
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 490da7f4b8d0..82299cd1d04d 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -38,7 +38,7 @@ const struct dma_mapping_ops swiotlb_dma_ops = {
 void __init pci_swiotlb_init(void)
 {
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
-	if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
+	if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
 	       swiotlb = 1;
 	if (swiotlb_force)
 		swiotlb = 1;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 6eced2f10734..d5d4b04d48a4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,11 +48,6 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 
-/*
- * PFN of last memory page.
- */
-unsigned long end_pfn;
-
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
  * The direct mapping extends to max_pfn_mapped, so that we can directly access
@@ -586,9 +581,9 @@ void __init paging_init(void)
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-	max_zone_pfns[ZONE_NORMAL] = end_pfn;
+	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
-	memory_present(0, 0, end_pfn);
+	memory_present(0, 0, max_pfn);
 	sparse_init();
 	free_area_init_nodes(max_zone_pfns);
 }
@@ -670,8 +665,8 @@ void __init mem_init(void)
 #else
 	totalram_pages = free_all_bootmem();
 #endif
-	reservedpages = end_pfn - totalram_pages -
-					absent_pages_in_range(0, end_pfn);
+	reservedpages = max_pfn - totalram_pages -
+					absent_pages_in_range(0, max_pfn);
 	after_bootmem = 1;
 
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
@@ -690,7 +685,7 @@ void __init mem_init(void)
 	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 				"%ldk reserved, %ldk data, %ldk init)\n",
 		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
-		end_pfn << (PAGE_SHIFT-10),
+		max_pfn << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
 		datasize >> 10,
@@ -784,7 +779,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 #endif
 	unsigned long pfn = phys >> PAGE_SHIFT;
 
-	if (pfn >= end_pfn) {
+	if (pfn >= max_pfn) {
 		/*
 		 * This can happen with kdump kernels when accessing
 		 * firmware tables:
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 317573ec9256..41f1b5c00a1d 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -143,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		limit |= (1<<24)-1;
 		limit++;
 
-		if (limit > end_pfn << PAGE_SHIFT)
-			limit = end_pfn << PAGE_SHIFT;
+		if (limit > max_pfn << PAGE_SHIFT)
+			limit = max_pfn << PAGE_SHIFT;
 		if (limit <= base)
 			continue;
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 316e5f961ef0..b432d5781773 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -86,7 +86,7 @@ static int __init allocate_cachealigned_memnodemap(void)
 
 	addr = 0x8000;
 	nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
-	nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT,
+	nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
 				      nodemap_size, L1_CACHE_BYTES);
 	if (nodemap_addr == -1UL) {
 		printk(KERN_ERR
@@ -579,7 +579,7 @@ void __init paging_init(void)
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-	max_zone_pfns[ZONE_NORMAL] = end_pfn;
+	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
 	sparse_init();
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index b67f5a16755f..0fd67b81a8b6 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -299,7 +299,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 			pxmram = 0;
 	}
 
-	e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
+	e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
 	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
 	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
 		printk(KERN_ERR
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index b542355e0e34..6dd000dd7933 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -83,7 +83,7 @@ static int set_up_temporary_mappings(void)
 
 	/* Set up the direct mapping from scratch */
 	start = (unsigned long)pfn_to_kaddr(0);
-	end = (unsigned long)pfn_to_kaddr(end_pfn);
+	end = (unsigned long)pfn_to_kaddr(max_pfn);
 
 	for (; start < end; start = next) {
 		pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
index ac37643078e7..cac5b9e78265 100644
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -58,7 +58,8 @@
 void clear_page(void *page);
 void copy_page(void *to, void *from);
 
-extern unsigned long end_pfn;
+/* duplicated to the one in bootmem.h */
+extern unsigned long max_pfn;
 extern unsigned long phys_base;
 
 extern unsigned long __phys_addr(unsigned long);
@@ -87,7 +88,7 @@ extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
 #endif	/* !__ASSEMBLY__ */
 
 #ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn)          ((pfn) < end_pfn)
+#define pfn_valid(pfn)          ((pfn) < max_pfn)
 #endif
 
 
-- 
cgit v1.2.3


From 3381959da5a00ae8289cfbd28b0b6d228f2d1d46 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Tue, 24 Jun 2008 11:48:30 -0700
Subject: x86: cleanup e820_setup_gap(), add e820_search_gap(), v2

This is a preparatory patch for the next patch in series.
Moves some code from e820_setup_gap to a new function e820_search_gap.
This patch is a part of a bug fix where we walk the ACPI table to calculate
a gap for PCI optional devices.

v1->v2: Patch on top of tip/master.
	Fixes a bug introduced in the last patch about the typeof "last".
	Also the new function e820_search_gap now returns if we found a gap in
	e820_map.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Cc: lenb@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 43 ++++++++++++++++++++++++++++---------------
 include/asm-x86/e820.h |  2 ++
 2 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 3900ff51bc68..22cfd665224c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -488,26 +488,22 @@ void __init update_e820(void)
 }
 
 /*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space.  We pass this space to PCI to assign MMIO resources
- * for hotplug or unconfigured devices in.
- * Hopefully the BIOS let enough space left.
+ * Search for a gap in the e820 memory space from start_addr to 2^32.
  */
-__init void e820_setup_gap(void)
+__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
+		unsigned long start_addr)
 {
-	unsigned long gapstart, gapsize, round;
-	unsigned long long last;
-	int i;
+	unsigned long long last = 0x100000000ull;
+	int i = e820.nr_map;
 	int found = 0;
 
-	last = 0x100000000ull;
-	gapstart = 0x10000000;
-	gapsize = 0x400000;
-	i = e820.nr_map;
 	while (--i >= 0) {
 		unsigned long long start = e820.map[i].addr;
 		unsigned long long end = start + e820.map[i].size;
 
+		if (end < start_addr)
+			continue;
+
 		/*
 		 * Since "last" is at most 4GB, we know we'll
 		 * fit in 32 bits if this condition is true
@@ -515,15 +511,32 @@ __init void e820_setup_gap(void)
 		if (last > end) {
 			unsigned long gap = last - end;
 
-			if (gap > gapsize) {
-				gapsize = gap;
-				gapstart = end;
+			if (gap >= *gapsize) {
+				*gapsize = gap;
+				*gapstart = end;
 				found = 1;
 			}
 		}
 		if (start < last)
 			last = start;
 	}
+	return found;
+}
+
+/*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space.  We pass this space to PCI to assign MMIO resources
+ * for hotplug or unconfigured devices in.
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820_setup_gap(void)
+{
+	unsigned long gapstart, gapsize, round;
+	int found;
+
+	gapstart = 0x10000000;
+	gapsize = 0x400000;
+	found  = e820_search_gap(&gapstart, &gapsize, 0);
 
 #ifdef CONFIG_X86_64
 	if (!found) {
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 13fa5a076aa2..f622685c9af8 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -71,6 +71,8 @@ extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
 			     int checktype);
 extern void update_e820(void);
 extern void e820_setup_gap(void);
+extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
+				unsigned long start_addr);
 struct setup_data;
 extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data);
 
-- 
cgit v1.2.3


From 5dab8ec139be215fbaba216fb4aea914d0f4dac5 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 25 Jun 2008 05:44:40 -0700
Subject: mm, generic, x86 boot: more tweaks to hex prints of some pfn
 addresses

Fix some problems with (and applies on top of) a previous patch:
  x86 boot: show pfn addresses in hex not decimal in some kernel info printks

Primarily change "0x%8lx" format, which displays with a right aligned
space filled hex number (spaces between the "0x" prefix and the number),
into "%0#10lx" format, which zero fills instead of space fills, and
which uses the printf flag '#' to request the "0x" prefix instead of
hard coding it.

Also replace some other "0x%lx" formats with "%#lx", making use of the
'#' printf flag again.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: "Yinghai Lu" <yhlu.kernel@gmail.com>
Cc: "Jack Steiner" <steiner@sgi.com>
Cc: "Mike Travis" <travis@sgi.com>
Cc: "Huang
Cc: Ying" <ying.huang@intel.com>
Cc: "Andi Kleen" <andi@firstfloor.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 2 +-
 mm/page_alloc.c        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 22cfd665224c..1dcb66533dfc 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1014,7 +1014,7 @@ unsigned long __init e820_end_of_ram(void)
 	if (last_pfn > end_user_pfn)
 		last_pfn = end_user_pfn;
 
-	printk(KERN_INFO "last_pfn = 0x%lx max_arch_pfn = 0x%lx\n",
+	printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
 			 last_pfn, max_arch_pfn);
 	return last_pfn;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b604c64a0337..f024b9b3a2a6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 {
 	int i;
 
-	printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) "
+	printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) "
 			  "%d entries of %d used\n",
 			  nid, start_pfn, end_pfn,
 			  nr_nodemap_entries, MAX_ACTIVE_REGIONS);
@@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
-		printk("  %-8s 0x%8lx -> 0x%8lx\n",
+		printk("  %-8s %0#10lx -> %0#10lx\n",
 				zone_names[i],
 				arch_zone_lowest_possible_pfn[i],
 				arch_zone_highest_possible_pfn[i]);
@@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	/* Print out the early_node_map[] */
 	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
 	for (i = 0; i < nr_nodemap_entries; i++)
-		printk("  %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid,
+		printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
 						early_node_map[i].start_pfn,
 						early_node_map[i].end_pfn);
 
-- 
cgit v1.2.3


From 200001eb140ea33477965f2050bea0dac801974b Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 25 Jun 2008 05:44:46 -0700
Subject: x86 boot: only pick up additional EFI memmap if add_efi_memmap flag

Applies on top of the previous patch:
  x86 boot: add code to add BIOS provided EFI memory entries to kernel

Instead of always adding EFI memory map entries (if present) to the
memory map after initially finding either E820 BIOS memory map entries
and/or kernel command line memmap entries, -instead- only add such
additional EFI memory map entries if the kernel boot option:

    add_efi_memmap

is specified.

Requiring this 'add_efi_memmap' option is backward compatible with
kernels that didn't load such additional EFI memory map entries in
the first place, and it doesn't override a configuration that tries
to replace all E820 or EFI BIOS memory map entries with ones given
entirely on the kernel command line.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: "Yinghai Lu" <yhlu.kernel@gmail.com>
Cc: "Jack Steiner" <steiner@sgi.com>
Cc: "Mike Travis" <travis@sgi.com>
Cc: "Huang
Cc: Ying" <ying.huang@intel.com>
Cc: "Andi Kleen" <andi@firstfloor.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |  3 +++
 Documentation/x86/x86_64/uefi.txt   |  4 ++++
 arch/x86/kernel/efi.c               | 16 ++++++++++++++--
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index da13d6d1b2b0..795c487af8e4 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2150,6 +2150,9 @@ and is between 256 and 4096 characters. It is defined in the file
 	usbhid.mousepoll=
 			[USBHID] The interval which mice are to be polled at.
 
+	add_efi_memmap	[EFI; x86-32,X86-64] Include EFI memory map in
+			kernel's map of available physical RAM.
+
 	vdso=		[X86-32,SH,x86-64]
 			vdso=2: enable compat VDSO (default with COMPAT_VDSO)
 			vdso=1: enable VDSO (default)
diff --git a/Documentation/x86/x86_64/uefi.txt b/Documentation/x86/x86_64/uefi.txt
index 7d77120a5184..a5e2b4fdb170 100644
--- a/Documentation/x86/x86_64/uefi.txt
+++ b/Documentation/x86/x86_64/uefi.txt
@@ -36,3 +36,7 @@ Mechanics:
   services.
 	noefi		turn off all EFI runtime services
 	reboot_type=k	turn off EFI reboot runtime service
+- If the EFI memory map has additional entries not in the E820 map,
+  you can include those entries in the kernels memory map of available
+  physical RAM by using the following kernel command line parameter.
+	add_efi_memmap	include EFI memory map of available physical RAM
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index a03ca36a302c..94382faeadb6 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -64,6 +64,17 @@ static int __init setup_noefi(char *arg)
 }
 early_param("noefi", setup_noefi);
 
+int add_efi_memmap;
+EXPORT_SYMBOL(add_efi_memmap);
+
+static int __init setup_add_efi_memmap(char *arg)
+{
+	add_efi_memmap = 1;
+	return 0;
+}
+early_param("add_efi_memmap", setup_add_efi_memmap);
+
+
 static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
 {
 	return efi_call_virt2(get_time, tm, tc);
@@ -219,7 +230,7 @@ unsigned long efi_get_time(void)
  * (zeropage) memory map.
  */
 
-static void __init add_efi_memmap(void)
+static void __init do_add_efi_memmap(void)
 {
 	void *p;
 
@@ -406,7 +417,8 @@ void __init efi_init(void)
 	if (memmap.desc_size != sizeof(efi_memory_desc_t))
 		printk(KERN_WARNING "Kernel-defined memdesc"
 		       "doesn't match the one from EFI!\n");
-	add_efi_memmap();
+	if (add_efi_memmap)
+		do_add_efi_memmap();
 
 	/* Setup for EFI runtime service */
 	reboot_type = BOOT_EFI;
-- 
cgit v1.2.3


From b9d19f4a51447930db002409945ad40a7a373cb0 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 26 Jun 2008 00:44:56 -0700
Subject: x86: fix memory setup bug

interesting...

[    0.000000]   mapped low ram: 0 - 20000000
[    0.000000]   low ram: 00000000 - 1fff0000
[    0.000000]   bootmap 00002000 - 00006000

max_pfn_mapped > max_low_pfn?

it seems init_memory_mapping reveals an old bug.

please check attached test patch.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 4b953dc93883..392812d3c63c 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -427,7 +427,7 @@ void __init setup_arch(char **cmdline_p)
 	find_low_pfn_range();
 
 	/* max_pfn_mapped is updated here */
-	init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT));
+	max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT));
 
 	reserve_initrd();
 
-- 
cgit v1.2.3


From 378b39a4f91ab0846eb6e13d47ea812bc82b44d9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:48:14 -0700
Subject: x86: rename setup.c to setup_percpu.c

some functions need to be moved to setup_numa.c
after we merge setup32/64.c, some funcs need to be moved back to setup.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile       |   2 +-
 arch/x86/kernel/setup.c        | 528 -----------------------------------------
 arch/x86/kernel/setup_percpu.c | 528 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 529 insertions(+), 529 deletions(-)
 delete mode 100644 arch/x86/kernel/setup.c
 create mode 100644 arch/x86/kernel/setup_percpu.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0a1987b4acc4..5e1537b62534 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,7 +18,7 @@ CFLAGS_tsc_64.o		:= $(nostackp)
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
-obj-y			+= setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o
+obj-y			+= setup_$(BITS).o i8259.o irqinit_$(BITS).o setup_percpu.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
deleted file mode 100644
index 5c0c4bb5727d..000000000000
--- a/arch/x86/kernel/setup.c
+++ /dev/null
@@ -1,528 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/percpu.h>
-#include <linux/kexec.h>
-#include <linux/crash_dump.h>
-#include <asm/smp.h>
-#include <asm/percpu.h>
-#include <asm/sections.h>
-#include <asm/processor.h>
-#include <asm/setup.h>
-#include <asm/topology.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-#include <asm/highmem.h>
-
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-unsigned int num_processors;
-unsigned disabled_cpus __cpuinitdata;
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-unsigned int max_physical_apicid;
-EXPORT_SYMBOL(boot_cpu_physical_apicid);
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
-#endif
-
-/* map cpu index to physical APIC ID */
-DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
-#define	X86_64_NUMA	1
-
-/* map cpu index to node index */
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-
-/* which logical CPUs are on which nodes */
-cpumask_t *node_to_cpumask_map;
-EXPORT_SYMBOL(node_to_cpumask_map);
-
-/* setup node_to_cpumask_map */
-static void __init setup_node_to_cpumask_map(void);
-
-#else
-static inline void setup_node_to_cpumask_map(void) { }
-#endif
-
-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
-/*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas.  These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
- */
-static void __init setup_per_cpu_maps(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		per_cpu(x86_cpu_to_apicid, cpu) =
-				early_per_cpu_map(x86_cpu_to_apicid, cpu);
-		per_cpu(x86_bios_cpu_apicid, cpu) =
-				early_per_cpu_map(x86_bios_cpu_apicid, cpu);
-#ifdef X86_64_NUMA
-		per_cpu(x86_cpu_to_node_map, cpu) =
-				early_per_cpu_map(x86_cpu_to_node_map, cpu);
-#endif
-	}
-
-	/* indicate the early static arrays will soon be gone */
-	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
-	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
-#ifdef X86_64_NUMA
-	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
-#endif
-}
-
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-cpumask_t *cpumask_of_cpu_map __read_mostly;
-EXPORT_SYMBOL(cpumask_of_cpu_map);
-
-/* requires nr_cpu_ids to be initialized */
-static void __init setup_cpumask_of_cpu(void)
-{
-	int i;
-
-	/* alloc_bootmem zeroes memory */
-	cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
-	for (i = 0; i < nr_cpu_ids; i++)
-		cpu_set(i, cpumask_of_cpu_map[i]);
-}
-#else
-static inline void setup_cpumask_of_cpu(void) { }
-#endif
-
-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
-	char *pda;
-	struct x8664_pda **new_cpu_pda;
-	unsigned long size;
-	int cpu;
-
-	size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
-	/* allocate cpu_pda array and pointer table */
-	{
-		unsigned long tsize = nr_cpu_ids * sizeof(void *);
-		unsigned long asize = size * (nr_cpu_ids - 1);
-
-		tsize = roundup(tsize, cache_line_size());
-		new_cpu_pda = alloc_bootmem(tsize + asize);
-		pda = (char *)new_cpu_pda + tsize;
-	}
-
-	/* initialize pointer table to static pda's */
-	for_each_possible_cpu(cpu) {
-		if (cpu == 0) {
-			/* leave boot cpu pda in place */
-			new_cpu_pda[0] = cpu_pda(0);
-			continue;
-		}
-		new_cpu_pda[cpu] = (struct x8664_pda *)pda;
-		new_cpu_pda[cpu]->in_bootmem = 1;
-		pda += size;
-	}
-
-	/* point to new pointer table */
-	_cpu_pda = new_cpu_pda;
-}
-#endif
-
-/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
- */
-void __init setup_per_cpu_areas(void)
-{
-	ssize_t size = PERCPU_ENOUGH_ROOM;
-	char *ptr;
-	int cpu;
-
-	/* no processor from mptable or madt */
-	if (!num_processors)
-		num_processors = 1;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	prefill_possible_map();
-#else
-	nr_cpu_ids = num_processors;
-#endif
-
-	/* Setup cpu_pda map */
-	setup_cpu_pda_map();
-
-	/* Copy section for each CPU (we discard the original) */
-	size = PERCPU_ENOUGH_ROOM;
-	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
-			  size);
-
-	for_each_possible_cpu(cpu) {
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-		ptr = alloc_bootmem_pages(size);
-#else
-		int node = early_cpu_to_node(cpu);
-		if (!node_online(node) || !NODE_DATA(node)) {
-			ptr = alloc_bootmem_pages(size);
-			printk(KERN_INFO
-			       "cpu %d has no node %d or node-local memory\n",
-				cpu, node);
-		}
-		else
-			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-#endif
-		per_cpu_offset(cpu) = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-
-	}
-
-	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
-		NR_CPUS, nr_cpu_ids, nr_node_ids);
-
-	/* Setup percpu data maps */
-	setup_per_cpu_maps();
-
-	/* Setup node to cpumask map */
-	setup_node_to_cpumask_map();
-
-	/* Setup cpumask_of_cpu map */
-	setup_cpumask_of_cpu();
-}
-
-#endif
-
-void __init parse_setup_data(void)
-{
-	struct setup_data *data;
-	u64 pa_data;
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, PAGE_SIZE);
-		switch (data->type) {
-		case SETUP_E820_EXT:
-			parse_e820_ext(data, pa_data);
-			break;
-		default:
-			break;
-		}
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-		free_early(pa_data, pa_data+sizeof(*data)+data->len);
-#endif
-		pa_data = data->next;
-		early_iounmap(data, PAGE_SIZE);
-	}
-}
-
-#ifdef X86_64_NUMA
-
-/*
- * Allocate node_to_cpumask_map based on number of available nodes
- * Requires node_possible_map to be valid.
- *
- * Note: node_to_cpumask() is not valid until after this is done.
- */
-static void __init setup_node_to_cpumask_map(void)
-{
-	unsigned int node, num = 0;
-	cpumask_t *map;
-
-	/* setup nr_node_ids if not done yet */
-	if (nr_node_ids == MAX_NUMNODES) {
-		for_each_node_mask(node, node_possible_map)
-			num = node;
-		nr_node_ids = num + 1;
-	}
-
-	/* allocate the map */
-	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
-
-	Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
-		map, nr_node_ids);
-
-	/* node_to_cpumask() will now work */
-	node_to_cpumask_map = map;
-}
-
-void __cpuinit numa_set_node(int cpu, int node)
-{
-	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
-
-	if (cpu_pda(cpu) && node != NUMA_NO_NODE)
-		cpu_pda(cpu)->nodenumber = node;
-
-	if (cpu_to_node_map)
-		cpu_to_node_map[cpu] = node;
-
-	else if (per_cpu_offset(cpu))
-		per_cpu(x86_cpu_to_node_map, cpu) = node;
-
-	else
-		Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
-}
-
-void __cpuinit numa_clear_node(int cpu)
-{
-	numa_set_node(cpu, NUMA_NO_NODE);
-}
-
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-
-void __cpuinit numa_add_cpu(int cpu)
-{
-	cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
-	cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
-}
-
-#else /* CONFIG_DEBUG_PER_CPU_MAPS */
-
-/*
- * --------- debug versions of the numa functions ---------
- */
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
-	int node = cpu_to_node(cpu);
-	cpumask_t *mask;
-	char buf[64];
-
-	if (node_to_cpumask_map == NULL) {
-		printk(KERN_ERR "node_to_cpumask_map NULL\n");
-		dump_stack();
-		return;
-	}
-
-	mask = &node_to_cpumask_map[node];
-	if (enable)
-		cpu_set(cpu, *mask);
-	else
-		cpu_clear(cpu, *mask);
-
-	cpulist_scnprintf(buf, sizeof(buf), *mask);
-	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
-		enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
- }
-
-void __cpuinit numa_add_cpu(int cpu)
-{
-	numa_set_cpumask(cpu, 1);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
-	numa_set_cpumask(cpu, 0);
-}
-
-int cpu_to_node(int cpu)
-{
-	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
-		printk(KERN_WARNING
-			"cpu_to_node(%d): usage too early!\n", cpu);
-		dump_stack();
-		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-	}
-	return per_cpu(x86_cpu_to_node_map, cpu);
-}
-EXPORT_SYMBOL(cpu_to_node);
-
-/*
- * Same function as cpu_to_node() but used if called before the
- * per_cpu areas are setup.
- */
-int early_cpu_to_node(int cpu)
-{
-	if (early_per_cpu_ptr(x86_cpu_to_node_map))
-		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
-	if (!per_cpu_offset(cpu)) {
-		printk(KERN_WARNING
-			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
-		dump_stack();
-		return NUMA_NO_NODE;
-	}
-	return per_cpu(x86_cpu_to_node_map, cpu);
-}
-
-/*
- * Returns a pointer to the bitmask of CPUs on Node 'node'.
- */
-cpumask_t *_node_to_cpumask_ptr(int node)
-{
-	if (node_to_cpumask_map == NULL) {
-		printk(KERN_WARNING
-			"_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
-			node);
-		dump_stack();
-		return &cpu_online_map;
-	}
-	BUG_ON(node >= nr_node_ids);
-	return &node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(_node_to_cpumask_ptr);
-
-/*
- * Returns a bitmask of CPUs on Node 'node'.
- */
-cpumask_t node_to_cpumask(int node)
-{
-	if (node_to_cpumask_map == NULL) {
-		printk(KERN_WARNING
-			"node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
-		dump_stack();
-		return cpu_online_map;
-	}
-	BUG_ON(node >= nr_node_ids);
-	return node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(node_to_cpumask);
-
-/*
- * --------- end of debug versions of the numa functions ---------
- */
-
-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
-
-#endif /* X86_64_NUMA */
-
-
-/*
- * --------- Crashkernel reservation ------------------------------
- */
-
-static inline unsigned long long get_total_mem(void)
-{
-	unsigned long long total;
-
-	total = max_low_pfn - min_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	total += highend_pfn - highstart_pfn;
-#endif
-
-	return total << PAGE_SHIFT;
-}
-
-#ifdef CONFIG_KEXEC
-void __init reserve_crashkernel(void)
-{
-	unsigned long long total_mem;
-	unsigned long long crash_size, crash_base;
-	int ret;
-
-	total_mem = get_total_mem();
-
-	ret = parse_crashkernel(boot_command_line, total_mem,
-			&crash_size, &crash_base);
-	if (ret == 0 && crash_size > 0) {
-		if (crash_base <= 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"you have to specify a base address\n");
-			return;
-		}
-
-		if (reserve_bootmem_generic(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE) < 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"memory is in use\n");
-			return;
-		}
-
-		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-				"for crashkernel (System RAM: %ldMB)\n",
-				(unsigned long)(crash_size >> 20),
-				(unsigned long)(crash_base >> 20),
-				(unsigned long)(total_mem >> 20));
-
-		crashk_res.start = crash_base;
-		crashk_res.end   = crash_base + crash_size - 1;
-		insert_resource(&iomem_resource, &crashk_res);
-	}
-}
-#else
-void __init reserve_crashkernel(void)
-{}
-#endif
-static struct resource standard_io_resources[] = {
-	{ .name = "dma1", .start = 0x00, .end = 0x1f,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "pic1", .start = 0x20, .end = 0x21,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "timer0", .start = 0x40, .end = 0x43,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "timer1", .start = 0x50, .end = 0x53,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "keyboard", .start = 0x60, .end = 0x60,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "keyboard", .start = 0x64, .end = 0x64,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "fpu", .start = 0xf0, .end = 0xff,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
-};
-
-void __init reserve_standard_io_resources(void)
-{
-	int i;
-
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-
-}
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
-{
-	char *end;
-	if (!arg)
-		return -EINVAL;
-	elfcorehdr_addr = memparse(arg, &end);
-	return end > arg ? 0 : -EINVAL;
-}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
-
-
-
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
new file mode 100644
index 000000000000..5c0c4bb5727d
--- /dev/null
+++ b/arch/x86/kernel/setup_percpu.c
@@ -0,0 +1,528 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/percpu.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <asm/smp.h>
+#include <asm/percpu.h>
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/topology.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/highmem.h>
+
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+unsigned int num_processors;
+unsigned disabled_cpus __cpuinitdata;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+unsigned int max_physical_apicid;
+EXPORT_SYMBOL(boot_cpu_physical_apicid);
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+#endif
+
+/* map cpu index to physical APIC ID */
+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#define	X86_64_NUMA	1
+
+/* map cpu index to node index */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+/* which logical CPUs are on which nodes */
+cpumask_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+/* setup node_to_cpumask_map */
+static void __init setup_node_to_cpumask_map(void);
+
+#else
+static inline void setup_node_to_cpumask_map(void) { }
+#endif
+
+#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+/*
+ * Copy data used in early init routines from the initial arrays to the
+ * per cpu data areas.  These arrays then become expendable and the
+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ */
+static void __init setup_per_cpu_maps(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(x86_cpu_to_apicid, cpu) =
+				early_per_cpu_map(x86_cpu_to_apicid, cpu);
+		per_cpu(x86_bios_cpu_apicid, cpu) =
+				early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+#ifdef X86_64_NUMA
+		per_cpu(x86_cpu_to_node_map, cpu) =
+				early_per_cpu_map(x86_cpu_to_node_map, cpu);
+#endif
+	}
+
+	/* indicate the early static arrays will soon be gone */
+	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+#ifdef X86_64_NUMA
+	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
+#endif
+}
+
+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+cpumask_t *cpumask_of_cpu_map __read_mostly;
+EXPORT_SYMBOL(cpumask_of_cpu_map);
+
+/* requires nr_cpu_ids to be initialized */
+static void __init setup_cpumask_of_cpu(void)
+{
+	int i;
+
+	/* alloc_bootmem zeroes memory */
+	cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
+	for (i = 0; i < nr_cpu_ids; i++)
+		cpu_set(i, cpumask_of_cpu_map[i]);
+}
+#else
+static inline void setup_cpumask_of_cpu(void) { }
+#endif
+
+#ifdef CONFIG_X86_32
+/*
+ * Great future not-so-futuristic plan: make i386 and x86_64 do it
+ * the same way
+ */
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+static inline void setup_cpu_pda_map(void) { }
+
+#elif !defined(CONFIG_SMP)
+static inline void setup_cpu_pda_map(void) { }
+
+#else /* CONFIG_SMP && CONFIG_X86_64 */
+
+/*
+ * Allocate cpu_pda pointer table and array via alloc_bootmem.
+ */
+static void __init setup_cpu_pda_map(void)
+{
+	char *pda;
+	struct x8664_pda **new_cpu_pda;
+	unsigned long size;
+	int cpu;
+
+	size = roundup(sizeof(struct x8664_pda), cache_line_size());
+
+	/* allocate cpu_pda array and pointer table */
+	{
+		unsigned long tsize = nr_cpu_ids * sizeof(void *);
+		unsigned long asize = size * (nr_cpu_ids - 1);
+
+		tsize = roundup(tsize, cache_line_size());
+		new_cpu_pda = alloc_bootmem(tsize + asize);
+		pda = (char *)new_cpu_pda + tsize;
+	}
+
+	/* initialize pointer table to static pda's */
+	for_each_possible_cpu(cpu) {
+		if (cpu == 0) {
+			/* leave boot cpu pda in place */
+			new_cpu_pda[0] = cpu_pda(0);
+			continue;
+		}
+		new_cpu_pda[cpu] = (struct x8664_pda *)pda;
+		new_cpu_pda[cpu]->in_bootmem = 1;
+		pda += size;
+	}
+
+	/* point to new pointer table */
+	_cpu_pda = new_cpu_pda;
+}
+#endif
+
+/*
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
+ */
+void __init setup_per_cpu_areas(void)
+{
+	ssize_t size = PERCPU_ENOUGH_ROOM;
+	char *ptr;
+	int cpu;
+
+	/* no processor from mptable or madt */
+	if (!num_processors)
+		num_processors = 1;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	prefill_possible_map();
+#else
+	nr_cpu_ids = num_processors;
+#endif
+
+	/* Setup cpu_pda map */
+	setup_cpu_pda_map();
+
+	/* Copy section for each CPU (we discard the original) */
+	size = PERCPU_ENOUGH_ROOM;
+	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
+			  size);
+
+	for_each_possible_cpu(cpu) {
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+		ptr = alloc_bootmem_pages(size);
+#else
+		int node = early_cpu_to_node(cpu);
+		if (!node_online(node) || !NODE_DATA(node)) {
+			ptr = alloc_bootmem_pages(size);
+			printk(KERN_INFO
+			       "cpu %d has no node %d or node-local memory\n",
+				cpu, node);
+		}
+		else
+			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+#endif
+		per_cpu_offset(cpu) = ptr - __per_cpu_start;
+		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+
+	}
+
+	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
+		NR_CPUS, nr_cpu_ids, nr_node_ids);
+
+	/* Setup percpu data maps */
+	setup_per_cpu_maps();
+
+	/* Setup node to cpumask map */
+	setup_node_to_cpumask_map();
+
+	/* Setup cpumask_of_cpu map */
+	setup_cpumask_of_cpu();
+}
+
+#endif
+
+void __init parse_setup_data(void)
+{
+	struct setup_data *data;
+	u64 pa_data;
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, PAGE_SIZE);
+		switch (data->type) {
+		case SETUP_E820_EXT:
+			parse_e820_ext(data, pa_data);
+			break;
+		default:
+			break;
+		}
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+		free_early(pa_data, pa_data+sizeof(*data)+data->len);
+#endif
+		pa_data = data->next;
+		early_iounmap(data, PAGE_SIZE);
+	}
+}
+
+#ifdef X86_64_NUMA
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ */
+static void __init setup_node_to_cpumask_map(void)
+{
+	unsigned int node, num = 0;
+	cpumask_t *map;
+
+	/* setup nr_node_ids if not done yet */
+	if (nr_node_ids == MAX_NUMNODES) {
+		for_each_node_mask(node, node_possible_map)
+			num = node;
+		nr_node_ids = num + 1;
+	}
+
+	/* allocate the map */
+	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+
+	Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
+		map, nr_node_ids);
+
+	/* node_to_cpumask() will now work */
+	node_to_cpumask_map = map;
+}
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+	if (cpu_pda(cpu) && node != NUMA_NO_NODE)
+		cpu_pda(cpu)->nodenumber = node;
+
+	if (cpu_to_node_map)
+		cpu_to_node_map[cpu] = node;
+
+	else if (per_cpu_offset(cpu))
+		per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+	else
+		Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
+}
+
+void __cpuinit numa_clear_node(int cpu)
+{
+	numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
+}
+
+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+/*
+ * --------- debug versions of the numa functions ---------
+ */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+	int node = cpu_to_node(cpu);
+	cpumask_t *mask;
+	char buf[64];
+
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_ERR "node_to_cpumask_map NULL\n");
+		dump_stack();
+		return;
+	}
+
+	mask = &node_to_cpumask_map[node];
+	if (enable)
+		cpu_set(cpu, *mask);
+	else
+		cpu_clear(cpu, *mask);
+
+	cpulist_scnprintf(buf, sizeof(buf), *mask);
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+		enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
+ }
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 0);
+}
+
+int cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+		printk(KERN_WARNING
+			"cpu_to_node(%d): usage too early!\n", cpu);
+		dump_stack();
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map))
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+	if (!per_cpu_offset(cpu)) {
+		printk(KERN_WARNING
+			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+		dump_stack();
+		return NUMA_NO_NODE;
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+cpumask_t *_node_to_cpumask_ptr(int node)
+{
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_WARNING
+			"_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
+			node);
+		dump_stack();
+		return &cpu_online_map;
+	}
+	BUG_ON(node >= nr_node_ids);
+	return &node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(_node_to_cpumask_ptr);
+
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ */
+cpumask_t node_to_cpumask(int node)
+{
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_WARNING
+			"node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
+		dump_stack();
+		return cpu_online_map;
+	}
+	BUG_ON(node >= nr_node_ids);
+	return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(node_to_cpumask);
+
+/*
+ * --------- end of debug versions of the numa functions ---------
+ */
+
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+#endif /* X86_64_NUMA */
+
+
+/*
+ * --------- Crashkernel reservation ------------------------------
+ */
+
+static inline unsigned long long get_total_mem(void)
+{
+	unsigned long long total;
+
+	total = max_low_pfn - min_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	total += highend_pfn - highstart_pfn;
+#endif
+
+	return total << PAGE_SHIFT;
+}
+
+#ifdef CONFIG_KEXEC
+void __init reserve_crashkernel(void)
+{
+	unsigned long long total_mem;
+	unsigned long long crash_size, crash_base;
+	int ret;
+
+	total_mem = get_total_mem();
+
+	ret = parse_crashkernel(boot_command_line, total_mem,
+			&crash_size, &crash_base);
+	if (ret == 0 && crash_size > 0) {
+		if (crash_base <= 0) {
+			printk(KERN_INFO "crashkernel reservation failed - "
+					"you have to specify a base address\n");
+			return;
+		}
+
+		if (reserve_bootmem_generic(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE) < 0) {
+			printk(KERN_INFO "crashkernel reservation failed - "
+					"memory is in use\n");
+			return;
+		}
+
+		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+				"for crashkernel (System RAM: %ldMB)\n",
+				(unsigned long)(crash_size >> 20),
+				(unsigned long)(crash_base >> 20),
+				(unsigned long)(total_mem >> 20));
+
+		crashk_res.start = crash_base;
+		crashk_res.end   = crash_base + crash_size - 1;
+		insert_resource(&iomem_resource, &crashk_res);
+	}
+}
+#else
+void __init reserve_crashkernel(void)
+{}
+#endif
+static struct resource standard_io_resources[] = {
+	{ .name = "dma1", .start = 0x00, .end = 0x1f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic1", .start = 0x20, .end = 0x21,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer0", .start = 0x40, .end = 0x43,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer1", .start = 0x50, .end = 0x53,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x60, .end = 0x60,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x64, .end = 0x64,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "fpu", .start = 0xf0, .end = 0xff,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+void __init reserve_standard_io_resources(void)
+{
+	int i;
+
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+
+}
+
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+	char *end;
+	if (!arg)
+		return -EINVAL;
+	elfcorehdr_addr = memparse(arg, &end);
+	return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
+#endif
+
+
+
-- 
cgit v1.2.3


From 08afc7c0dd8ecb04c7a6fe367102e410a74abbe6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:48:23 -0700
Subject: x86: we can use full bootmem after have init_memory_mapping

So remove outdated comments

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 392812d3c63c..5e919e2b99fc 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -476,15 +476,6 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	vmi_init();
 #endif
-	/*
-	 * NOTE: before this point _nobody_ is allowed to allocate
-	 * any memory using the bootmem allocator.  Although the
-	 * allocator is now initialised only the first 8Mb of the kernel
-	 * virtual address space has been mapped.  All allocations before
-	 * paging_init() has completed must use the alloc_bootmem_low_pages()
-	 * variant (which allocates DMA'able memory) and care must be taken
-	 * not to exceed the 8Mb limit.
-	 */
 
 	paging_init();
 
-- 
cgit v1.2.3


From eb1379cb296f5aee348c2e04317d911bb84d9184 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:49:26 -0700
Subject: x86: update reserve_initrd to support 64bit

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 109 +++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 54 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 5e919e2b99fc..aa81779db218 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -186,43 +186,18 @@ static inline void copy_edd(void)
 
 #ifdef CONFIG_BLK_DEV_INITRD
 
-static void __init relocate_initrd(void);
+#ifdef CONFIG_X86_32
 
-static void __init reserve_initrd(void)
+#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
+static void __init relocate_initrd(void)
 {
+
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
-	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
 	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
 	u64 ramdisk_here;
-
-	if (!boot_params.hdr.type_of_loader ||
-	    !ramdisk_image || !ramdisk_size)
-		return;		/* No initrd provided by bootloader */
-
-	initrd_start = 0;
-
-	if (ramdisk_size >= (end_of_lowmem>>1)) {
-		free_early(ramdisk_image, ramdisk_end);
-		printk(KERN_ERR "initrd too large to handle, "
-		       "disabling initrd\n");
-		return;
-	}
-
-	printk(KERN_INFO "old RAMDISK: %08llx - %08llx\n", ramdisk_image,
-			ramdisk_end);
-
-
-	if (ramdisk_end <= end_of_lowmem) {
-		/* All in lowmem, easy case */
-		/*
-		 * don't need to reserve again, already reserved early
-		 * in i386_start_kernel
-		 */
-		initrd_start = ramdisk_image + PAGE_OFFSET;
-		initrd_end = initrd_start + ramdisk_size;
-		return;
-	}
+	unsigned long slop, clen, mapaddr;
+	char *p, *q;
 
 	/* We need to move the initrd down into lowmem */
 	ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
@@ -241,22 +216,6 @@ static void __init reserve_initrd(void)
 	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
 			 ramdisk_here, ramdisk_here + ramdisk_size);
 
-	relocate_initrd();
-}
-
-#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
-
-static void __init relocate_initrd(void)
-{
-	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
-	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-	u64 ramdisk_here;
-	unsigned long slop, clen, mapaddr;
-	char *p, *q;
-
-	ramdisk_here = initrd_start - PAGE_OFFSET;
-
 	q = (char *)initrd_start;
 
 	/* Copy any lowmem portion of the initrd */
@@ -286,18 +245,60 @@ static void __init relocate_initrd(void)
 	/* high pages is not converted by early_res_to_bootmem */
 	ramdisk_image = boot_params.hdr.ramdisk_image;
 	ramdisk_size  = boot_params.hdr.ramdisk_size;
-	printk(KERN_INFO "Copied RAMDISK from %016llx - %016llx to %08llx - %08llx\n",
+	printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
+		" %08llx - %08llx\n",
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
 		ramdisk_here, ramdisk_here + ramdisk_size - 1);
+}
+#endif
 
-	/*
-	 * need to free old one, otherwise init cross max_low_pfn could be
-	 * converted to bootmem
-	 */
-	free_early(ramdisk_image, ramdisk_image+ramdisk_size);
+static void __init reserve_initrd(void)
+{
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+
+	if (!boot_params.hdr.type_of_loader ||
+	    !ramdisk_image || !ramdisk_size)
+		return;		/* No initrd provided by bootloader */
+
+	initrd_start = 0;
+
+	if (ramdisk_size >= (end_of_lowmem>>1)) {
+		free_early(ramdisk_image, ramdisk_end);
+		printk(KERN_ERR "initrd too large to handle, "
+		       "disabling initrd\n");
+		return;
+	}
+
+	printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
+			ramdisk_end);
+
+
+	if (ramdisk_end <= end_of_lowmem) {
+		/* All in lowmem, easy case */
+		/*
+		 * don't need to reserve again, already reserved early
+		 * in i386_start_kernel
+		 */
+		initrd_start = ramdisk_image + PAGE_OFFSET;
+		initrd_end = initrd_start + ramdisk_size;
+		return;
+	}
+
+#ifdef CONFIG_X86_32
+	relocate_initrd();
+#else
+	printk(KERN_ERR "initrd extends beyond end of memory "
+	       "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
+	       ramdisk_end, end_of_lowmem);
+	initrd_start = 0;
+#endif
+	free_early(ramdisk_image, ramdisk_end);
 }
 #else
-void __init reserve_initrd(void)
+static void __init reserve_initrd(void)
 {
 }
 #endif /* CONFIG_BLK_DEV_INITRD */
-- 
cgit v1.2.3


From 7dea23ecd17db6e42e19499db70d2fcfa5ca1ee2 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:50:06 -0700
Subject: x86: put global variable for 32bit all together

those variables are not needed by 64 bit.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 57 +++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index aa81779db218..98f9199026cd 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -100,6 +100,8 @@ static struct resource bss_resource = {
 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
+
+#ifdef CONFIG_X86_32
 static struct resource video_ram_resource = {
 	.name	= "Video RAM area",
 	.start	= 0xa0000,
@@ -108,24 +110,47 @@ static struct resource video_ram_resource = {
 };
 
 /* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
 /* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
 EXPORT_SYMBOL(boot_cpu_data);
+static void set_mca_bus(int x)
+{
+#ifdef CONFIG_MCA
+	MCA_bus = x;
+#endif
+}
 
 unsigned int def_to_bigsmp;
 
-#ifndef CONFIG_X86_PAE
-unsigned long mmu_cr4_features;
-#else
-unsigned long mmu_cr4_features = X86_CR4_PAE;
-#endif
-
 /* for MCA, but anyone else can use it if they want */
 unsigned int machine_id;
 unsigned int machine_submodel_id;
 unsigned int BIOS_revision;
 
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+	defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+struct ist_info ist_info;
+EXPORT_SYMBOL(ist_info);
+#else
+struct ist_info ist_info;
+#endif
+
+#else
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+#endif
+
+
+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+unsigned long mmu_cr4_features;
+#else
+unsigned long mmu_cr4_features = X86_CR4_PAE;
+#endif
+
 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
 int bootloader_type;
 
@@ -140,15 +165,8 @@ char dmi_alloc_data[DMI_MAX_DATA];
  */
 struct screen_info screen_info;
 EXPORT_SYMBOL(screen_info);
-struct apm_info apm_info;
-EXPORT_SYMBOL(apm_info);
 struct edid_info edid_info;
 EXPORT_SYMBOL_GPL(edid_info);
-struct ist_info ist_info;
-#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
-	defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
-EXPORT_SYMBOL(ist_info);
-#endif
 
 extern int root_mountflags;
 
@@ -303,15 +321,6 @@ static void __init reserve_initrd(void)
 }
 #endif /* CONFIG_BLK_DEV_INITRD */
 
-#ifdef CONFIG_MCA
-static void set_mca_bus(int x)
-{
-	MCA_bus = x;
-}
-#else
-static void set_mca_bus(int x) { }
-#endif
-
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
-- 
cgit v1.2.3


From 46d671b525102ae005dc5e8389ca67c86ae012b1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:51:29 -0700
Subject: x86: add extra includes for 64bit support

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 50 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 98f9199026cd..0b69483b0c3d 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -45,31 +45,75 @@
 #include <linux/dmi.h>
 #include <linux/pfn.h>
 #include <linux/pci.h>
+#include <asm/pci-direct.h>
 #include <linux/init_ohci1394_dma.h>
 #include <linux/kvm_para.h>
 
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+
+#include <linux/kallsyms.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/kexec.h>
+#include <linux/cpufreq.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+
+#include <linux/percpu.h>
+#include <linux/crash_dump.h>
+
 #include <video/edid.h>
 
 #include <asm/mtrr.h>
 #include <asm/apic.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
-#include <asm/mmzone.h>
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
 #include <asm/sections.h>
 #include <asm/dmi.h>
 #include <asm/io_apic.h>
 #include <asm/ist.h>
-#include <asm/io.h>
 #include <asm/vmi.h>
 #include <setup_arch.h>
 #include <asm/bios_ebda.h>
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
-#include <asm/efi.h>
 #include <asm/bugs.h>
 
+#include <asm/system.h>
+#include <asm/vsyscall.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/dma.h>
+#include <asm/gart.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+
+#include <mach_apic.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define ARCH_SETUP
+#endif
+
+#include <asm/percpu.h>
+#include <asm/sections.h>
+#include <asm/topology.h>
+#include <asm/apicdef.h>
+#ifdef CONFIG_X86_32
+#include <asm/highmem.h>
+#endif
+
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
    address, and must not be in the .bss segment! */
-- 
cgit v1.2.3


From 76934ed4b33b65096296d3e7b3c046fd020019fc Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:52:35 -0700
Subject: x86: merge 64bit setup_arch into setup_32

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 99 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 91 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 0b69483b0c3d..f3177bae300d 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -372,26 +372,38 @@ static void __init reserve_initrd(void)
  * global efi_enabled. This allows the same kernel image to be used on existing
  * systems (with a traditional BIOS) as well as on EFI systems.
  */
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
+
 void __init setup_arch(char **cmdline_p)
 {
+#ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	pre_setup_arch_hook();
 	early_cpu_init();
 	early_ioremap_init();
 	reserve_setup_data();
+#else
+	printk(KERN_INFO "Command line: %s\n", boot_command_line);
+#endif
 
 	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
 	screen_info = boot_params.screen_info;
 	edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
 	apm_info.bios = boot_params.apm_bios_info;
 	ist_info = boot_params.ist_info;
-	saved_video_mode = boot_params.hdr.vid_mode;
-	if( boot_params.sys_desc_table.length != 0 ) {
+	if (boot_params.sys_desc_table.length != 0) {
 		set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
 		machine_id = boot_params.sys_desc_table.table[0];
 		machine_submodel_id = boot_params.sys_desc_table.table[1];
 		BIOS_revision = boot_params.sys_desc_table.table[2];
 	}
+#endif
+	saved_video_mode = boot_params.hdr.vid_mode;
 	bootloader_type = boot_params.hdr.type_of_loader;
 
 #ifdef CONFIG_BLK_DEV_RAM
@@ -401,7 +413,12 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL32", 4)) {
+#ifdef CONFIG_X86_32
+		     "EL32",
+#else
+		     "EL64",
+#endif
+	 4)) {
 		efi_enabled = 1;
 		efi_reserve_early();
 	}
@@ -417,7 +434,11 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.start_code = (unsigned long) _text;
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
+#ifdef CONFIG_X86_32
 	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
+#else
+	init_mm.brk = (unsigned long) &_end;
+#endif
 
 	code_resource.start = virt_to_phys(_text);
 	code_resource.end = virt_to_phys(_etext)-1;
@@ -426,6 +447,9 @@ void __init setup_arch(char **cmdline_p)
 	bss_resource.start = virt_to_phys(&__bss_start);
 	bss_resource.end = virt_to_phys(&__bss_stop)-1;
 
+#ifdef CONFIG_X86_64
+	early_cpu_init();
+#endif
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
 
@@ -433,16 +457,27 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
-	if (acpi_mps_check()){
+	if (acpi_mps_check()) {
 #ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_X86_32
 		enable_local_apic = -1;
+#else
+		disable_apic = 1;
+#endif
 #endif
 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 	}
 
 	finish_e820_parsing();
 
+#ifdef CONFIG_X86_32
 	probe_roms();
+#else
+# ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	if (init_ohci1394_dma_early)
+		init_ohci1394_dma_on_all_controllers();
+# endif
+#endif
 
 	/* after parse_early_param, so could debug it */
 	insert_resource(&iomem_resource, &code_resource);
@@ -452,6 +487,7 @@ void __init setup_arch(char **cmdline_p)
 	if (efi_enabled)
 		efi_init();
 
+#ifdef CONFIG_X86_32
 	if (ppro_with_ram_bug()) {
 		e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
 				  E820_RESERVED);
@@ -459,6 +495,9 @@ void __init setup_arch(char **cmdline_p)
 		printk(KERN_INFO "fixed physical RAM map:\n");
 		e820_print_map("bad_ppro");
 	}
+#else
+	early_gart_iommu_check();
+#endif
 
 	e820_register_active_regions(0, 0, -1UL);
 	/*
@@ -477,14 +516,32 @@ void __init setup_arch(char **cmdline_p)
 		max_pfn = e820_end_of_ram();
 	}
 
+#ifdef CONFIG_X86_32
 	/* max_low_pfn get updated here */
 	find_low_pfn_range();
+#else
+	num_physpages = max_pfn;
+
+	check_efer();
+
+	/* How many end-of-memory variables you have, grandma! */
+	/* need this before calling reserve_initrd */
+	max_low_pfn = max_pfn;
+	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+#endif
 
 	/* max_pfn_mapped is updated here */
+#ifdef CONFIG_X86_64
+	max_pfn_mapped =
+#endif
 	max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT));
 
 	reserve_initrd();
 
+#ifdef CONFIG_X86_64
+	vsmp_init();
+#endif
+
 	dmi_scan_machine();
 
 	io_delay_init();
@@ -494,6 +551,11 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_boot_table_init();
 
+#ifdef CONFIG_X86_64
+	/* Remove active ranges so rediscovery with NUMA-awareness happens */
+	remove_all_active_ranges();
+#endif
+
 #ifdef CONFIG_ACPI_NUMA
         /*
          * Parse SRAT to discover nodes.
@@ -503,13 +565,18 @@ void __init setup_arch(char **cmdline_p)
 
 	initmem_init(0, max_pfn);
 
+#ifdef CONFIG_X86_64
+	dma32_reserve_bootmem();
+#endif
+
 #ifdef CONFIG_ACPI_SLEEP
 	/*
 	 * Reserve low memory region for sleep support.
 	 */
 	acpi_reserve_bootmem();
 #endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
+#if defined(CONFIG_X86_FIND_SMP_CONFIG) && defined(CONFIG_X86_32) || \
+    defined(CONFIG_X86_MPPARSE) && defined(CONFIG_X86_64)
 	/*
 	 * Find and reserve possible boot-time SMP configuration:
 	 */
@@ -523,7 +590,7 @@ void __init setup_arch(char **cmdline_p)
 	kvmclock_init();
 #endif
 
-#ifdef CONFIG_VMI
+#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
 	/*
 	 * Must be after max_low_pfn is determined, and before kernel
 	 * pagetables are setup.
@@ -533,11 +600,15 @@ void __init setup_arch(char **cmdline_p)
 
 	paging_init();
 
+#ifdef CONFIG_X86_64
+	map_vsyscall();
+#endif
+
 	/*
 	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
 	 */
 
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+#if defined(CONFIG_PROVIDE_OHCI1394_DMA_INIT) && defined(CONFIG_X86_32)
 	if (init_ohci1394_dma_early)
 		init_ohci1394_dma_on_all_controllers();
 #endif
@@ -553,6 +624,10 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_boot_init();
 
+#ifdef CONFIG_X86_64
+	init_cpu_to_node();
+#endif
+
 #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
 	/*
 	 * get boot-time SMP configuration:
@@ -560,18 +635,26 @@ void __init setup_arch(char **cmdline_p)
 	if (smp_found_config)
 		get_smp_config();
 #endif
-#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
+
+#ifdef CONFIG_X86_64
+	init_apic_mappings();
+	ioapic_init_mappings();
+#else
+# if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
 	if (def_to_bigsmp)
 		printk(KERN_WARNING "More than 8 CPUs detected and "
 			"CONFIG_X86_PC cannot handle it.\nUse "
 			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+# endif
 #endif
 	kvm_guest_init();
 
 	e820_reserve_resources();
 	e820_mark_nosave_regions(max_low_pfn);
 
+#ifdef CONFIG_X86_32
 	request_resource(&iomem_resource, &video_ram_resource);
+#endif
 	reserve_standard_io_resources();
 
 	e820_setup_gap();
-- 
cgit v1.2.3


From f2f865fe6e6e40ddf37f887eb427263d83bb925d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:53:22 -0700
Subject: x86: space to tab in setup_arch

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index f3177bae300d..4d5f8a3eb7ea 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -557,10 +557,10 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 #ifdef CONFIG_ACPI_NUMA
-        /*
-         * Parse SRAT to discover nodes.
-         */
-        acpi_numa_init();
+	/*
+	 * Parse SRAT to discover nodes.
+	 */
+	acpi_numa_init();
 #endif
 
 	initmem_init(0, max_pfn);
-- 
cgit v1.2.3


From 55f262391a2365d657a00ed68edd1a51bca66af5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:54:23 -0700
Subject: x86: rename setup_32.c to setup.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

and let 64 bit use that instead of setup_64.c

[ mingo@elte.hu ]

x86: build fix

fix:

arch/x86/kernel/setup.c: In function ‘setup_arch':
arch/x86/kernel/setup.c:561: error: implicit declaration of function ‘efi_reserve_early'

and:

arch/x86/kernel/setup.c:766: error: implicit declaration of function 'init_cpu_to_node'

and:

arch/x86/kernel/setup.c:676: warning: operation on 'max_pfn_mapped' may be undefined

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile   |   2 +-
 arch/x86/kernel/setup.c    | 672 +++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_32.c | 671 --------------------------------------------
 3 files changed, 673 insertions(+), 672 deletions(-)
 create mode 100644 arch/x86/kernel/setup.c
 delete mode 100644 arch/x86/kernel/setup_32.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e1537b62534..212804e78787 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,7 +18,7 @@ CFLAGS_tsc_64.o		:= $(nostackp)
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
-obj-y			+= setup_$(BITS).o i8259.o irqinit_$(BITS).o setup_percpu.o
+obj-y			+= setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
new file mode 100644
index 000000000000..222d7038f2fd
--- /dev/null
+++ b/arch/x86/kernel/setup.c
@@ -0,0 +1,672 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ *  Memory region support
+ *	David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ *  Added E820 sanitization routine (removes overlapping memory regions);
+ *  Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ *    Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ *  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *  Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+
+#include <linux/kallsyms.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/kexec.h>
+#include <linux/cpufreq.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+
+#include <linux/percpu.h>
+#include <linux/crash_dump.h>
+
+#include <video/edid.h>
+
+#include <asm/mtrr.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/efi.h>
+#include <asm/sections.h>
+#include <asm/dmi.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/vmi.h>
+#include <setup_arch.h>
+#include <asm/bios_ebda.h>
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/bugs.h>
+
+#include <asm/system.h>
+#include <asm/vsyscall.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/dma.h>
+#include <asm/gart.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+
+#include <mach_apic.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define ARCH_SETUP
+#endif
+
+#include <asm/percpu.h>
+#include <asm/sections.h>
+#include <asm/topology.h>
+#include <asm/apicdef.h>
+#ifdef CONFIG_X86_64
+#include <asm/numa_64.h>
+#endif
+#ifdef CONFIG_X86_32
+#include <asm/highmem.h>
+#endif
+
+/* This value is set up by the early boot code to point to the value
+   immediately after the boot time page tables.  It contains a *physical*
+   address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_start __initdata = ~0UL;
+unsigned long init_pg_tables_end __initdata = ~0UL;
+
+/*
+ * Machine setup..
+ */
+static struct resource data_resource = {
+	.name	= "Kernel data",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+	.name	= "Kernel code",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource bss_resource = {
+	.name	= "Kernel bss",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+
+#ifdef CONFIG_X86_32
+static struct resource video_ram_resource = {
+	.name	= "Video RAM area",
+	.start	= 0xa0000,
+	.end	= 0xbffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
+EXPORT_SYMBOL(boot_cpu_data);
+static void set_mca_bus(int x)
+{
+#ifdef CONFIG_MCA
+	MCA_bus = x;
+#endif
+}
+
+unsigned int def_to_bigsmp;
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+	defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+struct ist_info ist_info;
+EXPORT_SYMBOL(ist_info);
+#else
+struct ist_info ist_info;
+#endif
+
+#else
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+#endif
+
+
+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+unsigned long mmu_cr4_features;
+#else
+unsigned long mmu_cr4_features = X86_CR4_PAE;
+#endif
+
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
+/*
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+
+extern int root_mountflags;
+
+unsigned long saved_video_mode;
+
+#define RAMDISK_IMAGE_START_MASK	0x07FF
+#define RAMDISK_PROMPT_FLAG		0x8000
+#define RAMDISK_LOAD_FLAG		0x4000
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+	    sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+     edd.edd_info_nr = boot_params.eddbuf_entries;
+}
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+
+#ifdef CONFIG_X86_32
+
+#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
+static void __init relocate_initrd(void)
+{
+
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	u64 ramdisk_here;
+	unsigned long slop, clen, mapaddr;
+	char *p, *q;
+
+	/* We need to move the initrd down into lowmem */
+	ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
+					 PAGE_SIZE);
+
+	if (ramdisk_here == -1ULL)
+		panic("Cannot find place for new RAMDISK of size %lld\n",
+			 ramdisk_size);
+
+	/* Note: this includes all the lowmem currently occupied by
+	   the initrd, we rely on that fact to keep the data intact. */
+	reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
+			 "NEW RAMDISK");
+	initrd_start = ramdisk_here + PAGE_OFFSET;
+	initrd_end   = initrd_start + ramdisk_size;
+	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+			 ramdisk_here, ramdisk_here + ramdisk_size);
+
+	q = (char *)initrd_start;
+
+	/* Copy any lowmem portion of the initrd */
+	if (ramdisk_image < end_of_lowmem) {
+		clen = end_of_lowmem - ramdisk_image;
+		p = (char *)__va(ramdisk_image);
+		memcpy(q, p, clen);
+		q += clen;
+		ramdisk_image += clen;
+		ramdisk_size  -= clen;
+	}
+
+	/* Copy the highmem portion of the initrd */
+	while (ramdisk_size) {
+		slop = ramdisk_image & ~PAGE_MASK;
+		clen = ramdisk_size;
+		if (clen > MAX_MAP_CHUNK-slop)
+			clen = MAX_MAP_CHUNK-slop;
+		mapaddr = ramdisk_image & PAGE_MASK;
+		p = early_ioremap(mapaddr, clen+slop);
+		memcpy(q, p+slop, clen);
+		early_iounmap(p, clen+slop);
+		q += clen;
+		ramdisk_image += clen;
+		ramdisk_size  -= clen;
+	}
+	/* high pages is not converted by early_res_to_bootmem */
+	ramdisk_image = boot_params.hdr.ramdisk_image;
+	ramdisk_size  = boot_params.hdr.ramdisk_size;
+	printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
+		" %08llx - %08llx\n",
+		ramdisk_image, ramdisk_image + ramdisk_size - 1,
+		ramdisk_here, ramdisk_here + ramdisk_size - 1);
+}
+#endif
+
+static void __init reserve_initrd(void)
+{
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+
+	if (!boot_params.hdr.type_of_loader ||
+	    !ramdisk_image || !ramdisk_size)
+		return;		/* No initrd provided by bootloader */
+
+	initrd_start = 0;
+
+	if (ramdisk_size >= (end_of_lowmem>>1)) {
+		free_early(ramdisk_image, ramdisk_end);
+		printk(KERN_ERR "initrd too large to handle, "
+		       "disabling initrd\n");
+		return;
+	}
+
+	printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
+			ramdisk_end);
+
+
+	if (ramdisk_end <= end_of_lowmem) {
+		/* All in lowmem, easy case */
+		/*
+		 * don't need to reserve again, already reserved early
+		 * in i386_start_kernel
+		 */
+		initrd_start = ramdisk_image + PAGE_OFFSET;
+		initrd_end = initrd_start + ramdisk_size;
+		return;
+	}
+
+#ifdef CONFIG_X86_32
+	relocate_initrd();
+#else
+	printk(KERN_ERR "initrd extends beyond end of memory "
+	       "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
+	       ramdisk_end, end_of_lowmem);
+	initrd_start = 0;
+#endif
+	free_early(ramdisk_image, ramdisk_end);
+}
+#else
+static void __init reserve_initrd(void)
+{
+}
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+/*
+ * Determine if we were loaded by an EFI loader.  If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization.  Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
+
+void __init setup_arch(char **cmdline_p)
+{
+#ifdef CONFIG_X86_32
+	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+	pre_setup_arch_hook();
+	early_cpu_init();
+	early_ioremap_init();
+	reserve_setup_data();
+#else
+	printk(KERN_INFO "Command line: %s\n", boot_command_line);
+#endif
+
+	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+	screen_info = boot_params.screen_info;
+	edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+	apm_info.bios = boot_params.apm_bios_info;
+	ist_info = boot_params.ist_info;
+	if (boot_params.sys_desc_table.length != 0) {
+		set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
+		machine_id = boot_params.sys_desc_table.table[0];
+		machine_submodel_id = boot_params.sys_desc_table.table[1];
+		BIOS_revision = boot_params.sys_desc_table.table[2];
+	}
+#endif
+	saved_video_mode = boot_params.hdr.vid_mode;
+	bootloader_type = boot_params.hdr.type_of_loader;
+
+#ifdef CONFIG_BLK_DEV_RAM
+	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+#endif
+#ifdef CONFIG_EFI
+	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+#ifdef CONFIG_X86_32
+		     "EL32",
+#else
+		     "EL64",
+#endif
+	 4)) {
+		efi_enabled = 1;
+		efi_reserve_early();
+	}
+#endif
+
+	ARCH_SETUP
+
+	setup_memory_map();
+	copy_edd();
+
+	if (!boot_params.hdr.root_flags)
+		root_mountflags &= ~MS_RDONLY;
+	init_mm.start_code = (unsigned long) _text;
+	init_mm.end_code = (unsigned long) _etext;
+	init_mm.end_data = (unsigned long) _edata;
+#ifdef CONFIG_X86_32
+	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
+#else
+	init_mm.brk = (unsigned long) &_end;
+#endif
+
+	code_resource.start = virt_to_phys(_text);
+	code_resource.end = virt_to_phys(_etext)-1;
+	data_resource.start = virt_to_phys(_etext);
+	data_resource.end = virt_to_phys(_edata)-1;
+	bss_resource.start = virt_to_phys(&__bss_start);
+	bss_resource.end = virt_to_phys(&__bss_stop)-1;
+
+#ifdef CONFIG_X86_64
+	early_cpu_init();
+#endif
+	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
+
+	parse_setup_data();
+
+	parse_early_param();
+
+	if (acpi_mps_check()) {
+#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_X86_32
+		enable_local_apic = -1;
+#else
+		disable_apic = 1;
+#endif
+#endif
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+	}
+
+	finish_e820_parsing();
+
+#ifdef CONFIG_X86_32
+	probe_roms();
+#else
+# ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	if (init_ohci1394_dma_early)
+		init_ohci1394_dma_on_all_controllers();
+# endif
+#endif
+
+	/* after parse_early_param, so could debug it */
+	insert_resource(&iomem_resource, &code_resource);
+	insert_resource(&iomem_resource, &data_resource);
+	insert_resource(&iomem_resource, &bss_resource);
+
+	if (efi_enabled)
+		efi_init();
+
+#ifdef CONFIG_X86_32
+	if (ppro_with_ram_bug()) {
+		e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+				  E820_RESERVED);
+		sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+		printk(KERN_INFO "fixed physical RAM map:\n");
+		e820_print_map("bad_ppro");
+	}
+#else
+	early_gart_iommu_check();
+#endif
+
+	e820_register_active_regions(0, 0, -1UL);
+	/*
+	 * partially used pages are not usable - thus
+	 * we are rounding upwards:
+	 */
+	max_pfn = e820_end_of_ram();
+
+	/* preallocate 4k for mptable mpc */
+	early_reserve_e820_mpc_new();
+	/* update e820 for memory not covered by WB MTRRs */
+	mtrr_bp_init();
+	if (mtrr_trim_uncached_memory(max_pfn)) {
+		remove_all_active_ranges();
+		e820_register_active_regions(0, 0, -1UL);
+		max_pfn = e820_end_of_ram();
+	}
+
+#ifdef CONFIG_X86_32
+	/* max_low_pfn get updated here */
+	find_low_pfn_range();
+#else
+	num_physpages = max_pfn;
+
+	check_efer();
+
+	/* How many end-of-memory variables you have, grandma! */
+	/* need this before calling reserve_initrd */
+	max_low_pfn = max_pfn;
+	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+	/* max_pfn_mapped is updated here */
+	max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT));
+
+	reserve_initrd();
+
+#ifdef CONFIG_X86_64
+	vsmp_init();
+#endif
+
+	dmi_scan_machine();
+
+	io_delay_init();
+
+	/*
+	 * Parse the ACPI tables for possible boot-time SMP configuration.
+	 */
+	acpi_boot_table_init();
+
+#ifdef CONFIG_X86_64
+	/* Remove active ranges so rediscovery with NUMA-awareness happens */
+	remove_all_active_ranges();
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+	/*
+	 * Parse SRAT to discover nodes.
+	 */
+	acpi_numa_init();
+#endif
+
+	initmem_init(0, max_pfn);
+
+#ifdef CONFIG_X86_64
+	dma32_reserve_bootmem();
+#endif
+
+#ifdef CONFIG_ACPI_SLEEP
+	/*
+	 * Reserve low memory region for sleep support.
+	 */
+	acpi_reserve_bootmem();
+#endif
+#if defined(CONFIG_X86_FIND_SMP_CONFIG) && defined(CONFIG_X86_32) || \
+    defined(CONFIG_X86_MPPARSE) && defined(CONFIG_X86_64)
+	/*
+	 * Find and reserve possible boot-time SMP configuration:
+	 */
+	find_smp_config();
+#endif
+	reserve_crashkernel();
+
+	reserve_ibft_region();
+
+#ifdef CONFIG_KVM_CLOCK
+	kvmclock_init();
+#endif
+
+#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
+	/*
+	 * Must be after max_low_pfn is determined, and before kernel
+	 * pagetables are setup.
+	 */
+	vmi_init();
+#endif
+
+	paging_init();
+
+#ifdef CONFIG_X86_64
+	map_vsyscall();
+#endif
+
+	/*
+	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+	 */
+
+#if defined(CONFIG_PROVIDE_OHCI1394_DMA_INIT) && defined(CONFIG_X86_32)
+	if (init_ohci1394_dma_early)
+		init_ohci1394_dma_on_all_controllers();
+#endif
+
+#ifdef CONFIG_X86_GENERICARCH
+	generic_apic_probe();
+#endif
+
+	early_quirks();
+
+	/*
+	 * Read APIC and some other early information from ACPI tables.
+	 */
+	acpi_boot_init();
+
+#ifdef CONFIG_X86_64
+	init_cpu_to_node();
+#endif
+
+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	if (smp_found_config)
+		get_smp_config();
+#endif
+
+#ifdef CONFIG_X86_64
+	init_apic_mappings();
+	ioapic_init_mappings();
+#else
+# if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
+	if (def_to_bigsmp)
+		printk(KERN_WARNING "More than 8 CPUs detected and "
+			"CONFIG_X86_PC cannot handle it.\nUse "
+			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+# endif
+#endif
+	kvm_guest_init();
+
+	e820_reserve_resources();
+	e820_mark_nosave_regions(max_low_pfn);
+
+#ifdef CONFIG_X86_32
+	request_resource(&iomem_resource, &video_ram_resource);
+#endif
+	reserve_standard_io_resources();
+
+	e820_setup_gap();
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+	if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+		conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+	conswitchp = &dummy_con;
+#endif
+#endif
+}
+
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
deleted file mode 100644
index 4d5f8a3eb7ea..000000000000
--- a/arch/x86/kernel/setup_32.c
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- *  Copyright (C) 1995  Linus Torvalds
- *
- *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- *
- *  Memory region support
- *	David Parsons <orc@pell.chi.il.us>, July-August 1999
- *
- *  Added E820 sanitization routine (removes overlapping memory regions);
- *  Brian Moyle <bmoyle@mvista.com>, February 2001
- *
- * Moved CPU detection code to cpu/${cpu}.c
- *    Patrick Mochel <mochel@osdl.org>, March 2002
- *
- *  Provisions for empty E820 memory regions (reported by certain BIOSes).
- *  Alex Achenbach <xela@slit.de>, December 2002.
- *
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/acpi.h>
-#include <linux/apm_bios.h>
-#include <linux/initrd.h>
-#include <linux/bootmem.h>
-#include <linux/seq_file.h>
-#include <linux/console.h>
-#include <linux/mca.h>
-#include <linux/root_dev.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/nodemask.h>
-#include <linux/kexec.h>
-#include <linux/dmi.h>
-#include <linux/pfn.h>
-#include <linux/pci.h>
-#include <asm/pci-direct.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/delay.h>
-#include <linux/highmem.h>
-
-#include <linux/kallsyms.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/kexec.h>
-#include <linux/cpufreq.h>
-#include <linux/dma-mapping.h>
-#include <linux/ctype.h>
-#include <linux/uaccess.h>
-
-#include <linux/percpu.h>
-#include <linux/crash_dump.h>
-
-#include <video/edid.h>
-
-#include <asm/mtrr.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/arch_hooks.h>
-#include <asm/sections.h>
-#include <asm/dmi.h>
-#include <asm/io_apic.h>
-#include <asm/ist.h>
-#include <asm/vmi.h>
-#include <setup_arch.h>
-#include <asm/bios_ebda.h>
-#include <asm/cacheflush.h>
-#include <asm/processor.h>
-#include <asm/bugs.h>
-
-#include <asm/system.h>
-#include <asm/vsyscall.h>
-#include <asm/smp.h>
-#include <asm/desc.h>
-#include <asm/dma.h>
-#include <asm/gart.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-
-#include <mach_apic.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define ARCH_SETUP
-#endif
-
-#include <asm/percpu.h>
-#include <asm/sections.h>
-#include <asm/topology.h>
-#include <asm/apicdef.h>
-#ifdef CONFIG_X86_32
-#include <asm/highmem.h>
-#endif
-
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
-
-/*
- * Machine setup..
- */
-static struct resource data_resource = {
-	.name	= "Kernel data",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource code_resource = {
-	.name	= "Kernel code",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource bss_resource = {
-	.name	= "Kernel bss",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-
-#ifdef CONFIG_X86_32
-static struct resource video_ram_resource = {
-	.name	= "Video RAM area",
-	.start	= 0xa0000,
-	.end	= 0xbffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
-/* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
-EXPORT_SYMBOL(boot_cpu_data);
-static void set_mca_bus(int x)
-{
-#ifdef CONFIG_MCA
-	MCA_bus = x;
-#endif
-}
-
-unsigned int def_to_bigsmp;
-
-/* for MCA, but anyone else can use it if they want */
-unsigned int machine_id;
-unsigned int machine_submodel_id;
-unsigned int BIOS_revision;
-
-struct apm_info apm_info;
-EXPORT_SYMBOL(apm_info);
-
-#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
-	defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
-struct ist_info ist_info;
-EXPORT_SYMBOL(ist_info);
-#else
-struct ist_info ist_info;
-#endif
-
-#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
-EXPORT_SYMBOL(boot_cpu_data);
-#endif
-
-
-#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-unsigned long mmu_cr4_features;
-#else
-unsigned long mmu_cr4_features = X86_CR4_PAE;
-#endif
-
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
-
-/*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
- * Setup options
- */
-struct screen_info screen_info;
-EXPORT_SYMBOL(screen_info);
-struct edid_info edid_info;
-EXPORT_SYMBOL_GPL(edid_info);
-
-extern int root_mountflags;
-
-unsigned long saved_video_mode;
-
-#define RAMDISK_IMAGE_START_MASK	0x07FF
-#define RAMDISK_PROMPT_FLAG		0x8000
-#define RAMDISK_LOAD_FLAG		0x4000
-
-static char __initdata command_line[COMMAND_LINE_SIZE];
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-struct edd edd;
-#ifdef CONFIG_EDD_MODULE
-EXPORT_SYMBOL(edd);
-#endif
-/**
- * copy_edd() - Copy the BIOS EDD information
- *              from boot_params into a safe place.
- *
- */
-static inline void copy_edd(void)
-{
-     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
-	    sizeof(edd.mbr_signature));
-     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
-     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
-     edd.edd_info_nr = boot_params.eddbuf_entries;
-}
-#else
-static inline void copy_edd(void)
-{
-}
-#endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-
-#ifdef CONFIG_X86_32
-
-#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
-static void __init relocate_initrd(void)
-{
-
-	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
-	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-	u64 ramdisk_here;
-	unsigned long slop, clen, mapaddr;
-	char *p, *q;
-
-	/* We need to move the initrd down into lowmem */
-	ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
-					 PAGE_SIZE);
-
-	if (ramdisk_here == -1ULL)
-		panic("Cannot find place for new RAMDISK of size %lld\n",
-			 ramdisk_size);
-
-	/* Note: this includes all the lowmem currently occupied by
-	   the initrd, we rely on that fact to keep the data intact. */
-	reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
-			 "NEW RAMDISK");
-	initrd_start = ramdisk_here + PAGE_OFFSET;
-	initrd_end   = initrd_start + ramdisk_size;
-	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
-			 ramdisk_here, ramdisk_here + ramdisk_size);
-
-	q = (char *)initrd_start;
-
-	/* Copy any lowmem portion of the initrd */
-	if (ramdisk_image < end_of_lowmem) {
-		clen = end_of_lowmem - ramdisk_image;
-		p = (char *)__va(ramdisk_image);
-		memcpy(q, p, clen);
-		q += clen;
-		ramdisk_image += clen;
-		ramdisk_size  -= clen;
-	}
-
-	/* Copy the highmem portion of the initrd */
-	while (ramdisk_size) {
-		slop = ramdisk_image & ~PAGE_MASK;
-		clen = ramdisk_size;
-		if (clen > MAX_MAP_CHUNK-slop)
-			clen = MAX_MAP_CHUNK-slop;
-		mapaddr = ramdisk_image & PAGE_MASK;
-		p = early_ioremap(mapaddr, clen+slop);
-		memcpy(q, p+slop, clen);
-		early_iounmap(p, clen+slop);
-		q += clen;
-		ramdisk_image += clen;
-		ramdisk_size  -= clen;
-	}
-	/* high pages is not converted by early_res_to_bootmem */
-	ramdisk_image = boot_params.hdr.ramdisk_image;
-	ramdisk_size  = boot_params.hdr.ramdisk_size;
-	printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
-		" %08llx - %08llx\n",
-		ramdisk_image, ramdisk_image + ramdisk_size - 1,
-		ramdisk_here, ramdisk_here + ramdisk_size - 1);
-}
-#endif
-
-static void __init reserve_initrd(void)
-{
-	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
-	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
-	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-
-	if (!boot_params.hdr.type_of_loader ||
-	    !ramdisk_image || !ramdisk_size)
-		return;		/* No initrd provided by bootloader */
-
-	initrd_start = 0;
-
-	if (ramdisk_size >= (end_of_lowmem>>1)) {
-		free_early(ramdisk_image, ramdisk_end);
-		printk(KERN_ERR "initrd too large to handle, "
-		       "disabling initrd\n");
-		return;
-	}
-
-	printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
-			ramdisk_end);
-
-
-	if (ramdisk_end <= end_of_lowmem) {
-		/* All in lowmem, easy case */
-		/*
-		 * don't need to reserve again, already reserved early
-		 * in i386_start_kernel
-		 */
-		initrd_start = ramdisk_image + PAGE_OFFSET;
-		initrd_end = initrd_start + ramdisk_size;
-		return;
-	}
-
-#ifdef CONFIG_X86_32
-	relocate_initrd();
-#else
-	printk(KERN_ERR "initrd extends beyond end of memory "
-	       "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
-	       ramdisk_end, end_of_lowmem);
-	initrd_start = 0;
-#endif
-	free_early(ramdisk_image, ramdisk_end);
-}
-#else
-static void __init reserve_initrd(void)
-{
-}
-#endif /* CONFIG_BLK_DEV_INITRD */
-
-/*
- * Determine if we were loaded by an EFI loader.  If so, then we have also been
- * passed the efi memmap, systab, etc., so we should use these data structures
- * for initialization.  Note, the efi init code path is determined by the
- * global efi_enabled. This allows the same kernel image to be used on existing
- * systems (with a traditional BIOS) as well as on EFI systems.
- */
-/*
- * setup_arch - architecture-specific boot-time initializations
- *
- * Note: On x86_64, fixmaps are ready for use even before this is called.
- */
-
-void __init setup_arch(char **cmdline_p)
-{
-#ifdef CONFIG_X86_32
-	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
-	pre_setup_arch_hook();
-	early_cpu_init();
-	early_ioremap_init();
-	reserve_setup_data();
-#else
-	printk(KERN_INFO "Command line: %s\n", boot_command_line);
-#endif
-
-	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
-	screen_info = boot_params.screen_info;
-	edid_info = boot_params.edid_info;
-#ifdef CONFIG_X86_32
-	apm_info.bios = boot_params.apm_bios_info;
-	ist_info = boot_params.ist_info;
-	if (boot_params.sys_desc_table.length != 0) {
-		set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
-		machine_id = boot_params.sys_desc_table.table[0];
-		machine_submodel_id = boot_params.sys_desc_table.table[1];
-		BIOS_revision = boot_params.sys_desc_table.table[2];
-	}
-#endif
-	saved_video_mode = boot_params.hdr.vid_mode;
-	bootloader_type = boot_params.hdr.type_of_loader;
-
-#ifdef CONFIG_BLK_DEV_RAM
-	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
-	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
-	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-#endif
-#ifdef CONFIG_EFI
-	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-#ifdef CONFIG_X86_32
-		     "EL32",
-#else
-		     "EL64",
-#endif
-	 4)) {
-		efi_enabled = 1;
-		efi_reserve_early();
-	}
-#endif
-
-	ARCH_SETUP
-
-	setup_memory_map();
-	copy_edd();
-
-	if (!boot_params.hdr.root_flags)
-		root_mountflags &= ~MS_RDONLY;
-	init_mm.start_code = (unsigned long) _text;
-	init_mm.end_code = (unsigned long) _etext;
-	init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
-	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
-	init_mm.brk = (unsigned long) &_end;
-#endif
-
-	code_resource.start = virt_to_phys(_text);
-	code_resource.end = virt_to_phys(_etext)-1;
-	data_resource.start = virt_to_phys(_etext);
-	data_resource.end = virt_to_phys(_edata)-1;
-	bss_resource.start = virt_to_phys(&__bss_start);
-	bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
-#ifdef CONFIG_X86_64
-	early_cpu_init();
-#endif
-	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-	*cmdline_p = command_line;
-
-	parse_setup_data();
-
-	parse_early_param();
-
-	if (acpi_mps_check()) {
-#ifdef CONFIG_X86_LOCAL_APIC
-#ifdef CONFIG_X86_32
-		enable_local_apic = -1;
-#else
-		disable_apic = 1;
-#endif
-#endif
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-	}
-
-	finish_e820_parsing();
-
-#ifdef CONFIG_X86_32
-	probe_roms();
-#else
-# ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	if (init_ohci1394_dma_early)
-		init_ohci1394_dma_on_all_controllers();
-# endif
-#endif
-
-	/* after parse_early_param, so could debug it */
-	insert_resource(&iomem_resource, &code_resource);
-	insert_resource(&iomem_resource, &data_resource);
-	insert_resource(&iomem_resource, &bss_resource);
-
-	if (efi_enabled)
-		efi_init();
-
-#ifdef CONFIG_X86_32
-	if (ppro_with_ram_bug()) {
-		e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
-				  E820_RESERVED);
-		sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-		printk(KERN_INFO "fixed physical RAM map:\n");
-		e820_print_map("bad_ppro");
-	}
-#else
-	early_gart_iommu_check();
-#endif
-
-	e820_register_active_regions(0, 0, -1UL);
-	/*
-	 * partially used pages are not usable - thus
-	 * we are rounding upwards:
-	 */
-	max_pfn = e820_end_of_ram();
-
-	/* preallocate 4k for mptable mpc */
-	early_reserve_e820_mpc_new();
-	/* update e820 for memory not covered by WB MTRRs */
-	mtrr_bp_init();
-	if (mtrr_trim_uncached_memory(max_pfn)) {
-		remove_all_active_ranges();
-		e820_register_active_regions(0, 0, -1UL);
-		max_pfn = e820_end_of_ram();
-	}
-
-#ifdef CONFIG_X86_32
-	/* max_low_pfn get updated here */
-	find_low_pfn_range();
-#else
-	num_physpages = max_pfn;
-
-	check_efer();
-
-	/* How many end-of-memory variables you have, grandma! */
-	/* need this before calling reserve_initrd */
-	max_low_pfn = max_pfn;
-	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
-#endif
-
-	/* max_pfn_mapped is updated here */
-#ifdef CONFIG_X86_64
-	max_pfn_mapped =
-#endif
-	max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT));
-
-	reserve_initrd();
-
-#ifdef CONFIG_X86_64
-	vsmp_init();
-#endif
-
-	dmi_scan_machine();
-
-	io_delay_init();
-
-	/*
-	 * Parse the ACPI tables for possible boot-time SMP configuration.
-	 */
-	acpi_boot_table_init();
-
-#ifdef CONFIG_X86_64
-	/* Remove active ranges so rediscovery with NUMA-awareness happens */
-	remove_all_active_ranges();
-#endif
-
-#ifdef CONFIG_ACPI_NUMA
-	/*
-	 * Parse SRAT to discover nodes.
-	 */
-	acpi_numa_init();
-#endif
-
-	initmem_init(0, max_pfn);
-
-#ifdef CONFIG_X86_64
-	dma32_reserve_bootmem();
-#endif
-
-#ifdef CONFIG_ACPI_SLEEP
-	/*
-	 * Reserve low memory region for sleep support.
-	 */
-	acpi_reserve_bootmem();
-#endif
-#if defined(CONFIG_X86_FIND_SMP_CONFIG) && defined(CONFIG_X86_32) || \
-    defined(CONFIG_X86_MPPARSE) && defined(CONFIG_X86_64)
-	/*
-	 * Find and reserve possible boot-time SMP configuration:
-	 */
-	find_smp_config();
-#endif
-	reserve_crashkernel();
-
-	reserve_ibft_region();
-
-#ifdef CONFIG_KVM_CLOCK
-	kvmclock_init();
-#endif
-
-#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
-	/*
-	 * Must be after max_low_pfn is determined, and before kernel
-	 * pagetables are setup.
-	 */
-	vmi_init();
-#endif
-
-	paging_init();
-
-#ifdef CONFIG_X86_64
-	map_vsyscall();
-#endif
-
-	/*
-	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
-	 */
-
-#if defined(CONFIG_PROVIDE_OHCI1394_DMA_INIT) && defined(CONFIG_X86_32)
-	if (init_ohci1394_dma_early)
-		init_ohci1394_dma_on_all_controllers();
-#endif
-
-#ifdef CONFIG_X86_GENERICARCH
-	generic_apic_probe();
-#endif
-
-	early_quirks();
-
-	/*
-	 * Read APIC and some other early information from ACPI tables.
-	 */
-	acpi_boot_init();
-
-#ifdef CONFIG_X86_64
-	init_cpu_to_node();
-#endif
-
-#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
-	/*
-	 * get boot-time SMP configuration:
-	 */
-	if (smp_found_config)
-		get_smp_config();
-#endif
-
-#ifdef CONFIG_X86_64
-	init_apic_mappings();
-	ioapic_init_mappings();
-#else
-# if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
-	if (def_to_bigsmp)
-		printk(KERN_WARNING "More than 8 CPUs detected and "
-			"CONFIG_X86_PC cannot handle it.\nUse "
-			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
-# endif
-#endif
-	kvm_guest_init();
-
-	e820_reserve_resources();
-	e820_mark_nosave_regions(max_low_pfn);
-
-#ifdef CONFIG_X86_32
-	request_resource(&iomem_resource, &video_ram_resource);
-#endif
-	reserve_standard_io_resources();
-
-	e820_setup_gap();
-
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
-	if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
-		conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
-	conswitchp = &dummy_con;
-#endif
-#endif
-}
-
-- 
cgit v1.2.3


From 217b8ce89088dd47e7176686ff34573f75a624e9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:55:20 -0700
Subject: x86: move boot_params back to setup.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c        | 6 ++++++
 arch/x86/kernel/setup_percpu.c | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 222d7038f2fd..f24a8fe8a7e4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -118,6 +118,12 @@
 #include <asm/highmem.h>
 #endif
 
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
    address, and must not be in the .bss segment! */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5c0c4bb5727d..5497fb9b00a0 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -15,12 +15,6 @@
 #include <asm/apicdef.h>
 #include <asm/highmem.h>
 
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
 #ifdef CONFIG_X86_LOCAL_APIC
 unsigned int num_processors;
 unsigned disabled_cpus __cpuinitdata;
-- 
cgit v1.2.3


From 257b0fde99df0160db03e529dbfb3a4e46c07a88 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:56:22 -0700
Subject: x86: move parse_setup_data back to setup.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c        | 25 +++++++++++++++++++++++++
 arch/x86/kernel/setup_percpu.c | 25 -------------------------
 2 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f24a8fe8a7e4..4b00c8347bb7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -375,6 +375,31 @@ static void __init reserve_initrd(void)
 }
 #endif /* CONFIG_BLK_DEV_INITRD */
 
+void __init parse_setup_data(void)
+{
+	struct setup_data *data;
+	u64 pa_data;
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, PAGE_SIZE);
+		switch (data->type) {
+		case SETUP_E820_EXT:
+			parse_e820_ext(data, pa_data);
+			break;
+		default:
+			break;
+		}
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+		free_early(pa_data, pa_data+sizeof(*data)+data->len);
+#endif
+		pa_data = data->next;
+		early_iounmap(data, PAGE_SIZE);
+	}
+}
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5497fb9b00a0..2f807503e030 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -214,31 +214,6 @@ void __init setup_per_cpu_areas(void)
 
 #endif
 
-void __init parse_setup_data(void)
-{
-	struct setup_data *data;
-	u64 pa_data;
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, PAGE_SIZE);
-		switch (data->type) {
-		case SETUP_E820_EXT:
-			parse_e820_ext(data, pa_data);
-			break;
-		default:
-			break;
-		}
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-		free_early(pa_data, pa_data+sizeof(*data)+data->len);
-#endif
-		pa_data = data->next;
-		early_iounmap(data, PAGE_SIZE);
-	}
-}
-
 #ifdef X86_64_NUMA
 
 /*
-- 
cgit v1.2.3


From ccb4defa71744f086822950d8fa64a17c4e6eb04 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:57:13 -0700
Subject: x86: move back crashkernel back to setup.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c        | 58 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_percpu.c | 57 -----------------------------------------
 2 files changed, 58 insertions(+), 57 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4b00c8347bb7..75d35401a8c7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -400,6 +400,64 @@ void __init parse_setup_data(void)
 	}
 }
 
+/*
+ * --------- Crashkernel reservation ------------------------------
+ */
+
+#ifdef CONFIG_KEXEC
+static inline unsigned long long get_total_mem(void)
+{
+	unsigned long long total;
+
+	total = max_low_pfn - min_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	total += highend_pfn - highstart_pfn;
+#endif
+
+	return total << PAGE_SHIFT;
+}
+
+void __init reserve_crashkernel(void)
+{
+	unsigned long long total_mem;
+	unsigned long long crash_size, crash_base;
+	int ret;
+
+	total_mem = get_total_mem();
+
+	ret = parse_crashkernel(boot_command_line, total_mem,
+			&crash_size, &crash_base);
+	if (ret == 0 && crash_size > 0) {
+		if (crash_base <= 0) {
+			printk(KERN_INFO "crashkernel reservation failed - "
+					"you have to specify a base address\n");
+			return;
+		}
+
+		if (reserve_bootmem_generic(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE) < 0) {
+			printk(KERN_INFO "crashkernel reservation failed - "
+					"memory is in use\n");
+			return;
+		}
+
+		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+				"for crashkernel (System RAM: %ldMB)\n",
+				(unsigned long)(crash_size >> 20),
+				(unsigned long)(crash_base >> 20),
+				(unsigned long)(total_mem >> 20));
+
+		crashk_res.start = crash_base;
+		crashk_res.end   = crash_base + crash_size - 1;
+		insert_resource(&iomem_resource, &crashk_res);
+	}
+}
+#else
+void __init reserve_crashkernel(void)
+{
+}
+#endif
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2f807503e030..a3a19ab0edef 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -387,63 +387,6 @@ EXPORT_SYMBOL(node_to_cpumask);
 
 #endif /* X86_64_NUMA */
 
-
-/*
- * --------- Crashkernel reservation ------------------------------
- */
-
-static inline unsigned long long get_total_mem(void)
-{
-	unsigned long long total;
-
-	total = max_low_pfn - min_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	total += highend_pfn - highstart_pfn;
-#endif
-
-	return total << PAGE_SHIFT;
-}
-
-#ifdef CONFIG_KEXEC
-void __init reserve_crashkernel(void)
-{
-	unsigned long long total_mem;
-	unsigned long long crash_size, crash_base;
-	int ret;
-
-	total_mem = get_total_mem();
-
-	ret = parse_crashkernel(boot_command_line, total_mem,
-			&crash_size, &crash_base);
-	if (ret == 0 && crash_size > 0) {
-		if (crash_base <= 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"you have to specify a base address\n");
-			return;
-		}
-
-		if (reserve_bootmem_generic(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE) < 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"memory is in use\n");
-			return;
-		}
-
-		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-				"for crashkernel (System RAM: %ldMB)\n",
-				(unsigned long)(crash_size >> 20),
-				(unsigned long)(crash_base >> 20),
-				(unsigned long)(total_mem >> 20));
-
-		crashk_res.start = crash_base;
-		crashk_res.end   = crash_base + crash_size - 1;
-		insert_resource(&iomem_resource, &crashk_res);
-	}
-}
-#else
-void __init reserve_crashkernel(void)
-{}
-#endif
 static struct resource standard_io_resources[] = {
 	{ .name = "dma1", .start = 0x00, .end = 0x1f,
 		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-- 
cgit v1.2.3


From bdba0e700c86fa2f152b1fe37b001c9e9c65d2b7 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:58:02 -0700
Subject: x86: move reserve_standard_io_resources back to setup.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c        | 33 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_percpu.c | 32 --------------------------------
 2 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 75d35401a8c7..62647b04fab5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -458,6 +458,39 @@ void __init reserve_crashkernel(void)
 }
 #endif
 
+static struct resource standard_io_resources[] = {
+	{ .name = "dma1", .start = 0x00, .end = 0x1f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic1", .start = 0x20, .end = 0x21,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer0", .start = 0x40, .end = 0x43,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer1", .start = 0x50, .end = 0x53,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x60, .end = 0x60,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x64, .end = 0x64,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "fpu", .start = 0xf0, .end = 0xff,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+void __init reserve_standard_io_resources(void)
+{
+	int i;
+
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+
+}
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a3a19ab0edef..ccf329dc81be 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -387,38 +387,6 @@ EXPORT_SYMBOL(node_to_cpumask);
 
 #endif /* X86_64_NUMA */
 
-static struct resource standard_io_resources[] = {
-	{ .name = "dma1", .start = 0x00, .end = 0x1f,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "pic1", .start = 0x20, .end = 0x21,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "timer0", .start = 0x40, .end = 0x43,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "timer1", .start = 0x50, .end = 0x53,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "keyboard", .start = 0x60, .end = 0x60,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "keyboard", .start = 0x64, .end = 0x64,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "fpu", .start = 0xf0, .end = 0xff,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
-};
-
-void __init reserve_standard_io_resources(void)
-{
-	int i;
-
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-
-}
 
 #ifdef CONFIG_PROC_VMCORE
 /* elfcorehdr= specifies the location of elf core header
-- 
cgit v1.2.3


From 0196bcbb150786d54a50e3074013020570a59d31 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:58:55 -0700
Subject: x86: move parse elfvorehdr back to setup.c

Signed-off-by: Yinghai <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c        | 16 ++++++++++++++++
 arch/x86/kernel/setup_percpu.c | 19 -------------------
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 62647b04fab5..08efab538a24 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -491,6 +491,22 @@ void __init reserve_standard_io_resources(void)
 
 }
 
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+	char *end;
+	if (!arg)
+		return -EINVAL;
+	elfcorehdr_addr = memparse(arg, &end);
+	return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
+#endif
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ccf329dc81be..7068f95cccc6 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -387,22 +387,3 @@ EXPORT_SYMBOL(node_to_cpumask);
 
 #endif /* X86_64_NUMA */
 
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
-{
-	char *end;
-	if (!arg)
-		return -EINVAL;
-	elfcorehdr_addr = memparse(arg, &end);
-	return end > arg ? 0 : -EINVAL;
-}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
-
-
-
-- 
cgit v1.2.3


From d1b20afec356085a202d7832d47bfb89303ea901 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 17:59:41 -0700
Subject: x86: make x86_find_smp_config depends on 64 bit too

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig        | 1 -
 arch/x86/kernel/setup.c | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 25ff7548d3da..9441e46cb49b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -233,7 +233,6 @@ config SMP
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS
-	depends on X86_32
 
 if ACPI
 config X86_MPPARSE
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 08efab538a24..7690547ac285 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -714,8 +714,7 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_reserve_bootmem();
 #endif
-#if defined(CONFIG_X86_FIND_SMP_CONFIG) && defined(CONFIG_X86_32) || \
-    defined(CONFIG_X86_MPPARSE) && defined(CONFIG_X86_64)
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
 	/*
 	 * Find and reserve possible boot-time SMP configuration:
 	 */
-- 
cgit v1.2.3


From 29f784e369a914b5926e01a0b0caae0b47f6452a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 18:00:22 -0700
Subject: x86: change some functions in setup.c to static

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c     | 8 ++++----
 include/asm-x86/bootparam.h | 1 -
 include/asm-x86/setup.h     | 5 -----
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 7690547ac285..35022dc10428 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -375,7 +375,7 @@ static void __init reserve_initrd(void)
 }
 #endif /* CONFIG_BLK_DEV_INITRD */
 
-void __init parse_setup_data(void)
+static void __init parse_setup_data(void)
 {
 	struct setup_data *data;
 	u64 pa_data;
@@ -417,7 +417,7 @@ static inline unsigned long long get_total_mem(void)
 	return total << PAGE_SHIFT;
 }
 
-void __init reserve_crashkernel(void)
+static void __init reserve_crashkernel(void)
 {
 	unsigned long long total_mem;
 	unsigned long long crash_size, crash_base;
@@ -453,7 +453,7 @@ void __init reserve_crashkernel(void)
 	}
 }
 #else
-void __init reserve_crashkernel(void)
+static void __init reserve_crashkernel(void)
 {
 }
 #endif
@@ -481,7 +481,7 @@ static struct resource standard_io_resources[] = {
 		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
 };
 
-void __init reserve_standard_io_resources(void)
+static void __init reserve_standard_io_resources(void)
 {
 	int i;
 
diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h
index 55ae9d0c4255..6eeba3b2812b 100644
--- a/include/asm-x86/bootparam.h
+++ b/include/asm-x86/bootparam.h
@@ -109,6 +109,5 @@ struct boot_params {
 } __attribute__((packed));
 
 void reserve_setup_data(void);
-void parse_setup_data(void);
 
 #endif /* _ASM_BOOTPARAM_H */
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index bb12a1619c12..269ba7fe21d1 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -8,9 +8,6 @@
 /* Interrupt control for vSMPowered x86_64 systems */
 void vsmp_init(void);
 
-/* Crashkernel reservation */
-void reserve_crashkernel(void);
-
 #ifndef CONFIG_PARAVIRT
 #define paravirt_post_allocator_init()	do {} while (0)
 #endif
@@ -38,8 +35,6 @@ void reserve_crashkernel(void);
 #ifndef __ASSEMBLY__
 #include <asm/bootparam.h>
 
-void reserve_standard_io_resources(void);
-
 #ifndef _SETUP
 
 /*
-- 
cgit v1.2.3


From 5092301c7250d7459b79fe40dae43eca3b518e92 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 18:02:06 -0700
Subject: x86: we only have init_pg_tables_end for 32bit

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 35022dc10428..f2e314b5b5fa 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -124,12 +124,6 @@ struct boot_params __initdata boot_params;
 struct boot_params boot_params;
 #endif
 
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
-
 /*
  * Machine setup..
  */
@@ -156,6 +150,12 @@ static struct resource bss_resource = {
 
 
 #ifdef CONFIG_X86_32
+/* This value is set up by the early boot code to point to the value
+   immediately after the boot time page tables.  It contains a *physical*
+   address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_start __initdata = ~0UL;
+unsigned long init_pg_tables_end __initdata = ~0UL;
+
 static struct resource video_ram_resource = {
 	.name	= "Video RAM area",
 	.start	= 0xa0000,
-- 
cgit v1.2.3


From 3442682a54a9f44047efe526869aa52bd74fe16e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Jun 2008 12:29:29 +0200
Subject: x86: remove extra newline from setup.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f2e314b5b5fa..172a83e57ee7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -806,4 +806,3 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif
 }
-
-- 
cgit v1.2.3


From 330ddd20894f99a2b956ad59cf0cfdba188bde63 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Jun 2008 12:40:35 +0200
Subject: x86: build fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

In file included from arch/x86/kernel/setup.c:118:
include/asm/highmem.h:64: error: expected identifier or ‘(' before ‘do'
include/asm/highmem.h:64: error: expected identifier or ‘(' before ‘while'
include/asm/highmem.h:67: error: expected identifier or ‘(' before ‘do'
include/asm/highmem.h:67: error: expected identifier or ‘(' before ‘while'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 172a83e57ee7..63dbb5e8f7ee 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -114,9 +114,6 @@
 #ifdef CONFIG_X86_64
 #include <asm/numa_64.h>
 #endif
-#ifdef CONFIG_X86_32
-#include <asm/highmem.h>
-#endif
 
 #ifndef CONFIG_DEBUG_BOOT_PARAMS
 struct boot_params __initdata boot_params;
-- 
cgit v1.2.3


From eba0045ff87bab465d3c80c289f3bf709c1800f5 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:12 -0400
Subject: x86/paravirt: add a pgd_alloc/free hooks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add hooks which are called at pgd_alloc/free time.  The pgd_alloc hook
may return an error code, which if non-zero, causes the pgd allocation
to be failed.  The hooks may be used to allocate/free auxillary
per-pgd information.

also fix:

> * Ingo Molnar <mingo@elte.hu> wrote:
>
>  include/asm/pgalloc.h: In function ‘paravirt_pgd_free':
>  include/asm/pgalloc.h:14: error: parameter name omitted
>  arch/x86/kernel/entry_64.S: In file included from
>  arch/x86/kernel/traps_64.c:51:include/asm/pgalloc.h: In function ‘paravirt_pgd_free':
>  include/asm/pgalloc.h:14: error: parameter name omitted

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/paravirt.c |  4 ++++
 arch/x86/mm/pgtable.c      | 13 ++++++++-----
 arch/x86/xen/enlighten.c   |  4 ++++
 include/asm-x86/paravirt.h | 19 ++++++++++++++++++-
 include/asm-x86/pgalloc.h  |  4 ++++
 5 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e9b504537212..78c9a1b9e6b0 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -30,6 +30,7 @@
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
 #include <asm/time.h>
+#include <asm/pgalloc.h>
 #include <asm/irq.h>
 #include <asm/delay.h>
 #include <asm/fixmap.h>
@@ -366,6 +367,9 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.flush_tlb_single = native_flush_tlb_single,
 	.flush_tlb_others = native_flush_tlb_others,
 
+	.pgd_alloc = __paravirt_pgd_alloc,
+	.pgd_free = paravirt_nop,
+
 	.alloc_pte = paravirt_nop,
 	.alloc_pmd = paravirt_nop,
 	.alloc_pmd_clone = paravirt_nop,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 45b99ac39480..418c4432fb39 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -215,13 +215,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	/* so that alloc_pmd can use it */
 	mm->pgd = pgd;
-	if (pgd)
+	if (pgd) {
 		pgd_ctor(pgd);
 
-	if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
-		pgd_dtor(pgd);
-		free_page((unsigned long)pgd);
-		pgd = NULL;
+		if (paravirt_pgd_alloc(mm) != 0 ||
+		    !pgd_prepopulate_pmd(mm, pgd)) {
+			pgd_dtor(pgd);
+			free_page((unsigned long)pgd);
+			pgd = NULL;
+		}
 	}
 
 	return pgd;
@@ -231,6 +233,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	pgd_mop_up_pmds(mm, pgd);
 	pgd_dtor(pgd);
+	paravirt_pgd_free(mm, pgd);
 	free_page((unsigned long)pgd);
 }
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 76ad1efaf09e..d62f14e20708 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -45,6 +45,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/reboot.h>
+#include <asm/pgalloc.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -1153,6 +1154,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.pte_update = paravirt_nop,
 	.pte_update_defer = paravirt_nop,
 
+	.pgd_alloc = __paravirt_pgd_alloc,
+	.pgd_free = paravirt_nop,
+
 	.alloc_pte = xen_alloc_pte_init,
 	.release_pte = xen_release_pte_init,
 	.alloc_pmd = xen_alloc_pte_init,
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 41f5447efd88..5467e2cff4bc 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -219,7 +219,14 @@ struct pv_mmu_ops {
 	void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
 				 unsigned long va);
 
-	/* Hooks for allocating/releasing pagetable pages */
+	/* Hooks for allocating and freeing a pagetable top-level */
+	int  (*pgd_alloc)(struct mm_struct *mm);
+	void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
+
+	/*
+	 * Hooks for allocating/releasing pagetable pages when they're
+	 * attached to a pagetable
+	 */
 	void (*alloc_pte)(struct mm_struct *mm, u32 pfn);
 	void (*alloc_pmd)(struct mm_struct *mm, u32 pfn);
 	void (*alloc_pmd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
@@ -925,6 +932,16 @@ static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 	PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
 }
 
+static inline int paravirt_pgd_alloc(struct mm_struct *mm)
+{
+	return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
+}
+
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd);
+}
+
 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned pfn)
 {
 	PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
index 91e4641f3f31..d63ea431cb3b 100644
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -5,9 +5,13 @@
 #include <linux/mm.h>		/* for struct page */
 #include <linux/pagemap.h>
 
+static inline int  __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
+#define paravirt_pgd_alloc(mm)	__paravirt_pgd_alloc(mm)
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)	{}
 static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)	{}
 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
-- 
cgit v1.2.3


From a6523748bddd38bcec11431f57502090b6014a96 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Wed, 25 Jun 2008 00:19:16 -0400
Subject: paravirt/x86, 64-bit: move __PAGE_OFFSET to leave a space for
 hypervisor

Set __PAGE_OFFSET to the most negative possible address +
16*PGDIR_SIZE.  The gap is to allow a space for a hypervisor to fit.
The gap is more or less arbitrary, but it's what Xen needs.

When booting native, kernel/head_64.S has a set of compile-time
generated pagetables used at boot time.  This patch removes their
absolutely hard-coded layout, and makes it parameterised on
__PAGE_OFFSET (and __START_KERNEL_map).

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_64.S | 17 ++++++++++++-----
 include/asm-x86/page_64.h |  8 +++++++-
 2 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 32f5a114d1a2..38279fab093d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -32,6 +32,13 @@
  *
  */
 
+#define pud_index(x)	(((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+
+L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
+L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
+L4_START_KERNEL = pgd_index(__START_KERNEL_map)
+L3_START_KERNEL = pud_index(__START_KERNEL_map)
+
 	.text
 	.section .text.head
 	.code64
@@ -77,8 +84,8 @@ startup_64:
 	/* Fixup the physical addresses in the page table
 	 */
 	addq	%rbp, init_level4_pgt + 0(%rip)
-	addq	%rbp, init_level4_pgt + (258*8)(%rip)
-	addq	%rbp, init_level4_pgt + (511*8)(%rip)
+	addq	%rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
+	addq	%rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
 
 	addq	%rbp, level3_ident_pgt + 0(%rip)
 
@@ -338,9 +345,9 @@ ENTRY(name)
 	 */
 NEXT_PAGE(init_level4_pgt)
 	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.fill	257,8,0
+	.org	init_level4_pgt + L4_PAGE_OFFSET*8, 0
 	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.fill	252,8,0
+	.org	init_level4_pgt + L4_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
@@ -349,7 +356,7 @@ NEXT_PAGE(level3_ident_pgt)
 	.fill	511,8,0
 
 NEXT_PAGE(level3_kernel_pgt)
-	.fill	510,8,0
+	.fill	L3_START_KERNEL,8,0
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
 	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
 	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
index cac5b9e78265..010d12db80dc 100644
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -26,7 +26,13 @@
 #define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
 #define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
 
-#define __PAGE_OFFSET           _AC(0xffff810000000000, UL)
+/*
+ * Set __PAGE_OFFSET to the most negative possible address +
+ * PGDIR_SIZE*16 (pgd slot 272).  The gap is to allow a space for a
+ * hypervisor to fit.  Choosing 16 slots here is arbitrary, but it's
+ * what Xen requires.
+ */
+#define __PAGE_OFFSET           _AC(0xffff880000000000, UL)
 
 #define __PHYSICAL_START	CONFIG_PHYSICAL_START
 #define __KERNEL_ALIGN		0x200000
-- 
cgit v1.2.3


From f97013fd8f17120182aa247f360e4d2069a9db9c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:18 -0400
Subject: x86, 64-bit: split x86_64_start_kernel

Split x86_64_start_kernel() into two pieces:

   The first essentially cleans up after head_64.S.  It clears the
   bss, zaps low identity mappings, sets up some early exception
   handlers.

   The second part preserves the boot data, reserves the kernel's
   text/data/bss, pagetables and ramdisk, and then starts the kernel
   proper.

This split is so that Xen can call the second part to do the set up it
needs done.  It doesn't need any of the first part setups, because it
doesn't boot via head_64.S, and its redundant or actively damaging.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head64.c | 5 +++++
 include/asm-x86/setup.h  | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c970929bb15d..f684e3b3de4e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -108,6 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
 	early_printk("Kernel really alive\n");
 
+	x86_64_start_reservations(real_mode_data);
+}
+
+void __init x86_64_start_reservations(char *real_mode_data)
+{
 	copy_bootdata(__va(real_mode_data));
 
 	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 42bc62b4b70c..1d121c632d9e 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -58,6 +58,8 @@ extern unsigned long init_pg_tables_end;
 
 #else
 void __init x86_64_start_kernel(char *real_mode);
+void __init x86_64_start_reservations(char *real_mode_data);
+
 #endif /* __i386__ */
 #endif /* _SETUP */
 #endif /* __ASSEMBLY__ */
-- 
cgit v1.2.3


From 3fe0a63efd4437f6438ce5f2708929b1108873b6 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:23 -0400
Subject: x86, 64-bit: __switch_to(): move arch_leave_lazy_cpu_mode() to the
 right place

We must leave lazy mode before switching the %fs and %gs selectors.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_64.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ddc6fcc73dc6..488eaca47bd8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -562,6 +562,15 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	load_TLS(next, cpu);
 
+	/*
+	 * Leave lazy mode, flushing any hypercalls made here.
+	 * This must be done before restoring TLS segments so
+	 * the GDT and LDT are properly updated, and must be
+	 * done before math_state_restore, so the TS bit is up
+	 * to date.
+	 */
+	arch_leave_lazy_cpu_mode();
+
 	/* 
 	 * Switch FS and GS.
 	 */
-- 
cgit v1.2.3


From 478de5a9d691dd0c048ddce62dbec23722515636 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:24 -0400
Subject: x86: save %fs and %gs before load_TLS() and
 arch_leave_lazy_cpu_mode()

We must do this because load_TLS() may need to clear %fs and %gs.
(e.g. under Xen).

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_64.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 488eaca47bd8..db5eb963e4df 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -538,6 +538,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 				 *next = &next_p->thread;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	unsigned fsindex, gsindex;
 
 	/* we're going to use this soon, after a few expensive things */
 	if (next_p->fpu_counter>5)
@@ -560,6 +561,15 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (unlikely(next->ds | prev->ds))
 		loadsegment(ds, next->ds);
 
+
+	/* We must save %fs and %gs before load_TLS() because
+	 * %fs and %gs may be cleared by load_TLS().
+	 *
+	 * (e.g. xen_load_tls())
+	 */
+	savesegment(fs, fsindex);
+	savesegment(gs, gsindex);
+
 	load_TLS(next, cpu);
 
 	/*
@@ -575,8 +585,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Switch FS and GS.
 	 */
 	{ 
-		unsigned fsindex;
-		savesegment(fs, fsindex);
 		/* segment register != 0 always requires a reload. 
 		   also reload when it has changed. 
 		   when prev process used 64bit base always reload
@@ -594,10 +602,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		if (next->fs) 
 			wrmsrl(MSR_FS_BASE, next->fs); 
 		prev->fsindex = fsindex;
-	}
-	{ 
-		unsigned gsindex;
-		savesegment(gs, gsindex);
+
 		if (unlikely(gsindex | next->gsindex | prev->gs)) {
 			load_gs_index(next->gsindex);
 			if (gsindex)
-- 
cgit v1.2.3


From e04e0a630d8b5c621b3a8e70ff20db737d3a5728 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:25 -0400
Subject: x86: use __KERNEL_DS as SS when returning to a kernel thread

This is needed when the kernel is running on RING3, such as under Xen.
x86_64 has a weird feature that makes it #GP on iret when SS is a null
descriptor.

This need to be tested on bare metal to make sure it doesn't cause any
problems. AMD specs say SS is always ignored (except on iret?).

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ff15ab552280..6d1101469e97 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -104,7 +104,7 @@ ENTRY(native_irq_enable_syscall_ret)
 	.macro FAKE_STACK_FRAME child_rip
 	/* push in order ss, rsp, eflags, cs, rip */
 	xorl %eax, %eax
-	pushq %rax /* ss */
+	pushq $__KERNEL_DS /* ss */
 	CFI_ADJUST_CFA_OFFSET	8
 	/*CFI_REL_OFFSET	ss,0*/
 	pushq %rax /* rsp */
-- 
cgit v1.2.3


From d75cd22fdd5f7d203fb60014d426942df33dd9a6 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:26 -0400
Subject: x86/paravirt: split sysret and sysexit

Don't conflate sysret and sysexit; they're different instructions with
different semantics, and may be in use at the same time (at least
within the same kernel, depending on whether its an Intel or AMD
system).

sysexit - just return to userspace, does no register restoration of
    any kind; must explicitly atomically enable interrupts.

sysret - reloads flags from r11, so no need to explicitly enable
    interrupts on 64-bit, responsible for restoring usermode %gs

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citirx.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/asm-offsets_32.c    |  2 +-
 arch/x86/kernel/asm-offsets_64.c    |  2 +-
 arch/x86/kernel/entry_32.S          |  8 ++++----
 arch/x86/kernel/entry_64.S          |  4 ++--
 arch/x86/kernel/paravirt.c          | 12 +++++++++---
 arch/x86/kernel/paravirt_patch_32.c |  4 ++--
 arch/x86/kernel/paravirt_patch_64.c |  4 ++--
 arch/x86/kernel/vmi_32.c            |  4 ++--
 arch/x86/xen/enlighten.c            |  2 +-
 include/asm-x86/irqflags.h          |  4 ++--
 include/asm-x86/paravirt.h          | 15 ++++++++++-----
 11 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 92588083950f..6649d09ad88f 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -111,7 +111,7 @@ void foo(void)
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-	OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
+	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 #endif
 
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index f126c05d6170..27ac2deca465 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -62,7 +62,7 @@ int main(void)
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-	OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
+	OFFSET(PV_CPU_usersp_sysret, pv_cpu_ops, usersp_sysret);
 	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 #endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 159a1c76d2bd..53393c306e11 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -58,7 +58,7 @@
  * for paravirtualization.  The following will never clobber any registers:
  *   INTERRUPT_RETURN (aka. "iret")
  *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- *   ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
  *
  * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
  * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -349,7 +349,7 @@ sysenter_past_esp:
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
 1:	mov  PT_FS(%esp), %fs
-	ENABLE_INTERRUPTS_SYSCALL_RET
+	ENABLE_INTERRUPTS_SYSEXIT
 	CFI_ENDPROC
 .pushsection .fixup,"ax"
 2:	movl $0,PT_FS(%esp)
@@ -874,10 +874,10 @@ ENTRY(native_iret)
 .previous
 END(native_iret)
 
-ENTRY(native_irq_enable_syscall_ret)
+ENTRY(native_irq_enable_sysexit)
 	sti
 	sysexit
-END(native_irq_enable_syscall_ret)
+END(native_irq_enable_sysexit)
 #endif
 
 KPROBE_ENTRY(int3)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6d1101469e97..0056bc4c61a9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -59,7 +59,7 @@
 #endif	
 
 #ifdef CONFIG_PARAVIRT
-ENTRY(native_irq_enable_syscall_ret)
+ENTRY(native_usersp_sysret)
 	movq	%gs:pda_oldrsp,%rsp
 	swapgs
 	sysretq
@@ -275,7 +275,7 @@ sysret_check:
 	CFI_REGISTER	rip,rcx
 	RESTORE_ARGS 0,-ARG_SKIP,1
 	/*CFI_REGISTER	rflags,r11*/
-	ENABLE_INTERRUPTS_SYSCALL_RET
+	USERSP_SYSRET
 
 	CFI_RESTORE_STATE
 	/* Handle reschedules */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 78c9a1b9e6b0..565ee7a990ea 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -140,7 +140,8 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		/* If the operation is a nop, then nop the callsite */
 		ret = paravirt_patch_nop();
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
+		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
+		 type == PARAVIRT_PATCH(pv_cpu_ops.usersp_sysret))
 		/* If operation requires a jmp, then jmp */
 		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
 	else
@@ -191,7 +192,8 @@ static void native_flush_tlb_single(unsigned long addr)
 
 /* These are in entry.S */
 extern void native_iret(void);
-extern void native_irq_enable_syscall_ret(void);
+extern void native_irq_enable_sysexit(void);
+extern void native_usersp_sysret(void);
 
 static int __init print_banner(void)
 {
@@ -327,7 +329,11 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.write_idt_entry = native_write_idt_entry,
 	.load_sp0 = native_load_sp0,
 
-	.irq_enable_syscall_ret = native_irq_enable_syscall_ret,
+#ifdef CONFIG_X86_32
+	.irq_enable_sysexit = native_irq_enable_sysexit,
+#else
+	.usersp_sysret = native_usersp_sysret,
+#endif
 	.iret = native_iret,
 	.swapgs = native_swapgs,
 
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 82fc5fcab4f4..58262218781b 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -5,7 +5,7 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
 DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
 DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
+DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
@@ -29,7 +29,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, restore_fl);
 		PATCH_SITE(pv_irq_ops, save_fl);
 		PATCH_SITE(pv_cpu_ops, iret);
-		PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
+		PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
 		PATCH_SITE(pv_mmu_ops, read_cr2);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 7d904e138d7e..4a170552b852 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -15,7 +15,7 @@ DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
 /* the three commands give us more control to how to return from a syscall */
-DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;");
+DEF_NATIVE(pv_cpu_ops, usersp_sysret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;");
 DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
@@ -35,7 +35,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
 		PATCH_SITE(pv_cpu_ops, iret);
-		PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
+		PATCH_SITE(pv_cpu_ops, usersp_sysret);
 		PATCH_SITE(pv_cpu_ops, swapgs);
 		PATCH_SITE(pv_mmu_ops, read_cr2);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 956f38927aa7..946bf13b44ab 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -151,7 +151,7 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
 					      insns, ip);
 		case PARAVIRT_PATCH(pv_cpu_ops.iret):
 			return patch_internal(VMI_CALL_IRET, len, insns, ip);
-		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
+		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
 			return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
 		default:
 			break;
@@ -896,7 +896,7 @@ static inline int __init activate_vmi(void)
 	 * the backend.  They are performance critical anyway, so requiring
 	 * a patch is not a big problem.
 	 */
-	pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0;
+	pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
 	pv_cpu_ops.iret = (void *)0xbadbab0;
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d62f14e20708..119c88fa769d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1089,7 +1089,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.read_pmc = native_read_pmc,
 
 	.iret = xen_iret,
-	.irq_enable_syscall_ret = xen_sysexit,
+	.irq_enable_sysexit = xen_sysexit,
 
 	.load_tr_desc = paravirt_nop,
 	.set_ldt = xen_set_ldt,
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
index c242527f970e..99ee5256a7e3 100644
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -112,13 +112,13 @@ static inline unsigned long __raw_local_irq_save(void)
 
 #ifdef CONFIG_X86_64
 #define INTERRUPT_RETURN	iretq
-#define ENABLE_INTERRUPTS_SYSCALL_RET			\
+#define USERSP_SYSRET					\
 			movq	%gs:pda_oldrsp, %rsp;	\
 			swapgs;				\
 			sysretq;
 #else
 #define INTERRUPT_RETURN		iret
-#define ENABLE_INTERRUPTS_SYSCALL_RET	sti; sysexit
+#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
 #define GET_CR0_INTO_EAX		movl %cr0, %eax
 #endif
 
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 82cdcde4b222..2668903b70f5 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -141,8 +141,9 @@ struct pv_cpu_ops {
 	u64 (*read_pmc)(int counter);
 	unsigned long long (*read_tscp)(unsigned int *aux);
 
-	/* These two are jmp to, not actually called. */
-	void (*irq_enable_syscall_ret)(void);
+	/* These three are jmp to, not actually called. */
+	void (*irq_enable_sysexit)(void);
+	void (*usersp_sysret)(void);
 	void (*iret)(void);
 
 	void (*swapgs)(void);
@@ -1480,10 +1481,10 @@ static inline unsigned long __raw_local_irq_save(void)
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);	\
 		  PV_RESTORE_REGS;)
 
-#define ENABLE_INTERRUPTS_SYSCALL_RET					\
-	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_syscall_ret),\
+#define ENABLE_INTERRUPTS_SYSEXIT					\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),	\
 		  CLBR_NONE,						\
-		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_syscall_ret))
+		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
 
 
 #ifdef CONFIG_X86_32
@@ -1504,6 +1505,10 @@ static inline unsigned long __raw_local_irq_save(void)
 	movq %rax, %rcx;				\
 	xorq %rax, %rax;
 
+#define USERSP_SYSRET							\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usersp_sysret),		\
+		  CLBR_NONE,						\
+		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usersp_sysret))
 #endif
 
 #endif /* __ASSEMBLY__ */
-- 
cgit v1.2.3


From c7245da6ae7e5208504ff027c4e0eec69b788651 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:27 -0400
Subject: x86/paravirt, 64-bit: don't restore user rsp within sysret

There's no need to combine restoring the user rsp within the sysret
pvop, so split it out.  This makes the pvop's semantics closer to the
machine instruction.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citirx.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/asm-offsets_64.c    | 2 +-
 arch/x86/kernel/entry_64.S          | 6 +++---
 arch/x86/kernel/paravirt.c          | 6 +++---
 arch/x86/kernel/paravirt_patch_64.c | 4 ++--
 include/asm-x86/irqflags.h          | 3 +--
 include/asm-x86/paravirt.h          | 8 ++++----
 6 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 27ac2deca465..a19aba8c5bb3 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -62,7 +62,7 @@ int main(void)
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-	OFFSET(PV_CPU_usersp_sysret, pv_cpu_ops, usersp_sysret);
+	OFFSET(PV_CPU_usergs_sysret, pv_cpu_ops, usergs_sysret);
 	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 #endif
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0056bc4c61a9..18447a373fbd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -59,8 +59,7 @@
 #endif	
 
 #ifdef CONFIG_PARAVIRT
-ENTRY(native_usersp_sysret)
-	movq	%gs:pda_oldrsp,%rsp
+ENTRY(native_usergs_sysret)
 	swapgs
 	sysretq
 #endif /* CONFIG_PARAVIRT */
@@ -275,7 +274,8 @@ sysret_check:
 	CFI_REGISTER	rip,rcx
 	RESTORE_ARGS 0,-ARG_SKIP,1
 	/*CFI_REGISTER	rflags,r11*/
-	USERSP_SYSRET
+	movq	%gs:pda_oldrsp, %rsp
+	USERGS_SYSRET
 
 	CFI_RESTORE_STATE
 	/* Handle reschedules */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 565ee7a990ea..b0b17f0bc7e9 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -141,7 +141,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		ret = paravirt_patch_nop();
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
-		 type == PARAVIRT_PATCH(pv_cpu_ops.usersp_sysret))
+		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret))
 		/* If operation requires a jmp, then jmp */
 		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
 	else
@@ -193,7 +193,7 @@ static void native_flush_tlb_single(unsigned long addr)
 /* These are in entry.S */
 extern void native_iret(void);
 extern void native_irq_enable_sysexit(void);
-extern void native_usersp_sysret(void);
+extern void native_usergs_sysret(void);
 
 static int __init print_banner(void)
 {
@@ -332,7 +332,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 #ifdef CONFIG_X86_32
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 #else
-	.usersp_sysret = native_usersp_sysret,
+	.usergs_sysret = native_usergs_sysret,
 #endif
 	.iret = native_iret,
 	.swapgs = native_swapgs,
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 4a170552b852..d4c0712a3e64 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -15,7 +15,7 @@ DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
 /* the three commands give us more control to how to return from a syscall */
-DEF_NATIVE(pv_cpu_ops, usersp_sysret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;");
+DEF_NATIVE(pv_cpu_ops, usergs_sysret, "swapgs; sysretq;");
 DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
@@ -35,7 +35,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
 		PATCH_SITE(pv_cpu_ops, iret);
-		PATCH_SITE(pv_cpu_ops, usersp_sysret);
+		PATCH_SITE(pv_cpu_ops, usergs_sysret);
 		PATCH_SITE(pv_cpu_ops, swapgs);
 		PATCH_SITE(pv_mmu_ops, read_cr2);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
index 99ee5256a7e3..544836c96b61 100644
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -112,8 +112,7 @@ static inline unsigned long __raw_local_irq_save(void)
 
 #ifdef CONFIG_X86_64
 #define INTERRUPT_RETURN	iretq
-#define USERSP_SYSRET					\
-			movq	%gs:pda_oldrsp, %rsp;	\
+#define USERGS_SYSRET					\
 			swapgs;				\
 			sysretq;
 #else
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 2668903b70f5..dad5b4186f51 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -143,7 +143,7 @@ struct pv_cpu_ops {
 
 	/* These three are jmp to, not actually called. */
 	void (*irq_enable_sysexit)(void);
-	void (*usersp_sysret)(void);
+	void (*usergs_sysret)(void);
 	void (*iret)(void);
 
 	void (*swapgs)(void);
@@ -1505,10 +1505,10 @@ static inline unsigned long __raw_local_irq_save(void)
 	movq %rax, %rcx;				\
 	xorq %rax, %rax;
 
-#define USERSP_SYSRET							\
-	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usersp_sysret),		\
+#define USERGS_SYSRET							\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret),		\
 		  CLBR_NONE,						\
-		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usersp_sysret))
+		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret))
 #endif
 
 #endif /* __ASSEMBLY__ */
-- 
cgit v1.2.3


From 2be29982a08009c731307f4a39053b70ac4700da Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:28 -0400
Subject: x86/paravirt: add sysret/sysexit pvops for returning to 32-bit
 compatibility userspace

In a 64-bit system, we need separate sysret/sysexit operations to
return to a 32-bit userspace.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citirx.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/ia32/ia32entry.S           | 21 ++++++++++----
 arch/x86/kernel/asm-offsets_64.c    |  4 ++-
 arch/x86/kernel/entry_64.S          |  4 +--
 arch/x86/kernel/paravirt.c          | 12 ++++----
 arch/x86/kernel/paravirt_patch_64.c |  9 ++++--
 include/asm-x86/irqflags.h          | 14 ++++++++--
 include/asm-x86/paravirt.h          | 56 +++++++++++++++++++++++++++++--------
 7 files changed, 89 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 3aefbce2de48..2a4c42427d9a 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -61,6 +61,19 @@
 	CFI_UNDEFINED	r15
 	.endm
 
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret32)
+	swapgs
+	sysretl
+ENDPROC(native_usergs_sysret32)
+
+ENTRY(native_irq_enable_sysexit)
+	swapgs
+	sti
+	sysexit
+ENDPROC(native_irq_enable_sysexit)
+#endif
+
 /*
  * 32bit SYSENTER instruction entry.
  *
@@ -151,10 +164,7 @@ sysenter_do_call:
 	CFI_ADJUST_CFA_OFFSET -8
 	CFI_REGISTER rsp,rcx
 	TRACE_IRQS_ON
-	swapgs
-	sti		/* sti only takes effect after the next instruction */
-	/* sysexit */
-	.byte	0xf, 0x35
+	ENABLE_INTERRUPTS_SYSEXIT32
 
 sysenter_tracesys:
 	CFI_RESTORE_STATE
@@ -254,8 +264,7 @@ cstar_do_call:
 	TRACE_IRQS_ON
 	movl RSP-ARGOFFSET(%rsp),%esp
 	CFI_RESTORE rsp
-	swapgs
-	sysretl
+	USERGS_SYSRET32
 	
 cstar_tracesys:	
 	CFI_RESTORE_STATE
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index a19aba8c5bb3..06c451af979a 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -62,7 +62,9 @@ int main(void)
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-	OFFSET(PV_CPU_usergs_sysret, pv_cpu_ops, usergs_sysret);
+	OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
+	OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
+	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
 	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 #endif
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 18447a373fbd..880ffe510a11 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -59,7 +59,7 @@
 #endif	
 
 #ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret)
+ENTRY(native_usergs_sysret64)
 	swapgs
 	sysretq
 #endif /* CONFIG_PARAVIRT */
@@ -275,7 +275,7 @@ sysret_check:
 	RESTORE_ARGS 0,-ARG_SKIP,1
 	/*CFI_REGISTER	rflags,r11*/
 	movq	%gs:pda_oldrsp, %rsp
-	USERGS_SYSRET
+	USERGS_SYSRET64
 
 	CFI_RESTORE_STATE
 	/* Handle reschedules */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index b0b17f0bc7e9..bf1067e89cad 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -141,7 +141,8 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		ret = paravirt_patch_nop();
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
-		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret))
+		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
+		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
 		/* If operation requires a jmp, then jmp */
 		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
 	else
@@ -193,7 +194,8 @@ static void native_flush_tlb_single(unsigned long addr)
 /* These are in entry.S */
 extern void native_iret(void);
 extern void native_irq_enable_sysexit(void);
-extern void native_usergs_sysret(void);
+extern void native_usergs_sysret32(void);
+extern void native_usergs_sysret64(void);
 
 static int __init print_banner(void)
 {
@@ -329,10 +331,10 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.write_idt_entry = native_write_idt_entry,
 	.load_sp0 = native_load_sp0,
 
-#ifdef CONFIG_X86_32
 	.irq_enable_sysexit = native_irq_enable_sysexit,
-#else
-	.usergs_sysret = native_usergs_sysret,
+#ifdef CONFIG_X86_64
+	.usergs_sysret32 = native_usergs_sysret32,
+	.usergs_sysret64 = native_usergs_sysret64,
 #endif
 	.iret = native_iret,
 	.swapgs = native_swapgs,
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index d4c0712a3e64..061d01df9ae6 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -14,8 +14,9 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
-/* the three commands give us more control to how to return from a syscall */
-DEF_NATIVE(pv_cpu_ops, usergs_sysret, "swapgs; sysretq;");
+DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
+DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
+DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
 DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
@@ -35,7 +36,9 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
 		PATCH_SITE(pv_cpu_ops, iret);
-		PATCH_SITE(pv_cpu_ops, usergs_sysret);
+		PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
+		PATCH_SITE(pv_cpu_ops, usergs_sysret32);
+		PATCH_SITE(pv_cpu_ops, usergs_sysret64);
 		PATCH_SITE(pv_cpu_ops, swapgs);
 		PATCH_SITE(pv_mmu_ops, read_cr2);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
index 544836c96b61..ea9bd2635d59 100644
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -112,9 +112,17 @@ static inline unsigned long __raw_local_irq_save(void)
 
 #ifdef CONFIG_X86_64
 #define INTERRUPT_RETURN	iretq
-#define USERGS_SYSRET					\
-			swapgs;				\
-			sysretq;
+#define USERGS_SYSRET64				\
+	swapgs;					\
+	sysretq;
+#define USERGS_SYSRET32				\
+	swapgs;					\
+	sysretl
+#define ENABLE_INTERRUPTS_SYSEXIT32		\
+	swapgs;					\
+	sti;					\
+	sysexit
+
 #else
 #define INTERRUPT_RETURN		iret
 #define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index dad5b4186f51..33f72f8fe757 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -141,9 +141,32 @@ struct pv_cpu_ops {
 	u64 (*read_pmc)(int counter);
 	unsigned long long (*read_tscp)(unsigned int *aux);
 
-	/* These three are jmp to, not actually called. */
+	/*
+	 * Atomically enable interrupts and return to userspace.  This
+	 * is only ever used to return to 32-bit processes; in a
+	 * 64-bit kernel, it's used for 32-on-64 compat processes, but
+	 * never native 64-bit processes.  (Jump, not call.)
+	 */
 	void (*irq_enable_sysexit)(void);
-	void (*usergs_sysret)(void);
+
+	/*
+	 * Switch to usermode gs and return to 64-bit usermode using
+	 * sysret.  Only used in 64-bit kernels to return to 64-bit
+	 * processes.  Usermode register state, including %rsp, must
+	 * already be restored.
+	 */
+	void (*usergs_sysret64)(void);
+
+	/*
+	 * Switch to usermode gs and return to 32-bit usermode using
+	 * sysret.  Used to return to 32-on-64 compat processes.
+	 * Other usermode register state, including %esp, must already
+	 * be restored.
+	 */
+	void (*usergs_sysret32)(void);
+
+	/* Normal iret.  Jump to this with the standard iret stack
+	   frame set up. */
 	void (*iret)(void);
 
 	void (*swapgs)(void);
@@ -1481,18 +1504,24 @@ static inline unsigned long __raw_local_irq_save(void)
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);	\
 		  PV_RESTORE_REGS;)
 
-#define ENABLE_INTERRUPTS_SYSEXIT					\
-	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),	\
+#define USERGS_SYSRET32							\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32),	\
 		  CLBR_NONE,						\
-		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
-
+		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
 
 #ifdef CONFIG_X86_32
 #define GET_CR0_INTO_EAX				\
 	push %ecx; push %edx;				\
 	call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0);	\
 	pop %edx; pop %ecx
-#else
+
+#define ENABLE_INTERRUPTS_SYSEXIT					\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),	\
+		  CLBR_NONE,						\
+		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
+
+
+#else	/* !CONFIG_X86_32 */
 #define SWAPGS								\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,	\
 		  PV_SAVE_REGS;						\
@@ -1505,11 +1534,16 @@ static inline unsigned long __raw_local_irq_save(void)
 	movq %rax, %rcx;				\
 	xorq %rax, %rax;
 
-#define USERGS_SYSRET							\
-	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret),		\
+#define USERGS_SYSRET64							\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\
 		  CLBR_NONE,						\
-		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret))
-#endif
+		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#define ENABLE_INTERRUPTS_SYSEXIT32					\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),	\
+		  CLBR_NONE,						\
+		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
+#endif	/* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_PARAVIRT */
-- 
cgit v1.2.3


From fab58420ac0007a452b540cfb07923225ea4f48d Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:31 -0400
Subject: x86/paravirt, 64-bit: add adjust_exception_frame

64-bit Xen pushes a couple of extra words onto an exception frame.
Add a hook to deal with them.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/asm-offsets_64.c | 1 +
 arch/x86/kernel/entry_64.S       | 2 ++
 arch/x86/kernel/paravirt.c       | 3 +++
 arch/x86/xen/enlighten.c         | 3 +++
 include/asm-x86/paravirt.h       | 9 +++++++++
 include/asm-x86/processor.h      | 2 ++
 6 files changed, 20 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 06c451af979a..3295e7c08fe7 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -61,6 +61,7 @@ int main(void)
 	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+	OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
 	OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
 	OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 880ffe510a11..70fe13a1c41d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -736,6 +736,7 @@ END(spurious_interrupt)
  */ 		
 	.macro zeroentry sym
 	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq $0	/* push error code/oldrax */ 
 	CFI_ADJUST_CFA_OFFSET 8
 	pushq %rax	/* push real oldrax to the rdi slot */ 
@@ -748,6 +749,7 @@ END(spurious_interrupt)
 
 	.macro errorentry sym
 	XCPT_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq %rax
 	CFI_ADJUST_CFA_OFFSET 8
 	CFI_REL_OFFSET rax,0
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bf1067e89cad..b20c369cb89d 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -296,6 +296,9 @@ struct pv_irq_ops pv_irq_ops = {
 	.irq_enable = native_irq_enable,
 	.safe_halt = native_safe_halt,
 	.halt = native_halt,
+#ifdef CONFIG_X86_64
+	.adjust_exception_frame = paravirt_nop,
+#endif
 };
 
 struct pv_cpu_ops pv_cpu_ops = {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 119c88fa769d..3b980831602c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1123,6 +1123,9 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
 	.irq_enable = xen_irq_enable,
 	.safe_halt = xen_safe_halt,
 	.halt = xen_halt,
+#ifdef CONFIG_X86_64
+	.adjust_exception_frame = paravirt_nop,
+#endif
 };
 
 static const struct pv_apic_ops xen_apic_ops __initdata = {
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 3286a0c63b42..3dc223da200b 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -189,6 +189,10 @@ struct pv_irq_ops {
 	void (*irq_enable)(void);
 	void (*safe_halt)(void);
 	void (*halt)(void);
+
+#ifdef CONFIG_X86_64
+	void (*adjust_exception_frame)(void);
+#endif
 };
 
 struct pv_apic_ops {
@@ -1544,6 +1548,11 @@ static inline unsigned long __raw_local_irq_save(void)
 	movq %rax, %rcx;				\
 	xorq %rax, %rax;
 
+#define PARAVIRT_ADJUST_EXCEPTION_FRAME					\
+	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
+		  CLBR_NONE,						\
+		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))
+
 #define USERGS_SYSRET64							\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\
 		  CLBR_NONE,						\
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 12b5ede14c7a..df2459f5ebbb 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -532,6 +532,8 @@ static inline void load_sp0(struct tss_struct *tss,
 
 #define set_iopl_mask native_set_iopl_mask
 #define SWAPGS	swapgs
+
+#define PARAVIRT_ADJUST_EXCEPTION_FRAME	/*  */
 #endif /* CONFIG_PARAVIRT */
 
 /*
-- 
cgit v1.2.3


From 9f9d489a3e78b49d897734eaaf9dea568dbea66e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 25 Jun 2008 00:19:32 -0400
Subject: x86/paravirt, 64-bit: make load_gs_index() a paravirt operation

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S |  4 ++--
 arch/x86/kernel/paravirt.c |  3 +++
 include/asm-x86/elf.h      |  2 +-
 include/asm-x86/paravirt.h | 10 ++++++++++
 include/asm-x86/system.h   |  3 ++-
 5 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 70fe13a1c41d..07d69f262337 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -944,7 +944,7 @@ KPROBE_END(error_entry)
 	
        /* Reload gs selector with exception handling */
        /* edi:  new selector */ 
-ENTRY(load_gs_index)
+ENTRY(native_load_gs_index)
 	CFI_STARTPROC
 	pushf
 	CFI_ADJUST_CFA_OFFSET 8
@@ -958,7 +958,7 @@ gs_change:
 	CFI_ADJUST_CFA_OFFSET -8
         ret
 	CFI_ENDPROC
-ENDPROC(load_gs_index)
+ENDPROC(native_load_gs_index)
        
         .section __ex_table,"a"
         .align 8
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index b20c369cb89d..27819e3e4245 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -329,6 +329,9 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.store_idt = native_store_idt,
 	.store_tr = native_store_tr,
 	.load_tls = native_load_tls,
+#ifdef CONFIG_X86_64
+	.load_gs_index = native_load_gs_index,
+#endif
 	.write_ldt_entry = native_write_ldt_entry,
 	.write_gdt_entry = native_write_gdt_entry,
 	.write_idt_entry = native_write_idt_entry,
diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h
index 8f232dc5b5fe..7be4733c793e 100644
--- a/include/asm-x86/elf.h
+++ b/include/asm-x86/elf.h
@@ -83,9 +83,9 @@ extern unsigned int vdso_enabled;
 	(((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
 
 #include <asm/processor.h>
+#include <asm/system.h>
 
 #ifdef CONFIG_X86_32
-#include <asm/system.h>		/* for savesegment */
 #include <asm/desc.h>
 
 #define elf_check_arch(x)	elf_check_arch_ia32(x)
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 3dc223da200b..6d8966f9d190 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -115,6 +115,9 @@ struct pv_cpu_ops {
 	void (*set_ldt)(const void *desc, unsigned entries);
 	unsigned long (*store_tr)(void);
 	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
+#ifdef CONFIG_X86_64
+	void (*load_gs_index)(unsigned int idx);
+#endif
 	void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
 				const void *desc);
 	void (*write_gdt_entry)(struct desc_struct *,
@@ -845,6 +848,13 @@ static inline void load_TLS(struct thread_struct *t, unsigned cpu)
 	PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
 }
 
+#ifdef CONFIG_X86_64
+static inline void load_gs_index(unsigned int gs)
+{
+	PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs);
+}
+#endif
+
 static inline void write_ldt_entry(struct desc_struct *dt, int entry,
 				   const void *desc)
 {
diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h
index f686aa6abfe9..c4946c5964bf 100644
--- a/include/asm-x86/system.h
+++ b/include/asm-x86/system.h
@@ -136,7 +136,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \
 #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
 #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
 
-extern void load_gs_index(unsigned);
+extern void native_load_gs_index(unsigned);
 
 /*
  * Load a segment. Fall back on loading the zero
@@ -282,6 +282,7 @@ static inline void native_wbinvd(void)
 #ifdef CONFIG_X86_64
 #define read_cr8()	(native_read_cr8())
 #define write_cr8(x)	(native_write_cr8(x))
+#define load_gs_index   native_load_gs_index
 #endif
 
 /* Clear the 'TS' bit */
-- 
cgit v1.2.3


From 611dfd7819e525b45f39ff15e0faf5f23551c113 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Wed, 25 Jun 2008 21:39:16 +0200
Subject: x86: limit E820 map when a user-defined memory map is specified

This patch brings back limiting of the E820 map when a user-defined
E820 map is specified. While the behaviour of i386 (32 bit) was to limit
the E820 map (and /proc/iomem), the behaviour of x86-64 (64 bit) was not to
limit.

That patch limits the E820 map again for both x86 architectures.

Code was tested for compilation and booting on a 32 bit and 64 bit system.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Acked-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: kexec@lists.infradead.org
Cc: vgoyal@redhat.com
Cc: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 1dcb66533dfc..7b7685b78852 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1117,6 +1117,9 @@ static int __init parse_memopt(char *p)
 
 	mem_size = memparse(p, &p);
 	end_user_pfn = mem_size>>PAGE_SHIFT;
+	e820_update_range(mem_size, ULLONG_MAX - mem_size,
+		E820_RAM, E820_RESERVED);
+
 	return 0;
 }
 early_param("mem", parse_memopt);
@@ -1161,6 +1164,8 @@ static int __init parse_memmap_opt(char *p)
 		e820_add_region(start_at, mem_size, E820_RESERVED);
 	} else {
 		end_user_pfn = (mem_size >> PAGE_SHIFT);
+		e820_update_range(mem_size, ULLONG_MAX - mem_size,
+			E820_RAM, E820_RESERVED);
 	}
 	return *p == '\0' ? 0 : -EINVAL;
 }
-- 
cgit v1.2.3


From 042623bbabae168246ad8a37693f0ecb6c450aea Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 19:52:15 -0700
Subject: x86: clean up ARCH_SETUP

asm-x86/paravirt.h already have protection with CONFIG_PARAVIRT inside

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c                   | 8 ++++----
 include/asm-x86/mach-default/setup_arch.h | 4 ----
 include/asm-x86/mach-visws/setup_arch.h   | 2 --
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63dbb5e8f7ee..161609c6925e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -101,11 +101,7 @@
 #include <asm/proto.h>
 
 #include <mach_apic.h>
-#ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
-#else
-#define ARCH_SETUP
-#endif
 
 #include <asm/percpu.h>
 #include <asm/sections.h>
@@ -115,6 +111,10 @@
 #include <asm/numa_64.h>
 #endif
 
+#ifndef ARCH_SETUP
+#define ARCH_SETUP
+#endif
+
 #ifndef CONFIG_DEBUG_BOOT_PARAMS
 struct boot_params __initdata boot_params;
 #else
diff --git a/include/asm-x86/mach-default/setup_arch.h b/include/asm-x86/mach-default/setup_arch.h
index 605e3ccb991b..38846208b548 100644
--- a/include/asm-x86/mach-default/setup_arch.h
+++ b/include/asm-x86/mach-default/setup_arch.h
@@ -1,7 +1,3 @@
 /* Hook to call BIOS initialisation function */
 
 /* no action for generic */
-
-#ifndef ARCH_SETUP
-#define ARCH_SETUP
-#endif
diff --git a/include/asm-x86/mach-visws/setup_arch.h b/include/asm-x86/mach-visws/setup_arch.h
index 33f700ef6831..b8f5dd829dba 100644
--- a/include/asm-x86/mach-visws/setup_arch.h
+++ b/include/asm-x86/mach-visws/setup_arch.h
@@ -4,5 +4,3 @@ extern unsigned long sgivwfb_mem_phys;
 extern unsigned long sgivwfb_mem_size;
 
 /* no action for visws */
-
-#define ARCH_SETUP
-- 
cgit v1.2.3


From e7b3789524eecc96213dd69d6686efd429235051 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 25 Jun 2008 21:51:28 -0700
Subject: x86: move fix mapping page table range early

do that in init_memory_mapping

also remove one init_ohci1394_dma_on_all_controllers

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 23 +++++++++--------------
 arch/x86/mm/init_32.c   | 15 +++++++++++----
 2 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 161609c6925e..bf528b23750a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -611,11 +611,6 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_X86_32
 	probe_roms();
-#else
-# ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	if (init_ohci1394_dma_early)
-		init_ohci1394_dma_on_all_controllers();
-# endif
 #endif
 
 	/* after parse_early_param, so could debug it */
@@ -672,6 +667,15 @@ void __init setup_arch(char **cmdline_p)
 	/* max_pfn_mapped is updated here */
 	max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT));
 
+	/*
+	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+	 */
+
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	if (init_ohci1394_dma_early)
+		init_ohci1394_dma_on_all_controllers();
+#endif
+
 	reserve_initrd();
 
 #ifdef CONFIG_X86_64
@@ -739,15 +743,6 @@ void __init setup_arch(char **cmdline_p)
 	map_vsyscall();
 #endif
 
-	/*
-	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
-	 */
-
-#if defined(CONFIG_PROVIDE_OHCI1394_DMA_INIT) && defined(CONFIG_X86_32)
-	if (init_ohci1394_dma_early)
-		init_ohci1394_dma_on_all_controllers();
-#endif
-
 #ifdef CONFIG_X86_GENERICARCH
 	generic_apic_probe();
 #endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 156000de6e62..b9cf7f705302 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -442,13 +442,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init pagetable_init(void)
+static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
 {
-	pgd_t *pgd_base = swapper_pg_dir;
 	unsigned long vaddr, end;
 
-	paravirt_pagetable_setup_start(pgd_base);
-
 	/*
 	 * Fixed mappings, only the page table structure has to be
 	 * created - mappings will be set by set_fixmap():
@@ -458,6 +455,13 @@ static void __init pagetable_init(void)
 	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
 	page_table_range_init(vaddr, end, pgd_base);
 	early_ioremap_reset();
+}
+
+static void __init pagetable_init(void)
+{
+	pgd_t *pgd_base = swapper_pg_dir;
+
+	paravirt_pagetable_setup_start(pgd_base);
 
 	permanent_kmaps_init(pgd_base);
 
@@ -788,6 +792,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 
 	kernel_physical_mapping_init(pgd_base, start, end);
 
+	early_ioremap_page_table_range_init(pgd_base);
+
 	load_cr3(swapper_pg_dir);
 
 	__flush_tlb_all();
@@ -799,6 +805,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	return end >> PAGE_SHIFT;
 }
 
+
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
  * already mapped by head.S.
-- 
cgit v1.2.3


From ab67715c7201be2fe729888a09007b6ba5bb2326 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 27 Jun 2008 15:36:54 -0700
Subject: x86: early res print out alignment v2

v2: fix print info to cont

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7b7685b78852..fa77cb4185c3 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -828,16 +828,26 @@ void __init free_early(u64 start, u64 end)
 
 void __init early_res_to_bootmem(u64 start, u64 end)
 {
-	int i;
+	int i, count;
 	u64 final_start, final_end;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+
+	count  = 0;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
+		count++;
+
+	printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count);
+	for (i = 0; i < count; i++) {
 		struct early_res *r = &early_res[i];
+		printk(KERN_INFO "  #%d [ %010llx - %010llx ] %16s", i,
+			r->start, r->end, r->name);
 		final_start = max(start, r->start);
 		final_end = min(end, r->end);
-		if (final_start >= final_end)
+		if (final_start >= final_end) {
+			printk(KERN_CONT "\n");
 			continue;
-		printk(KERN_INFO "  early res: %d [%llx-%llx] %s\n", i,
-			final_start, final_end - 1, r->name);
+		}
+		printk(KERN_CONT " ===> [ %010llx - %010llx ]\n",
+			final_start, final_end);
 		reserve_bootmem_generic(final_start, final_end - final_start,
 				BOOTMEM_DEFAULT);
 	}
-- 
cgit v1.2.3


From f3294a33e765d8308c3e17b951a13e0db9cf5f00 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 27 Jun 2008 01:41:56 -0700
Subject: x86: let setup_arch call init_apic_mappings for 32bit

instead of calling it from trap_init()

also move init ioapic mapping out of apic_32.c

so 32 bit do same as 64 bit

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c    | 30 ------------------------------
 arch/x86/kernel/io_apic_32.c | 32 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup.c      |  6 ++----
 arch/x86/kernel/traps_32.c   |  4 ----
 include/asm-x86/apic.h       |  1 +
 include/asm-x86/io_apic.h    |  1 +
 6 files changed, 36 insertions(+), 38 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index b5571fb0f77e..8fbad8ed0ebe 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1197,36 +1197,6 @@ void __init init_apic_mappings(void)
 	if (boot_cpu_physical_apicid == -1U)
 		boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
 
-#ifdef CONFIG_X86_IO_APIC
-	{
-		unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
-		int i;
-
-		for (i = 0; i < nr_ioapics; i++) {
-			if (smp_found_config) {
-				ioapic_phys = mp_ioapics[i].mp_apicaddr;
-				if (!ioapic_phys) {
-					printk(KERN_ERR
-					       "WARNING: bogus zero IO-APIC "
-					       "address found in MPTABLE, "
-					       "disabling IO/APIC support!\n");
-					smp_found_config = 0;
-					skip_ioapic_setup = 1;
-					goto fake_ioapic_page;
-				}
-			} else {
-fake_ioapic_page:
-				ioapic_phys = (unsigned long)
-					      alloc_bootmem_pages(PAGE_SIZE);
-				ioapic_phys = __pa(ioapic_phys);
-			}
-			set_fixmap_nocache(idx, ioapic_phys);
-			printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
-			       __fix_to_virt(idx), ioapic_phys);
-			idx++;
-		}
-	}
-#endif
 }
 
 /*
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index d6af301c822b..337ec3438a8f 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/bootmem.h>
 #include <linux/mc146818rtc.h>
 #include <linux/compiler.h>
 #include <linux/acpi.h>
@@ -2852,3 +2853,34 @@ static int __init parse_noapic(char *arg)
 	return 0;
 }
 early_param("noapic", parse_noapic);
+
+void __init ioapic_init_mappings(void)
+{
+	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+	int i;
+
+	for (i = 0; i < nr_ioapics; i++) {
+		if (smp_found_config) {
+			ioapic_phys = mp_ioapics[i].mp_apicaddr;
+			if (!ioapic_phys) {
+				printk(KERN_ERR
+				       "WARNING: bogus zero IO-APIC "
+				       "address found in MPTABLE, "
+				       "disabling IO/APIC support!\n");
+				smp_found_config = 0;
+				skip_ioapic_setup = 1;
+				goto fake_ioapic_page;
+			}
+		} else {
+fake_ioapic_page:
+			ioapic_phys = (unsigned long)
+				      alloc_bootmem_pages(PAGE_SIZE);
+			ioapic_phys = __pa(ioapic_phys);
+		}
+		set_fixmap_nocache(idx, ioapic_phys);
+		printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
+		       __fix_to_virt(idx), ioapic_phys);
+		idx++;
+	}
+}
+
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bf528b23750a..4716460607b4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -766,16 +766,14 @@ void __init setup_arch(char **cmdline_p)
 		get_smp_config();
 #endif
 
-#ifdef CONFIG_X86_64
 	init_apic_mappings();
 	ioapic_init_mappings();
-#else
-# if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
 	if (def_to_bigsmp)
 		printk(KERN_WARNING "More than 8 CPUs detected and "
 			"CONFIG_X86_PC cannot handle it.\nUse "
 			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
-# endif
 #endif
 	kvm_guest_init();
 
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index dc7c05e5cfe7..f60feee83253 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1198,10 +1198,6 @@ void __init trap_init(void)
 	early_iounmap(p, 4);
 #endif
 
-#ifdef CONFIG_X86_LOCAL_APIC
-	init_apic_mappings();
-#endif
-	set_trap_gate(0,  &divide_error);
 	set_intr_gate(1,  &debug);
 	set_intr_gate(2,  &nmi);
 	set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index 9fe941cd843d..6ae6ae68d4cd 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -135,6 +135,7 @@ extern int apic_is_clustered_box(void);
 #else /* !CONFIG_X86_LOCAL_APIC */
 static inline void lapic_shutdown(void) { }
 #define local_apic_timer_c2_ok		1
+static inline void init_apic_mappings(void) { }
 
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h
index 8b1f5684842e..14f82bbcb5fd 100644
--- a/include/asm-x86/io_apic.h
+++ b/include/asm-x86/io_apic.h
@@ -186,6 +186,7 @@ extern void ioapic_init_mappings(void);
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
 static const int timer_through_8259 = 0;
+static inline void ioapic_init_mappings(void) { }
 #endif
 
 #endif
-- 
cgit v1.2.3


From b4df32f4aeef8794d0135fc8dc250acb44cfee60 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 28 Jun 2008 17:49:59 -0700
Subject: x86: fix warning in e820_reserve_resources with 32bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

when 64bit resource is not enabled, we get:

arch/x86/kernel/e820.c: In function ‘e820_reserve_resources’:
arch/x86/kernel/e820.c:1217: warning: comparison is always false due to limited range of data type

because res->start/end is resource_t aka u32. it will overflow.

fix it with temp end of u64

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index fa77cb4185c3..ba5ac880ea1e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1202,6 +1202,7 @@ void __init e820_reserve_resources(void)
 {
 	int i;
 	struct resource *res;
+	u64 end;
 
 	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
 	for (i = 0; i < e820.nr_map; i++) {
@@ -1211,14 +1212,16 @@ void __init e820_reserve_resources(void)
 		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
 		default:	res->name = "reserved";
 		}
-		res->start = e820.map[i].addr;
-		res->end = res->start + e820.map[i].size - 1;
+		end = e820.map[i].addr + e820.map[i].size - 1;
 #ifndef CONFIG_RESOURCES_64BIT
-		if (res->end > 0x100000000ULL) {
+		if (end > 0x100000000ULL) {
 			res++;
 			continue;
 		}
 #endif
+		res->start = e820.map[i].addr;
+		res->end = end;
+
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		insert_resource(&iomem_resource, res);
 		res++;
-- 
cgit v1.2.3


From 914bebfad42c417b84bda8920a3073d236007fde Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 29 Jun 2008 00:06:37 -0700
Subject: x86: use disable_apic in 32bit

change the enable_local_apic to static force_enable_local_apic for 32bit

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c | 15 ++++++++-------
 arch/x86/kernel/setup.c   |  4 ----
 include/asm-x86/apic.h    |  4 ----
 3 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 8fbad8ed0ebe..6dea8306d8c0 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -55,9 +55,10 @@ unsigned long mp_lapic_addr;
 /*
  * Knob to control our willingness to enable the local APIC.
  *
- * -1=force-disable, +1=force-enable
+ * +1=force-enable
  */
-int enable_local_apic;
+static int force_enable_local_apic;
+int disable_apic;
 
 /* Local APIC timer verification ok */
 static int local_apic_timer_verify_ok;
@@ -1099,7 +1100,7 @@ static int __init detect_init_APIC(void)
 	u32 h, l, features;
 
 	/* Disabled by kernel option? */
-	if (enable_local_apic < 0)
+	if (disable_apic)
 		return -1;
 
 	switch (boot_cpu_data.x86_vendor) {
@@ -1122,7 +1123,7 @@ static int __init detect_init_APIC(void)
 		 * Over-ride BIOS and try to enable the local APIC only if
 		 * "lapic" specified.
 		 */
-		if (enable_local_apic <= 0) {
+		if (!force_enable_local_apic) {
 			printk(KERN_INFO "Local APIC disabled by BIOS -- "
 			       "you can enable it with \"lapic\"\n");
 			return -1;
@@ -1208,7 +1209,7 @@ int apic_version[MAX_APICS];
 
 int __init APIC_init_uniprocessor(void)
 {
-	if (enable_local_apic < 0)
+	if (disable_apic)
 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 
 	if (!smp_found_config && !cpu_has_apic)
@@ -1682,14 +1683,14 @@ static void apic_pm_activate(void) { }
  */
 static int __init parse_lapic(char *arg)
 {
-	enable_local_apic = 1;
+	force_enable_local_apic = 1;
 	return 0;
 }
 early_param("lapic", parse_lapic);
 
 static int __init parse_nolapic(char *arg)
 {
-	enable_local_apic = -1;
+	disable_apic = 1;
 	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 	return 0;
 }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4716460607b4..fb318edd8bc6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -598,11 +598,7 @@ void __init setup_arch(char **cmdline_p)
 
 	if (acpi_mps_check()) {
 #ifdef CONFIG_X86_LOCAL_APIC
-#ifdef CONFIG_X86_32
-		enable_local_apic = -1;
-#else
 		disable_apic = 1;
-#endif
 #endif
 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 	}
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index 6ae6ae68d4cd..a29807737d3d 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -40,11 +40,7 @@ extern int local_apic_timer_c2_ok;
 
 extern int ioapic_force;
 
-#ifdef CONFIG_X86_64
 extern int disable_apic;
-#else
-extern int enable_local_apic;
-#endif
 /*
  * Basic functions accessing APICs.
  */
-- 
cgit v1.2.3


From 1a98fd14f44cfade4af3e6ed96ba55065fa17ee4 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Sun, 29 Jun 2008 20:02:44 -0700
Subject: x86: setup_arch() && early_ioremap_init()

Looks like the setup.c unification missed the early_ioremap init from
the early_ioremap unification.  Unconditionally call early_ioremap_init().

needed for "x86/paravirt: groundwork for 64-bit Xen support".

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Mark McLoughlin <markmc@redhat.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fb318edd8bc6..caec79fb83a3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -523,12 +523,13 @@ void __init setup_arch(char **cmdline_p)
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	pre_setup_arch_hook();
 	early_cpu_init();
-	early_ioremap_init();
 	reserve_setup_data();
 #else
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 #endif
 
+	early_ioremap_init();
+
 	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
 	screen_info = boot_params.screen_info;
 	edid_info = boot_params.edid_info;
-- 
cgit v1.2.3


From 102d0a4b56d94e9b7eedfdfb488400271235543f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 30 Jun 2008 11:10:53 -0700
Subject: x86, paravirt, 64-bit: fix compile errors with IA32_EMULATION off

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/paravirt.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 27819e3e4245..e7e5652f65bc 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -337,9 +337,13 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.write_idt_entry = native_write_idt_entry,
 	.load_sp0 = native_load_sp0,
 
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 	.irq_enable_sysexit = native_irq_enable_sysexit,
+#endif
 #ifdef CONFIG_X86_64
+#ifdef CONFIG_IA32_EMULATION
 	.usergs_sysret32 = native_usergs_sysret32,
+#endif
 	.usergs_sysret64 = native_usergs_sysret64,
 #endif
 	.iret = native_iret,
-- 
cgit v1.2.3


From 28bb22379513ca3cac9d13766064a219c5fc21a9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 30 Jun 2008 16:20:54 -0700
Subject: x86: move reserve_setup_data to setup.c

Ying Huang would like setup_data to be reserved, but not included in the
no save range.

Here we try to modify the e820 table to reserve that range early.
also add that in early_res in case bootloader messes up with the ramdisk.

other solution would be
1. add early_res_to_highmem...
2. early_res_to_e820...
but they could reserve another type memory wrongly, if early_res has some
resource reserved early, and not needed later, but it is not removed from
early_res in time. Like the RAMDISK (already handled).

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: andi@firstfloor.org
Tested-by: Huang, Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c      |  4 +++-
 arch/x86/kernel/head.c      | 18 ------------------
 arch/x86/kernel/head64.c    |  1 -
 arch/x86/kernel/setup.c     | 34 ++++++++++++++++++++++++++++------
 include/asm-x86/bootparam.h |  2 --
 include/asm-x86/e820.h      |  3 +++
 6 files changed, 34 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index ba5ac880ea1e..e03b89ac8f2b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -120,6 +120,7 @@ void __init e820_print_map(char *who)
 		       (e820.map[i].addr + e820.map[i].size));
 		switch (e820.map[i].type) {
 		case E820_RAM:
+		case E820_RESERVED_KERN:
 			printk(KERN_CONT "(usable)\n");
 			break;
 		case E820_RESERVED:
@@ -611,7 +612,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 			register_nosave_region(pfn, PFN_UP(ei->addr));
 
 		pfn = PFN_DOWN(ei->addr + ei->size);
-		if (ei->type != E820_RAM)
+		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
 			register_nosave_region(PFN_UP(ei->addr), pfn);
 
 		if (pfn >= limit_pfn)
@@ -1207,6 +1208,7 @@ void __init e820_reserve_resources(void)
 	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
 	for (i = 0; i < e820.nr_map; i++) {
 		switch (e820.map[i].type) {
+		case E820_RESERVED_KERN:
 		case E820_RAM:	res->name = "System RAM"; break;
 		case E820_ACPI:	res->name = "ACPI Tables"; break;
 		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index a6816be01cd6..3e66bd364a9d 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -53,21 +53,3 @@ void __init reserve_ebda_region(void)
 	/* reserve all memory between lowmem and the 1MB mark */
 	reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
 }
-
-void __init reserve_setup_data(void)
-{
-	struct setup_data *data;
-	u64 pa_data;
-	char buf[32];
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, sizeof(*data));
-		sprintf(buf, "setup data %x", data->type);
-		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
-		pa_data = data->next;
-		early_iounmap(data, sizeof(*data));
-	}
-}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f684e3b3de4e..c97819829146 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -128,7 +128,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
 #endif
 
 	reserve_ebda_region();
-	reserve_setup_data();
 
 	/*
 	 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index caec79fb83a3..2ca12d4c88fb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -389,14 +389,34 @@ static void __init parse_setup_data(void)
 		default:
 			break;
 		}
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-		free_early(pa_data, pa_data+sizeof(*data)+data->len);
-#endif
 		pa_data = data->next;
 		early_iounmap(data, PAGE_SIZE);
 	}
 }
 
+static void __init reserve_setup_data(void)
+{
+	struct setup_data *data;
+	u64 pa_data;
+	char buf[32];
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, sizeof(*data));
+		sprintf(buf, "setup data %x", data->type);
+		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+		e820_update_range(pa_data, sizeof(*data)+data->len,
+			 E820_RAM, E820_RESERVED_KERN);
+		pa_data = data->next;
+		early_iounmap(data, sizeof(*data));
+	}
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+	printk(KERN_INFO "extended physical RAM map:\n");
+	e820_print_map("reserve setup_data");
+}
+
 /*
  * --------- Crashkernel reservation ------------------------------
  */
@@ -523,7 +543,6 @@ void __init setup_arch(char **cmdline_p)
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	pre_setup_arch_hook();
 	early_cpu_init();
-	reserve_setup_data();
 #else
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 #endif
@@ -567,6 +586,8 @@ void __init setup_arch(char **cmdline_p)
 	ARCH_SETUP
 
 	setup_memory_map();
+	parse_setup_data();
+
 	copy_edd();
 
 	if (!boot_params.hdr.root_flags)
@@ -593,10 +614,11 @@ void __init setup_arch(char **cmdline_p)
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
 
-	parse_setup_data();
-
 	parse_early_param();
 
+	/* after early param, so could get panic from serial */
+	reserve_setup_data();
+
 	if (acpi_mps_check()) {
 #ifdef CONFIG_X86_LOCAL_APIC
 		disable_apic = 1;
diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h
index 6eeba3b2812b..ae22bdf0ab14 100644
--- a/include/asm-x86/bootparam.h
+++ b/include/asm-x86/bootparam.h
@@ -108,6 +108,4 @@ struct boot_params {
 	__u8  _pad9[276];				/* 0xeec */
 } __attribute__((packed));
 
-void reserve_setup_data(void);
-
 #endif /* _ASM_BOOTPARAM_H */
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index f622685c9af8..45e904fe4076 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -44,6 +44,9 @@
 #define E820_ACPI	3
 #define E820_NVS	4
 
+/* reserved RAM used by kernel itself */
+#define E820_RESERVED_KERN        128
+
 #ifndef __ASSEMBLY__
 struct e820entry {
 	__u64 addr;	/* start of memory segment */
-- 
cgit v1.2.3


From cd5dce2fb023a6f0168344b7dd8adec30017458e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 30 Jun 2008 16:04:48 -0700
Subject: x86: fix CPA self-test for "x86/paravirt: groundwork for 64-bit Xen
 support"

Ingo Molnar wrote:
> -tip auto-testing found pagetable corruption (CPA self-test failure):
>
> [   32.956015] CPA self-test:
> [   32.958822]  4k 2048 large 508 gb 0 x 2556[ffff880000000000-ffff88003fe00000] miss 0
> [   32.964000] CPA ffff88001d54e000: bad pte 1d4000e3
> [   32.968000] CPA ffff88001d54e000: unexpected level 2
> [   32.972000] CPA ffff880022c5d000: bad pte 22c000e3
> [   32.976000] CPA ffff880022c5d000: unexpected level 2
> [   32.980000] CPA ffff8800200ce000: bad pte 200000e3
> [   32.984000] CPA ffff8800200ce000: unexpected level 2
> [   32.988000] CPA ffff8800210f0000: bad pte 210000e3
>
> config and full log can be found at:
>
>  http://redhat.com/~mingo/misc/config-Mon_Jun_30_11_11_51_CEST_2008.bad
>  http://redhat.com/~mingo/misc/log-Mon_Jun_30_11_11_51_CEST_2008.bad

Phew.  OK, I've worked this out.  Short version is that's it's a false
alarm, and there was no real failure here.  Long version:

    * I changed the code to create the physical mapping pagetables to
      reuse any existing mapping rather than replace it.   Specifically,
      reusing an pud pointed to by the pgd caused this symptom to appear.
    * The specific PUD being reused is the one created statically in
      head_64.S, which creates an initial 1GB mapping.
    * That mapping doesn't have _PAGE_GLOBAL set on it, due to the
      inconsistency between __PAGE_* and PAGE_*.
    * The CPA test attempts to clear _PAGE_GLOBAL, and then checks to
      see that the resulting range is 1) shattered into 4k pages, and 2)
      has no _PAGE_GLOBAL.
    * However, since it didn't have _PAGE_GLOBAL on that range to start
      with, change_page_attr_clear() had nothing to do, and didn't
      bother shattering the range,
    * resulting in the reported messages

The simple fix is to set _PAGE_GLOBAL in level2_ident_pgt.

An additional fix to make CPA testing more robust by using some other
pagetable bit (one of the unused available-to-software ones).  This
would solve spurious CPA test warnings under Xen which uses _PAGE_GLOBAL
for its own purposes (ie, not under guest control).

Also, we should revisit the use of _PAGE_GLOBAL in asm-x86/pgtable.h,
and use it consistently, and drop MAKE_GLOBAL.  The first time I
proposed it it caused breakages in the very early CPA code; with luck
that's all fixed now.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Mark McLoughlin <markmc@redhat.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 38279fab093d..4336f98456ea 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
 	/* Since I easily can, map the first 1G.
 	 * Don't set NX because code runs from these pages.
 	 */
-	PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+	PMDS(0, __PAGE_KERNEL_LARGE_EXEC | _PAGE_GLOBAL, PTRS_PER_PMD)
 
 NEXT_PAGE(level2_kernel_pgt)
 	/*
-- 
cgit v1.2.3


From 6a2f47ca27fad36f99e8478a3807d4b8c7db80e7 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 27 Jun 2008 10:10:13 -0700
Subject: x86: add check for node passed to node_to_cpumask, v3

  * When CONFIG_DEBUG_PER_CPU_MAPS is set, the node passed to
    node_to_cpumask and node_to_cpumask_ptr should be validated.
    If invalid, then a dump_stack is performed and a zero cpumask
    is returned.

v2: Slightly different version to remove a compiler warning.
v3: Redone to reflect moving setup.c -> setup_percpu.c

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: "akpm@linux-foundation.org" <akpm@linux-foundation.org>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c | 26 +++++++++++++++++++++++---
 include/asm-x86/topology.h     |  7 ++++++-
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 7068f95cccc6..43aca2de624a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -346,6 +346,10 @@ int early_cpu_to_node(int cpu)
 	return per_cpu(x86_cpu_to_node_map, cpu);
 }
 
+
+/* empty cpumask */
+static const cpumask_t cpu_mask_none;
+
 /*
  * Returns a pointer to the bitmask of CPUs on Node 'node'.
  */
@@ -358,13 +362,23 @@ cpumask_t *_node_to_cpumask_ptr(int node)
 		dump_stack();
 		return &cpu_online_map;
 	}
-	BUG_ON(node >= nr_node_ids);
-	return &node_to_cpumask_map[node];
+	if (node >= nr_node_ids) {
+		printk(KERN_WARNING
+			"_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
+			node, nr_node_ids);
+		dump_stack();
+		return (cpumask_t *)&cpu_mask_none;
+	}
+	return (cpumask_t *)&node_to_cpumask_map[node];
 }
 EXPORT_SYMBOL(_node_to_cpumask_ptr);
 
 /*
  * Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used.  The
+ * node_to_cpumask_ptr function should be used whenever possible.
  */
 cpumask_t node_to_cpumask(int node)
 {
@@ -374,7 +388,13 @@ cpumask_t node_to_cpumask(int node)
 		dump_stack();
 		return cpu_online_map;
 	}
-	BUG_ON(node >= nr_node_ids);
+	if (node >= nr_node_ids) {
+		printk(KERN_WARNING
+			"node_to_cpumask(%d): node > nr_node_ids(%d)\n",
+			node, nr_node_ids);
+		dump_stack();
+		return cpu_mask_none;
+	}
 	return node_to_cpumask_map[node];
 }
 EXPORT_SYMBOL(node_to_cpumask);
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index 1f97758de4ab..98e5f17ea856 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -57,7 +57,12 @@ static inline int cpu_to_node(int cpu)
 }
 #define early_cpu_to_node(cpu)	cpu_to_node(cpu)
 
-/* Returns a bitmask of CPUs on Node 'node'. */
+/* Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used.  The
+ * node_to_cpumask_ptr function should be used whenever possible.
+ */
 static inline cpumask_t node_to_cpumask(int node)
 {
 	return node_to_cpumask_map[node];
-- 
cgit v1.2.3


From fd6493e16625b92a506fba13deda31c0be5f1cd4 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Wed, 25 Jun 2008 11:02:42 -0700
Subject: x86: cleanup e820_setup_gap(), v2

e820_search_gap also take a end_addr parameter to limit search from
start_addr to end_addr.

Signed-off-by: AloK N Kataria <akataria@vmware.com>
Acked-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: "lenb@kernel.org" <lenb@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 12 +++++++-----
 include/asm-x86/e820.h |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e03b89ac8f2b..d3f6c5f6d3b1 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -487,17 +487,19 @@ void __init update_e820(void)
 	printk(KERN_INFO "modified physical RAM map:\n");
 	e820_print_map("modified");
 }
-
+#define MAX_GAP_END 0x100000000ull
 /*
- * Search for a gap in the e820 memory space from start_addr to 2^32.
+ * Search for a gap in the e820 memory space from start_addr to end_addr.
  */
 __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
-		unsigned long start_addr)
+		unsigned long start_addr, unsigned long long end_addr)
 {
-	unsigned long long last = 0x100000000ull;
+	unsigned long long last;
 	int i = e820.nr_map;
 	int found = 0;
 
+	last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
+
 	while (--i >= 0) {
 		unsigned long long start = e820.map[i].addr;
 		unsigned long long end = start + e820.map[i].size;
@@ -537,7 +539,7 @@ __init void e820_setup_gap(void)
 
 	gapstart = 0x10000000;
 	gapsize = 0x400000;
-	found  = e820_search_gap(&gapstart, &gapsize, 0);
+	found  = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
 
 #ifdef CONFIG_X86_64
 	if (!found) {
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 45e904fe4076..05a7dc33334c 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -75,7 +75,7 @@ extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
 extern void update_e820(void);
 extern void e820_setup_gap(void);
 extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
-				unsigned long start_addr);
+			unsigned long start_addr, unsigned long long end_addr);
 struct setup_data;
 extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data);
 
-- 
cgit v1.2.3


From 32105f7fd8faa7bc3d101dcc3eabc0ae1ac375a7 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Thu, 26 Jun 2008 21:54:08 +0200
Subject: x86: find offset for crashkernel reservation automatically

This patch removes the need of the crashkernel=...@offset parameter to define
a fixed offset for crashkernel reservation. That feature can be used together
with a relocatable kernel where the kexec-tools relocate the kernel and
get the actual offset from /proc/iomem.

The use case is a kernel where the .text+.data+.bss is after 16M physical
memory (debug kernel with lockdep on x86_64 can cause that) which caused a
major pain in autoconfiguration in our distribution.

Also, that patch unifies crashdump architectures a bit since IA64 has
that semantics from the very beginning of the kdump port.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Cc: vgoyal@redhat.com
Cc: Bernhard Walle <bwalle@suse.de>
Cc: kexec@lists.infradead.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 70 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2ca12d4c88fb..b3469898717e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -422,6 +422,34 @@ static void __init reserve_setup_data(void)
  */
 
 #ifdef CONFIG_KEXEC
+
+/**
+ * Reserve @size bytes of crashkernel memory at any suitable offset.
+ *
+ * @size: Size of the crashkernel memory to reserve.
+ * Returns the base address on success, and -1ULL on failure.
+ */
+unsigned long long find_and_reserve_crashkernel(unsigned long long size)
+{
+	const unsigned long long alignment = 16<<20; 	/* 16M */
+	unsigned long long start = 0LL;
+
+	while (1) {
+		int ret;
+
+		start = find_e820_area(start, ULONG_MAX, size, alignment);
+		if (start == -1ULL)
+			return start;
+
+		/* try to reserve it */
+		ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
+		if (ret >= 0)
+			return start;
+
+		start += alignment;
+	}
+}
+
 static inline unsigned long long get_total_mem(void)
 {
 	unsigned long long total;
@@ -444,30 +472,36 @@ static void __init reserve_crashkernel(void)
 
 	ret = parse_crashkernel(boot_command_line, total_mem,
 			&crash_size, &crash_base);
-	if (ret == 0 && crash_size > 0) {
-		if (crash_base <= 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"you have to specify a base address\n");
+	if (ret != 0 || crash_size <= 0)
+		return;
+
+	/* 0 means: find the address automatically */
+	if (crash_base <= 0) {
+		crash_base = find_and_reserve_crashkernel(crash_size);
+		if (crash_base == -1ULL) {
+			pr_info("crashkernel reservation failed. "
+				"No suitable area found.\n");
 			return;
 		}
-
-		if (reserve_bootmem_generic(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE) < 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"memory is in use\n");
+	} else {
+		ret = reserve_bootmem_generic(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE);
+		if (ret < 0) {
+			pr_info("crashkernel reservation failed - "
+				"memory is in use\n");
 			return;
 		}
+	}
 
-		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-				"for crashkernel (System RAM: %ldMB)\n",
-				(unsigned long)(crash_size >> 20),
-				(unsigned long)(crash_base >> 20),
-				(unsigned long)(total_mem >> 20));
+	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+			"for crashkernel (System RAM: %ldMB)\n",
+			(unsigned long)(crash_size >> 20),
+			(unsigned long)(crash_base >> 20),
+			(unsigned long)(total_mem >> 20));
 
-		crashk_res.start = crash_base;
-		crashk_res.end   = crash_base + crash_size - 1;
-		insert_resource(&iomem_resource, &crashk_res);
-	}
+	crashk_res.start = crash_base;
+	crashk_res.end   = crash_base + crash_size - 1;
+	insert_resource(&iomem_resource, &crashk_res);
 }
 #else
 static void __init reserve_crashkernel(void)
-- 
cgit v1.2.3


From dc8e8120ad291074a5fb93cfb0418466c62f6019 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 1 Jul 2008 20:02:16 -0700
Subject: x86: change copy_e820_map to append_e820_map

so it has a more meaningful name.
also change it to static.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 11 ++++++-----
 include/asm-x86/e820.h |  1 -
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d3f6c5f6d3b1..b01fa0d0dc7c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -360,7 +360,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 	return 0;
 }
 
-static int __init __copy_e820_map(struct e820entry *biosmap, int nr_map)
+static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
 {
 	while (nr_map) {
 		u64 start = biosmap->addr;
@@ -389,13 +389,13 @@ static int __init __copy_e820_map(struct e820entry *biosmap, int nr_map)
  * will have given us a memory map that we can use to properly
  * set up memory.  If we aren't, we'll fake a memory map.
  */
-int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
+static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
 {
 	/* Only one memory region (or negative)? Ignore it */
 	if (nr_map < 2)
 		return -1;
 
-	return __copy_e820_map(biosmap, nr_map);
+	return __append_e820_map(biosmap, nr_map);
 }
 
 u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
@@ -583,7 +583,7 @@ void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
 	if (map_len > PAGE_SIZE)
 		sdata = early_ioremap(pa_data, map_len);
 	extmap = (struct e820entry *)(sdata->data);
-	__copy_e820_map(extmap, entries);
+	__append_e820_map(extmap, entries);
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 	if (map_len > PAGE_SIZE)
 		early_iounmap(sdata, map_len);
@@ -1247,7 +1247,8 @@ char *__init default_machine_specific_memory_setup(void)
 			ARRAY_SIZE(boot_params.e820_map),
 			&new_nr);
 	boot_params.e820_entries = new_nr;
-	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) {
+	if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
+	  < 0) {
 		u64 mem_size;
 
 		/* compare results from other methods and take the greater */
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 05a7dc33334c..4b26604b3c19 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -67,7 +67,6 @@ extern void e820_add_region(u64 start, u64 size, int type);
 extern void e820_print_map(char *who);
 extern int
 sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
-extern int copy_e820_map(struct e820entry *biosmap, int nr_map);
 extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
 			       unsigned new_type);
 extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
-- 
cgit v1.2.3


From 4fcc545a7479135332f511a54611820c9f4208a0 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 1 Jul 2008 20:03:11 -0700
Subject: x86: make early_res_to_bootmem print out less 80 width chars

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index b01fa0d0dc7c..d0335853ff52 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -841,7 +841,7 @@ void __init early_res_to_bootmem(u64 start, u64 end)
 	printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count);
 	for (i = 0; i < count; i++) {
 		struct early_res *r = &early_res[i];
-		printk(KERN_INFO "  #%d [ %010llx - %010llx ] %16s", i,
+		printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
 			r->start, r->end, r->name);
 		final_start = max(start, r->start);
 		final_end = min(end, r->end);
@@ -849,7 +849,7 @@ void __init early_res_to_bootmem(u64 start, u64 end)
 			printk(KERN_CONT "\n");
 			continue;
 		}
-		printk(KERN_CONT " ===> [ %010llx - %010llx ]\n",
+		printk(KERN_CONT " ==> [%010llx - %010llx]\n",
 			final_start, final_end);
 		reserve_bootmem_generic(final_start, final_end - final_start,
 				BOOTMEM_DEFAULT);
-- 
cgit v1.2.3


From d9a81b4411d53196c4535c3a1258cb03d945c718 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 1 Jul 2008 20:04:10 -0700
Subject: x86: do not printout if we do not find setup_data

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b3469898717e..4ac01d0ce624 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -399,6 +399,7 @@ static void __init reserve_setup_data(void)
 	struct setup_data *data;
 	u64 pa_data;
 	char buf[32];
+	int found = 0;
 
 	if (boot_params.hdr.version < 0x0209)
 		return;
@@ -409,9 +410,13 @@ static void __init reserve_setup_data(void)
 		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
 		e820_update_range(pa_data, sizeof(*data)+data->len,
 			 E820_RAM, E820_RESERVED_KERN);
+		found = 1;
 		pa_data = data->next;
 		early_iounmap(data, sizeof(*data));
 	}
+	if (!found)
+		return;
+
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 	printk(KERN_INFO "extended physical RAM map:\n");
 	e820_print_map("reserve setup_data");
-- 
cgit v1.2.3


From cb95a13a8ace8612ecab042a838e5aab2ec14ef0 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 2 Jul 2008 00:31:02 -0700
Subject: x86: merge zones_sizes_init for numa and non numa on 32-bit

move out e820_register_active_regions from non numa zones_sizes_init()
and remove numa version zones_sizes_init().

and let 32 bit call remove_all_active_ranges() in setup_arch() directly
like 64-bit

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c    |  2 --
 arch/x86/mm/discontig_32.c | 16 ----------------
 arch/x86/mm/init_32.c      | 10 ++++------
 include/asm-x86/page_32.h  |  1 -
 4 files changed, 4 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4ac01d0ce624..d5de157b02ae 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -749,10 +749,8 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_boot_table_init();
 
-#ifdef CONFIG_X86_64
 	/* Remove active ranges so rediscovery with NUMA-awareness happens */
 	remove_all_active_ranges();
-#endif
 
 #ifdef CONFIG_ACPI_NUMA
 	/*
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index e2f0dd13a451..fa389d20383e 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -327,7 +327,6 @@ void __init initmem_init(unsigned long start_pfn,
 	 * and ZONE_HIGHMEM.
 	 */
 
-	remove_all_active_ranges();
 	get_memcfg_numa();
 
 	kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
@@ -390,21 +389,6 @@ void __init initmem_init(unsigned long start_pfn,
 	setup_bootmem_allocator();
 }
 
-void __init zone_sizes_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA] =
-		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-#endif
-
-	free_area_init_nodes(max_zone_pfns);
-	return;
-}
-
 void __init set_highmem_pages_init(void)
 {
 #ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index aa5e37c9f4b4..8efe872b9617 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -660,12 +660,14 @@ void __init initmem_init(unsigned long start_pfn,
 	if (max_pfn > max_low_pfn)
 		highstart_pfn = max_low_pfn;
 	memory_present(0, 0, highend_pfn);
+	e820_register_active_regions(0, 0, highend_pfn);
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 		pages_to_mb(highend_pfn - highstart_pfn));
 	num_physpages = highend_pfn;
 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
 	memory_present(0, 0, max_low_pfn);
+	e820_register_active_regions(0, 0, max_low_pfn);
 	num_physpages = max_low_pfn;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
@@ -677,25 +679,21 @@ void __init initmem_init(unsigned long start_pfn,
 
 	setup_bootmem_allocator();
 }
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
-void __init zone_sizes_init(void)
+static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] =
 		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-	remove_all_active_ranges();
 #ifdef CONFIG_HIGHMEM
 	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-	e820_register_active_regions(0, 0, highend_pfn);
-#else
-	e820_register_active_regions(0, 0, max_low_pfn);
 #endif
 
 	free_area_init_nodes(max_zone_pfns);
 }
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
 void __init setup_bootmem_allocator(void)
 {
diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h
index 4ae1daba129b..ab8528793f08 100644
--- a/include/asm-x86/page_32.h
+++ b/include/asm-x86/page_32.h
@@ -96,7 +96,6 @@ extern void find_low_pfn_range(void);
 extern unsigned long init_memory_mapping(unsigned long start,
 					 unsigned long end);
 extern void initmem_init(unsigned long, unsigned long);
-extern void zone_sizes_init(void);
 extern void setup_bootmem_allocator(void);
 
 
-- 
cgit v1.2.3


From 5f4765f96eebee6a0adc4009758b597ba48a0a3a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 2 Jul 2008 18:53:44 -0700
Subject: x86: move init_cpu_to_node after get_smp_config

when acpi=off, cpu_to_apicid is ready after get_smp_config
so need to move init_cpu_to_node after it.

otherwise, we will get wrong cpu->node mapping, and it will rely on
amd_detect_cmp() to correct it - but that is too late as
setup_per_cpu_data is already called before that so  we will get
per_cpu_data on the wrong node.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d5de157b02ae..f52a6fb90238 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -810,10 +810,6 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_boot_init();
 
-#ifdef CONFIG_X86_64
-	init_cpu_to_node();
-#endif
-
 #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
 	/*
 	 * get boot-time SMP configuration:
@@ -822,6 +818,10 @@ void __init setup_arch(char **cmdline_p)
 		get_smp_config();
 #endif
 
+#ifdef CONFIG_X86_64
+	init_cpu_to_node();
+#endif
+
 	init_apic_mappings();
 	ioapic_init_mappings();
 
-- 
cgit v1.2.3


From 329513a35d1a2b6b28d54f5c2c0dde4face8200b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 2 Jul 2008 18:54:40 -0700
Subject: x86: move prefill_possible_map calling early

call it right after we are done with MADT/mptable handling, instead of
doing that in setup_per_cpu_areas() later on...

this way for_possible_cpu() can be used early.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c        |  1 +
 arch/x86/kernel/setup_percpu.c | 10 ----------
 arch/x86/kernel/smpboot.c      |  8 ++++++++
 include/asm-x86/smp.h          |  4 ++++
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f52a6fb90238..cfcfbefee0b9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -818,6 +818,7 @@ void __init setup_arch(char **cmdline_p)
 		get_smp_config();
 #endif
 
+	prefill_possible_map();
 #ifdef CONFIG_X86_64
 	init_cpu_to_node();
 #endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 43aca2de624a..5fc310f746fc 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -162,16 +162,6 @@ void __init setup_per_cpu_areas(void)
 	char *ptr;
 	int cpu;
 
-	/* no processor from mptable or madt */
-	if (!num_processors)
-		num_processors = 1;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	prefill_possible_map();
-#else
-	nr_cpu_ids = num_processors;
-#endif
-
 	/* Setup cpu_pda map */
 	setup_cpu_pda_map();
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3b19441d78b8..e1200b202ed7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1278,12 +1278,20 @@ __init void prefill_possible_map(void)
 	int i;
 	int possible;
 
+	/* no processor from mptable or madt */
+	if (!num_processors)
+		num_processors = 1;
+
+#ifdef CONFIG_HOTPLUG_CPU
 	if (additional_cpus == -1) {
 		if (disabled_cpus > 0)
 			additional_cpus = disabled_cpus;
 		else
 			additional_cpus = 0;
 	}
+#else
+	additional_cpus = 0;
+#endif
 	possible = num_processors + additional_cpus;
 	if (possible > NR_CPUS)
 		possible = NR_CPUS;
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index fad45f6a193f..b324a0645a78 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -119,6 +119,10 @@ static inline int num_booting_cpus(void)
 {
 	return cpus_weight(cpu_callout_map);
 }
+#else
+static inline void prefill_possible_map(void)
+{
+}
 #endif /* CONFIG_SMP */
 
 extern unsigned disabled_cpus __cpuinitdata;
-- 
cgit v1.2.3


From aea5f9f89bae5d5f9eb3fe3cddedbbfb82e6e44f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 4 Jul 2008 12:16:55 +0200
Subject: x86: fix "x86: let setup_arch call init_apic_mappings for 32bit"

add back this line lost from trap_init():

        set_trap_gate(0,  &divide_error);

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_32.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index f60feee83253..d7cc292691ff 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1198,6 +1198,7 @@ void __init trap_init(void)
 	early_iounmap(p, 4);
 #endif
 
+	set_trap_gate(0,  &divide_error);
 	set_intr_gate(1,  &debug);
 	set_intr_gate(2,  &nmi);
 	set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
-- 
cgit v1.2.3


From 8490638cf0fb3975f7636c5268f27d5daf4eaaa5 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 1 Jul 2008 16:46:35 -0700
Subject: x86: always set _PAGE_GLOBAL in _PAGE_KERNEL* flags

Consistently set _PAGE_GLOBAL in _PAGE_KERNEL flags.  This makes 32-
and 64-bit code consistent, and removes some special cases where
__PAGE_KERNEL* did not have _PAGE_GLOBAL set, causing confusion as a
result of the inconsistencies.

This patch only affects x86-64, which generally always supports PGD.
The x86-32 patch is next.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_64.S |  4 ++--
 include/asm-x86/pgtable.h | 32 +++++++++++++-------------------
 2 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 4336f98456ea..b07ac7b217cb 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
 	/* Since I easily can, map the first 1G.
 	 * Don't set NX because code runs from these pages.
 	 */
-	PMDS(0, __PAGE_KERNEL_LARGE_EXEC | _PAGE_GLOBAL, PTRS_PER_PMD)
+	PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
 
 NEXT_PAGE(level2_kernel_pgt)
 	/*
@@ -387,7 +387,7 @@ NEXT_PAGE(level2_kernel_pgt)
 	 *  If you want to increase this then increase MODULES_VADDR
 	 *  too.)
 	 */
-	PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
+	PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
 		KERNEL_IMAGE_SIZE/PMD_SIZE)
 
 NEXT_PAGE(level2_spare_pgt)
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 85f851a0b998..a1c0009e81b7 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -88,7 +88,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
 #endif	/* __ASSEMBLY__ */
 #else
 #define __PAGE_KERNEL_EXEC						\
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
+	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
 #define __PAGE_KERNEL		(__PAGE_KERNEL_EXEC | _PAGE_NX)
 #endif
 
@@ -103,24 +103,18 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
 
-#ifdef CONFIG_X86_32
-# define MAKE_GLOBAL(x)			__pgprot((x))
-#else
-# define MAKE_GLOBAL(x)			__pgprot((x) | _PAGE_GLOBAL)
-#endif
-
-#define PAGE_KERNEL			MAKE_GLOBAL(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO			MAKE_GLOBAL(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC		MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX			MAKE_GLOBAL(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_WC			MAKE_GLOBAL(__PAGE_KERNEL_WC)
-#define PAGE_KERNEL_NOCACHE		MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_UC_MINUS		MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
-#define PAGE_KERNEL_EXEC_NOCACHE	MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
-#define PAGE_KERNEL_LARGE		MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC		MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL		MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE	MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+#define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC		__pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX			__pgprot(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC			__pgprot(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE		__pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS		__pgprot(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE	__pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE		__pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_EXEC		__pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL		__pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE	__pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
 
 /*         xwr */
 #define __P000	PAGE_NONE
-- 
cgit v1.2.3


From 6247943d8ab699b57653afd453a4940cca70ef8a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 6 Jul 2008 11:42:11 -0700
Subject: x86: remove acpi_srat config v2

use ACPI_NUMA directly

and move srat_32.c to mm/

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig          |   5 -
 arch/x86/kernel/Makefile  |   1 -
 arch/x86/kernel/srat_32.c | 280 ----------------------------------------------
 arch/x86/mm/Makefile      |   3 +-
 arch/x86/mm/srat_32.c     | 280 ++++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/srat.h    |   2 +-
 6 files changed, 283 insertions(+), 288 deletions(-)
 delete mode 100644 arch/x86/kernel/srat_32.c
 create mode 100644 arch/x86/mm/srat_32.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d8dddbc5e349..bb0c0d0f6db7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -455,11 +455,6 @@ config MEMTEST
 		memtest=4, mean do 4 test patterns.
 	  If you are unsure how to answer this question, answer Y.
 
-config ACPI_SRAT
-	def_bool y
-	depends on X86_32 && ACPI && NUMA && X86_GENERICARCH
-	select ACPI_NUMA
-
 config X86_SUMMIT_NUMA
 	def_bool y
 	depends on X86_32 && NUMA && X86_GENERICARCH
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 212804e78787..54829e2b5160 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -64,7 +64,6 @@ obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o
 obj-y				+= vsmp_64.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_MODULES)		+= module_$(BITS).o
-obj-$(CONFIG_ACPI_SRAT) 	+= srat_32.o
 obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
deleted file mode 100644
index f41d67f8f831..000000000000
--- a/arch/x86/kernel/srat_32.c
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit 
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.          
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/mmzone.h>
-#include <linux/acpi.h>
-#include <linux/nodemask.h>
-#include <asm/srat.h>
-#include <asm/topology.h>
-#include <asm/smp.h>
-#include <asm/e820.h>
-
-/*
- * proximity macros and definitions
- */
-#define NODE_ARRAY_INDEX(x)	((x) / 8)	/* 8 bits/char */
-#define NODE_ARRAY_OFFSET(x)	((x) % 8)	/* 8 bits/char */
-#define BMAP_SET(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
-#define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-/* bitmap length; _PXM is at most 255 */
-#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
-static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
-
-#define MAX_CHUNKS_PER_NODE	3
-#define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
-struct node_memory_chunk_s {
-	unsigned long	start_pfn;
-	unsigned long	end_pfn;
-	u8	pxm;		// proximity domain of node
-	u8	nid;		// which cnode contains this chunk?
-	u8	bank;		// which mem bank on this node
-};
-static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
-
-static int __initdata num_memory_chunks; /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_APICID];
-
-int numa_off __initdata;
-int acpi_numa __initdata;
-
-static __init void bad_srat(void)
-{
-        printk(KERN_ERR "SRAT: SRAT not used.\n");
-        acpi_numa = -1;
-	num_memory_chunks = 0;
-}
-
-static __init inline int srat_disabled(void)
-{
-	return numa_off || acpi_numa < 0;
-}
-
-/* Identify CPU proximity domains */
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
-{
-	if (srat_disabled())
-		return;
-	if (cpu_affinity->header.length !=
-	     sizeof(struct acpi_srat_cpu_affinity)) {
-		bad_srat();
-		return;
-	}
-
-	if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
-		return;		/* empty entry */
-
-	/* mark this node as "seen" in node bitmap */
-	BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
-
-	apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
-
-	printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
-		cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
-}
-
-/*
- * Identify memory proximity domains and hot-remove capabilities.
- * Fill node memory chunk list structure.
- */
-void __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
-{
-	unsigned long long paddr, size;
-	unsigned long start_pfn, end_pfn;
-	u8 pxm;
-	struct node_memory_chunk_s *p, *q, *pend;
-
-	if (srat_disabled())
-		return;
-	if (memory_affinity->header.length !=
-	     sizeof(struct acpi_srat_mem_affinity)) {
-		bad_srat();
-		return;
-	}
-
-	if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
-		return;		/* empty entry */
-
-	pxm = memory_affinity->proximity_domain & 0xff;
-
-	/* mark this node as "seen" in node bitmap */
-	BMAP_SET(pxm_bitmap, pxm);
-
-	/* calculate info for memory chunk structure */
-	paddr = memory_affinity->base_address;
-	size = memory_affinity->length;
-
-	start_pfn = paddr >> PAGE_SHIFT;
-	end_pfn = (paddr + size) >> PAGE_SHIFT;
-
-
-	if (num_memory_chunks >= MAXCHUNKS) {
-		printk(KERN_WARNING "Too many mem chunks in SRAT."
-			" Ignoring %lld MBytes at %llx\n",
-			size/(1024*1024), paddr);
-		return;
-	}
-
-	/* Insertion sort based on base address */
-	pend = &node_memory_chunk[num_memory_chunks];
-	for (p = &node_memory_chunk[0]; p < pend; p++) {
-		if (start_pfn < p->start_pfn)
-			break;
-	}
-	if (p < pend) {
-		for (q = pend; q >= p; q--)
-			*(q + 1) = *q;
-	}
-	p->start_pfn = start_pfn;
-	p->end_pfn = end_pfn;
-	p->pxm = pxm;
-
-	num_memory_chunks++;
-
-	printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)"
-			  " in proximity domain %02x %s\n",
-		start_pfn, end_pfn,
-		memory_affinity->memory_type,
-		pxm,
-		((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
-		 "enabled and removable" : "enabled" ) );
-}
-
-/* Callback for SLIT parsing */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-}
-
-void acpi_numa_arch_fixup(void)
-{
-}
-/*
- * The SRAT table always lists ascending addresses, so can always
- * assume that the first "start" address that you see is the real
- * start of the node, and that the current "end" address is after
- * the previous one.
- */
-static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
-{
-	/*
-	 * Only add present memory as told by the e820.
-	 * There is no guarantee from the SRAT that the memory it
-	 * enumerates is present at boot time because it represents
-	 * *possible* memory hotplug areas the same as normal RAM.
-	 */
-	if (memory_chunk->start_pfn >= max_pfn) {
-		printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
-			memory_chunk->start_pfn, memory_chunk->end_pfn);
-		return;
-	}
-	if (memory_chunk->nid != nid)
-		return;
-
-	if (!node_has_online_mem(nid))
-		node_start_pfn[nid] = memory_chunk->start_pfn;
-
-	if (node_start_pfn[nid] > memory_chunk->start_pfn)
-		node_start_pfn[nid] = memory_chunk->start_pfn;
-
-	if (node_end_pfn[nid] < memory_chunk->end_pfn)
-		node_end_pfn[nid] = memory_chunk->end_pfn;
-}
-
-int __init get_memcfg_from_srat(void)
-{
-	int i, j, nid;
-
-
-	if (srat_disabled())
-		goto out_fail;
-
-	if (num_memory_chunks == 0) {
-		printk(KERN_WARNING
-			 "could not finy any ACPI SRAT memory areas.\n");
-		goto out_fail;
-	}
-
-	/* Calculate total number of nodes in system from PXM bitmap and create
-	 * a set of sequential node IDs starting at zero.  (ACPI doesn't seem
-	 * to specify the range of _PXM values.)
-	 */
-	/*
-	 * MCD - we no longer HAVE to number nodes sequentially.  PXM domain
-	 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
-	 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
-	 * approaches MAX_PXM_DOMAINS for i386.
-	 */
-	nodes_clear(node_online_map);
-	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
-		if (BMAP_TEST(pxm_bitmap, i)) {
-			int nid = acpi_map_pxm_to_node(i);
-			node_set_online(nid);
-		}
-	}
-	BUG_ON(num_online_nodes() == 0);
-
-	/* set cnode id in memory chunk structure */
-	for (i = 0; i < num_memory_chunks; i++)
-		node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
-
-	printk(KERN_DEBUG "pxm bitmap: ");
-	for (i = 0; i < sizeof(pxm_bitmap); i++) {
-		printk(KERN_CONT "%02x ", pxm_bitmap[i]);
-	}
-	printk(KERN_CONT "\n");
-	printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
-			 num_online_nodes());
-	printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
-			 num_memory_chunks);
-
-	for (i = 0; i < MAX_APICID; i++)
-		apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
-
-	for (j = 0; j < num_memory_chunks; j++){
-		struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
-		printk(KERN_DEBUG
-			"chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
-		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
-		node_read_chunk(chunk->nid, chunk);
-		e820_register_active_regions(chunk->nid, chunk->start_pfn,
-					     min(chunk->end_pfn, max_pfn));
-	}
-
-	for_each_online_node(nid) {
-		unsigned long start = node_start_pfn[nid];
-		unsigned long end = min(node_end_pfn[nid], max_pfn);
-
-		memory_present(nid, start, end);
-		node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
-	}
-	return 1;
-out_fail:
-	printk(KERN_ERR "failed to get NUMA memory information from SRAT"
-			" table\n");
-	return 0;
-}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..c107641cd39b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -13,5 +13,6 @@ obj-$(CONFIG_NUMA)		+= discontig_32.o
 else
 obj-$(CONFIG_NUMA)		+= numa_64.o
 obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
-obj-$(CONFIG_ACPI_NUMA)		+= srat_64.o
 endif
+obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
+
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
new file mode 100644
index 000000000000..f41d67f8f831
--- /dev/null
+++ b/arch/x86/mm/srat_32.c
@@ -0,0 +1,280 @@
+/*
+ * Some of the code in this file has been gleaned from the 64 bit 
+ * discontigmem support code base.
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to Pat Gaughen <gone@us.ibm.com>
+ */
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/acpi.h>
+#include <linux/nodemask.h>
+#include <asm/srat.h>
+#include <asm/topology.h>
+#include <asm/smp.h>
+#include <asm/e820.h>
+
+/*
+ * proximity macros and definitions
+ */
+#define NODE_ARRAY_INDEX(x)	((x) / 8)	/* 8 bits/char */
+#define NODE_ARRAY_OFFSET(x)	((x) % 8)	/* 8 bits/char */
+#define BMAP_SET(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
+#define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
+/* bitmap length; _PXM is at most 255 */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
+static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
+
+#define MAX_CHUNKS_PER_NODE	3
+#define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+struct node_memory_chunk_s {
+	unsigned long	start_pfn;
+	unsigned long	end_pfn;
+	u8	pxm;		// proximity domain of node
+	u8	nid;		// which cnode contains this chunk?
+	u8	bank;		// which mem bank on this node
+};
+static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
+
+static int __initdata num_memory_chunks; /* total number of memory chunks */
+static u8 __initdata apicid_to_pxm[MAX_APICID];
+
+int numa_off __initdata;
+int acpi_numa __initdata;
+
+static __init void bad_srat(void)
+{
+        printk(KERN_ERR "SRAT: SRAT not used.\n");
+        acpi_numa = -1;
+	num_memory_chunks = 0;
+}
+
+static __init inline int srat_disabled(void)
+{
+	return numa_off || acpi_numa < 0;
+}
+
+/* Identify CPU proximity domains */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
+{
+	if (srat_disabled())
+		return;
+	if (cpu_affinity->header.length !=
+	     sizeof(struct acpi_srat_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
+
+	if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+		return;		/* empty entry */
+
+	/* mark this node as "seen" in node bitmap */
+	BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+
+	apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
+
+	printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
+		cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
+}
+
+/*
+ * Identify memory proximity domains and hot-remove capabilities.
+ * Fill node memory chunk list structure.
+ */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
+{
+	unsigned long long paddr, size;
+	unsigned long start_pfn, end_pfn;
+	u8 pxm;
+	struct node_memory_chunk_s *p, *q, *pend;
+
+	if (srat_disabled())
+		return;
+	if (memory_affinity->header.length !=
+	     sizeof(struct acpi_srat_mem_affinity)) {
+		bad_srat();
+		return;
+	}
+
+	if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+		return;		/* empty entry */
+
+	pxm = memory_affinity->proximity_domain & 0xff;
+
+	/* mark this node as "seen" in node bitmap */
+	BMAP_SET(pxm_bitmap, pxm);
+
+	/* calculate info for memory chunk structure */
+	paddr = memory_affinity->base_address;
+	size = memory_affinity->length;
+
+	start_pfn = paddr >> PAGE_SHIFT;
+	end_pfn = (paddr + size) >> PAGE_SHIFT;
+
+
+	if (num_memory_chunks >= MAXCHUNKS) {
+		printk(KERN_WARNING "Too many mem chunks in SRAT."
+			" Ignoring %lld MBytes at %llx\n",
+			size/(1024*1024), paddr);
+		return;
+	}
+
+	/* Insertion sort based on base address */
+	pend = &node_memory_chunk[num_memory_chunks];
+	for (p = &node_memory_chunk[0]; p < pend; p++) {
+		if (start_pfn < p->start_pfn)
+			break;
+	}
+	if (p < pend) {
+		for (q = pend; q >= p; q--)
+			*(q + 1) = *q;
+	}
+	p->start_pfn = start_pfn;
+	p->end_pfn = end_pfn;
+	p->pxm = pxm;
+
+	num_memory_chunks++;
+
+	printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)"
+			  " in proximity domain %02x %s\n",
+		start_pfn, end_pfn,
+		memory_affinity->memory_type,
+		pxm,
+		((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
+		 "enabled and removable" : "enabled" ) );
+}
+
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+}
+
+void acpi_numa_arch_fixup(void)
+{
+}
+/*
+ * The SRAT table always lists ascending addresses, so can always
+ * assume that the first "start" address that you see is the real
+ * start of the node, and that the current "end" address is after
+ * the previous one.
+ */
+static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
+{
+	/*
+	 * Only add present memory as told by the e820.
+	 * There is no guarantee from the SRAT that the memory it
+	 * enumerates is present at boot time because it represents
+	 * *possible* memory hotplug areas the same as normal RAM.
+	 */
+	if (memory_chunk->start_pfn >= max_pfn) {
+		printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
+			memory_chunk->start_pfn, memory_chunk->end_pfn);
+		return;
+	}
+	if (memory_chunk->nid != nid)
+		return;
+
+	if (!node_has_online_mem(nid))
+		node_start_pfn[nid] = memory_chunk->start_pfn;
+
+	if (node_start_pfn[nid] > memory_chunk->start_pfn)
+		node_start_pfn[nid] = memory_chunk->start_pfn;
+
+	if (node_end_pfn[nid] < memory_chunk->end_pfn)
+		node_end_pfn[nid] = memory_chunk->end_pfn;
+}
+
+int __init get_memcfg_from_srat(void)
+{
+	int i, j, nid;
+
+
+	if (srat_disabled())
+		goto out_fail;
+
+	if (num_memory_chunks == 0) {
+		printk(KERN_WARNING
+			 "could not finy any ACPI SRAT memory areas.\n");
+		goto out_fail;
+	}
+
+	/* Calculate total number of nodes in system from PXM bitmap and create
+	 * a set of sequential node IDs starting at zero.  (ACPI doesn't seem
+	 * to specify the range of _PXM values.)
+	 */
+	/*
+	 * MCD - we no longer HAVE to number nodes sequentially.  PXM domain
+	 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
+	 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
+	 * approaches MAX_PXM_DOMAINS for i386.
+	 */
+	nodes_clear(node_online_map);
+	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
+		if (BMAP_TEST(pxm_bitmap, i)) {
+			int nid = acpi_map_pxm_to_node(i);
+			node_set_online(nid);
+		}
+	}
+	BUG_ON(num_online_nodes() == 0);
+
+	/* set cnode id in memory chunk structure */
+	for (i = 0; i < num_memory_chunks; i++)
+		node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
+
+	printk(KERN_DEBUG "pxm bitmap: ");
+	for (i = 0; i < sizeof(pxm_bitmap); i++) {
+		printk(KERN_CONT "%02x ", pxm_bitmap[i]);
+	}
+	printk(KERN_CONT "\n");
+	printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
+			 num_online_nodes());
+	printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
+			 num_memory_chunks);
+
+	for (i = 0; i < MAX_APICID; i++)
+		apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
+
+	for (j = 0; j < num_memory_chunks; j++){
+		struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
+		printk(KERN_DEBUG
+			"chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
+		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
+		node_read_chunk(chunk->nid, chunk);
+		e820_register_active_regions(chunk->nid, chunk->start_pfn,
+					     min(chunk->end_pfn, max_pfn));
+	}
+
+	for_each_online_node(nid) {
+		unsigned long start = node_start_pfn[nid];
+		unsigned long end = min(node_end_pfn[nid], max_pfn);
+
+		memory_present(nid, start, end);
+		node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
+	}
+	return 1;
+out_fail:
+	printk(KERN_ERR "failed to get NUMA memory information from SRAT"
+			" table\n");
+	return 0;
+}
diff --git a/include/asm-x86/srat.h b/include/asm-x86/srat.h
index 456fe0b5a921..774c919dc232 100644
--- a/include/asm-x86/srat.h
+++ b/include/asm-x86/srat.h
@@ -27,7 +27,7 @@
 #ifndef _ASM_SRAT_H_
 #define _ASM_SRAT_H_
 
-#ifdef CONFIG_ACPI_SRAT
+#ifdef CONFIG_ACPI_NUMA
 extern int get_memcfg_from_srat(void);
 #else
 static inline int get_memcfg_from_srat(void)
-- 
cgit v1.2.3


From 5dfcf14d5b28174f94cbe9b4fb35d415db61c64a Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Fri, 27 Jun 2008 13:12:55 +0200
Subject: x86: use FIRMWARE_MEMMAP on x86/E820

This patch uses the /sys/firmware/memmap interface provided in the last patch
on the x86 architecture when E820 is used. The patch copies the E820
memory map very early, and registers the E820 map afterwards via
firmware_map_add_early().

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Acked-by: Greg KH <gregkh@suse.de>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: kexec@lists.infradead.org
Cc: yhlu.kernel@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 44 +++++++++++++++++++++++++++++++++++++-------
 include/asm-x86/e820.h |  2 ++
 2 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d0335853ff52..fc1d579f212b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -19,6 +19,7 @@
 #include <linux/mm.h>
 #include <linux/pfn.h>
 #include <linux/suspend.h>
+#include <linux/firmware-map.h>
 
 #include <asm/pgtable.h>
 #include <asm/page.h>
@@ -27,7 +28,22 @@
 #include <asm/setup.h>
 #include <asm/trampoline.h>
 
+/*
+ * The e820 map is the map that gets modified e.g. with command line parameters
+ * and that is also registered with modifications in the kernel resource tree
+ * with the iomem_resource as parent.
+ *
+ * The e820_saved is directly saved after the BIOS-provided memory map is
+ * copied. It doesn't get modified afterwards. It's registered for the
+ * /sys/firmware/memmap interface.
+ *
+ * That memory map is not modified and is used as base for kexec. The kexec'd
+ * kernel should get the same memory map as the firmware provides. Then the
+ * user can e.g. boot the original kernel with mem=1G while still booting the
+ * next kernel with full memory.
+ */
 struct e820map e820;
+struct e820map e820_saved;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
@@ -1198,6 +1214,17 @@ void __init finish_e820_parsing(void)
 	}
 }
 
+static inline const char *e820_type_to_string(int e820_type)
+{
+	switch (e820_type) {
+	case E820_RESERVED_KERN:
+	case E820_RAM:	return "System RAM";
+	case E820_ACPI:	return "ACPI Tables";
+	case E820_NVS:	return "ACPI Non-volatile Storage";
+	default:	return "reserved";
+	}
+}
+
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
@@ -1209,13 +1236,6 @@ void __init e820_reserve_resources(void)
 
 	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
 	for (i = 0; i < e820.nr_map; i++) {
-		switch (e820.map[i].type) {
-		case E820_RESERVED_KERN:
-		case E820_RAM:	res->name = "System RAM"; break;
-		case E820_ACPI:	res->name = "ACPI Tables"; break;
-		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
-		default:	res->name = "reserved";
-		}
 		end = e820.map[i].addr + e820.map[i].size - 1;
 #ifndef CONFIG_RESOURCES_64BIT
 		if (end > 0x100000000ULL) {
@@ -1223,6 +1243,7 @@ void __init e820_reserve_resources(void)
 			continue;
 		}
 #endif
+		res->name = e820_type_to_string(e820.map[i].type);
 		res->start = e820.map[i].addr;
 		res->end = end;
 
@@ -1230,6 +1251,13 @@ void __init e820_reserve_resources(void)
 		insert_resource(&iomem_resource, res);
 		res++;
 	}
+
+	for (i = 0; i < e820_saved.nr_map; i++) {
+		struct e820entry *entry = &e820_saved.map[i];
+		firmware_map_add_early(entry->addr,
+			entry->addr + entry->size - 1,
+			e820_type_to_string(entry->type));
+	}
 }
 
 char *__init default_machine_specific_memory_setup(void)
@@ -1266,6 +1294,8 @@ char *__init default_machine_specific_memory_setup(void)
 		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
 	}
 
+	memcpy(&e820_saved, &e820, sizeof(struct e820map));
+
 	/* In case someone cares... */
 	return who;
 }
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 4b26604b3c19..a20d0a7f5892 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -59,7 +59,9 @@ struct e820map {
 	struct e820entry map[E820_X_MAX];
 };
 
+/* see comment in arch/x86/kernel/e820.c */
 extern struct e820map e820;
+extern struct e820map e820_saved;
 
 extern int e820_any_mapped(u64 start, u64 end, unsigned type);
 extern int e820_all_mapped(u64 start, u64 end, unsigned type);
-- 
cgit v1.2.3


From 0be15526beb4c228e0477221c62ec8ab0fc7440f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 3 Jul 2008 11:35:37 -0700
Subject: x86: move saving e820_saved to setup_memory_map

so other path that will override memory_setup or
machine_specific_memory_setup could have e820_saved too.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index fc1d579f212b..13e32986cb5f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1294,8 +1294,6 @@ char *__init default_machine_specific_memory_setup(void)
 		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
 	}
 
-	memcpy(&e820_saved, &e820, sizeof(struct e820map));
-
 	/* In case someone cares... */
 	return who;
 }
@@ -1313,8 +1311,12 @@ char * __init __attribute__((weak)) memory_setup(void)
 
 void __init setup_memory_map(void)
 {
+	char *who;
+
+	who = memory_setup();
+	memcpy(&e820_saved, &e820, sizeof(struct e820map));
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	e820_print_map(memory_setup());
+	e820_print_map(who);
 }
 
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3


From a0a0becd2da0ba0d7f0204a61d1905926cdb163d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 3 Jul 2008 11:37:13 -0700
Subject: x86: make e820_saved have update from setup_data

seperate reserve_setup_data into e820_reserved_setup_data,
and reserve_early_setup_data.

So could use e820_reserved_setup_data to backup e820 with setup_data.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Bernhard Walle <bwalle@suse.de>
Cc: Ying Huang <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cfcfbefee0b9..bea8ae77d059 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -394,11 +394,10 @@ static void __init parse_setup_data(void)
 	}
 }
 
-static void __init reserve_setup_data(void)
+static void __init e820_reserve_setup_data(void)
 {
 	struct setup_data *data;
 	u64 pa_data;
-	char buf[32];
 	int found = 0;
 
 	if (boot_params.hdr.version < 0x0209)
@@ -406,8 +405,6 @@ static void __init reserve_setup_data(void)
 	pa_data = boot_params.hdr.setup_data;
 	while (pa_data) {
 		data = early_ioremap(pa_data, sizeof(*data));
-		sprintf(buf, "setup data %x", data->type);
-		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
 		e820_update_range(pa_data, sizeof(*data)+data->len,
 			 E820_RAM, E820_RESERVED_KERN);
 		found = 1;
@@ -418,10 +415,29 @@ static void __init reserve_setup_data(void)
 		return;
 
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+	memcpy(&e820_saved, &e820, sizeof(struct e820map));
 	printk(KERN_INFO "extended physical RAM map:\n");
 	e820_print_map("reserve setup_data");
 }
 
+static void __init reserve_early_setup_data(void)
+{
+	struct setup_data *data;
+	u64 pa_data;
+	char buf[32];
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, sizeof(*data));
+		sprintf(buf, "setup data %x", data->type);
+		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+		pa_data = data->next;
+		early_iounmap(data, sizeof(*data));
+	}
+}
+
 /*
  * --------- Crashkernel reservation ------------------------------
  */
@@ -626,6 +642,8 @@ void __init setup_arch(char **cmdline_p)
 
 	setup_memory_map();
 	parse_setup_data();
+	/* update the e820_saved too */
+	e820_reserve_setup_data();
 
 	copy_edd();
 
@@ -656,7 +674,7 @@ void __init setup_arch(char **cmdline_p)
 	parse_early_param();
 
 	/* after early param, so could get panic from serial */
-	reserve_setup_data();
+	reserve_early_setup_data();
 
 	if (acpi_mps_check()) {
 #ifdef CONFIG_X86_LOCAL_APIC
-- 
cgit v1.2.3


From fc9036ea1a4b14229788e6df3936b451a6abac98 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 3 Jul 2008 11:39:00 -0700
Subject: x86: let early_reserve_e820 update e820_saved too

so when it is called after early_param, e820_saved get updated too.
esp for mpc update.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 13e32986cb5f..e07d4019e266 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -414,8 +414,9 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
 	return __append_e820_map(biosmap, nr_map);
 }
 
-u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type)
+static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+					u64 size, unsigned old_type,
+					unsigned new_type)
 {
 	int i;
 	u64 real_updated_size = 0;
@@ -426,7 +427,7 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 		size = ULLONG_MAX - start;
 
 	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
+		struct e820entry *ei = &e820x->map[i];
 		u64 final_start, final_end;
 		if (ei->type != old_type)
 			continue;
@@ -454,6 +455,19 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 	return real_updated_size;
 }
 
+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
+			     unsigned new_type)
+{
+	return e820_update_range_map(&e820, start, size, old_type, new_type);
+}
+
+static u64 __init e820_update_range_saved(u64 start, u64 size,
+					  unsigned old_type, unsigned new_type)
+{
+	return e820_update_range_map(&e820_saved, start, size, old_type,
+				     new_type);
+}
+
 /* make e820 not cover the range */
 u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
 			     int checktype)
@@ -503,6 +517,15 @@ void __init update_e820(void)
 	printk(KERN_INFO "modified physical RAM map:\n");
 	e820_print_map("modified");
 }
+static void __init update_e820_saved(void)
+{
+	int nr_map;
+
+	nr_map = e820_saved.nr_map;
+	if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
+		return;
+	e820_saved.nr_map = nr_map;
+}
 #define MAX_GAP_END 0x100000000ull
 /*
  * Search for a gap in the e820 memory space from start_addr to end_addr.
@@ -1007,8 +1030,10 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 
 	addr = round_down(start + size - sizet, align);
 	e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
+	e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
 	printk(KERN_INFO "update e820 for early_reserve_e820\n");
 	update_e820();
+	update_e820_saved();
 
 	return addr;
 }
-- 
cgit v1.2.3


From 83f5d894ca5280bfcd904dfeb1347c2da2b19aac Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Tue, 1 Jul 2008 14:45:38 -0500
Subject: x86: map UV chipset space - UV support

Create page table entries to map the SGI UV chipset GRU. local MMR &
global MMR ranges.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/genx2apic_uv_x.c | 75 +++++++++++++++++++++++++++++++++++++++-
 include/asm-x86/uv/uv_hub.h      |  2 ++
 include/asm-x86/uv/uv_mmrs.h     | 46 ++++++++++++++++++++++++
 3 files changed, 122 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 45e84acca8a9..711f11c30b06 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -8,6 +8,7 @@
  * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
  */
 
+#include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/string.h>
@@ -20,6 +21,7 @@
 #include <asm/smp.h>
 #include <asm/ipi.h>
 #include <asm/genapic.h>
+#include <asm/pgtable.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
 
@@ -208,14 +210,79 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
 	BUG();
 }
 
+static __init void map_low_mmrs(void)
+{
+	init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
+	init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
+}
+
+enum map_type {map_wb, map_uc};
+
+static void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
+{
+	unsigned long bytes, paddr;
+
+	paddr = base << shift;
+	bytes = (1UL << shift);
+	printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
+	       					paddr + bytes);
+	if (map_type == map_uc)
+		init_extra_mapping_uc(paddr, bytes);
+	else
+		init_extra_mapping_wb(paddr, bytes);
+
+}
+static __init void map_gru_high(int max_pnode)
+{
+	union uvh_rh_gam_gru_overlay_config_mmr_u gru;
+	int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+	gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
+	if (gru.s.enable)
+		map_high("GRU", gru.s.base, shift, map_wb);
+}
+
+static __init void map_config_high(int max_pnode)
+{
+	union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
+	int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+	cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
+	if (cfg.s.enable)
+		map_high("CONFIG", cfg.s.base, shift, map_uc);
+}
+
+static __init void map_mmr_high(int max_pnode)
+{
+	union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
+	int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+	mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
+	if (mmr.s.enable)
+		map_high("MMR", mmr.s.base, shift, map_uc);
+}
+
+static __init void map_mmioh_high(int max_pnode)
+{
+	union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
+	int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+	mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+	if (mmioh.s.enable)
+		map_high("MMIOH", mmioh.s.base, shift, map_uc);
+}
+
 static __init void uv_system_init(void)
 {
 	union uvh_si_addr_map_config_u m_n_config;
 	union uvh_node_id_u node_id;
 	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
 	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+	int max_pnode = 0;
 	unsigned long mmr_base, present;
 
+	map_low_mmrs();
+
 	m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
 	m_val = m_n_config.s.m_skt;
 	n_val = m_n_config.s.n_skt;
@@ -281,12 +348,18 @@ static __init void uv_system_init(void)
 		uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
 		uv_node_to_blade[nid] = blade;
 		uv_cpu_to_blade[cpu] = blade;
+		max_pnode = max(pnode, max_pnode);
 
-		printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, pnode %d, nid %d, "
+		printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, "
 			"lcpu %d, blade %d\n",
 			cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
 			lcpu, blade);
 	}
+
+	map_gru_high(max_pnode);
+	map_mmr_high(max_pnode);
+	map_config_high(max_pnode);
+	map_mmioh_high(max_pnode);
 }
 
 /*
diff --git a/include/asm-x86/uv/uv_hub.h b/include/asm-x86/uv/uv_hub.h
index 65004881de5f..a4ef26e5850b 100644
--- a/include/asm-x86/uv/uv_hub.h
+++ b/include/asm-x86/uv/uv_hub.h
@@ -149,6 +149,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define UV_LOCAL_MMR_BASE		0xf4000000UL
 #define UV_GLOBAL_MMR32_BASE		0xf8000000UL
 #define UV_GLOBAL_MMR64_BASE		(uv_hub_info->global_mmr_base)
+#define UV_LOCAL_MMR_SIZE		(64UL * 1024 * 1024)
+#define UV_GLOBAL_MMR32_SIZE		(64UL * 1024 * 1024)
 
 #define UV_GLOBAL_MMR32_PNODE_SHIFT	15
 #define UV_GLOBAL_MMR64_PNODE_SHIFT	26
diff --git a/include/asm-x86/uv/uv_mmrs.h b/include/asm-x86/uv/uv_mmrs.h
index ac9846076521..37113f554a97 100644
--- a/include/asm-x86/uv/uv_mmrs.h
+++ b/include/asm-x86/uv/uv_mmrs.h
@@ -712,6 +712,26 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
     } s;
 };
 
+/* ========================================================================= */
+/*                    UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR                      */
+/* ========================================================================= */
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL
+
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_cfg_overlay_config_mmr_u {
+    unsigned long	v;
+    struct uvh_rh_gam_cfg_overlay_config_mmr_s {
+	unsigned long	rsvd_0_25: 26;  /*    */
+	unsigned long	base   : 20;  /* RW */
+	unsigned long	rsvd_46_62: 17;  /*    */
+	unsigned long	enable :  1;  /* RW */
+    } s;
+};
+
 /* ========================================================================= */
 /*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
@@ -739,6 +759,32 @@ union uvh_rh_gam_gru_overlay_config_mmr_u {
     } s;
 };
 
+/* ========================================================================= */
+/*                   UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR                     */
+/* ========================================================================= */
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_mmioh_overlay_config_mmr_u {
+    unsigned long	v;
+    struct uvh_rh_gam_mmioh_overlay_config_mmr_s {
+	unsigned long	rsvd_0_29: 30;  /*    */
+	unsigned long	base   : 16;  /* RW */
+	unsigned long	m_io   :  6;  /* RW */
+	unsigned long	n_io   :  4;  /* RW */
+	unsigned long	rsvd_56_62:  7;  /*    */
+	unsigned long	enable :  1;  /* RW */
+    } s;
+};
+
 /* ========================================================================= */
 /*                    UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
-- 
cgit v1.2.3


From 746f2eb790e75676ddc3b816ba18bac4179cc744 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 1 Jul 2008 21:43:52 +0400
Subject: x86: apic_32.c - add lapic resource

Add lapic resource into kernel resource map and mark it as busy

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: "Maciej W. Rozycki" <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
CC: Maciej W. Rozycki <macro@linux-mips.org>
---
 arch/x86/kernel/apic_32.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 6dea8306d8c0..3e947208b9d9 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -82,6 +82,11 @@ int pic_mode;
 /* Have we found an MP table */
 int smp_found_config;
 
+static struct resource lapic_resource = {
+	.name = "Local APIC",
+	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
 static unsigned int calibration_result;
 
 static int lapic_next_event(unsigned long delta,
@@ -1720,3 +1725,21 @@ static int __init apic_set_verbosity(char *str)
 }
 __setup("apic=", apic_set_verbosity);
 
+static int __init lapic_insert_resource(void)
+{
+	if (!apic_phys)
+		return -1;
+
+	/* Put local APIC into the resource map. */
+	lapic_resource.start = apic_phys;
+	lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+	insert_resource(&iomem_resource, &lapic_resource);
+
+	return 0;
+}
+
+/*
+ * need call insert after e820_reserve_resources()
+ * that is using request_resource
+ */
+late_initcall(lapic_insert_resource);
-- 
cgit v1.2.3


From 0ef95533326a7b37d16025af9edc0c18e644b346 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Tue, 1 Jul 2008 11:43:18 -0700
Subject: x86: merge sched_clock handling

Move the basic global variable definitions and sched_clock handling in the
common "tsc.c" file.

 - Unify notsc kernel command line handling for 32 bit and 64bit.
 - Functional changes for 64bit.
        - "tsc_disabled" is updated if "notsc" is passed at boottime.
        - Fallback to jiffies for sched_clock, incase notsc is passed on
	  commandline.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Signed-off-by: Dan Hecht <dhecht@vmware.com>
Cc: Dan Hecht <dhecht@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile  |  2 +-
 arch/x86/kernel/time_32.c |  3 --
 arch/x86/kernel/tsc.c     | 86 +++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/tsc_32.c  | 86 ++---------------------------------------------
 arch/x86/kernel/tsc_64.c  | 67 ++++++------------------------------
 5 files changed, 100 insertions(+), 144 deletions(-)
 create mode 100644 arch/x86/kernel/tsc.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 54829e2b5160..ca904ee17252 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,7 +26,7 @@ obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o
-obj-y			+= tsc_$(BITS).o io_delay.o rtc.o
+obj-y			+= tsc_$(BITS).o io_delay.o rtc.o tsc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 5f29f12da50c..059ca6ee59b4 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -39,9 +39,6 @@
 
 #include "do_timer.h"
 
-unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
-EXPORT_SYMBOL(cpu_khz);
-
 int timer_ack;
 
 unsigned long profile_pc(struct pt_regs *regs)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
new file mode 100644
index 000000000000..5d0be778fadd
--- /dev/null
+++ b/arch/x86/kernel/tsc.c
@@ -0,0 +1,86 @@
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+
+unsigned int cpu_khz;           /* TSC clocks / usec, not used here */
+EXPORT_SYMBOL(cpu_khz);
+unsigned int tsc_khz;
+EXPORT_SYMBOL(tsc_khz);
+
+/*
+ * TSC can be unstable due to cpufreq or due to unsynced TSCs
+ */
+int tsc_unstable;
+
+/* native_sched_clock() is called before tsc_init(), so
+   we must start with the TSC soft disabled to prevent
+   erroneous rdtsc usage on !cpu_has_tsc processors */
+int tsc_disabled = -1;
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+u64 native_sched_clock(void)
+{
+	u64 this_offset;
+
+	/*
+	 * Fall back to jiffies if there's no TSC available:
+	 * ( But note that we still use it if the TSC is marked
+	 *   unstable. We do this because unlike Time Of Day,
+	 *   the scheduler clock tolerates small errors and it's
+	 *   very important for it to be as fast as the platform
+	 *   can achive it. )
+	 */
+	if (unlikely(tsc_disabled)) {
+		/* No locking but a rare wrong value is not a big deal: */
+		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+	}
+
+	/* read the Time Stamp Counter: */
+	rdtscll(this_offset);
+
+	/* return the value in ns */
+	return cycles_2_ns(this_offset);
+}
+
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+	return paravirt_sched_clock();
+}
+#else
+unsigned long long
+sched_clock(void) __attribute__((alias("native_sched_clock")));
+#endif
+
+int check_tsc_unstable(void)
+{
+	return tsc_unstable;
+}
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
+
+#ifdef CONFIG_X86_TSC
+int __init notsc_setup(char *str)
+{
+	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
+			"cannot disable TSC completely.\n");
+	tsc_disabled = 1;
+	return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+int __init notsc_setup(char *str)
+{
+	setup_clear_cpu_cap(X86_FEATURE_TSC);
+	return 1;
+}
+#endif
+
+__setup("notsc", notsc_setup);
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 6240922e497c..dc8990056d75 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -15,52 +15,8 @@
 
 #include "mach_timer.h"
 
-/* native_sched_clock() is called before tsc_init(), so
-   we must start with the TSC soft disabled to prevent
-   erroneous rdtsc usage on !cpu_has_tsc processors */
-static int tsc_disabled = -1;
-
-/*
- * On some systems the TSC frequency does not
- * change with the cpu frequency. So we need
- * an extra value to store the TSC freq
- */
-unsigned int tsc_khz;
-EXPORT_SYMBOL_GPL(tsc_khz);
-
-#ifdef CONFIG_X86_TSC
-static int __init tsc_setup(char *str)
-{
-	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-	       "cannot disable TSC completely.\n");
-	tsc_disabled = 1;
-	return 1;
-}
-#else
-/*
- * disable flag for tsc. Takes effect by clearing the TSC cpu flag
- * in cpu/common.c
- */
-static int __init tsc_setup(char *str)
-{
-	setup_clear_cpu_cap(X86_FEATURE_TSC);
-	return 1;
-}
-#endif
-
-__setup("notsc", tsc_setup);
-
-/*
- * code to mark and check if the TSC is unstable
- * due to cpufreq or due to unsynced TSCs
- */
-static int tsc_unstable;
-
-int check_tsc_unstable(void)
-{
-	return tsc_unstable;
-}
-EXPORT_SYMBOL_GPL(check_tsc_unstable);
+extern int tsc_unstable;
+extern int tsc_disabled;
 
 /* Accelerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
@@ -109,44 +65,6 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	local_irq_restore(flags);
 }
 
-/*
- * Scheduler clock - returns current time in nanosec units.
- */
-unsigned long long native_sched_clock(void)
-{
-	unsigned long long this_offset;
-
-	/*
-	 * Fall back to jiffies if there's no TSC available:
-	 * ( But note that we still use it if the TSC is marked
-	 *   unstable. We do this because unlike Time Of Day,
-	 *   the scheduler clock tolerates small errors and it's
-	 *   very important for it to be as fast as the platform
-	 *   can achive it. )
-	 */
-	if (unlikely(tsc_disabled))
-		/* No locking but a rare wrong value is not a big deal: */
-		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
-
-	/* read the Time Stamp Counter: */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return cycles_2_ns(this_offset);
-}
-
-/* We need to define a real function for sched_clock, to override the
-   weak default version */
-#ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
-{
-	return paravirt_sched_clock();
-}
-#else
-unsigned long long sched_clock(void)
-	__attribute__((alias("native_sched_clock")));
-#endif
-
 unsigned long native_calculate_cpu_khz(void)
 {
 	unsigned long long start, end;
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 9898fb01edfd..69cbe4c9f050 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -13,12 +13,8 @@
 #include <asm/timer.h>
 #include <asm/vgtod.h>
 
-static int notsc __initdata = 0;
-
-unsigned int cpu_khz;		/* TSC clocks / usec, not used here */
-EXPORT_SYMBOL(cpu_khz);
-unsigned int tsc_khz;
-EXPORT_SYMBOL(tsc_khz);
+extern int tsc_unstable;
+extern int tsc_disabled;
 
 /* Accelerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
@@ -41,6 +37,7 @@ EXPORT_SYMBOL(tsc_khz);
  *
  *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
+
 DEFINE_PER_CPU(unsigned long, cyc2ns);
 
 static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
@@ -63,41 +60,6 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	local_irq_restore(flags);
 }
 
-unsigned long long native_sched_clock(void)
-{
-	unsigned long a = 0;
-
-	/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
-	 * which means it is not completely exact and may not be monotonous
-	 * between CPUs. But the errors should be too small to matter for
-	 * scheduling purposes.
-	 */
-
-	rdtscll(a);
-	return cycles_2_ns(a);
-}
-
-/* We need to define a real function for sched_clock, to override the
-   weak default version */
-#ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
-{
-	return paravirt_sched_clock();
-}
-#else
-unsigned long long
-sched_clock(void) __attribute__((alias("native_sched_clock")));
-#endif
-
-
-static int tsc_unstable;
-
-int check_tsc_unstable(void)
-{
-	return tsc_unstable;
-}
-EXPORT_SYMBOL_GPL(check_tsc_unstable);
-
 #ifdef CONFIG_CPU_FREQ
 
 /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -281,14 +243,6 @@ __cpuinit int unsynchronized_tsc(void)
 	return num_present_cpus() > 1;
 }
 
-int __init notsc_setup(char *s)
-{
-	notsc = 1;
-	return 1;
-}
-
-__setup("notsc", notsc_setup);
-
 static struct clocksource clocksource_tsc;
 
 /*
@@ -346,12 +300,13 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
 void __init init_tsc_clocksource(void)
 {
-	if (!notsc) {
-		clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
-							clocksource_tsc.shift);
-		if (check_tsc_unstable())
-			clocksource_tsc.rating = 0;
+	if (tsc_disabled > 0)
+		return;
 
-		clocksource_register(&clocksource_tsc);
-	}
+	clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
+			clocksource_tsc.shift);
+	if (check_tsc_unstable())
+		clocksource_tsc.rating = 0;
+
+	clocksource_register(&clocksource_tsc);
 }
-- 
cgit v1.2.3


From bfc0f5947afa5e3a13e55867f4478c8a92c11dca Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Tue, 1 Jul 2008 11:43:24 -0700
Subject: x86: merge tsc calibration

Merge the tsc calibration code for the 32bit and 64bit kernel.
The paravirtualized calculate_cpu_khz for 64bit now points to the correct
tsc_calibrate code as in 32bit.
Original native_calculate_cpu_khz for 64 bit is now called as calibrate_cpu.

Also moved the recalibrate_cpu_khz function in the common file.
Note that this function is called only from powernow K7 cpu freq driver.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Signed-off-by: Dan Hecht <dhecht@vmware.com>
Cc: Dan Hecht <dhecht@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/time_64.c |  26 ++++++---
 arch/x86/kernel/tsc.c     | 131 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/tsc_32.c  |  74 +-------------------------
 arch/x86/kernel/tsc_64.c  |  94 +--------------------------------
 include/asm-x86/hpet.h    |   2 +-
 include/asm-x86/tsc.h     |   1 -
 6 files changed, 154 insertions(+), 174 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 39ae8511a137..c6ac4dad41f6 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -56,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
 /* calibrate_cpu is used on systems with fixed rate TSCs to determine
  * processor frequency */
 #define TICK_COUNT 100000000
-unsigned long __init native_calculate_cpu_khz(void)
+static unsigned long __init calibrate_cpu(void)
 {
 	int tsc_start, tsc_now;
 	int i, no_ctr_free;
@@ -114,14 +114,18 @@ void __init hpet_time_init(void)
 	setup_irq(0, &irq0);
 }
 
+extern void set_cyc2ns_scale(unsigned long cpu_khz, int cpu);
+
 void __init time_init(void)
 {
-	tsc_calibrate();
+	int cpu;
+
+	cpu_khz = calculate_cpu_khz();
+	tsc_khz = cpu_khz;
 
-	cpu_khz = tsc_khz;
 	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
-		(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
-		cpu_khz = calculate_cpu_khz();
+			(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
+		cpu_khz = calibrate_cpu();
 
 	lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ;
 
@@ -134,7 +138,17 @@ void __init time_init(void)
 		vgetcpu_mode = VGETCPU_LSL;
 
 	printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
-		cpu_khz / 1000, cpu_khz % 1000);
+			cpu_khz / 1000, cpu_khz % 1000);
+
+	/*
+	 * Secondary CPUs do not run through tsc_init(), so set up
+	 * all the scale factors for all CPUs, assuming the same
+	 * speed as the bootup CPU. (cpufreq notifiers will fix this
+	 * up if their speed diverges)
+	 */
+	for_each_possible_cpu(cpu)
+		set_cyc2ns_scale(cpu_khz, cpu);
+
 	init_tsc_clocksource();
 	late_time_init = choose_time_init();
 }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5d0be778fadd..e6ee14533c75 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1,7 +1,11 @@
+#include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/timer.h>
+#include <linux/acpi_pmtmr.h>
+
+#include <asm/hpet.h>
 
 unsigned int cpu_khz;           /* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
@@ -84,3 +88,130 @@ int __init notsc_setup(char *str)
 #endif
 
 __setup("notsc", notsc_setup);
+
+#define MAX_RETRIES     5
+#define SMI_TRESHOLD    50000
+
+/*
+ * Read TSC and the reference counters. Take care of SMI disturbance
+ */
+static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
+{
+	u64 t1, t2;
+	int i;
+
+	for (i = 0; i < MAX_RETRIES; i++) {
+		t1 = get_cycles();
+		if (hpet)
+			*hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+		else
+			*pm = acpi_pm_read_early();
+		t2 = get_cycles();
+		if ((t2 - t1) < SMI_TRESHOLD)
+			return t2;
+	}
+	return ULLONG_MAX;
+}
+
+/**
+ * tsc_calibrate - calibrate the tsc on boot
+ */
+static unsigned int __init tsc_calibrate(void)
+{
+	unsigned long flags;
+	u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
+	int hpet = is_hpet_enabled();
+	unsigned int tsc_khz_val = 0;
+
+	local_irq_save(flags);
+
+	tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
+
+	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+	outb(0xb0, 0x43);
+	outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
+	outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+	tr1 = get_cycles();
+	while ((inb(0x61) & 0x20) == 0);
+	tr2 = get_cycles();
+
+	tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
+
+	local_irq_restore(flags);
+
+	/*
+	 * Preset the result with the raw and inaccurate PIT
+	 * calibration value
+	 */
+	delta = (tr2 - tr1);
+	do_div(delta, 50);
+	tsc_khz_val = delta;
+
+	/* hpet or pmtimer available ? */
+	if (!hpet && !pm1 && !pm2) {
+		printk(KERN_INFO "TSC calibrated against PIT\n");
+		goto out;
+	}
+
+	/* Check, whether the sampling was disturbed by an SMI */
+	if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) {
+		printk(KERN_WARNING "TSC calibration disturbed by SMI, "
+				"using PIT calibration result\n");
+		goto out;
+	}
+
+	tsc2 = (tsc2 - tsc1) * 1000000LL;
+
+	if (hpet) {
+		printk(KERN_INFO "TSC calibrated against HPET\n");
+		if (hpet2 < hpet1)
+			hpet2 += 0x100000000ULL;
+		hpet2 -= hpet1;
+		tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+		do_div(tsc1, 1000000);
+	} else {
+		printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
+		if (pm2 < pm1)
+			pm2 += (u64)ACPI_PM_OVRRUN;
+		pm2 -= pm1;
+		tsc1 = pm2 * 1000000000LL;
+		do_div(tsc1, PMTMR_TICKS_PER_SEC);
+	}
+
+	do_div(tsc2, tsc1);
+	tsc_khz_val = tsc2;
+
+out:
+	return tsc_khz_val;
+}
+
+unsigned long native_calculate_cpu_khz(void)
+{
+	return tsc_calibrate();
+}
+
+#ifdef CONFIG_X86_32
+/* Only called from the Powernow K7 cpu freq driver */
+int recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+	unsigned long cpu_khz_old = cpu_khz;
+
+	if (cpu_has_tsc) {
+		cpu_khz = calculate_cpu_khz();
+		tsc_khz = cpu_khz;
+		cpu_data(0).loops_per_jiffy =
+			cpufreq_scale(cpu_data(0).loops_per_jiffy,
+					cpu_khz_old, cpu_khz);
+		return 0;
+	} else
+		return -ENODEV;
+#else
+	return -ENODEV;
+#endif
+}
+
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index dc8990056d75..40c0aafb358d 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -42,7 +42,7 @@ extern int tsc_disabled;
 
 DEFINE_PER_CPU(unsigned long, cyc2ns);
 
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 {
 	unsigned long long tsc_now, ns_now;
 	unsigned long flags, *scale;
@@ -65,78 +65,6 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	local_irq_restore(flags);
 }
 
-unsigned long native_calculate_cpu_khz(void)
-{
-	unsigned long long start, end;
-	unsigned long count;
-	u64 delta64 = (u64)ULLONG_MAX;
-	int i;
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	/* run 3 times to ensure the cache is warm and to get an accurate reading */
-	for (i = 0; i < 3; i++) {
-		mach_prepare_counter();
-		rdtscll(start);
-		mach_countup(&count);
-		rdtscll(end);
-
-		/*
-		 * Error: ECTCNEVERSET
-		 * The CTC wasn't reliable: we got a hit on the very first read,
-		 * or the CPU was so fast/slow that the quotient wouldn't fit in
-		 * 32 bits..
-		 */
-		if (count <= 1)
-			continue;
-
-		/* cpu freq too slow: */
-		if ((end - start) <= CALIBRATE_TIME_MSEC)
-			continue;
-
-		/*
-		 * We want the minimum time of all runs in case one of them
-		 * is inaccurate due to SMI or other delay
-		 */
-		delta64 = min(delta64, (end - start));
-	}
-
-	/* cpu freq too fast (or every run was bad): */
-	if (delta64 > (1ULL<<32))
-		goto err;
-
-	delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
-	do_div(delta64,CALIBRATE_TIME_MSEC);
-
-	local_irq_restore(flags);
-	return (unsigned long)delta64;
-err:
-	local_irq_restore(flags);
-	return 0;
-}
-
-int recalibrate_cpu_khz(void)
-{
-#ifndef CONFIG_SMP
-	unsigned long cpu_khz_old = cpu_khz;
-
-	if (cpu_has_tsc) {
-		cpu_khz = calculate_cpu_khz();
-		tsc_khz = cpu_khz;
-		cpu_data(0).loops_per_jiffy =
-			cpufreq_scale(cpu_data(0).loops_per_jiffy,
-					cpu_khz_old, cpu_khz);
-		return 0;
-	} else
-		return -ENODEV;
-#else
-	return -ENODEV;
-#endif
-}
-
-EXPORT_SYMBOL(recalibrate_cpu_khz);
-
 #ifdef CONFIG_CPU_FREQ
 
 /*
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 69cbe4c9f050..c852ff9bd5d4 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -40,7 +40,7 @@ extern int tsc_disabled;
 
 DEFINE_PER_CPU(unsigned long, cyc2ns);
 
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 {
 	unsigned long long tsc_now, ns_now;
 	unsigned long flags, *scale;
@@ -130,98 +130,6 @@ core_initcall(cpufreq_tsc);
 
 #endif
 
-#define MAX_RETRIES	5
-#define SMI_TRESHOLD	50000
-
-/*
- * Read TSC and the reference counters. Take care of SMI disturbance
- */
-static unsigned long __init tsc_read_refs(unsigned long *pm,
-					  unsigned long *hpet)
-{
-	unsigned long t1, t2;
-	int i;
-
-	for (i = 0; i < MAX_RETRIES; i++) {
-		t1 = get_cycles();
-		if (hpet)
-			*hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
-		else
-			*pm = acpi_pm_read_early();
-		t2 = get_cycles();
-		if ((t2 - t1) < SMI_TRESHOLD)
-			return t2;
-	}
-	return ULONG_MAX;
-}
-
-/**
- * tsc_calibrate - calibrate the tsc on boot
- */
-void __init tsc_calibrate(void)
-{
-	unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2;
-	int hpet = is_hpet_enabled(), cpu;
-
-	local_irq_save(flags);
-
-	tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
-
-	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
-
-	outb(0xb0, 0x43);
-	outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
-	outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
-	tr1 = get_cycles();
-	while ((inb(0x61) & 0x20) == 0);
-	tr2 = get_cycles();
-
-	tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
-
-	local_irq_restore(flags);
-
-	/*
-	 * Preset the result with the raw and inaccurate PIT
-	 * calibration value
-	 */
-	tsc_khz = (tr2 - tr1) / 50;
-
-	/* hpet or pmtimer available ? */
-	if (!hpet && !pm1 && !pm2) {
-		printk(KERN_INFO "TSC calibrated against PIT\n");
-		goto out;
-	}
-
-	/* Check, whether the sampling was disturbed by an SMI */
-	if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) {
-		printk(KERN_WARNING "TSC calibration disturbed by SMI, "
-		       "using PIT calibration result\n");
-		goto out;
-	}
-
-	tsc2 = (tsc2 - tsc1) * 1000000L;
-
-	if (hpet) {
-		printk(KERN_INFO "TSC calibrated against HPET\n");
-		if (hpet2 < hpet1)
-			hpet2 += 0x100000000UL;
-		hpet2 -= hpet1;
-		tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000;
-	} else {
-		printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
-		if (pm2 < pm1)
-			pm2 += ACPI_PM_OVRRUN;
-		pm2 -= pm1;
-		tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC;
-	}
-
-	tsc_khz = tsc2 / tsc1;
-
-out:
-	for_each_possible_cpu(cpu)
-		set_cyc2ns_scale(tsc_khz, cpu);
-}
-
 /*
  * Make an educated guess if the TSC is trustworthy and synchronized
  * over all CPUs.
diff --git a/include/asm-x86/hpet.h b/include/asm-x86/hpet.h
index 6a9b4ac59bf7..82f1ac641bd7 100644
--- a/include/asm-x86/hpet.h
+++ b/include/asm-x86/hpet.h
@@ -86,8 +86,8 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
 #else /* CONFIG_HPET_TIMER */
 
 static inline int hpet_enable(void) { return 0; }
-static inline unsigned long hpet_readl(unsigned long a) { return 0; }
 static inline int is_hpet_enabled(void) { return 0; }
+#define hpet_readl(a) 0
 
 #endif
 #endif /* ASM_X86_HPET_H */
diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h
index 548873ab5fc1..761054d7fefb 100644
--- a/include/asm-x86/tsc.h
+++ b/include/asm-x86/tsc.h
@@ -58,7 +58,6 @@ int check_tsc_unstable(void);
 extern void check_tsc_sync_source(int cpu);
 extern void check_tsc_sync_target(void);
 
-extern void tsc_calibrate(void);
 extern int notsc_setup(char *);
 
 #endif
-- 
cgit v1.2.3


From 2dbe06faf37b39f9ecffc054dd173b2a1dc2adcd Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Tue, 1 Jul 2008 11:43:31 -0700
Subject: x86: merge the TSC cpu-freq code

Unify the TSC cpufreq code.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Signed-off-by: Dan Hecht <dhecht@vmware.com>
Cc: Dan Hecht <dhecht@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc.c    | 114 +++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/tsc_32.c | 113 ----------------------------------------------
 arch/x86/kernel/tsc_64.c | 114 -----------------------------------------------
 3 files changed, 114 insertions(+), 227 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index e6ee14533c75..595f78a22212 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -4,6 +4,7 @@
 #include <linux/module.h>
 #include <linux/timer.h>
 #include <linux/acpi_pmtmr.h>
+#include <linux/cpufreq.h>
 
 #include <asm/hpet.h>
 
@@ -215,3 +216,116 @@ int recalibrate_cpu_khz(void)
 EXPORT_SYMBOL(recalibrate_cpu_khz);
 
 #endif /* CONFIG_X86_32 */
+
+/* Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *              ns = cycles / (freq / ns_per_sec)
+ *              ns = cycles * (ns_per_sec / freq)
+ *              ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *              ns = cycles * (10^6 / cpu_khz)
+ *
+ *      Then we use scaling math (suggested by george@mvista.com) to get:
+ *              ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *              ns = cycles * cyc2ns_scale / SC
+ *
+ *      And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better precision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+DEFINE_PER_CPU(unsigned long, cyc2ns);
+
+void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+{
+	unsigned long long tsc_now, ns_now;
+	unsigned long flags, *scale;
+
+	local_irq_save(flags);
+	sched_clock_idle_sleep_event();
+
+	scale = &per_cpu(cyc2ns, cpu);
+
+	rdtscll(tsc_now);
+	ns_now = __cycles_2_ns(tsc_now);
+
+	if (cpu_khz)
+		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
+
+	sched_clock_idle_wakeup_event(0);
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_CPU_FREQ
+
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+ * changes.
+ *
+ * RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
+ * not that important because current Opteron setups do not support
+ * scaling on SMP anyroads.
+ *
+ * Should fix up last_tsc too. Currently gettimeofday in the
+ * first tick after the change will be slightly wrong.
+ */
+
+static unsigned int  ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long tsc_khz_ref;
+
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	unsigned long *lpj, dummy;
+
+	if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
+		return 0;
+
+	lpj = &dummy;
+	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+#ifdef CONFIG_SMP
+		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
+#else
+	lpj = &boot_cpu_data.loops_per_jiffy;
+#endif
+
+	if (!ref_freq) {
+		ref_freq = freq->old;
+		loops_per_jiffy_ref = *lpj;
+		tsc_khz_ref = tsc_khz;
+	}
+	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+			(val == CPUFREQ_RESUMECHANGE)) {
+		*lpj = 	cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+			mark_tsc_unstable("cpufreq changes");
+	}
+
+	set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
+
+	return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+	.notifier_call  = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+	cpufreq_register_notifier(&time_cpufreq_notifier_block,
+				CPUFREQ_TRANSITION_NOTIFIER);
+	return 0;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif /* CONFIG_CPU_FREQ */
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 40c0aafb358d..bbc153d36f84 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -18,119 +18,6 @@
 extern int tsc_unstable;
 extern int tsc_disabled;
 
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_khz * 10^3))
- *		ns = cycles * (10^6 / cpu_khz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^6 * SC / cpu_khz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-
-void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
-{
-	unsigned long long tsc_now, ns_now;
-	unsigned long flags, *scale;
-
-	local_irq_save(flags);
-	sched_clock_idle_sleep_event();
-
-	scale = &per_cpu(cyc2ns, cpu);
-
-	rdtscll(tsc_now);
-	ns_now = __cycles_2_ns(tsc_now);
-
-	if (cpu_khz)
-		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
-
-	/*
-	 * Start smoothly with the new frequency:
-	 */
-	sched_clock_idle_wakeup_event(0);
-	local_irq_restore(flags);
-}
-
-#ifdef CONFIG_CPU_FREQ
-
-/*
- * if the CPU frequency is scaled, TSC-based delays will need a different
- * loops_per_jiffy value to function properly.
- */
-static unsigned int ref_freq;
-static unsigned long loops_per_jiffy_ref;
-static unsigned long cpu_khz_ref;
-
-static int
-time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
-{
-	struct cpufreq_freqs *freq = data;
-
-	if (!ref_freq) {
-		if (!freq->old){
-			ref_freq = freq->new;
-			return 0;
-		}
-		ref_freq = freq->old;
-		loops_per_jiffy_ref = cpu_data(freq->cpu).loops_per_jiffy;
-		cpu_khz_ref = cpu_khz;
-	}
-
-	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
-	    (val == CPUFREQ_RESUMECHANGE)) {
-		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-			cpu_data(freq->cpu).loops_per_jiffy =
-				cpufreq_scale(loops_per_jiffy_ref,
-						ref_freq, freq->new);
-
-		if (cpu_khz) {
-
-			if (num_online_cpus() == 1)
-				cpu_khz = cpufreq_scale(cpu_khz_ref,
-						ref_freq, freq->new);
-			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
-				tsc_khz = cpu_khz;
-				set_cyc2ns_scale(cpu_khz, freq->cpu);
-				/*
-				 * TSC based sched_clock turns
-				 * to junk w/ cpufreq
-				 */
-				mark_tsc_unstable("cpufreq changes");
-			}
-		}
-	}
-
-	return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
-	.notifier_call	= time_cpufreq_notifier
-};
-
-static int __init cpufreq_tsc(void)
-{
-	return cpufreq_register_notifier(&time_cpufreq_notifier_block,
-					 CPUFREQ_TRANSITION_NOTIFIER);
-}
-core_initcall(cpufreq_tsc);
-
-#endif
-
 /* clock source code */
 
 static struct clocksource clocksource_tsc;
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index c852ff9bd5d4..80a274b018c2 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -16,120 +16,6 @@
 extern int tsc_unstable;
 extern int tsc_disabled;
 
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_khz * 10^3))
- *		ns = cycles * (10^6 / cpu_khz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^6 * SC / cpu_khz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-
-void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
-{
-	unsigned long long tsc_now, ns_now;
-	unsigned long flags, *scale;
-
-	local_irq_save(flags);
-	sched_clock_idle_sleep_event();
-
-	scale = &per_cpu(cyc2ns, cpu);
-
-	rdtscll(tsc_now);
-	ns_now = __cycles_2_ns(tsc_now);
-
-	if (cpu_khz)
-		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
-
-	sched_clock_idle_wakeup_event(0);
-	local_irq_restore(flags);
-}
-
-#ifdef CONFIG_CPU_FREQ
-
-/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
- * changes.
- *
- * RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
- * not that important because current Opteron setups do not support
- * scaling on SMP anyroads.
- *
- * Should fix up last_tsc too. Currently gettimeofday in the
- * first tick after the change will be slightly wrong.
- */
-
-static unsigned int  ref_freq;
-static unsigned long loops_per_jiffy_ref;
-static unsigned long tsc_khz_ref;
-
-static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
-				 void *data)
-{
-	struct cpufreq_freqs *freq = data;
-	unsigned long *lpj, dummy;
-
-	if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
-		return 0;
-
-	lpj = &dummy;
-	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-#ifdef CONFIG_SMP
-		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
-#else
-		lpj = &boot_cpu_data.loops_per_jiffy;
-#endif
-
-	if (!ref_freq) {
-		ref_freq = freq->old;
-		loops_per_jiffy_ref = *lpj;
-		tsc_khz_ref = tsc_khz;
-	}
-	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-		(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
-		(val == CPUFREQ_RESUMECHANGE)) {
-		*lpj =
-		cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-
-		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
-		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-			mark_tsc_unstable("cpufreq changes");
-	}
-
-	set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
-
-	return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
-	.notifier_call  = time_cpufreq_notifier
-};
-
-static int __init cpufreq_tsc(void)
-{
-	cpufreq_register_notifier(&time_cpufreq_notifier_block,
-				  CPUFREQ_TRANSITION_NOTIFIER);
-	return 0;
-}
-
-core_initcall(cpufreq_tsc);
-
-#endif
-
 /*
  * Make an educated guess if the TSC is trustworthy and synchronized
  * over all CPUs.
-- 
cgit v1.2.3


From 8fbbc4b45ce3e4c0eeb15004c79c72b6896a79c2 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Tue, 1 Jul 2008 11:43:34 -0700
Subject: x86: merge tsc_init and clocksource code

Unify the clocksource code.
Unify the tsc_init code.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Signed-off-by: Dan Hecht <dhecht@vmware.com>
Cc: Dan Hecht <dhecht@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile  |   2 +-
 arch/x86/kernel/time_64.c |  32 +------
 arch/x86/kernel/tsc.c     | 212 +++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kernel/tsc_32.c  | 188 ----------------------------------------
 arch/x86/kernel/tsc_64.c  | 106 -----------------------
 include/asm-x86/apic.h    |   7 +-
 include/asm-x86/delay.h   |   4 +
 include/asm-x86/time.h    |   2 +
 include/asm-x86/tsc.h     |   1 -
 9 files changed, 224 insertions(+), 330 deletions(-)
 delete mode 100644 arch/x86/kernel/tsc_32.c
 delete mode 100644 arch/x86/kernel/tsc_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index ca904ee17252..59b14c940a28 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,7 +26,7 @@ obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o
-obj-y			+= tsc_$(BITS).o io_delay.o rtc.o tsc.o
+obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index c6ac4dad41f6..e3d49c553af2 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -56,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
 /* calibrate_cpu is used on systems with fixed rate TSCs to determine
  * processor frequency */
 #define TICK_COUNT 100000000
-static unsigned long __init calibrate_cpu(void)
+unsigned long __init calibrate_cpu(void)
 {
 	int tsc_start, tsc_now;
 	int i, no_ctr_free;
@@ -114,41 +114,13 @@ void __init hpet_time_init(void)
 	setup_irq(0, &irq0);
 }
 
-extern void set_cyc2ns_scale(unsigned long cpu_khz, int cpu);
-
 void __init time_init(void)
 {
-	int cpu;
-
-	cpu_khz = calculate_cpu_khz();
-	tsc_khz = cpu_khz;
-
-	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
-			(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
-		cpu_khz = calibrate_cpu();
-
-	lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ;
-
-	if (unsynchronized_tsc())
-		mark_tsc_unstable("TSCs unsynchronized");
-
+	tsc_init();
 	if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
 		vgetcpu_mode = VGETCPU_RDTSCP;
 	else
 		vgetcpu_mode = VGETCPU_LSL;
 
-	printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
-			cpu_khz / 1000, cpu_khz % 1000);
-
-	/*
-	 * Secondary CPUs do not run through tsc_init(), so set up
-	 * all the scale factors for all CPUs, assuming the same
-	 * speed as the bootup CPU. (cpufreq notifiers will fix this
-	 * up if their speed diverges)
-	 */
-	for_each_possible_cpu(cpu)
-		set_cyc2ns_scale(cpu_khz, cpu);
-
-	init_tsc_clocksource();
 	late_time_init = choose_time_init();
 }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 595f78a22212..94c16bdd5696 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -5,8 +5,16 @@
 #include <linux/timer.h>
 #include <linux/acpi_pmtmr.h>
 #include <linux/cpufreq.h>
+#include <linux/dmi.h>
+#include <linux/delay.h>
+#include <linux/clocksource.h>
+#include <linux/percpu.h>
 
 #include <asm/hpet.h>
+#include <asm/timer.h>
+#include <asm/vgtod.h>
+#include <asm/time.h>
+#include <asm/delay.h>
 
 unsigned int cpu_khz;           /* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
@@ -16,12 +24,12 @@ EXPORT_SYMBOL(tsc_khz);
 /*
  * TSC can be unstable due to cpufreq or due to unsynced TSCs
  */
-int tsc_unstable;
+static int tsc_unstable;
 
 /* native_sched_clock() is called before tsc_init(), so
    we must start with the TSC soft disabled to prevent
    erroneous rdtsc usage on !cpu_has_tsc processors */
-int tsc_disabled = -1;
+static int tsc_disabled = -1;
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -241,7 +249,7 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
 
 DEFINE_PER_CPU(unsigned long, cyc2ns);
 
-void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 {
 	unsigned long long tsc_now, ns_now;
 	unsigned long flags, *scale;
@@ -329,3 +337,201 @@ static int __init cpufreq_tsc(void)
 core_initcall(cpufreq_tsc);
 
 #endif /* CONFIG_CPU_FREQ */
+
+/* clocksource code */
+
+static struct clocksource clocksource_tsc;
+
+/*
+ * We compare the TSC to the cycle_last value in the clocksource
+ * structure to avoid a nasty time-warp. This can be observed in a
+ * very small window right after one CPU updated cycle_last under
+ * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
+ * is smaller than the cycle_last reference value due to a TSC which
+ * is slighty behind. This delta is nowhere else observable, but in
+ * that case it results in a forward time jump in the range of hours
+ * due to the unsigned delta calculation of the time keeping core
+ * code, which is necessary to support wrapping clocksources like pm
+ * timer.
+ */
+static cycle_t read_tsc(void)
+{
+	cycle_t ret = (cycle_t)get_cycles();
+
+	return ret >= clocksource_tsc.cycle_last ?
+		ret : clocksource_tsc.cycle_last;
+}
+
+static cycle_t __vsyscall_fn vread_tsc(void)
+{
+	cycle_t ret = (cycle_t)vget_cycles();
+
+	return ret >= __vsyscall_gtod_data.clock.cycle_last ?
+		ret : __vsyscall_gtod_data.clock.cycle_last;
+}
+
+static struct clocksource clocksource_tsc = {
+	.name                   = "tsc",
+	.rating                 = 300,
+	.read                   = read_tsc,
+	.mask                   = CLOCKSOURCE_MASK(64),
+	.shift                  = 22,
+	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
+				  CLOCK_SOURCE_MUST_VERIFY,
+#ifdef CONFIG_X86_64
+	.vread                  = vread_tsc,
+#endif
+};
+
+void mark_tsc_unstable(char *reason)
+{
+	if (!tsc_unstable) {
+		tsc_unstable = 1;
+		printk("Marking TSC unstable due to %s\n", reason);
+		/* Change only the rating, when not registered */
+		if (clocksource_tsc.mult)
+			clocksource_change_rating(&clocksource_tsc, 0);
+		else
+			clocksource_tsc.rating = 0;
+	}
+}
+
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
+{
+	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
+			d->ident);
+	tsc_unstable = 1;
+	return 0;
+}
+
+/* List of systems that have known TSC problems */
+static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
+	{
+		.callback = dmi_mark_tsc_unstable,
+		.ident = "IBM Thinkpad 380XD",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+			DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
+		},
+	},
+	{}
+};
+
+/*
+ * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
+ */
+#ifdef CONFIG_MGEODE_LX
+/* RTSC counts during suspend */
+#define RTSC_SUSP 0x100
+
+static void __init check_geode_tsc_reliable(void)
+{
+	unsigned long res_low, res_high;
+
+	rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+	if (res_low & RTSC_SUSP)
+		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+#else
+static inline void check_geode_tsc_reliable(void) { }
+#endif
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+	if (!cpu_has_tsc || tsc_unstable)
+		return 1;
+
+#ifdef CONFIG_SMP
+	if (apic_is_clustered_box())
+		return 1;
+#endif
+
+	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+		return 0;
+	/*
+	 * Intel systems are normally all synchronized.
+	 * Exceptions must mark TSC as unstable:
+	 */
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+		/* assume multi socket systems are not synchronized: */
+		if (num_possible_cpus() > 1)
+			tsc_unstable = 1;
+	}
+
+	return tsc_unstable;
+}
+
+static void __init init_tsc_clocksource(void)
+{
+	clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
+			clocksource_tsc.shift);
+	/* lower the rating if we already know its unstable: */
+	if (check_tsc_unstable()) {
+		clocksource_tsc.rating = 0;
+		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
+	}
+	clocksource_register(&clocksource_tsc);
+}
+
+void __init tsc_init(void)
+{
+	u64 lpj;
+	int cpu;
+
+	if (!cpu_has_tsc)
+		return;
+
+	cpu_khz = calculate_cpu_khz();
+	tsc_khz = cpu_khz;
+
+	if (!cpu_khz) {
+		mark_tsc_unstable("could not calculate TSC khz");
+		return;
+	}
+
+#ifdef CONFIG_X86_64
+	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
+			(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
+		cpu_khz = calibrate_cpu();
+#endif
+
+	lpj = ((u64)tsc_khz * 1000);
+	do_div(lpj, HZ);
+	lpj_fine = lpj;
+
+	printk("Detected %lu.%03lu MHz processor.\n",
+			(unsigned long)cpu_khz / 1000,
+			(unsigned long)cpu_khz % 1000);
+
+	/*
+	 * Secondary CPUs do not run through tsc_init(), so set up
+	 * all the scale factors for all CPUs, assuming the same
+	 * speed as the bootup CPU. (cpufreq notifiers will fix this
+	 * up if their speed diverges)
+	 */
+	for_each_possible_cpu(cpu)
+		set_cyc2ns_scale(cpu_khz, cpu);
+
+	if (tsc_disabled > 0)
+		return;
+
+	/* now allow native_sched_clock() to use rdtsc */
+	tsc_disabled = 0;
+
+	use_tsc_delay();
+	/* Check and install the TSC clocksource */
+	dmi_check_system(bad_tsc_dmi_table);
+
+	if (unsynchronized_tsc())
+		mark_tsc_unstable("TSCs unsynchronized");
+
+	check_geode_tsc_reliable();
+	init_tsc_clocksource();
+}
+
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
deleted file mode 100644
index bbc153d36f84..000000000000
--- a/arch/x86/kernel/tsc_32.c
+++ /dev/null
@@ -1,188 +0,0 @@
-#include <linux/sched.h>
-#include <linux/clocksource.h>
-#include <linux/workqueue.h>
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-#include <linux/jiffies.h>
-#include <linux/init.h>
-#include <linux/dmi.h>
-#include <linux/percpu.h>
-
-#include <asm/delay.h>
-#include <asm/tsc.h>
-#include <asm/io.h>
-#include <asm/timer.h>
-
-#include "mach_timer.h"
-
-extern int tsc_unstable;
-extern int tsc_disabled;
-
-/* clock source code */
-
-static struct clocksource clocksource_tsc;
-
-/*
- * We compare the TSC to the cycle_last value in the clocksource
- * structure to avoid a nasty time-warp issue. This can be observed in
- * a very small window right after one CPU updated cycle_last under
- * xtime lock and the other CPU reads a TSC value which is smaller
- * than the cycle_last reference value due to a TSC which is slighty
- * behind. This delta is nowhere else observable, but in that case it
- * results in a forward time jump in the range of hours due to the
- * unsigned delta calculation of the time keeping core code, which is
- * necessary to support wrapping clocksources like pm timer.
- */
-static cycle_t read_tsc(void)
-{
-	cycle_t ret;
-
-	rdtscll(ret);
-
-	return ret >= clocksource_tsc.cycle_last ?
-		ret : clocksource_tsc.cycle_last;
-}
-
-static struct clocksource clocksource_tsc = {
-	.name			= "tsc",
-	.rating			= 300,
-	.read			= read_tsc,
-	.mask			= CLOCKSOURCE_MASK(64),
-	.mult			= 0, /* to be set */
-	.shift			= 22,
-	.flags			= CLOCK_SOURCE_IS_CONTINUOUS |
-				  CLOCK_SOURCE_MUST_VERIFY,
-};
-
-void mark_tsc_unstable(char *reason)
-{
-	if (!tsc_unstable) {
-		tsc_unstable = 1;
-		printk("Marking TSC unstable due to: %s.\n", reason);
-		/* Can be called before registration */
-		if (clocksource_tsc.mult)
-			clocksource_change_rating(&clocksource_tsc, 0);
-		else
-			clocksource_tsc.rating = 0;
-	}
-}
-EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-
-static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
-{
-	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
-	       d->ident);
-	tsc_unstable = 1;
-	return 0;
-}
-
-/* List of systems that have known TSC problems */
-static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
-	{
-	 .callback = dmi_mark_tsc_unstable,
-	 .ident = "IBM Thinkpad 380XD",
-	 .matches = {
-		     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
-		     DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
-		     },
-	 },
-	 {}
-};
-
-/*
- * Make an educated guess if the TSC is trustworthy and synchronized
- * over all CPUs.
- */
-__cpuinit int unsynchronized_tsc(void)
-{
-	if (!cpu_has_tsc || tsc_unstable)
-		return 1;
-
-	/* Anything with constant TSC should be synchronized */
-	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
-		return 0;
-
-	/*
-	 * Intel systems are normally all synchronized.
-	 * Exceptions must mark TSC as unstable:
-	 */
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
-		/* assume multi socket systems are not synchronized: */
-		if (num_possible_cpus() > 1)
-			tsc_unstable = 1;
-	}
-	return tsc_unstable;
-}
-
-/*
- * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
- */
-#ifdef CONFIG_MGEODE_LX
-/* RTSC counts during suspend */
-#define RTSC_SUSP 0x100
-
-static void __init check_geode_tsc_reliable(void)
-{
-	unsigned long res_low, res_high;
-
-	rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
-	if (res_low & RTSC_SUSP)
-		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-}
-#else
-static inline void check_geode_tsc_reliable(void) { }
-#endif
-
-
-void __init tsc_init(void)
-{
-	int cpu;
-	u64 lpj;
-
-	if (!cpu_has_tsc || tsc_disabled > 0)
-		return;
-
-	cpu_khz = calculate_cpu_khz();
-	tsc_khz = cpu_khz;
-
-	if (!cpu_khz) {
-		mark_tsc_unstable("could not calculate TSC khz");
-		return;
-	}
-
-	lpj = ((u64)tsc_khz * 1000);
-	do_div(lpj, HZ);
-	lpj_fine = lpj;
-
-	/* now allow native_sched_clock() to use rdtsc */
-	tsc_disabled = 0;
-
-	printk("Detected %lu.%03lu MHz processor.\n",
-				(unsigned long)cpu_khz / 1000,
-				(unsigned long)cpu_khz % 1000);
-
-	/*
-	 * Secondary CPUs do not run through tsc_init(), so set up
-	 * all the scale factors for all CPUs, assuming the same
-	 * speed as the bootup CPU. (cpufreq notifiers will fix this
-	 * up if their speed diverges)
-	 */
-	for_each_possible_cpu(cpu)
-		set_cyc2ns_scale(cpu_khz, cpu);
-
-	use_tsc_delay();
-
-	/* Check and install the TSC clocksource */
-	dmi_check_system(bad_tsc_dmi_table);
-
-	unsynchronized_tsc();
-	check_geode_tsc_reliable();
-	clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
-						    clocksource_tsc.shift);
-	/* lower the rating if we already know its unstable: */
-	if (check_tsc_unstable()) {
-		clocksource_tsc.rating = 0;
-		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
-	}
-	clocksource_register(&clocksource_tsc);
-}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
deleted file mode 100644
index 80a274b018c2..000000000000
--- a/arch/x86/kernel/tsc_64.c
+++ /dev/null
@@ -1,106 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/clocksource.h>
-#include <linux/time.h>
-#include <linux/acpi.h>
-#include <linux/cpufreq.h>
-#include <linux/acpi_pmtmr.h>
-
-#include <asm/hpet.h>
-#include <asm/timex.h>
-#include <asm/timer.h>
-#include <asm/vgtod.h>
-
-extern int tsc_unstable;
-extern int tsc_disabled;
-
-/*
- * Make an educated guess if the TSC is trustworthy and synchronized
- * over all CPUs.
- */
-__cpuinit int unsynchronized_tsc(void)
-{
-	if (tsc_unstable)
-		return 1;
-
-#ifdef CONFIG_SMP
-	if (apic_is_clustered_box())
-		return 1;
-#endif
-
-	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
-		return 0;
-
-	/* Assume multi socket systems are not synchronized */
-	return num_present_cpus() > 1;
-}
-
-static struct clocksource clocksource_tsc;
-
-/*
- * We compare the TSC to the cycle_last value in the clocksource
- * structure to avoid a nasty time-warp. This can be observed in a
- * very small window right after one CPU updated cycle_last under
- * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
- * is smaller than the cycle_last reference value due to a TSC which
- * is slighty behind. This delta is nowhere else observable, but in
- * that case it results in a forward time jump in the range of hours
- * due to the unsigned delta calculation of the time keeping core
- * code, which is necessary to support wrapping clocksources like pm
- * timer.
- */
-static cycle_t read_tsc(void)
-{
-	cycle_t ret = (cycle_t)get_cycles();
-
-	return ret >= clocksource_tsc.cycle_last ?
-		ret : clocksource_tsc.cycle_last;
-}
-
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
-	cycle_t ret = (cycle_t)vget_cycles();
-
-	return ret >= __vsyscall_gtod_data.clock.cycle_last ?
-		ret : __vsyscall_gtod_data.clock.cycle_last;
-}
-
-static struct clocksource clocksource_tsc = {
-	.name			= "tsc",
-	.rating			= 300,
-	.read			= read_tsc,
-	.mask			= CLOCKSOURCE_MASK(64),
-	.shift			= 22,
-	.flags			= CLOCK_SOURCE_IS_CONTINUOUS |
-				  CLOCK_SOURCE_MUST_VERIFY,
-	.vread			= vread_tsc,
-};
-
-void mark_tsc_unstable(char *reason)
-{
-	if (!tsc_unstable) {
-		tsc_unstable = 1;
-		printk("Marking TSC unstable due to %s\n", reason);
-		/* Change only the rating, when not registered */
-		if (clocksource_tsc.mult)
-			clocksource_change_rating(&clocksource_tsc, 0);
-		else
-			clocksource_tsc.rating = 0;
-	}
-}
-EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-
-void __init init_tsc_clocksource(void)
-{
-	if (tsc_disabled > 0)
-		return;
-
-	clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
-			clocksource_tsc.shift);
-	if (check_tsc_unstable())
-		clocksource_tsc.rating = 0;
-
-	clocksource_register(&clocksource_tsc);
-}
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index a29807737d3d..4e2c1e517f06 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -121,12 +121,17 @@ extern void enable_NMI_through_LVT0(void);
  */
 #ifdef CONFIG_X86_64
 extern void early_init_lapic_mapping(void);
+extern int apic_is_clustered_box(void);
+#else
+static inline int apic_is_clustered_box(void)
+{
+	return 0;
+}
 #endif
 
 extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
 extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
 
-extern int apic_is_clustered_box(void);
 
 #else /* !CONFIG_X86_LOCAL_APIC */
 static inline void lapic_shutdown(void) { }
diff --git a/include/asm-x86/delay.h b/include/asm-x86/delay.h
index 409a649204aa..bb80880c834b 100644
--- a/include/asm-x86/delay.h
+++ b/include/asm-x86/delay.h
@@ -26,6 +26,10 @@ extern void __delay(unsigned long loops);
 	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
 	__ndelay(n))
 
+#ifdef CONFIG_X86_32
 void use_tsc_delay(void);
+#else
+#define use_tsc_delay() {}
+#endif
 
 #endif /* _ASM_X86_DELAY_H */
diff --git a/include/asm-x86/time.h b/include/asm-x86/time.h
index bce72d7a958c..a17fa473e91d 100644
--- a/include/asm-x86/time.h
+++ b/include/asm-x86/time.h
@@ -56,4 +56,6 @@ static inline int native_set_wallclock(unsigned long nowtime)
 
 #endif /* CONFIG_PARAVIRT */
 
+extern unsigned long __init calibrate_cpu(void);
+
 #endif
diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h
index 761054d7fefb..cb6f6ee45b8f 100644
--- a/include/asm-x86/tsc.h
+++ b/include/asm-x86/tsc.h
@@ -48,7 +48,6 @@ static __always_inline cycles_t vget_cycles(void)
 extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
-extern void init_tsc_clocksource(void);
 int check_tsc_unstable(void);
 
 /*
-- 
cgit v1.2.3


From e93ef949fd9a3f237aedfb8e64414b28980530b8 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Tue, 1 Jul 2008 11:43:36 -0700
Subject: x86: rename paravirtualized TSC functions

Rename the paravirtualized calculate_cpu_khz to calibrate_tsc.
In all cases, we actually calibrate_tsc and use that as the cpu_khz value.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Signed-off-by: Dan Hecht <dhecht@vmware.com>
Cc: Dan Hecht <dhecht@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/paravirt.c    |  2 +-
 arch/x86/kernel/tsc.c         | 18 +++++++-----------
 arch/x86/kernel/vmi_32.c      |  2 +-
 arch/x86/kernel/vmiclock_32.c |  4 ++--
 arch/x86/lguest/boot.c        |  4 ++--
 arch/x86/xen/enlighten.c      |  2 +-
 arch/x86/xen/time.c           |  4 ++--
 arch/x86/xen/xen-ops.h        |  2 +-
 include/asm-x86/paravirt.h    |  4 ++--
 include/asm-x86/timer.h       |  4 ++--
 include/asm-x86/vmi_time.h    |  2 +-
 11 files changed, 22 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e7e5652f65bc..e0f571d58c19 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -285,7 +285,7 @@ struct pv_time_ops pv_time_ops = {
 	.get_wallclock = native_get_wallclock,
 	.set_wallclock = native_set_wallclock,
 	.sched_clock = native_sched_clock,
-	.get_cpu_khz = native_calculate_cpu_khz,
+	.get_tsc_khz = native_calibrate_tsc,
 };
 
 struct pv_irq_ops pv_irq_ops = {
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 94c16bdd5696..3c36f92160c9 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -123,9 +123,9 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
 }
 
 /**
- * tsc_calibrate - calibrate the tsc on boot
+ * native_calibrate_tsc - calibrate the tsc on boot
  */
-static unsigned int __init tsc_calibrate(void)
+unsigned long native_calibrate_tsc(void)
 {
 	unsigned long flags;
 	u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
@@ -195,10 +195,6 @@ out:
 	return tsc_khz_val;
 }
 
-unsigned long native_calculate_cpu_khz(void)
-{
-	return tsc_calibrate();
-}
 
 #ifdef CONFIG_X86_32
 /* Only called from the Powernow K7 cpu freq driver */
@@ -208,8 +204,8 @@ int recalibrate_cpu_khz(void)
 	unsigned long cpu_khz_old = cpu_khz;
 
 	if (cpu_has_tsc) {
-		cpu_khz = calculate_cpu_khz();
-		tsc_khz = cpu_khz;
+		tsc_khz = calibrate_tsc();
+		cpu_khz = tsc_khz;
 		cpu_data(0).loops_per_jiffy =
 			cpufreq_scale(cpu_data(0).loops_per_jiffy,
 					cpu_khz_old, cpu_khz);
@@ -487,10 +483,10 @@ void __init tsc_init(void)
 	if (!cpu_has_tsc)
 		return;
 
-	cpu_khz = calculate_cpu_khz();
-	tsc_khz = cpu_khz;
+	tsc_khz = calibrate_tsc();
+	cpu_khz = tsc_khz;
 
-	if (!cpu_khz) {
+	if (!tsc_khz) {
 		mark_tsc_unstable("could not calculate TSC khz");
 		return;
 	}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 946bf13b44ab..b15346092b7b 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -932,7 +932,7 @@ static inline int __init activate_vmi(void)
 		pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
 #endif
 		pv_time_ops.sched_clock = vmi_sched_clock;
- 		pv_time_ops.get_cpu_khz = vmi_cpu_khz;
+		pv_time_ops.get_tsc_khz = vmi_tsc_khz;
 
 		/* We have true wallclock functions; disable CMOS clock sync */
 		no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index ba7d19e102b1..6953859fe289 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -69,8 +69,8 @@ unsigned long long vmi_sched_clock(void)
 	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
 }
 
-/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
-unsigned long vmi_cpu_khz(void)
+/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */
+unsigned long vmi_tsc_khz(void)
 {
 	unsigned long long khz;
 	khz = vmi_timer_ops.get_cycle_frequency();
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index e72cf0793fbe..50dad44fb542 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -607,7 +607,7 @@ static unsigned long lguest_get_wallclock(void)
  * what speed it runs at, or 0 if it's unusable as a reliable clock source.
  * This matches what we want here: if we return 0 from this function, the x86
  * TSC clock will give up and not register itself. */
-static unsigned long lguest_cpu_khz(void)
+static unsigned long lguest_tsc_khz(void)
 {
 	return lguest_data.tsc_khz;
 }
@@ -998,7 +998,7 @@ __init void lguest_init(void)
 	/* time operations */
 	pv_time_ops.get_wallclock = lguest_get_wallclock;
 	pv_time_ops.time_init = lguest_time_init;
-	pv_time_ops.get_cpu_khz = lguest_cpu_khz;
+	pv_time_ops.get_tsc_khz = lguest_tsc_khz;
 
 	/* Now is a good time to look at the implementations of these functions
 	 * before returning to the rest of lguest_init(). */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3b980831602c..dcd4e51f2f16 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1062,7 +1062,7 @@ static const struct pv_time_ops xen_time_ops __initdata = {
 
 	.set_wallclock = xen_set_wallclock,
 	.get_wallclock = xen_get_wallclock,
-	.get_cpu_khz = xen_cpu_khz,
+	.get_tsc_khz = xen_tsc_khz,
 	.sched_clock = xen_sched_clock,
 };
 
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 64f0038b9558..685b77470fc3 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -197,8 +197,8 @@ unsigned long long xen_sched_clock(void)
 }
 
 
-/* Get the CPU speed from Xen */
-unsigned long xen_cpu_khz(void)
+/* Get the TSC speed from Xen */
+unsigned long xen_tsc_khz(void)
 {
 	u64 xen_khz = 1000000ULL << 32;
 	const struct pvclock_vcpu_time_info *info =
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9a055592a307..d852ddbb3448 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -32,7 +32,7 @@ void __init xen_build_dynamic_phys_to_machine(void);
 
 void xen_setup_timer(int cpu);
 void xen_setup_cpu_clockevents(void);
-unsigned long xen_cpu_khz(void);
+unsigned long xen_tsc_khz(void);
 void __init xen_time_init(void);
 unsigned long xen_get_wallclock(void);
 int xen_set_wallclock(unsigned long time);
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 6d8966f9d190..ef5e8ec6a6ab 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -84,7 +84,7 @@ struct pv_time_ops {
 	int (*set_wallclock)(unsigned long);
 
 	unsigned long long (*sched_clock)(void);
-	unsigned long (*get_cpu_khz)(void);
+	unsigned long (*get_tsc_khz)(void);
 };
 
 struct pv_cpu_ops {
@@ -779,7 +779,7 @@ static inline unsigned long long paravirt_sched_clock(void)
 {
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
 }
-#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz())
+#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
 
 static inline unsigned long long paravirt_read_pmc(int counter)
 {
diff --git a/include/asm-x86/timer.h b/include/asm-x86/timer.h
index 4f6fcb050c11..fb2a4ddddf3d 100644
--- a/include/asm-x86/timer.h
+++ b/include/asm-x86/timer.h
@@ -7,14 +7,14 @@
 #define TICK_SIZE (tick_nsec / 1000)
 
 unsigned long long native_sched_clock(void);
-unsigned long native_calculate_cpu_khz(void);
+unsigned long native_calibrate_tsc(void);
 
 extern int timer_ack;
 extern int no_timer_check;
 extern int recalibrate_cpu_khz(void);
 
 #ifndef CONFIG_PARAVIRT
-#define calculate_cpu_khz() native_calculate_cpu_khz()
+#define calibrate_tsc() native_calibrate_tsc()
 #endif
 
 /* Accelerators for sched_clock()
diff --git a/include/asm-x86/vmi_time.h b/include/asm-x86/vmi_time.h
index 478188130328..c3118c385156 100644
--- a/include/asm-x86/vmi_time.h
+++ b/include/asm-x86/vmi_time.h
@@ -50,7 +50,7 @@ extern void __init vmi_time_init(void);
 extern unsigned long vmi_get_wallclock(void);
 extern int vmi_set_wallclock(unsigned long now);
 extern unsigned long long vmi_sched_clock(void);
-extern unsigned long vmi_cpu_khz(void);
+extern unsigned long vmi_tsc_khz(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC
 extern void __devinit vmi_time_bsp_init(void);
-- 
cgit v1.2.3


From a8c1be9d2e78d8608892c86791837acf12da4bf6 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Wed, 2 Jul 2008 01:29:44 +0200
Subject: x86: initial changes to unify traps_32.c and traps_64.c

This patch does not change the generated object files.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_32.c |  78 ++++++------
 arch/x86/kernel/traps_64.c | 309 ++++++++++++++++++++++-----------------------
 2 files changed, 191 insertions(+), 196 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index d7cc292691ff..92439698e489 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1,5 +1,6 @@
 /*
  *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  *
  *  Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
@@ -130,7 +131,8 @@ void printk_address(unsigned long address, int reliable)
 #endif
 }
 
-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+			void *p, unsigned int size)
 {
 	return	p > (void *)tinfo &&
 		p <= (void *)tinfo + THREAD_SIZE - size;
@@ -138,14 +140,14 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned s
 
 /* The form of the top of the frame on the stack */
 struct stack_frame {
-	struct stack_frame	*next_frame;
-	unsigned long		return_address;
+	struct stack_frame *next_frame;
+	unsigned long return_address;
 };
 
 static inline unsigned long
 print_context_stack(struct thread_info *tinfo,
-		    unsigned long *stack, unsigned long bp,
-		    const struct stacktrace_ops *ops, void *data)
+		unsigned long *stack, unsigned long bp,
+		const struct stacktrace_ops *ops, void *data)
 {
 	struct stack_frame *frame = (struct stack_frame *)bp;
 
@@ -167,8 +169,6 @@ print_context_stack(struct thread_info *tinfo,
 	return bp;
 }
 
-#define MSG(msg)		ops->warning(data, msg)
-
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
@@ -178,7 +178,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 
 	if (!stack) {
 		unsigned long dummy;
-
 		stack = &dummy;
 		if (task != current)
 			stack = (unsigned long *)task->thread.sp;
@@ -196,7 +195,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	}
 #endif
 
-	while (1) {
+	for (;;) {
 		struct thread_info *context;
 
 		context = (struct thread_info *)
@@ -248,10 +247,10 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
 }
 
 static const struct stacktrace_ops print_trace_ops = {
-	.warning		= print_trace_warning,
-	.warning_symbol		= print_trace_warning_symbol,
-	.stack			= print_trace_stack,
-	.address		= print_trace_address,
+	.warning = print_trace_warning,
+	.warning_symbol = print_trace_warning_symbol,
+	.stack = print_trace_stack,
+	.address = print_trace_address,
 };
 
 static void
@@ -351,15 +350,14 @@ void show_registers(struct pt_regs *regs)
 		printk(KERN_EMERG "Code: ");
 
 		ip = (u8 *)regs->ip - code_prologue;
-		if (ip < (u8 *)PAGE_OFFSET ||
-			probe_kernel_address(ip, c)) {
+		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
 			/* try starting at EIP */
 			ip = (u8 *)regs->ip;
 			code_len = code_len - code_prologue + 1;
 		}
 		for (i = 0; i < code_len; i++, ip++) {
 			if (ip < (u8 *)PAGE_OFFSET ||
-				probe_kernel_address(ip, c)) {
+					probe_kernel_address(ip, c)) {
 				printk(" Bad EIP value.");
 				break;
 			}
@@ -546,7 +544,7 @@ void do_##name(struct pt_regs *regs, long error_code)			\
 {									\
 	trace_hardirqs_fixup();						\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-						== NOTIFY_STOP)		\
+							== NOTIFY_STOP)	\
 		return;							\
 	do_trap(trapnr, signr, str, 0, regs, error_code, NULL);		\
 }
@@ -562,7 +560,7 @@ void do_##name(struct pt_regs *regs, long error_code)			\
 	info.si_code = sicode;						\
 	info.si_addr = (void __user *)siaddr;				\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-						== NOTIFY_STOP)		\
+							== NOTIFY_STOP)	\
 		return;							\
 	do_trap(trapnr, signr, str, 0, regs, error_code, &info);	\
 }
@@ -571,7 +569,7 @@ void do_##name(struct pt_regs *regs, long error_code)			\
 void do_##name(struct pt_regs *regs, long error_code)			\
 {									\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-						== NOTIFY_STOP)		\
+							== NOTIFY_STOP)	\
 		return;							\
 	do_trap(trapnr, signr, str, 1, regs, error_code, NULL);		\
 }
@@ -586,22 +584,22 @@ void do_##name(struct pt_regs *regs, long error_code)			\
 	info.si_addr = (void __user *)siaddr;				\
 	trace_hardirqs_fixup();						\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-						== NOTIFY_STOP)		\
+							== NOTIFY_STOP)	\
 		return;							\
 	do_trap(trapnr, signr, str, 1, regs, error_code, &info);	\
 }
 
-DO_VM86_ERROR_INFO(0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
 #ifndef CONFIG_KPROBES
 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
 #endif
 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
 DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
-DO_ERROR(9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
-DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
 
@@ -799,7 +797,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 
 	if (!(reason & 0xc0)) {
 		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
-							== NOTIFY_STOP)
+								== NOTIFY_STOP)
 			return;
 #ifdef CONFIG_X86_LOCAL_APIC
 		/*
@@ -818,6 +816,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 	}
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
 		return;
+
+	/* AK: following checks seem to be broken on modern chipsets. FIXME */
 	if (reason & 0x80)
 		mem_parity_error(reason, regs);
 	if (reason & 0x40)
@@ -915,7 +915,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	tsk->thread.debugctlmsr = 0;
 
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
-					SIGTRAP) == NOTIFY_STOP)
+						SIGTRAP) == NOTIFY_STOP)
 		return;
 	/* It's safe to allow irq's after DR6 has been saved */
 	if (regs->flags & X86_EFLAGS_IF)
@@ -997,7 +997,7 @@ void math_error(void __user *ip)
 	 * C1 reg you need in case of a stack fault, 0x040 is the stack
 	 * fault bit.  We should only be taking one exception at a time,
 	 * so if this combination doesn't produce any single exception,
-	 * then we have a bad program that isn't syncronizing its FPU usage
+	 * then we have a bad program that isn't synchronizing its FPU usage
 	 * and it will suffer the consequences since we won't be able to
 	 * fully reproduce the context of the exception
 	 */
@@ -1006,7 +1006,7 @@ void math_error(void __user *ip)
 	switch (swd & ~cwd & 0x3f) {
 	case 0x000: /* No unmasked exception */
 		return;
-	default:    /* Multiple exceptions */
+	default: /* Multiple exceptions */
 		break;
 	case 0x001: /* Invalid Op */
 		/*
@@ -1198,16 +1198,16 @@ void __init trap_init(void)
 	early_iounmap(p, 4);
 #endif
 
-	set_trap_gate(0,  &divide_error);
-	set_intr_gate(1,  &debug);
-	set_intr_gate(2,  &nmi);
-	set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
-	set_system_gate(4, &overflow);
-	set_trap_gate(5,  &bounds);
-	set_trap_gate(6,  &invalid_op);
-	set_trap_gate(7,  &device_not_available);
-	set_task_gate(8,  GDT_ENTRY_DOUBLEFAULT_TSS);
-	set_trap_gate(9,  &coprocessor_segment_overrun);
+	set_trap_gate(0, &divide_error);
+	set_intr_gate(1, &debug);
+	set_intr_gate(2, &nmi);
+	set_system_intr_gate(3, &int3); /* int3 can be called from all */
+	set_system_gate(4, &overflow); /* int4 can be called from all */
+	set_trap_gate(5, &bounds);
+	set_trap_gate(6, &invalid_op);
+	set_trap_gate(7, &device_not_available);
+	set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+	set_trap_gate(9, &coprocessor_segment_overrun);
 	set_trap_gate(10, &invalid_TSS);
 	set_trap_gate(11, &segment_not_present);
 	set_trap_gate(12, &stack_segment);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 80ba6d37bfe0..686074e6caf9 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -205,8 +205,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 	return NULL;
 }
 
-#define MSG(txt) ops->warning(data, txt)
-
 /*
  * x86-64 can have up to three kernel stacks: 
  * process stack
@@ -233,11 +231,11 @@ struct stack_frame {
 	unsigned long return_address;
 };
 
-
-static inline unsigned long print_context_stack(struct thread_info *tinfo,
-				unsigned long *stack, unsigned long bp,
-				const struct stacktrace_ops *ops, void *data,
-				unsigned long *end)
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+		unsigned long *stack, unsigned long bp,
+		const struct stacktrace_ops *ops, void *data,
+		unsigned long *end)
 {
 	struct stack_frame *frame = (struct stack_frame *)bp;
 
@@ -259,7 +257,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 	return bp;
 }
 
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
 {
@@ -268,31 +266,29 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 	unsigned used = 0;
 	struct thread_info *tinfo;
 
-	if (!tsk)
-		tsk = current;
-	tinfo = task_thread_info(tsk);
+	if (!task)
+		task = current;
+	tinfo = task_thread_info(task);
 
 	if (!stack) {
 		unsigned long dummy;
 		stack = &dummy;
-		if (tsk && tsk != current)
-			stack = (unsigned long *)tsk->thread.sp;
+		if (task && task != current)
+			stack = (unsigned long *)task->thread.sp;
 	}
 
 #ifdef CONFIG_FRAME_POINTER
 	if (!bp) {
-		if (tsk == current) {
+		if (task == current) {
 			/* Grab bp right from our regs */
-			asm("movq %%rbp, %0" : "=r" (bp):);
+			asm("movq %%rbp, %0" : "=r" (bp) :);
 		} else {
 			/* bp is the last reg pushed by switch_to */
-			bp = *(unsigned long *) tsk->thread.sp;
+			bp = *(unsigned long *) task->thread.sp;
 		}
 	}
 #endif
 
-
-
 	/*
 	 * Print function call entries in all stacks, starting at the
 	 * current stack address. If the stacks consist of nested
@@ -382,18 +378,17 @@ static const struct stacktrace_ops print_trace_ops = {
 	.address = print_trace_address,
 };
 
-void
-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
-		unsigned long bp)
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp)
 {
 	printk("\nCall Trace:\n");
-	dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
+	dump_trace(task, regs, stack, bp, &print_trace_ops, NULL);
 	printk("\n");
 }
 
 static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
-							unsigned long bp)
+_show_stack(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *sp, unsigned long bp)
 {
 	unsigned long *stack;
 	int i;
@@ -405,14 +400,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
 	// back trace for this cpu.
 
 	if (sp == NULL) {
-		if (tsk)
-			sp = (unsigned long *)tsk->thread.sp;
+		if (task)
+			sp = (unsigned long *)task->thread.sp;
 		else
 			sp = (unsigned long *)&sp;
 	}
 
 	stack = sp;
-	for(i=0; i < kstack_depth_to_print; i++) {
+	for (i = 0; i < kstack_depth_to_print; i++) {
 		if (stack >= irqstack && stack <= irqstack_end) {
 			if (stack == irqstack_end) {
 				stack = (unsigned long *) (irqstack_end[-1]);
@@ -427,12 +422,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
 		printk(" %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace(tsk, regs, sp, bp);
+	show_trace(task, regs, sp, bp);
 }
 
-void show_stack(struct task_struct *tsk, unsigned long * sp)
+void show_stack(struct task_struct *task, unsigned long *sp)
 {
-	_show_stack(tsk, NULL, sp, 0);
+	_show_stack(task, NULL, sp, 0);
 }
 
 /*
@@ -440,7 +435,7 @@ void show_stack(struct task_struct *tsk, unsigned long * sp)
  */
 void dump_stack(void)
 {
-	unsigned long dummy;
+	unsigned long stack;
 	unsigned long bp = 0;
 
 #ifdef CONFIG_FRAME_POINTER
@@ -453,7 +448,7 @@ void dump_stack(void)
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	show_trace(NULL, NULL, &dummy, bp);
+	show_trace(NULL, NULL, &stack, bp);
 }
 
 EXPORT_SYMBOL(dump_stack);
@@ -488,7 +483,7 @@ void show_registers(struct pt_regs *regs)
 		printk(KERN_EMERG "Code: ");
 		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
 			/* try starting at RIP */
-			ip = (u8 *) regs->ip;
+			ip = (u8 *)regs->ip;
 			code_len = code_len - code_prologue + 1;
 		}
 		for (i = 0; i < code_len; i++, ip++) {
@@ -504,7 +499,7 @@ void show_registers(struct pt_regs *regs)
 		}
 	}
 	printk("\n");
-}	
+}
 
 int is_valid_bugaddr(unsigned long ip)
 {
@@ -576,8 +571,10 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
 	printk("DEBUG_PAGEALLOC");
 #endif
 	printk("\n");
-	if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+	if (notify_die(DIE_OOPS, str, regs, err,
+			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
 		return 1;
+
 	show_registers(regs);
 	add_taint(TAINT_DIE);
 	/* Executive summary in case the oops scrolled away */
@@ -589,7 +586,7 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
 	return 0;
 }
 
-void die(const char * str, struct pt_regs * regs, long err)
+void die(const char * str, struct pt_regs *regs, long err)
 {
 	unsigned long flags = oops_begin();
 
@@ -606,8 +603,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
 {
 	unsigned long flags;
 
-	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
-	    NOTIFY_STOP)
+	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
 		return;
 
 	flags = oops_begin();
@@ -629,9 +625,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
 	do_exit(SIGBUS);
 }
 
-static void __kprobes do_trap(int trapnr, int signr, char *str,
-			      struct pt_regs * regs, long error_code,
-			      siginfo_t *info)
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+	long error_code, siginfo_t *info)
 {
 	struct task_struct *tsk = current;
 
@@ -676,38 +672,38 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 }
 
 #define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-							== NOTIFY_STOP) \
-		return; \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code)	\
+{									\
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
+							== NOTIFY_STOP)	\
+		return;							\
 	conditional_sti(regs);						\
-	do_trap(trapnr, signr, str, regs, error_code, NULL); \
+	do_trap(trapnr, signr, str, regs, error_code, NULL);		\
 }
 
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-	siginfo_t info; \
-	info.si_signo = signr; \
-	info.si_errno = 0; \
-	info.si_code = sicode; \
-	info.si_addr = (void __user *)siaddr; \
-	trace_hardirqs_fixup(); \
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-							== NOTIFY_STOP) \
-		return; \
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)		\
+asmlinkage void do_##name(struct pt_regs * regs, long error_code)	\
+{									\
+	siginfo_t info;							\
+	info.si_signo = signr;						\
+	info.si_errno = 0;						\
+	info.si_code = sicode;						\
+	info.si_addr = (void __user *)siaddr;				\
+	trace_hardirqs_fixup();						\
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
+							== NOTIFY_STOP)	\
+		return;							\
 	conditional_sti(regs);						\
-	do_trap(trapnr, signr, str, regs, error_code, &info); \
+	do_trap(trapnr, signr, str, regs, error_code, &info);		\
 }
 
-DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR( 4, SIGSEGV, "overflow", overflow)
-DO_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_ERROR(4, SIGSEGV, "overflow", overflow)
+DO_ERROR(5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
 
 /* Runs on IST stack */
@@ -775,14 +771,14 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 }
 
 static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs * regs)
+mem_parity_error(unsigned char reason, struct pt_regs *regs)
 {
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
 		reason);
 	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
 
 #if defined(CONFIG_EDAC)
-	if(edac_handler_set()) {
+	if (edac_handler_set()) {
 		edac_atomic_assert_error();
 		return;
 	}
@@ -799,7 +795,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
 }
 
 static notrace __kprobes void
-io_check_error(unsigned char reason, struct pt_regs * regs)
+io_check_error(unsigned char reason, struct pt_regs *regs)
 {
 	printk("NMI: IOCK error (debug interrupt?)\n");
 	show_registers(regs);
@@ -836,7 +832,7 @@ asmlinkage notrace  __kprobes void default_do_nmi(struct pt_regs *regs)
 
 	cpu = smp_processor_id();
 
-	/* Only the BSP gets external NMIs from the system.  */
+	/* Only the BSP gets external NMIs from the system. */
 	if (!cpu)
 		reason = get_nmi_reason();
 
@@ -848,18 +844,17 @@ asmlinkage notrace  __kprobes void default_do_nmi(struct pt_regs *regs)
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
 		 */
-		if (nmi_watchdog_tick(regs,reason))
+		if (nmi_watchdog_tick(regs, reason))
 			return;
-		if (!do_nmi_callback(regs,cpu))
+		if (!do_nmi_callback(regs, cpu))
 			unknown_nmi_error(reason, regs);
 
 		return;
 	}
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
-		return; 
+		return;
 
 	/* AK: following checks seem to be broken on modern chipsets. FIXME */
-
 	if (reason & 0x80)
 		mem_parity_error(reason, regs);
 	if (reason & 0x40)
@@ -870,9 +865,12 @@ asmlinkage notrace __kprobes void
 do_nmi(struct pt_regs *regs, long error_code)
 {
 	nmi_enter();
+
 	add_pda(__nmi_count, 1);
+
 	if (!ignore_nmis)
 		default_do_nmi(regs);
+
 	nmi_exit();
 }
 
@@ -889,13 +887,14 @@ void restart_nmi(void)
 }
 
 /* runs on IST stack. */
-asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
+asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
 	trace_hardirqs_fixup();
 
-	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
+	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+			== NOTIFY_STOP)
 		return;
-	}
+
 	preempt_conditional_sti(regs);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
 	preempt_conditional_cli(regs);
@@ -948,21 +947,19 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 
 	/* Mask out spurious debug traps due to lazy DR7 setting */
 	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-		if (!tsk->thread.debugreg7) { 
+		if (!tsk->thread.debugreg7)
 			goto clear_dr7;
-		}
 	}
 
 	tsk->thread.debugreg6 = condition;
 
-
 	/*
 	 * Single-stepping through TF: make sure we ignore any events in
 	 * kernel space (but re-enable TF when returning to user mode).
 	 */
 	if (condition & DR_STEP) {
-                if (!user_mode(regs))
-                       goto clear_TF_reenable;
+		if (!user_mode(regs))
+			goto clear_TF_reenable;
 	}
 
 	/* Ok, finally something we can handle */
@@ -975,7 +972,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 	force_sig_info(SIGTRAP, &info, tsk);
 
 clear_dr7:
-	set_debugreg(0UL, 7);
+	set_debugreg(0, 7);
 	preempt_conditional_cli(regs);
 	return;
 
@@ -983,6 +980,7 @@ clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 	regs->flags &= ~X86_EFLAGS_TF;
 	preempt_conditional_cli(regs);
+	return;
 }
 
 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
@@ -1005,7 +1003,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
 {
 	void __user *ip = (void __user *)(regs->ip);
-	struct task_struct * task;
+	struct task_struct *task;
 	siginfo_t info;
 	unsigned short cwd, swd;
 
@@ -1038,30 +1036,30 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
 	cwd = get_fpu_cwd(task);
 	swd = get_fpu_swd(task);
 	switch (swd & ~cwd & 0x3f) {
-		case 0x000:
-		default:
-			break;
-		case 0x001: /* Invalid Op */
-			/*
-			 * swd & 0x240 == 0x040: Stack Underflow
-			 * swd & 0x240 == 0x240: Stack Overflow
-			 * User must clear the SF bit (0x40) if set
-			 */
-			info.si_code = FPE_FLTINV;
-			break;
-		case 0x002: /* Denormalize */
-		case 0x010: /* Underflow */
-			info.si_code = FPE_FLTUND;
-			break;
-		case 0x004: /* Zero Divide */
-			info.si_code = FPE_FLTDIV;
-			break;
-		case 0x008: /* Overflow */
-			info.si_code = FPE_FLTOVF;
-			break;
-		case 0x020: /* Precision */
-			info.si_code = FPE_FLTRES;
-			break;
+	case 0x000: /* No unmasked exception */
+	default: /* Multiple exceptions */
+		break;
+	case 0x001: /* Invalid Op */
+		/*
+		 * swd & 0x240 == 0x040: Stack Underflow
+		 * swd & 0x240 == 0x240: Stack Overflow
+		 * User must clear the SF bit (0x40) if set
+		 */
+		info.si_code = FPE_FLTINV;
+		break;
+	case 0x002: /* Denormalize */
+	case 0x010: /* Underflow */
+		info.si_code = FPE_FLTUND;
+		break;
+	case 0x004: /* Zero Divide */
+		info.si_code = FPE_FLTDIV;
+		break;
+	case 0x008: /* Overflow */
+		info.si_code = FPE_FLTOVF;
+		break;
+	case 0x020: /* Precision */
+		info.si_code = FPE_FLTRES;
+		break;
 	}
 	force_sig_info(SIGFPE, &info, task);
 }
@@ -1074,7 +1072,7 @@ asmlinkage void bad_intr(void)
 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
 {
 	void __user *ip = (void __user *)(regs->ip);
-	struct task_struct * task;
+	struct task_struct *task;
 	siginfo_t info;
 	unsigned short mxcsr;
 
@@ -1102,25 +1100,25 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
 	 */
 	mxcsr = get_fpu_mxcsr(task);
 	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-		case 0x000:
-		default:
-			break;
-		case 0x001: /* Invalid Op */
-			info.si_code = FPE_FLTINV;
-			break;
-		case 0x002: /* Denormalize */
-		case 0x010: /* Underflow */
-			info.si_code = FPE_FLTUND;
-			break;
-		case 0x004: /* Zero Divide */
-			info.si_code = FPE_FLTDIV;
-			break;
-		case 0x008: /* Overflow */
-			info.si_code = FPE_FLTOVF;
-			break;
-		case 0x020: /* Precision */
-			info.si_code = FPE_FLTRES;
-			break;
+	case 0x000:
+	default:
+		break;
+	case 0x001: /* Invalid Op */
+		info.si_code = FPE_FLTINV;
+		break;
+	case 0x002: /* Denormalize */
+	case 0x010: /* Underflow */
+		info.si_code = FPE_FLTUND;
+		break;
+	case 0x004: /* Zero Divide */
+		info.si_code = FPE_FLTDIV;
+		break;
+	case 0x008: /* Overflow */
+		info.si_code = FPE_FLTOVF;
+		break;
+	case 0x020: /* Precision */
+		info.si_code = FPE_FLTRES;
+		break;
 	}
 	force_sig_info(SIGFPE, &info, task);
 }
@@ -1138,7 +1136,7 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
 }
 
 /*
- *  'math_state_restore()' saves the current math information in the
+ * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
  *
  * Careful.. There are problems with IBM-designed IRQ13 behaviour.
@@ -1163,7 +1161,7 @@ asmlinkage void math_state_restore(void)
 		local_irq_disable();
 	}
 
-	clts();			/* Allow maths ops (or we recurse) */
+	clts();				/* Allow maths ops (or we recurse) */
 	restore_fpu_checking(&me->thread.xstate->fxsave);
 	task_thread_info(me)->status |= TS_USEDFPU;
 	me->fpu_counter++;
@@ -1172,64 +1170,61 @@ EXPORT_SYMBOL_GPL(math_state_restore);
 
 void __init trap_init(void)
 {
-	set_intr_gate(0,&divide_error);
-	set_intr_gate_ist(1,&debug,DEBUG_STACK);
-	set_intr_gate_ist(2,&nmi,NMI_STACK);
- 	set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
-	set_system_gate(4,&overflow);	/* int4 can be called from all */
-	set_intr_gate(5,&bounds);
-	set_intr_gate(6,&invalid_op);
-	set_intr_gate(7,&device_not_available);
-	set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
-	set_intr_gate(9,&coprocessor_segment_overrun);
-	set_intr_gate(10,&invalid_TSS);
-	set_intr_gate(11,&segment_not_present);
-	set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
-	set_intr_gate(13,&general_protection);
-	set_intr_gate(14,&page_fault);
-	set_intr_gate(15,&spurious_interrupt_bug);
-	set_intr_gate(16,&coprocessor_error);
-	set_intr_gate(17,&alignment_check);
+	set_intr_gate(0, &divide_error);
+	set_intr_gate_ist(1, &debug, DEBUG_STACK);
+	set_intr_gate_ist(2, &nmi, NMI_STACK);
+ 	set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
+	set_system_gate(4, &overflow); /* int4 can be called from all */
+	set_intr_gate(5, &bounds);
+	set_intr_gate(6, &invalid_op);
+	set_intr_gate(7, &device_not_available);
+	set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
+	set_intr_gate(9, &coprocessor_segment_overrun);
+	set_intr_gate(10, &invalid_TSS);
+	set_intr_gate(11, &segment_not_present);
+	set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
+	set_intr_gate(13, &general_protection);
+	set_intr_gate(14, &page_fault);
+	set_intr_gate(15, &spurious_interrupt_bug);
+	set_intr_gate(16, &coprocessor_error);
+	set_intr_gate(17, &alignment_check);
 #ifdef CONFIG_X86_MCE
-	set_intr_gate_ist(18,&machine_check, MCE_STACK); 
+	set_intr_gate_ist(18, &machine_check, MCE_STACK);
 #endif
-	set_intr_gate(19,&simd_coprocessor_error);
+	set_intr_gate(19, &simd_coprocessor_error);
 
 #ifdef CONFIG_IA32_EMULATION
 	set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
 #endif
-       
 	/*
 	 * initialize the per thread extended state:
 	 */
         init_thread_xstate();
 	/*
-	 * Should be a barrier for any external CPU state.
+	 * Should be a barrier for any external CPU state:
 	 */
 	cpu_init();
 }
 
-
 static int __init oops_setup(char *s)
-{ 
+{
 	if (!s)
 		return -EINVAL;
 	if (!strcmp(s, "panic"))
 		panic_on_oops = 1;
 	return 0;
-} 
+}
 early_param("oops", oops_setup);
 
 static int __init kstack_setup(char *s)
 {
 	if (!s)
 		return -EINVAL;
-	kstack_depth_to_print = simple_strtoul(s,NULL,0);
+	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
 	return 0;
 }
 early_param("kstack", kstack_setup);
 
-
 static int __init code_bytes_setup(char *s)
 {
 	code_bytes = simple_strtoul(s, NULL, 0);
-- 
cgit v1.2.3


From badc76527f7e29302f0bde3d366c59101fb2ab87 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Wed, 2 Jul 2008 01:30:30 +0200
Subject: x86: traps_xx: shuffle headers and globals

Reorder headers and collect globals in traps_32.c and traps_64.c

Code size and data size are unaffected by the changes. Code
itself is changed due to different ordering of data and bss.
The bss segment changed size due to a change in the packing
of the variables.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_32.c |  9 +++----
 arch/x86/kernel/traps_64.c | 61 +++++++++++++++++++++++-----------------------
 2 files changed, 33 insertions(+), 37 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 92439698e489..5339af459a38 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -61,8 +61,6 @@
 
 #include "mach_traps.h"
 
-int panic_on_unrecovered_nmi;
-
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
 EXPORT_SYMBOL_GPL(used_vectors);
 
@@ -99,8 +97,11 @@ asmlinkage void alignment_check(void);
 asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void machine_check(void);
 
+int panic_on_unrecovered_nmi;
 int kstack_depth_to_print = 24;
 static unsigned int code_bytes = 64;
+static int ignore_nmis;
+static int die_counter;
 
 void printk_address(unsigned long address, int reliable)
 {
@@ -382,8 +383,6 @@ int is_valid_bugaddr(unsigned long ip)
 	return ud2 == 0x0b0f;
 }
 
-static int die_counter;
-
 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 {
 	unsigned short ss;
@@ -829,8 +828,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 	reassert_nmi();
 }
 
-static int ignore_nmis;
-
 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
 {
 	int cpu;
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 686074e6caf9..03d63b0f5b4b 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -10,49 +10,49 @@
  * 'Traps.c' handles hardware traps and faults after we have saved some
  * state in 'entry.S'.
  */
-#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/kdebug.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
 #include <linux/string.h>
+#include <linux/unwind.h>
+#include <linux/delay.h>
 #include <linux/errno.h>
-#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
 #include <linux/timer.h>
-#include <linux/mm.h>
 #include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/nmi.h>
-#include <linux/kprobes.h>
-#include <linux/kexec.h>
-#include <linux/unwind.h>
-#include <linux/uaccess.h>
 #include <linux/bug.h>
-#include <linux/kdebug.h>
-#include <linux/utsname.h>
-
-#include <mach_traps.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
 
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
 #endif
 
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/atomic.h>
+#include <asm/stacktrace.h>
+#include <asm/processor.h>
 #include <asm/debugreg.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/unwind.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
-#include <asm/processor.h>
-#include <asm/unwind.h>
+#include <asm/nmi.h>
 #include <asm/smp.h>
+#include <asm/io.h>
 #include <asm/pgalloc.h>
-#include <asm/pda.h>
 #include <asm/proto.h>
-#include <asm/nmi.h>
-#include <asm/stacktrace.h>
+#include <asm/pda.h>
+
+#include <mach_traps.h>
 
 asmlinkage void divide_error(void);
 asmlinkage void debug(void);
@@ -72,12 +72,14 @@ asmlinkage void page_fault(void);
 asmlinkage void coprocessor_error(void);
 asmlinkage void simd_coprocessor_error(void);
 asmlinkage void alignment_check(void);
-asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void machine_check(void);
 
 int panic_on_unrecovered_nmi;
+int kstack_depth_to_print = 12;
 static unsigned int code_bytes = 64;
-static unsigned ignore_nmis;
+static int ignore_nmis;
+static int die_counter;
 
 static inline void conditional_sti(struct pt_regs *regs)
 {
@@ -101,8 +103,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	dec_preempt_count();
 }
 
-int kstack_depth_to_print = 12;
-
 void printk_address(unsigned long address, int reliable)
 {
 #ifdef CONFIG_KALLSYMS
@@ -559,7 +559,6 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 
 int __kprobes __die(const char * str, struct pt_regs * regs, long err)
 {
-	static int die_counter;
 	printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
 #ifdef CONFIG_PREEMPT
 	printk("PREEMPT ");
-- 
cgit v1.2.3


From e423f49fc8ccd761618748d7139638d8ddc1d16f Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Wed, 2 Jul 2008 01:31:03 +0200
Subject: x86: traps_xx: modify __die

if (cond) block -> if (!cond) goto end_of_block

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_32.c | 30 +++++++++++++-----------------
 arch/x86/kernel/traps_64.c |  4 ++--
 2 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 5339af459a38..5afd94a01b98 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -399,26 +399,22 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 	printk("DEBUG_PAGEALLOC");
 #endif
 	printk("\n");
-
 	if (notify_die(DIE_OOPS, str, regs, err,
-			current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
-
-		show_registers(regs);
-		/* Executive summary in case the oops scrolled away */
-		sp = (unsigned long) (&regs->sp);
-		savesegment(ss, ss);
-		if (user_mode(regs)) {
-			sp = regs->sp;
-			ss = regs->ss & 0xffff;
-		}
-		printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
-		print_symbol("%s", regs->ip);
-		printk(" SS:ESP %04x:%08lx\n", ss, sp);
+			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+		return 1;
 
-		return 0;
+	show_registers(regs);
+	/* Executive summary in case the oops scrolled away */
+	sp = (unsigned long) (&regs->sp);
+	savesegment(ss, ss);
+	if (user_mode(regs)) {
+		sp = regs->sp;
+		ss = regs->ss & 0xffff;
 	}
-
-	return 1;
+	printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+	print_symbol("%s", regs->ip);
+	printk(" SS:ESP %04x:%08lx\n", ss, sp);
+	return 0;
 }
 
 /*
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 03d63b0f5b4b..019a06fcfbdc 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -557,9 +557,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 	do_exit(signr);
 }
 
-int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 {
-	printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
+	printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
 #ifdef CONFIG_PREEMPT
 	printk("PREEMPT ");
 #endif
-- 
cgit v1.2.3


From a7bbb0ce1d1f956c8491b925c74767adf654f373 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Wed, 2 Jul 2008 01:31:34 +0200
Subject: x86: traps_xx: modify do_trap

if (cond) block -> if (!cond) goto end_of_block

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_64.c | 56 +++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 019a06fcfbdc..1c2d533fe7ff 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -630,38 +630,38 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 {
 	struct task_struct *tsk = current;
 
-	if (user_mode(regs)) {
-		/*
-		 * We want error_code and trap_no set for userspace
-		 * faults and kernelspace faults which result in
-		 * die(), but not kernelspace faults which are fixed
-		 * up.  die() gives the process no chance to handle
-		 * the signal and notice the kernel fault information,
-		 * so that won't result in polluting the information
-		 * about previously queued, but not yet delivered,
-		 * faults.  See also do_general_protection below.
-		 */
-		tsk->thread.error_code = error_code;
-		tsk->thread.trap_no = trapnr;
-
-		if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
-		    printk_ratelimit()) {
-			printk(KERN_INFO
-			       "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
-			       tsk->comm, tsk->pid, str,
-			       regs->ip, regs->sp, error_code);
-			print_vma_addr(" in ", regs->ip);
-			printk("\n");
-		}
+	if (!user_mode(regs))
+		goto kernel_trap;
 
-		if (info)
-			force_sig_info(signr, info, tsk);
-		else
-			force_sig(signr, tsk);
-		return;
+	/*
+	 * We want error_code and trap_no set for userspace faults and
+	 * kernelspace faults which result in die(), but not
+	 * kernelspace faults which are fixed up.  die() gives the
+	 * process no chance to handle the signal and notice the
+	 * kernel fault information, so that won't result in polluting
+	 * the information about previously queued, but not yet
+	 * delivered, faults.  See also do_general_protection below.
+	 */
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = trapnr;
+
+	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+	    printk_ratelimit()) {
+		printk(KERN_INFO
+		       "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+		       tsk->comm, tsk->pid, str,
+		       regs->ip, regs->sp, error_code);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
 	}
 
+	if (info)
+		force_sig_info(signr, info, tsk);
+	else
+		force_sig(signr, tsk);
+	return;
 
+kernel_trap:
 	if (!fixup_exception(regs)) {
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_no = trapnr;
-- 
cgit v1.2.3


From 13485ab55bc77f02024e80bcca0374950b0becb3 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Wed, 2 Jul 2008 01:32:04 +0200
Subject: x86: traps_xx: restructure do_general_protection()

 - if (cond) block -> if (!cond) goto end_of_block
 - local caching of current

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_32.c | 36 ++++++++++++++++++++----------------
 arch/x86/kernel/traps_64.c | 41 ++++++++++++++++++++++-------------------
 2 files changed, 42 insertions(+), 35 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 5afd94a01b98..c9a0120e510c 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -598,8 +598,10 @@ DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
 
-void __kprobes do_general_protection(struct pt_regs *regs, long error_code)
+void __kprobes
+do_general_protection(struct pt_regs *regs, long error_code)
 {
+	struct task_struct *tsk;
 	struct thread_struct *thread;
 	struct tss_struct *tss;
 	int cpu;
@@ -640,23 +642,24 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code)
 	if (regs->flags & X86_VM_MASK)
 		goto gp_in_vm86;
 
+	tsk = current;
 	if (!user_mode(regs))
 		goto gp_in_kernel;
 
-	current->thread.error_code = error_code;
-	current->thread.trap_no = 13;
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 13;
 
-	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
-	    printk_ratelimit()) {
+	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+			printk_ratelimit()) {
 		printk(KERN_INFO
-		    "%s[%d] general protection ip:%lx sp:%lx error:%lx",
-		    current->comm, task_pid_nr(current),
-		    regs->ip, regs->sp, error_code);
+			"%s[%d] general protection ip:%lx sp:%lx error:%lx",
+			tsk->comm, task_pid_nr(tsk),
+			regs->ip, regs->sp, error_code);
 		print_vma_addr(" in ", regs->ip);
 		printk("\n");
 	}
 
-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV, tsk);
 	return;
 
 gp_in_vm86:
@@ -665,14 +668,15 @@ gp_in_vm86:
 	return;
 
 gp_in_kernel:
-	if (!fixup_exception(regs)) {
-		current->thread.error_code = error_code;
-		current->thread.trap_no = 13;
-		if (notify_die(DIE_GPF, "general protection fault", regs,
+	if (fixup_exception(regs))
+		return;
+
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 13;
+	if (notify_die(DIE_GPF, "general protection fault", regs,
 				error_code, 13, SIGSEGV) == NOTIFY_STOP)
-			return;
-		die("general protection fault", regs, error_code);
-	}
+		return;
+	die("general protection fault", regs, error_code);
 }
 
 static notrace __kprobes void
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 1c2d533fe7ff..bc21ddc97a02 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -733,31 +733,34 @@ asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
 		die(str, regs, error_code);
 }
 
-asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
-						long error_code)
+asmlinkage void __kprobes
+do_general_protection(struct pt_regs *regs, long error_code)
 {
-	struct task_struct *tsk = current;
+	struct task_struct *tsk;
 
 	conditional_sti(regs);
 
-	if (user_mode(regs)) {
-		tsk->thread.error_code = error_code;
-		tsk->thread.trap_no = 13;
-
-		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-		    printk_ratelimit()) {
-			printk(KERN_INFO
-		       "%s[%d] general protection ip:%lx sp:%lx error:%lx",
-			       tsk->comm, tsk->pid,
-			       regs->ip, regs->sp, error_code);
-			print_vma_addr(" in ", regs->ip);
-			printk("\n");
-		}
+	tsk = current;
+	if (!user_mode(regs))
+		goto gp_in_kernel;
 
-		force_sig(SIGSEGV, tsk);
-		return;
-	} 
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 13;
+
+	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+			printk_ratelimit()) {
+		printk(KERN_INFO
+			"%s[%d] general protection ip:%lx sp:%lx error:%lx",
+			tsk->comm, tsk->pid,
+			regs->ip, regs->sp, error_code);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+	force_sig(SIGSEGV, tsk);
+	return;
 
+gp_in_kernel:
 	if (fixup_exception(regs))
 		return;
 
-- 
cgit v1.2.3


From abd348072798aa88d48fe9f182ac3440fcb7ae47 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Wed, 2 Jul 2008 18:39:01 +0200
Subject: x86: traps_xx: modify default_do_nmi

 - local caching of smp_processor_id() in default_do_nmi()
 - v2: do not split default_do_nmi over two lines

On Wed, Jul 02, 2008 at 08:12:20PM +0400, Cyrill Gorcunov wrote:
> | -static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
> | +static notrace __kprobes void
> | +default_do_nmi(struct pt_regs *regs)
> | [ ... ]
> | -asmlinkage notrace  __kprobes void default_do_nmi(struct pt_regs *regs)
> | +asmlinkage notrace __kprobes void
> | +default_do_nmi(struct pt_regs *regs)
>
> Hi Alexander, good done, thanks! But why did you split default_do_nmi
> definition by two lines? I think it would be better to keep them as it
> was before, ie by a single line
>
> 	static notrace __kprobes void default_do_nmi(struct pt_regs *regs)

Thanks! Here is the replacement patch with default_do_nmi left on
a single line.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_32.c | 9 ++++++---
 arch/x86/kernel/traps_64.c | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index c9a0120e510c..fb8e3cce7bfb 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -789,9 +789,12 @@ void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
+	int cpu;
+
+	cpu = smp_processor_id();
 
-	/* Only the BSP gets external NMIs from the system: */
-	if (!smp_processor_id())
+	/* Only the BSP gets external NMIs from the system. */
+	if (!cpu)
 		reason = get_nmi_reason();
 
 	if (!(reason & 0xc0)) {
@@ -805,7 +808,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		 */
 		if (nmi_watchdog_tick(regs, reason))
 			return;
-		if (!do_nmi_callback(regs, smp_processor_id()))
+		if (!do_nmi_callback(regs, cpu))
 			unknown_nmi_error(reason, regs);
 #else
 		unknown_nmi_error(reason, regs);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index bc21ddc97a02..9b9245f81b9f 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -827,7 +827,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 
 /* Runs on IST stack. This code must keep interrupts off all the time.
    Nested NMIs are prevented by the CPU. */
-asmlinkage notrace  __kprobes void default_do_nmi(struct pt_regs *regs)
+asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
 	int cpu;
-- 
cgit v1.2.3


From 7b4fd4bb2e7fad2c41afab4683a44ab77f6f35d0 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Wed, 2 Jul 2008 01:33:14 +0200
Subject: x86: traps_xx: various small changes

 - order of local variable declarations
 - minor code changes

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_32.c | 15 +++++++--------
 arch/x86/kernel/traps_64.c | 20 +++++++++++---------
 2 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index fb8e3cce7bfb..8a768973c4f0 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -106,13 +106,13 @@ static int die_counter;
 void printk_address(unsigned long address, int reliable)
 {
 #ifdef CONFIG_KALLSYMS
-	char namebuf[KSYM_NAME_LEN];
 	unsigned long offset = 0;
 	unsigned long symsize;
 	const char *symname;
-	char reliab[4] = "";
-	char *delim = ":";
 	char *modname;
+	char *delim = ":";
+	char namebuf[KSYM_NAME_LEN];
+	char reliab[4] = "";
 
 	symname = kallsyms_lookup(address, &symsize, &offset,
 					&modname, namebuf);
@@ -135,8 +135,8 @@ void printk_address(unsigned long address, int reliable)
 static inline int valid_stack_ptr(struct thread_info *tinfo,
 			void *p, unsigned int size)
 {
-	return	p > (void *)tinfo &&
-		p <= (void *)tinfo + THREAD_SIZE - size;
+	void *t = tinfo;
+	return	p > t && p <= t + THREAD_SIZE - size;
 }
 
 /* The form of the top of the frame on the stack */
@@ -976,9 +976,8 @@ clear_TF_reenable:
 void math_error(void __user *ip)
 {
 	struct task_struct *task;
-	unsigned short cwd;
-	unsigned short swd;
 	siginfo_t info;
+	unsigned short cwd, swd;
 
 	/*
 	 * Save the info for the exception handler and clear the error.
@@ -1042,8 +1041,8 @@ void do_coprocessor_error(struct pt_regs *regs, long error_code)
 static void simd_math_error(void __user *ip)
 {
 	struct task_struct *task;
-	unsigned short mxcsr;
 	siginfo_t info;
+	unsigned short mxcsr;
 
 	/*
 	 * Save the info for the exception handler and clear the error.
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 9b9245f81b9f..74e992957ff6 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -268,7 +268,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 
 	if (!task)
 		task = current;
-	tinfo = task_thread_info(task);
 
 	if (!stack) {
 		unsigned long dummy;
@@ -294,6 +293,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	 * current stack address. If the stacks consist of nested
 	 * exceptions
 	 */
+	tinfo = task_thread_info(task);
 	for (;;) {
 		char *id;
 		unsigned long *estack_end;
@@ -435,8 +435,8 @@ void show_stack(struct task_struct *task, unsigned long *sp)
  */
 void dump_stack(void)
 {
-	unsigned long stack;
 	unsigned long bp = 0;
+	unsigned long stack;
 
 #ifdef CONFIG_FRAME_POINTER
 	if (!bp)
@@ -459,12 +459,8 @@ void show_registers(struct pt_regs *regs)
 	unsigned long sp;
 	const int cpu = smp_processor_id();
 	struct task_struct *cur = cpu_pda(cpu)->pcurrent;
-	u8 *ip;
-	unsigned int code_prologue = code_bytes * 43 / 64;
-	unsigned int code_len = code_bytes;
 
 	sp = regs->sp;
-	ip = (u8 *) regs->ip - code_prologue;
 	printk("CPU %d ", cpu);
 	__show_regs(regs);
 	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -475,12 +471,18 @@ void show_registers(struct pt_regs *regs)
 	 * time of the fault..
 	 */
 	if (!user_mode(regs)) {
+		unsigned int code_prologue = code_bytes * 43 / 64;
+		unsigned int code_len = code_bytes;
 		unsigned char c;
+		u8 *ip;
+
 		printk("Stack: ");
 		_show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
 		printk("\n");
 
 		printk(KERN_EMERG "Code: ");
+
+		ip = (u8 *)regs->ip - code_prologue;
 		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
 			/* try starting at RIP */
 			ip = (u8 *)regs->ip;
@@ -585,7 +587,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 	return 0;
 }
 
-void die(const char * str, struct pt_regs *regs, long err)
+void die(const char *str, struct pt_regs *regs, long err)
 {
 	unsigned long flags = oops_begin();
 
@@ -927,8 +929,8 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 				   unsigned long error_code)
 {
-	unsigned long condition;
 	struct task_struct *tsk = current;
+	unsigned long condition;
 	siginfo_t info;
 
 	trace_hardirqs_fixup();
@@ -1201,7 +1203,7 @@ void __init trap_init(void)
 	/*
 	 * initialize the per thread extended state:
 	 */
-        init_thread_xstate();
+	init_thread_xstate();
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
-- 
cgit v1.2.3


From 6f585e01614f38195599c1bc5379d3c9dae6be47 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 4 Jul 2008 12:36:21 +0200
Subject: arch/x86/kernel/smpboot.c: fix warning

arch/x86/kernel/smpboot.c: In function 'do_boot_cpu':
arch/x86/kernel/smpboot.c:943: warning: label 'restore_state' defined but not used

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e1200b202ed7..f35c2d8016ac 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -939,9 +939,9 @@ do_rest:
 				inquire_remote_apic(apicid);
 		}
 	}
-
+#ifdef CONFIG_X86_64
 restore_state:
-
+#endif
 	if (boot_error) {
 		/* Try to put things back the way they were before ... */
 		numa_remove_cpu(cpu); /* was set by numa_add_cpu */
-- 
cgit v1.2.3


From 0a4d8a472f645d99f86303db1462b64e371b090d Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Tue, 24 Jun 2008 09:34:08 -0300
Subject: x86: provide delay loop for x86_64.

This is for consistency with i386. We call use_tsc_delay()
at tsc initialization for x86_64, so we'll be always using it.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc.c   |  1 +
 arch/x86/lib/delay_64.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 41 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3c36f92160c9..4a775d001957 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -513,6 +513,7 @@ void __init tsc_init(void)
 	 */
 	for_each_possible_cpu(cpu)
 		set_cyc2ns_scale(cpu_khz, cpu);
+	use_tsc_delay();
 
 	if (tsc_disabled > 0)
 		return;
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
index 4c441be92641..d0326d07c845 100644
--- a/arch/x86/lib/delay_64.c
+++ b/arch/x86/lib/delay_64.c
@@ -22,13 +22,28 @@
 #include <asm/smp.h>
 #endif
 
-int __devinit read_current_timer(unsigned long *timer_value)
+/* simple loop based delay: */
+static void delay_loop(unsigned long loops)
 {
-	rdtscll(*timer_value);
-	return 0;
+	asm volatile(
+		"	test %0,%0	\n"
+		"	jz 3f		\n"
+		"	jmp 1f		\n"
+
+		".align 16		\n"
+		"1:	jmp 2f		\n"
+
+		".align 16		\n"
+		"2:	dec %0		\n"
+		"	jnz 2b		\n"
+		"3:	dec %0		\n"
+
+		: /* we don't need output */
+		:"a" (loops)
+	);
 }
 
-void __delay(unsigned long loops)
+static void delay_tsc(unsigned long loops)
 {
 	unsigned bclock, now;
 	int cpu;
@@ -63,6 +78,27 @@ void __delay(unsigned long loops)
 	}
 	preempt_enable();
 }
+
+static void (*delay_fn)(unsigned long) = delay_loop;
+
+void use_tsc_delay(void)
+{
+	delay_fn = delay_tsc;
+}
+
+int __devinit read_current_timer(unsigned long *timer_value)
+{
+	if (delay_fn == delay_tsc) {
+		rdtscll(*timer_value);
+		return 0;
+	}
+	return -1;
+}
+
+void __delay(unsigned long loops)
+{
+	delay_fn(loops);
+}
 EXPORT_SYMBOL(__delay);
 
 inline void __const_udelay(unsigned long xloops)
-- 
cgit v1.2.3


From 26ccb8a7183eed424ff9c874c83af20dafe7cdef Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Tue, 24 Jun 2008 11:19:35 -0300
Subject: x86: rename threadinfo to TI.

This is for consistency with i386.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/ia32/ia32entry.S        | 25 ++++++++++++++-----------
 arch/x86/kernel/asm-offsets_64.c |  2 +-
 arch/x86/kernel/entry_64.S       | 23 ++++++++++++-----------
 arch/x86/lib/copy_user_64.S      |  4 ++--
 arch/x86/lib/getuser_64.S        |  8 ++++----
 arch/x86/lib/putuser_64.S        |  8 ++++----
 6 files changed, 37 insertions(+), 33 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 24e4d4928d65..20371d0635e4 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -116,7 +116,7 @@ ENTRY(ia32_sysenter_target)
 	pushfq
 	CFI_ADJUST_CFA_OFFSET 8
 	/*CFI_REL_OFFSET rflags,0*/
-	movl	8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d
+	movl	8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
 	CFI_REGISTER rip,r10
 	pushq	$__USER32_CS
 	CFI_ADJUST_CFA_OFFSET 8
@@ -136,8 +136,9 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
-	orl    $TS_COMPAT,threadinfo_status(%r10)
-	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+	orl    $TS_COMPAT,TI_status(%r10)
+	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
+		 TI_flags(%r10)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 sysenter_do_call:	
@@ -149,9 +150,9 @@ sysenter_do_call:
 	GET_THREAD_INFO(%r10)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl	$_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
+	testl	$_TIF_ALLWORK_MASK,TI_flags(%r10)
 	jnz	int_ret_from_sys_call
-	andl    $~TS_COMPAT,threadinfo_status(%r10)
+	andl    $~TS_COMPAT,TI_status(%r10)
 	/* clear IF, that popfq doesn't enable interrupts early */
 	andl  $~0x200,EFLAGS-R11(%rsp) 
 	movl	RIP-R11(%rsp),%edx		/* User %eip */
@@ -240,8 +241,9 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
-	orl   $TS_COMPAT,threadinfo_status(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+	orl   $TS_COMPAT,TI_status(%r10)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
+		TI_flags(%r10)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 cstar_do_call:	
@@ -253,9 +255,9 @@ cstar_do_call:
 	GET_THREAD_INFO(%r10)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
+	testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
 	jnz  int_ret_from_sys_call
-	andl $~TS_COMPAT,threadinfo_status(%r10)
+	andl $~TS_COMPAT,TI_status(%r10)
 	RESTORE_ARGS 1,-ARG_SKIP,1,1,1
 	movl RIP-ARGOFFSET(%rsp),%ecx
 	CFI_REGISTER rip,rcx
@@ -333,8 +335,9 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,0,1
 	GET_THREAD_INFO(%r10)
-	orl   $TS_COMPAT,threadinfo_status(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+	orl   $TS_COMPAT,TI_status(%r10)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
+		TI_flags(%r10)
 	jnz ia32_tracesys
 ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls-1),%eax
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 3295e7c08fe7..bacf5deeec2d 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -34,7 +34,7 @@ int main(void)
 	ENTRY(pid);
 	BLANK();
 #undef ENTRY
-#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
+#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
 	ENTRY(flags);
 	ENTRY(addr_limit);
 	ENTRY(preempt_count);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 07d69f262337..466b9284ed2f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -168,13 +168,13 @@ ENTRY(ret_from_fork)
 	CFI_ADJUST_CFA_OFFSET -4
 	call schedule_tail
 	GET_THREAD_INFO(%rcx)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
 	jnz rff_trace
 rff_action:	
 	RESTORE_REST
 	testl $3,CS-ARGOFFSET(%rsp)	# from kernel_thread?
 	je   int_ret_from_sys_call
-	testl $_TIF_IA32,threadinfo_flags(%rcx)
+	testl $_TIF_IA32,TI_flags(%rcx)
 	jnz  int_ret_from_sys_call
 	RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
 	jmp ret_from_sys_call
@@ -243,7 +243,8 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
+		TI_flags(%rcx)
 	jnz tracesys
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
@@ -262,7 +263,7 @@ sysret_check:
 	GET_THREAD_INFO(%rcx)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	movl threadinfo_flags(%rcx),%edx
+	movl TI_flags(%rcx),%edx
 	andl %edi,%edx
 	jnz  sysret_careful 
 	CFI_REMEMBER_STATE
@@ -347,10 +348,10 @@ int_ret_from_sys_call:
 int_with_check:
 	LOCKDEP_SYS_EXIT_IRQ
 	GET_THREAD_INFO(%rcx)
-	movl threadinfo_flags(%rcx),%edx
+	movl TI_flags(%rcx),%edx
 	andl %edi,%edx
 	jnz   int_careful
-	andl    $~TS_COMPAT,threadinfo_status(%rcx)
+	andl    $~TS_COMPAT,TI_status(%rcx)
 	jmp   retint_swapgs
 
 	/* Either reschedule or signal or syscall exit tracking needed. */
@@ -558,7 +559,7 @@ retint_with_reschedule:
 	movl $_TIF_WORK_MASK,%edi
 retint_check:
 	LOCKDEP_SYS_EXIT_IRQ
-	movl threadinfo_flags(%rcx),%edx
+	movl TI_flags(%rcx),%edx
 	andl %edi,%edx
 	CFI_REMEMBER_STATE
 	jnz  retint_careful
@@ -654,9 +655,9 @@ retint_signal:
 	/* Returning to kernel space. Check if we need preemption */
 	/* rcx:	 threadinfo. interrupts off. */
 ENTRY(retint_kernel)
-	cmpl $0,threadinfo_preempt_count(%rcx)
+	cmpl $0,TI_preempt_count(%rcx)
 	jnz  retint_restore_args
-	bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
 	jnc  retint_restore_args
 	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
@@ -819,7 +820,7 @@ paranoid_restore\trace:
 	jmp irq_return
 paranoid_userspace\trace:
 	GET_THREAD_INFO(%rcx)
-	movl threadinfo_flags(%rcx),%ebx
+	movl TI_flags(%rcx),%ebx
 	andl $_TIF_WORK_MASK,%ebx
 	jz paranoid_swapgs\trace
 	movq %rsp,%rdi			/* &pt_regs */
@@ -917,7 +918,7 @@ error_exit:
 	testl %eax,%eax
 	jne  retint_kernel
 	LOCKDEP_SYS_EXIT_IRQ
-	movl  threadinfo_flags(%rcx),%edx
+	movl  TI_flags(%rcx),%edx
 	movl  $_TIF_WORK_MASK,%edi
 	andl  %edi,%edx
 	jnz  retint_careful
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index ee1c3f635157..7eaaf0123b4d 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -40,7 +40,7 @@ ENTRY(copy_to_user)
 	movq %rdi,%rcx
 	addq %rdx,%rcx
 	jc  bad_to_user
-	cmpq threadinfo_addr_limit(%rax),%rcx
+	cmpq TI_addr_limit(%rax),%rcx
 	jae bad_to_user
 	xorl %eax,%eax	/* clear zero flag */
 	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
@@ -65,7 +65,7 @@ ENTRY(copy_from_user)
 	movq %rsi,%rcx
 	addq %rdx,%rcx
 	jc  bad_from_user
-	cmpq threadinfo_addr_limit(%rax),%rcx
+	cmpq TI_addr_limit(%rax),%rcx
 	jae  bad_from_user
 	movl $1,%ecx	/* set zero flag */
 	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser_64.S
index df37d3a9ba2a..0ec7890f9dcc 100644
--- a/arch/x86/lib/getuser_64.S
+++ b/arch/x86/lib/getuser_64.S
@@ -37,7 +37,7 @@
 ENTRY(__get_user_1)
 	CFI_STARTPROC
 	GET_THREAD_INFO(%rdx)
-	cmpq threadinfo_addr_limit(%rdx),%rax
+	cmpq TI_addr_limit(%rdx),%rax
 	jae bad_get_user
 1:	movzb (%rax),%edx
 	xorl %eax,%eax
@@ -50,7 +50,7 @@ ENTRY(__get_user_2)
 	addq $1,%rax
 	jc bad_get_user
 	GET_THREAD_INFO(%rdx)
-	cmpq threadinfo_addr_limit(%rdx),%rax
+	cmpq TI_addr_limit(%rdx),%rax
 	jae bad_get_user
 2:	movzwl -1(%rax),%edx
 	xorl %eax,%eax
@@ -63,7 +63,7 @@ ENTRY(__get_user_4)
 	addq $3,%rax
 	jc bad_get_user
 	GET_THREAD_INFO(%rdx)
-	cmpq threadinfo_addr_limit(%rdx),%rax
+	cmpq TI_addr_limit(%rdx),%rax
 	jae bad_get_user
 3:	movl -3(%rax),%edx
 	xorl %eax,%eax
@@ -76,7 +76,7 @@ ENTRY(__get_user_8)
 	addq $7,%rax
 	jc bad_get_user
 	GET_THREAD_INFO(%rdx)
-	cmpq threadinfo_addr_limit(%rdx),%rax
+	cmpq TI_addr_limit(%rdx),%rax
 	jae	bad_get_user
 4:	movq -7(%rax),%rdx
 	xorl %eax,%eax
diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S
index 4989f5a8fa9b..940796fa0d98 100644
--- a/arch/x86/lib/putuser_64.S
+++ b/arch/x86/lib/putuser_64.S
@@ -35,7 +35,7 @@
 ENTRY(__put_user_1)
 	CFI_STARTPROC
 	GET_THREAD_INFO(%r8)
-	cmpq threadinfo_addr_limit(%r8),%rcx
+	cmpq TI_addr_limit(%r8),%rcx
 	jae bad_put_user
 1:	movb %dl,(%rcx)
 	xorl %eax,%eax
@@ -48,7 +48,7 @@ ENTRY(__put_user_2)
 	GET_THREAD_INFO(%r8)
 	addq $1,%rcx
 	jc 20f
-	cmpq threadinfo_addr_limit(%r8),%rcx
+	cmpq TI_addr_limit(%r8),%rcx
 	jae 20f
 	decq %rcx
 2:	movw %dx,(%rcx)
@@ -64,7 +64,7 @@ ENTRY(__put_user_4)
 	GET_THREAD_INFO(%r8)
 	addq $3,%rcx
 	jc 30f
-	cmpq threadinfo_addr_limit(%r8),%rcx
+	cmpq TI_addr_limit(%r8),%rcx
 	jae 30f
 	subq $3,%rcx
 3:	movl %edx,(%rcx)
@@ -80,7 +80,7 @@ ENTRY(__put_user_8)
 	GET_THREAD_INFO(%r8)
 	addq $7,%rcx
 	jc 40f
-	cmpq threadinfo_addr_limit(%r8),%rcx
+	cmpq TI_addr_limit(%r8),%rcx
 	jae 40f
 	subq $7,%rcx
 4:	movq %rdx,(%rcx)
-- 
cgit v1.2.3


From 2dc807b37b7b8c7df445513ad2b415df4ebcaf6d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 8 Jul 2008 18:56:38 -0700
Subject: x86: make max_pfn cover acpi table below 4g

When system have 4g less ram installed, and acpi table sit
near end of ram, make max_pfn cover them too,
so 64bit kernel don't need to mess up fixmap.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: "Suresh Siddha" <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c  | 18 ++++++++++++------
 arch/x86/kernel/setup.c | 13 +++----------
 include/asm-x86/e820.h  |  2 +-
 3 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e07d4019e266..2e08619a9c5c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1056,12 +1056,20 @@ unsigned long __initdata end_user_pfn = MAX_ARCH_PFN;
 /*
  * Find the highest page frame number we have available
  */
-unsigned long __init e820_end_of_ram(void)
+unsigned long __init e820_end(void)
 {
-	unsigned long last_pfn;
+	int i;
+	unsigned long last_pfn = 0;
 	unsigned long max_arch_pfn = MAX_ARCH_PFN;
 
-	last_pfn = find_max_pfn_with_active_regions();
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		unsigned long end_pfn;
+
+		end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+		if (end_pfn > last_pfn)
+			last_pfn = end_pfn;
+	}
 
 	if (last_pfn > max_arch_pfn)
 		last_pfn = max_arch_pfn;
@@ -1192,9 +1200,7 @@ static int __init parse_memmap_opt(char *p)
 		 * the real mem size before original memory map is
 		 * reset.
 		 */
-		e820_register_active_regions(0, 0, -1UL);
-		saved_max_pfn = e820_end_of_ram();
-		remove_all_active_ranges();
+		saved_max_pfn = e820_end();
 #endif
 		e820.nr_map = 0;
 		userdef = 1;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bea8ae77d059..a7c3471ea17c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -709,22 +709,18 @@ void __init setup_arch(char **cmdline_p)
 	early_gart_iommu_check();
 #endif
 
-	e820_register_active_regions(0, 0, -1UL);
 	/*
 	 * partially used pages are not usable - thus
 	 * we are rounding upwards:
 	 */
-	max_pfn = e820_end_of_ram();
+	max_pfn = e820_end();
 
 	/* preallocate 4k for mptable mpc */
 	early_reserve_e820_mpc_new();
 	/* update e820 for memory not covered by WB MTRRs */
 	mtrr_bp_init();
-	if (mtrr_trim_uncached_memory(max_pfn)) {
-		remove_all_active_ranges();
-		e820_register_active_regions(0, 0, -1UL);
-		max_pfn = e820_end_of_ram();
-	}
+	if (mtrr_trim_uncached_memory(max_pfn))
+		max_pfn = e820_end();
 
 #ifdef CONFIG_X86_32
 	/* max_low_pfn get updated here */
@@ -767,9 +763,6 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_boot_table_init();
 
-	/* Remove active ranges so rediscovery with NUMA-awareness happens */
-	remove_all_active_ranges();
-
 #ifdef CONFIG_ACPI_NUMA
 	/*
 	 * Parse SRAT to discover nodes.
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index a20d0a7f5892..78c03d7bf441 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -99,7 +99,7 @@ extern void free_early(u64 start, u64 end);
 extern void early_res_to_bootmem(u64 start, u64 end);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
-extern unsigned long e820_end_of_ram(void);
+extern unsigned long e820_end(void);
 extern int e820_find_active_region(const struct e820entry *ei,
 				  unsigned long start_pfn,
 				  unsigned long last_pfn,
-- 
cgit v1.2.3


From e2079c43861f71b2deb78ee20e247ad954fdd67e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 8 Jul 2008 16:12:26 +0200
Subject: x86: fix C1E && nx6325 stability problem

The problems are that, with the ACPI vs timer overring issue _fixed_,
after using the box for some time (between several seconds and 1 hour, at
random) processes get very high CPU loads (once I've got X using 107% of
the CPU, for example) and the system becomes unresponsive, as though there
were interrupts lost or something similar.

Andreas Herrman reproduced similar problems:

> Ok, now I've reproduced the stability problem.
> - Using tip/master,
> - reverting e38502eb8aa82314d5ab0eba45f50e6790dadd88 and
> - applying your patch from this posting
>   http://marc.info/?l=linux-kernel&m=121539354224562&w=4
>
> Starting X, firefox, gimp, tuxpaint and doing some drawing in tuxpaint
> results in a slow system. Drawing is almost not possible anymore --
> Selections of new colors, cursors etc. is performed with huge delay
> if it's performed at all.
>
> BTW, the code sets up timer IRQ as Virtual Wire IRQ:
>
> Jul  8 14:57:58 kodscha IO-APIC (apicid-pin) 2-22, 2-23 not connected.
> Jul  8 14:57:58 kodscha ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
> Jul  8 14:57:58 kodscha ...trying to set up timer as Virtual Wire IRQ... works.
>
> and both INT0 and INT2 of IOAPIC are masked:
>
> Jul  8 14:57:58 kodscha NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:
> Jul  8 14:57:58 kodscha 00 000 1    0    0   0   0    0    0    00
> Jul  8 14:57:58 kodscha 01 003 0    0    0   0   0    1    1    31
> Jul  8 14:57:58 kodscha 02 003 1    0    0   0   0    0    0    30
>
> I've also seen strange CPU utilization -- with syslog-ng:
>
> top - 15:33:06 up 35 min,  4 users,  load average: 1.70, 0.68, 0.37
> Tasks:  64 total,   4 running,  60 sleeping,   0 stopped,   0 zombie
> Cpu0  :  0.0%us,100.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
> Cpu1  :  6.4%us, 87.2%sy,  0.0%ni,  5.8%id,  0.0%wa,  0.6%hi,  0.0%si,  0.0%st
> Mem:    895384k total,   283568k used,   611816k free,    35492k buffers
> Swap:  1959920k total,        0k used,  1959920k free,   163044k cached
>
>   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
>  4632 root      20   0 17216  800  580 S  104  0.1   0:34.22 syslog-ng
> 28505 root      20   0  205m  11m 4024 S    6  1.3   0:21.16 X
> 28518 root      20   0 56292 5652 4492 S    1  0.6   0:01.80 fluxbox
>     1 root      20   0  3724  608  508 S    0  0.1   0:00.36 init
>
> So far I have no clue why C1E-idle in conjunction with virtual wire
> mode causes this strange behaviour.
>
> ... and I start to think about the root cause of all this.
>
> I've performed similar tests under X with the IRQ0/INT0 configuration and
> I did not see above symptoms.

So lets fall back to the IRQ0/INT0 configuration on this box.

This basically restores the dont-use-the-lapic-timer exception mechanism
that was unconditional on this box prior commit 8750bf5 ("x86: add C1E
aware idle function").

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c  | 43 +++++++++++++++++++++++++++++++++++++------
 arch/x86/kernel/io_apic_32.c | 10 ++++++++++
 arch/x86/kernel/io_apic_64.c | 10 ++++++++++
 include/asm-x86/genapic_32.h |  1 +
 include/asm-x86/genapic_64.h |  2 ++
 5 files changed, 60 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 5c0107602b62..e1f01394b681 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1373,8 +1373,6 @@ static void __init acpi_process_madt(void)
 	return;
 }
 
-#ifdef __i386__
-
 static int __init disable_acpi_irq(const struct dmi_system_id *d)
 {
 	if (!acpi_force) {
@@ -1435,6 +1433,17 @@ dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d)
 	return 0;
 }
 
+/*
+ * Force ignoring BIOS IRQ0 pin2 override
+ */
+static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
+{
+	pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident);
+	acpi_skip_timer_override = 1;
+	force_mask_ioapic_irq_2();
+	return 0;
+}
+
 /*
  * If your system is blacklisted here, but you find that acpi=force
  * works for you, please contact acpi-devel@sourceforge.net
@@ -1628,11 +1637,35 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
 		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
 		     },
 	 },
+	/*
+	 * HP laptops which use a DSDT reporting as HP/SB400/10000,
+	 * which includes some code which overrides all temperature
+	 * trip points to 16C if the INTIN2 input of the I/O APIC
+	 * is enabled.  This input is incorrectly designated the
+	 * ISA IRQ 0 via an interrupt source override even though
+	 * it is wired to the output of the master 8259A and INTIN0
+	 * is not connected at all.  Force ignoring BIOS IRQ0 pin2
+	 * override in that cases.
+	 */
+	{
+	 .callback = dmi_ignore_irq0_timer_override,
+	 .ident = "HP NX6125 laptop",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
+		     },
+	 },
+	{
+	 .callback = dmi_ignore_irq0_timer_override,
+	 .ident = "HP NX6325 laptop",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
+		     },
+	 },
 	{}
 };
 
-#endif				/* __i386__ */
-
 /*
  * acpi_boot_table_init() and acpi_boot_init()
  *  called from setup_arch(), always.
@@ -1660,9 +1693,7 @@ int __init acpi_boot_table_init(void)
 {
 	int error;
 
-#ifdef __i386__
 	dmi_check_system(acpi_dmi_table);
-#endif
 
 	/*
 	 * If acpi_disabled, bail out
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 337ec3438a8f..6b220b9dcbb3 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -59,6 +59,13 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 static DEFINE_SPINLOCK(ioapic_lock);
 static DEFINE_SPINLOCK(vector_lock);
 
+static bool mask_ioapic_irq_2 __initdata;
+
+void __init force_mask_ioapic_irq_2(void)
+{
+	mask_ioapic_irq_2 = true;
+}
+
 int timer_through_8259 __initdata;
 
 /*
@@ -2172,6 +2179,9 @@ static inline void __init check_timer(void)
 	printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		vector, apic1, pin1, apic2, pin2);
 
+	if (mask_ioapic_irq_2)
+		mask_IO_APIC_irq(2);
+
 	/*
 	 * Some BIOS writers are clueless and report the ExtINTA
 	 * I/O APIC input from the cascaded 8259A as the timer
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 2b4c40bc12c9..0494cdb270c5 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -94,6 +94,13 @@ static int no_timer_check;
 
 static int disable_timer_pin_1 __initdata;
 
+static bool mask_ioapic_irq_2 __initdata;
+
+void __init force_mask_ioapic_irq_2(void)
+{
+	mask_ioapic_irq_2 = true;
+}
+
 int timer_through_8259 __initdata;
 
 /* Where if anywhere is the i8259 connect in external int mode */
@@ -1698,6 +1705,9 @@ static inline void __init check_timer(void)
 	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		cfg->vector, apic1, pin1, apic2, pin2);
 
+	if (mask_ioapic_irq_2)
+		mask_IO_APIC_irq(2);
+
 	/*
 	 * Some BIOS writers are clueless and report the ExtINTA
 	 * I/O APIC input from the cascaded 8259A as the timer
diff --git a/include/asm-x86/genapic_32.h b/include/asm-x86/genapic_32.h
index b02ea6e17de8..8d4c8bdb9065 100644
--- a/include/asm-x86/genapic_32.h
+++ b/include/asm-x86/genapic_32.h
@@ -119,5 +119,6 @@ enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
 #define is_uv_system()			0
 #define uv_wakeup_secondary(a, b)	1
 
+extern void force_mask_ioapic_irq_2(void);
 
 #endif
diff --git a/include/asm-x86/genapic_64.h b/include/asm-x86/genapic_64.h
index 0f8504627c41..082ad020e412 100644
--- a/include/asm-x86/genapic_64.h
+++ b/include/asm-x86/genapic_64.h
@@ -46,4 +46,6 @@ extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
 
 extern void setup_apic_routing(void);
 
+extern void force_mask_ioapic_irq_2(void);
+
 #endif
-- 
cgit v1.2.3


From 183fe065652dbd64953afa9f389327e23e97967f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 9 Jul 2008 11:32:10 +0200
Subject: x86: build fix for "x86: fix C1E && nx6325 stability problem"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

arch/x86/kernel/acpi/boot.c: In function ‘dmi_ignore_irq0_timer_override’:
arch/x86/kernel/acpi/boot.c:1443: error: implicit declaration of function ‘force_mask_ioapic_irq_2’

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e1f01394b681..bf7b4f7f60e1 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -37,6 +37,7 @@
 #include <asm/pgtable.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/io.h>
 #include <asm/mpspec.h>
 #include <asm/smp.h>
-- 
cgit v1.2.3


From c22d4c1885130db9c07f6441ab461208a1ba16b2 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 9 Jul 2008 03:01:14 -0700
Subject: x86: make e820_end return max ram type only for 32 bit

to avoid warning from find_low_pfn_range for high pages size etc

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 2e08619a9c5c..292ebc7fe4d0 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1066,6 +1066,11 @@ unsigned long __init e820_end(void)
 		struct e820entry *ei = &e820.map[i];
 		unsigned long end_pfn;
 
+#ifdef CONFIG_X86_32
+		if (ei->type != E820_RAM)
+			continue;
+#endif
+
 		end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
 		if (end_pfn > last_pfn)
 			last_pfn = end_pfn;
-- 
cgit v1.2.3


From a737abd11ac4eb9f4226fa8c9f1d9b5be12a96c1 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 5 Jul 2008 15:53:39 +0400
Subject: x86: e820 memmap - add checking for NULL early param

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: akpm@linux-foundation.org
Cc: andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 292ebc7fe4d0..66fd5bd78318 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1198,6 +1198,9 @@ static int __init parse_memmap_opt(char *p)
 	char *oldp;
 	u64 start_at, mem_size;
 
+	if (!p)
+		return -EINVAL;
+
 	if (!strcmp(p, "exactmap")) {
 #ifdef CONFIG_CRASH_DUMP
 		/*
-- 
cgit v1.2.3


From f34fa82b19581affffb14f8ad9bdad9b5ab4daf5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 9 Jul 2008 20:16:36 -0700
Subject: x86, acpi: merge __acpi_map_table

and let 64-bit to fall back to use fixmap too.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 23 ++++++-----------------
 include/asm-x86/fixmap_64.h |  5 +++++
 2 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index bf7b4f7f60e1..a31a579a47ca 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -109,21 +109,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
  */
 enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
 
-#ifdef	CONFIG_X86_64
-
-/* rely on all ACPI tables being in the direct mapping */
-char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
-{
-	if (!phys_addr || !size)
-		return NULL;
-
-	if (phys_addr+size <= (max_pfn_mapped << PAGE_SHIFT) + PAGE_SIZE)
-		return __va(phys_addr);
-
-	return NULL;
-}
-
-#else
 
 /*
  * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
@@ -142,11 +127,15 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
 	unsigned long base, offset, mapped_size;
 	int idx;
 
-	if (phys + size < 8 * 1024 * 1024)
+	if (!phys || !size)
+		return NULL;
+
+	if (phys+size <= (max_pfn_mapped << PAGE_SHIFT))
 		return __va(phys);
 
 	offset = phys & (PAGE_SIZE - 1);
 	mapped_size = PAGE_SIZE - offset;
+	clear_fixmap(FIX_ACPI_END);
 	set_fixmap(FIX_ACPI_END, phys);
 	base = fix_to_virt(FIX_ACPI_END);
 
@@ -158,13 +147,13 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
 		if (--idx < FIX_ACPI_BEGIN)
 			return NULL;	/* cannot handle this */
 		phys += PAGE_SIZE;
+		clear_fixmap(idx);
 		set_fixmap(idx, phys);
 		mapped_size += PAGE_SIZE;
 	}
 
 	return ((unsigned char *)base + offset);
 }
-#endif
 
 #ifdef CONFIG_PCI_MMCONFIG
 /* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h
index 1a0b61eb02ff..6a4789d57e6c 100644
--- a/include/asm-x86/fixmap_64.h
+++ b/include/asm-x86/fixmap_64.h
@@ -12,6 +12,7 @@
 #define _ASM_FIXMAP_64_H
 
 #include <linux/kernel.h>
+#include <asm/acpi.h>
 #include <asm/apicdef.h>
 #include <asm/page.h>
 #include <asm/vsyscall.h>
@@ -49,6 +50,10 @@ enum fixed_addresses {
 #ifdef CONFIG_PARAVIRT
 	FIX_PARAVIRT_BOOTMAP,
 #endif
+#ifdef CONFIG_ACPI
+	FIX_ACPI_BEGIN,
+	FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
 	FIX_OHCI1394_BASE,
 #endif
-- 
cgit v1.2.3


From 3d43ecd286e442792f2e899e6e06eb23ab3d99f6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 9 Jul 2008 20:17:50 -0700
Subject: x86: make e820_end return end_of_ram again for 64bit

even on 64bit systems with less than 4G RAM, we can now use fixmap
to handle acpi SIT near end of ram.

change e820_end to e820_end_of_ram again?
or e820_ram_pfn?

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 66fd5bd78318..9836a079cfd9 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1066,10 +1066,8 @@ unsigned long __init e820_end(void)
 		struct e820entry *ei = &e820.map[i];
 		unsigned long end_pfn;
 
-#ifdef CONFIG_X86_32
 		if (ei->type != E820_RAM)
 			continue;
-#endif
 
 		end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
 		if (end_pfn > last_pfn)
-- 
cgit v1.2.3


From 2179bab7d431ab8ed539e19e029b1f6a231f4ed3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Jul 2008 10:59:59 +0200
Subject: Revert "x86: fix IO APIC breakage on HP nx6325, v2"

This reverts commit a74a1cc3df0be89658bc735c8aed80c8392e2c15.

This was just temporary diagnostics commit - not needed now that we've
got the final fix.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_64.c | 28 ++--------------------------
 1 file changed, 2 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 0494cdb270c5..83c953af9cd8 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -373,26 +373,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	entry->pin = pin;
 }
 
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq(unsigned int irq,
-				      int oldapic, int oldpin,
-				      int newapic, int newpin)
-{
-	struct irq_pin_list *entry = irq_2_pin + irq;
-
-	while (1) {
-		if (entry->apic == oldapic && entry->pin == oldpin) {
-			entry->apic = newapic;
-			entry->pin = newpin;
-		}
-		if (!entry->next)
-			break;
-		entry = irq_2_pin + entry->next;
-	}
-}
-
 
 #define DO_ACTION(name,R,ACTION, FINAL)					\
 									\
@@ -1724,11 +1704,6 @@ static inline void __init check_timer(void)
 		apic2 = apic1;
 	}
 
-	replace_pin_at_irq(0, 0, 0, apic1, pin1);
-	apic1 = 0;
-	pin1 = 0;
-	setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
-
 	if (pin1 != -1) {
 		/*
 		 * Ok, does IRQ0 through the IOAPIC work?
@@ -1760,9 +1735,10 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		/* replace_pin_at_irq(0, apic1, pin1, apic2, pin2); */
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		unmask_IO_APIC_irq(0);
+		clear_IO_APIC_pin(apic2, pin2);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
-- 
cgit v1.2.3


From c9076b63191ec799ba6848ce5603fff109da57d2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Jul 2008 11:00:50 +0200
Subject: Revert "x86: fix IO APIC breakage on HP nx6325"

This reverts commit 90221a61a71b7ad659d8741cf1e404506b174982.

This too was just temporary diagnostics - not needed now that we've
got the final fix via:

| commit e2079c43861f71b2deb78ee20e247ad954fdd67e
| Author: Rafael J. Wysocki <rjw@sisk.pl>
| Date:   Tue Jul 8 16:12:26 2008 +0200
|
|     x86: fix C1E && nx6325 stability problem

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 83c953af9cd8..f930885ac667 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1738,7 +1738,6 @@ static inline void __init check_timer(void)
 		/* replace_pin_at_irq(0, apic1, pin1, apic2, pin2); */
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		unmask_IO_APIC_irq(0);
-		clear_IO_APIC_pin(apic2, pin2);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
-- 
cgit v1.2.3


From 0b9f4f49e2abe787673de8f1f56f053fb30fec24 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 1 Jul 2008 01:19:31 +0100
Subject: x86: I/O APIC: Add a 64-bit variation of replace_pin_at_irq()

When an interrupt is rerouted to a different I/O APIC pin the relevant
entry of the irq_2_pin list should get updated accordingly so that
operations are performed on the correct redirection entry.

This is already done by the 32-bit variation of the code and here is a
complementing 64-bit implementation.  Should make someone's decision less
tough when merging the two. ;)

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_64.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index f930885ac667..848411753c7c 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -373,6 +373,26 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	entry->pin = pin;
 }
 
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+				      int oldapic, int oldpin,
+				      int newapic, int newpin)
+{
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	while (1) {
+		if (entry->apic == oldapic && entry->pin == oldpin) {
+			entry->apic = newapic;
+			entry->pin = newpin;
+		}
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+}
+
 
 #define DO_ACTION(name,R,ACTION, FINAL)					\
 									\
@@ -1735,7 +1755,7 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		/* replace_pin_at_irq(0, apic1, pin1, apic2, pin2); */
+		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
-- 
cgit v1.2.3


From a13b04af713bfa60d44cbac956ab00d3a5793de7 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 29 May 2008 10:05:08 -0700
Subject: x86 microcode: firmware data is const

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 arch/x86/kernel/microcode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 69729e38b78a..649dfd761f23 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -488,7 +488,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
 #define microcode_dev_exit() do { } while(0)
 #endif
 
-static long get_next_ucode_from_buffer(void **mc, void *buf,
+static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
 	unsigned long size, long offset)
 {
 	microcode_header_t *mc_header;
@@ -522,7 +522,7 @@ static int cpu_request_microcode(int cpu)
 	char name[30];
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	const struct firmware *firmware;
-	void *buf;
+	const u8 *buf;
 	unsigned long size;
 	long offset = 0;
 	int error;
-- 
cgit v1.2.3


From 3b33553badcde952adcf3b3ba5faae38d7d85071 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Jul 2008 17:30:40 +0200
Subject: x86: add early quirk support

Add early quirks support.

In preparation of enabling the generic architecture to boot on a VISWS.

This will allow us to remove the VISWS subarch and all its complications.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c    | 11 +++++++++++
 arch/x86/kernel/mpparse.c | 20 ++++++++++++++++++--
 arch/x86/kernel/setup.c   |  1 +
 include/asm-x86/setup.h   | 17 +++++++++++++++++
 4 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9836a079cfd9..269d367d2ace 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1297,6 +1297,11 @@ void __init e820_reserve_resources(void)
 	}
 }
 
+/*
+ * Non-standard memory setup can be specified via this quirk:
+ */
+char * (*arch_memory_setup_quirk)(void);
+
 char *__init default_machine_specific_memory_setup(void)
 {
 	char *who = "BIOS-e820";
@@ -1337,6 +1342,12 @@ char *__init default_machine_specific_memory_setup(void)
 
 char *__init __attribute__((weak)) machine_specific_memory_setup(void)
 {
+	if (arch_memory_setup_quirk) {
+		char *who = arch_memory_setup_quirk();
+
+		if (who)
+			return who;
+	}
 	return default_machine_specific_memory_setup();
 }
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 8b6b1e05c306..3b25e49380c6 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -725,13 +725,23 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 
 static struct intel_mp_floating *mpf_found;
 
+/*
+ * Machine specific quirk for finding the SMP config before other setup
+ * activities destroy the table:
+ */
+int (*mach_get_smp_config_quirk)(unsigned int early);
+
 /*
  * Scan the memory blocks for an SMP configuration block.
  */
-static void __init __get_smp_config(unsigned early)
+static void __init __get_smp_config(unsigned int early)
 {
 	struct intel_mp_floating *mpf = mpf_found;
 
+	if (mach_get_smp_config_quirk) {
+		if (mach_get_smp_config_quirk(early))
+			return;
+	}
 	if (acpi_lapic && early)
 		return;
 	/*
@@ -889,10 +899,16 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 	return 0;
 }
 
-static void __init __find_smp_config(unsigned reserve)
+int (*mach_find_smp_config_quirk)(unsigned int reserve);
+
+static void __init __find_smp_config(unsigned int reserve)
 {
 	unsigned int address;
 
+	if (mach_find_smp_config_quirk) {
+		if (mach_find_smp_config_quirk(reserve))
+			return;
+	}
 	/*
 	 * FIXME: Linux assumes you have 640K of base ram..
 	 * this continues the error...
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a7c3471ea17c..cb3db406247c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -596,6 +596,7 @@ void __init setup_arch(char **cmdline_p)
 {
 #ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+	visws_early_detect();
 	pre_setup_arch_hook();
 	early_cpu_init();
 #else
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 1d121c632d9e..1ad7eae0d9be 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -8,6 +8,23 @@
 /* Interrupt control for vSMPowered x86_64 systems */
 void vsmp_init(void);
 
+#ifdef CONFIG_X86_VISWS
+extern void visws_early_detect(void);
+#else
+static inline void visws_early_detect(void) { }
+#endif
+
+/*
+ * Any setup quirks to be performed?
+ */
+extern int (*arch_time_init_quirk)(void);
+extern int (*arch_pre_intr_init_quirk)(void);
+extern int (*arch_intr_init_quirk)(void);
+extern int (*arch_trap_init_quirk)(void);
+extern char * (*arch_memory_setup_quirk)(void);
+extern int (*mach_get_smp_config_quirk)(unsigned int early);
+extern int (*mach_find_smp_config_quirk)(unsigned int reserve);
+
 #ifndef CONFIG_PARAVIRT
 #define paravirt_post_allocator_init()	do {} while (0)
 #endif
-- 
cgit v1.2.3


From 22d5c67c5b0476e463ce4b632ba9ec3953d33a5f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Jul 2008 16:29:28 +0200
Subject: x86, VisWS: turn into generic arch, make VisWS boot on a regular PC

first step: make the VISWS subarch boot on a regular PC.

We take various shortcuts for that. We copy the generic arch setup file over
into the VISWS setup file.

This is the only step that is not expected to boot on a real VISWS.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c        |   2 +-
 arch/x86/mach-visws/reboot.c     |   4 +-
 arch/x86/mach-visws/setup.c      | 249 +++++++++++++++++----------------------
 arch/x86/mach-visws/traps.c      |   6 +-
 arch/x86/mach-visws/visws_apic.c |   3 +
 arch/x86/pci/Makefile            |   2 +-
 arch/x86/pci/visws.c             |   6 +-
 drivers/pci/Makefile             |   2 +-
 8 files changed, 119 insertions(+), 155 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 3e947208b9d9..3e58b676d23b 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -974,7 +974,7 @@ void __cpuinit setup_local_APIC(void)
 	 * Double-check whether this APIC is really registered.
 	 */
 	if (!apic_id_registered())
-		BUG();
+		WARN_ON_ONCE(1);
 
 	/*
 	 * Intel recommends to set DFR, LDR and TPR before enabling
diff --git a/arch/x86/mach-visws/reboot.c b/arch/x86/mach-visws/reboot.c
index 99332abfad42..79c26eab8824 100644
--- a/arch/x86/mach-visws/reboot.c
+++ b/arch/x86/mach-visws/reboot.c
@@ -33,7 +33,7 @@ void machine_restart(char * __unused)
 void machine_power_off(void)
 {
 	unsigned short pm_status;
-	extern unsigned int pci_bus0;
+/*	extern unsigned int pci_bus0; */
 
 	while ((pm_status = inw(PMSTS_PORT)) & 0x100)
 		outw(pm_status, PMSTS_PORT);
@@ -45,7 +45,7 @@ void machine_power_off(void)
 #define PCI_CONF1_ADDRESS(bus, devfn, reg) \
 	(0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
 
-	outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8);
+/*	outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
 	outl(PIIX_SPECIAL_STOP, 0xCFC);
 }
 
diff --git a/arch/x86/mach-visws/setup.c b/arch/x86/mach-visws/setup.c
index d67868ec9b7f..2f5e277686b8 100644
--- a/arch/x86/mach-visws/setup.c
+++ b/arch/x86/mach-visws/setup.c
@@ -1,183 +1,144 @@
 /*
- *  Unmaintained SGI Visual Workstation support.
- *  Split out from setup.c by davej@suse.de
+ *	Machine specific setup for generic
  */
 
 #include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
-#include <linux/module.h>
-
-#include <asm/fixmap.h>
+#include <asm/acpi.h>
 #include <asm/arch_hooks.h>
-#include <asm/io.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
-#include "cobalt.h"
-#include "piix4.h"
-
-int no_broadcast;
-
-char visws_board_type = -1;
-char visws_board_rev = -1;
-
-void __init visws_get_board_type_and_rev(void)
-{
-	int raw;
-
-	visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
-							 >> PIIX_GPI_BD_SHIFT;
-	/*
-	 * Get Board rev.
-	 * First, we have to initialize the 307 part to allow us access
-	 * to the GPIO registers.  Let's map them at 0x0fc0 which is right
-	 * after the PIIX4 PM section.
-	 */
-	outb_p(SIO_DEV_SEL, SIO_INDEX);
-	outb_p(SIO_GP_DEV, SIO_DATA);	/* Talk to GPIO regs. */
-
-	outb_p(SIO_DEV_MSB, SIO_INDEX);
-	outb_p(SIO_GP_MSB, SIO_DATA);	/* MSB of GPIO base address */
-
-	outb_p(SIO_DEV_LSB, SIO_INDEX);
-	outb_p(SIO_GP_LSB, SIO_DATA);	/* LSB of GPIO base address */
-
-	outb_p(SIO_DEV_ENB, SIO_INDEX);
-	outb_p(1, SIO_DATA);		/* Enable GPIO registers. */
-
-	/*
-	 * Now, we have to map the power management section to write
-	 * a bit which enables access to the GPIO registers.
-	 * What lunatic came up with this shit?
-	 */
-	outb_p(SIO_DEV_SEL, SIO_INDEX);
-	outb_p(SIO_PM_DEV, SIO_DATA);	/* Talk to GPIO regs. */
 
-	outb_p(SIO_DEV_MSB, SIO_INDEX);
-	outb_p(SIO_PM_MSB, SIO_DATA);	/* MSB of PM base address */
-
-	outb_p(SIO_DEV_LSB, SIO_INDEX);
-	outb_p(SIO_PM_LSB, SIO_DATA);	/* LSB of PM base address */
-
-	outb_p(SIO_DEV_ENB, SIO_INDEX);
-	outb_p(1, SIO_DATA);		/* Enable PM registers. */
-
-	/*
-	 * Now, write the PM register which enables the GPIO registers.
-	 */
-	outb_p(SIO_PM_FER2, SIO_PM_INDEX);
-	outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
-
-	/*
-	 * Now, initialize the GPIO registers.
-	 * We want them all to be inputs which is the
-	 * power on default, so let's leave them alone.
-	 * So, let's just read the board rev!
-	 */
-	raw = inb_p(SIO_GP_DATA1);
-	raw &= 0x7f;	/* 7 bits of valid board revision ID. */
-
-	if (visws_board_type == VISWS_320) {
-		if (raw < 0x6) {
-			visws_board_rev = 4;
-		} else if (raw < 0xc) {
-			visws_board_rev = 5;
-		} else {
-			visws_board_rev = 6;
-		}
-	} else if (visws_board_type == VISWS_540) {
-			visws_board_rev = 2;
-		} else {
-			visws_board_rev = raw;
-		}
-
-	printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
-	       (visws_board_type == VISWS_320 ? "320" :
-	       (visws_board_type == VISWS_540 ? "540" :
-		"unknown")), visws_board_rev);
-}
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI	(1)
+#else
+#define DEFAULT_SEND_IPI	(0)
+#endif
 
+int no_broadcast=DEFAULT_SEND_IPI;
+
+/**
+ * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *	Perform any necessary interrupt initialisation prior to setting up
+ *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *	interrupts should be initialised here if the machine emulates a PC
+ *	in any way.
+ **/
 void __init pre_intr_init_hook(void)
 {
-	init_VISWS_APIC_irqs();
+	init_ISA_irqs();
 }
 
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = {
+	.handler = no_action,
+	.mask = CPU_MASK_NONE,
+	.name = "cascade",
+};
+
+/**
+ * intr_init_hook - post gate setup interrupt initialisation
+ *
+ * Description:
+ *	Fill in any interrupts that may have been left out by the general
+ *	init_IRQ() routine.  interrupts having to do with the machine rather
+ *	than the devices on the I/O bus (like APIC interrupts in intel MP
+ *	systems) are started here.
+ **/
 void __init intr_init_hook(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
 	apic_intr_init();
 #endif
+
+	if (!acpi_ioapic)
+		setup_irq(2, &irq2);
+}
+
+/**
+ * pre_setup_arch_hook - hook called prior to any setup_arch() execution
+ *
+ * Description:
+ *	generally used to activate any machine specific identification
+ *	routines that may be needed before setup_arch() runs.  On VISWS
+ *	this is used to get the board revision and type.
+ **/
+void __init pre_setup_arch_hook(void)
+{
 }
 
-void __init pre_setup_arch_hook()
+/**
+ * trap_init_hook - initialise system specific traps
+ *
+ * Description:
+ *	Called as the final act of trap_init().  Used in VISWS to initialise
+ *	the various board specific APIC traps.
+ **/
+void __init trap_init_hook(void)
 {
-	visws_get_board_type_and_rev();
 }
 
-static struct irqaction irq0 = {
-	.handler =	timer_interrupt,
-	.flags =	IRQF_DISABLED | IRQF_IRQPOLL,
-	.name =		"timer",
+static struct irqaction irq0  = {
+	.handler = timer_interrupt,
+	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
+	.mask = CPU_MASK_NONE,
+	.name = "timer"
 };
 
+/**
+ * time_init_hook - do any specific initialisations for the system timer.
+ *
+ * Description:
+ *	Must plug the system timer interrupt source at HZ into the IRQ listed
+ *	in irq_vectors.h:TIMER_IRQ
+ **/
 void __init time_init_hook(void)
 {
-	printk(KERN_INFO "Starting Cobalt Timer system clock\n");
-
-	/* Set the countdown value */
-	co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
-
-	/* Start the timer */
-	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
-
-	/* Enable (unmask) the timer interrupt */
-	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
-
-	/* Wire cpu IDT entry to s/w handler (and Cobalt APIC to IDT) */
+	irq0.mask = cpumask_of_cpu(0);
 	setup_irq(0, &irq0);
 }
 
-/* Hook for machine specific memory setup. */
-
-#define MB (1024 * 1024)
-
-unsigned long sgivwfb_mem_phys;
-unsigned long sgivwfb_mem_size;
-EXPORT_SYMBOL(sgivwfb_mem_phys);
-EXPORT_SYMBOL(sgivwfb_mem_size);
-
-long long mem_size __initdata = 0;
-
-char * __init machine_specific_memory_setup(void)
+#ifdef CONFIG_MCA
+/**
+ * mca_nmi_hook - hook into MCA specific NMI chain
+ *
+ * Description:
+ *	The MCA (Microchannel Architecture) has an NMI chain for NMI sources
+ *	along the MCA bus.  Use this to hook into that chain if you will need
+ *	it.
+ **/
+void mca_nmi_hook(void)
 {
-	long long gfx_mem_size = 8 * MB;
+	/* If I recall correctly, there's a whole bunch of other things that
+	 * we can do to check for NMI problems, but that's all I know about
+	 * at the moment.
+	 */
 
-	mem_size = boot_params.alt_mem_k;
+	printk("NMI generated from unknown source!\n");
+}
+#endif
 
-	if (!mem_size) {
-		printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
-		mem_size = 128 * MB;
-	}
+static __init int no_ipi_broadcast(char *str)
+{
+	get_option(&str, &no_broadcast);
+	printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
+											"IPI Broadcast");
+	return 1;
+}
 
-	/*
-	 * this hardcodes the graphics memory to 8 MB
-	 * it really should be sized dynamically (or at least
-	 * set as a boot param)
-	 */
-	if (!sgivwfb_mem_size) {
-		printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
-		sgivwfb_mem_size = 8 * MB;
-	}
+__setup("no_ipi_broadcast=", no_ipi_broadcast);
 
-	/*
-	 * Trim to nearest MB
-	 */
-	sgivwfb_mem_size &= ~((1 << 20) - 1);
-	sgivwfb_mem_phys = mem_size - gfx_mem_size;
+static int __init print_ipi_mode(void)
+{
+	printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
+											"Shortcut");
+	return 0;
+}
 
-	e820_add_region(0, LOWMEMSIZE(), E820_RAM);
-	e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
-	e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
+late_initcall(print_ipi_mode);
 
-	return "PROM";
-}
diff --git a/arch/x86/mach-visws/traps.c b/arch/x86/mach-visws/traps.c
index bfac6ba10f8a..dd1f5fa94210 100644
--- a/arch/x86/mach-visws/traps.c
+++ b/arch/x86/mach-visws/traps.c
@@ -25,13 +25,13 @@ static __init void lithium_init(void)
 	if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
 	    (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
 		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
-		panic("This machine is not SGI Visual Workstation 320/540");
+/*		panic("This machine is not SGI Visual Workstation 320/540"); */
 	}
 
 	if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
 	    (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
 		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
-		panic("This machine is not SGI Visual Workstation 320/540");
+/*		panic("This machine is not SGI Visual Workstation 320/540"); */
 	}
 
 	li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
@@ -62,7 +62,7 @@ static __init void cobalt_init(void)
 		co_apic_read(CO_APIC_ID));
 }
 
-void __init trap_init_hook(void)
+void __init trap_init_hook_dontuse(void)
 {
 	lithium_init();
 	cobalt_init();
diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c
index d8b2cfd85d92..8b1cd8e01cb4 100644
--- a/arch/x86/mach-visws/visws_apic.c
+++ b/arch/x86/mach-visws/visws_apic.c
@@ -25,6 +25,9 @@
 
 #include "cobalt.h"
 
+char visws_board_type = -1;
+char visws_board_rev = -1;
+
 static DEFINE_SPINLOCK(cobalt_lock);
 
 /*
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 99d9f095e4d4..fa0164d80bbd 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -11,7 +11,7 @@ pci-y				+= legacy.o irq.o
 
 # Careful: VISWS overrule the pci-y above. The colons are
 # therefor correct. This needs a proper fix by distangling the code.
-pci-$(CONFIG_X86_VISWS)		:= visws.o fixup.o
+#pci-$(CONFIG_X86_VISWS)		:= visws.o irq.o fixup.o
 
 pci-$(CONFIG_X86_NUMAQ)		+= numa.o
 
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 16e52063ecb3..343ccf668d14 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -16,10 +16,10 @@
 static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
 static void pci_visws_disable_irq(struct pci_dev *dev) { }
 
-int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq;
-void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq;
+/* int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; */
+/* void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; */
 
-void __init pcibios_penalize_isa_irq(int irq, int active) {}
+/* void __init pcibios_penalize_isa_irq(int irq, int active) {} */
 
 
 unsigned int pci_bus0, pci_bus1;
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 4d1ce2e7361e..3452cf59eed0 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -37,7 +37,7 @@ obj-$(CONFIG_SUPERH) += setup-bus.o setup-irq.o
 obj-$(CONFIG_PPC32) += setup-irq.o
 obj-$(CONFIG_PPC) += setup-bus.o
 obj-$(CONFIG_MIPS) += setup-bus.o setup-irq.o
-obj-$(CONFIG_X86_VISWS) += setup-irq.o
+#obj-$(CONFIG_X86_VISWS) += setup-irq.o
 obj-$(CONFIG_MN10300) += setup-bus.o
 
 #
-- 
cgit v1.2.3


From 652536367b727251bfeba72189a17a040accbc2d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Jul 2008 15:50:37 +0200
Subject: x86, VisWS: turn into generic arch, copy visws files

copy arch/x86/mach-visws/setup_visws.c, apic_visws.c and traps_visws.c
files to arch/x86/kernel/, in preparation of the switchover to a
non-subarch setup for VISWS.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_visws.c  | 295 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_visws.c | 331 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps_visws.c |  71 +++++++++
 3 files changed, 697 insertions(+)
 create mode 100644 arch/x86/kernel/apic_visws.c
 create mode 100644 arch/x86/kernel/setup_visws.c
 create mode 100644 arch/x86/kernel/traps_visws.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_visws.c b/arch/x86/kernel/apic_visws.c
new file mode 100644
index 000000000000..6c02c8d09932
--- /dev/null
+++ b/arch/x86/kernel/apic_visws.c
@@ -0,0 +1,295 @@
+/*
+ *	Copyright (C) 1999 Bent Hagemark, Ingo Molnar
+ *
+ *  SGI Visual Workstation interrupt controller
+ *
+ *  The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
+ *  which serves as the main interrupt controller in the system.  Non-legacy
+ *  hardware in the system uses this controller directly.  Legacy devices
+ *  are connected to the PIIX4 which in turn has its 8259(s) connected to
+ *  a of the Cobalt APIC entry.
+ *
+ *  09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
+ *
+ *  25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+#include <asm/irq_vectors.h>
+#include <asm/visws/cobalt.h>
+
+static DEFINE_SPINLOCK(cobalt_lock);
+
+/*
+ * Set the given Cobalt APIC Redirection Table entry to point
+ * to the given IDT vector/index.
+ */
+static inline void co_apic_set(int entry, int irq)
+{
+	co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
+	co_apic_write(CO_APIC_HI(entry), 0);
+}
+
+/*
+ * Cobalt (IO)-APIC functions to handle PCI devices.
+ */
+static inline int co_apic_ide0_hack(void)
+{
+	extern char visws_board_type;
+	extern char visws_board_rev;
+
+	if (visws_board_type == VISWS_320 && visws_board_rev == 5)
+		return 5;
+	return CO_APIC_IDE0;
+}
+
+static int is_co_apic(unsigned int irq)
+{
+	if (IS_CO_APIC(irq))
+		return CO_APIC(irq);
+
+	switch (irq) {
+		case 0: return CO_APIC_CPU;
+		case CO_IRQ_IDE0: return co_apic_ide0_hack();
+		case CO_IRQ_IDE1: return CO_APIC_IDE1;
+		default: return -1;
+	}
+}
+
+
+/*
+ * This is the SGI Cobalt (IO-)APIC:
+ */
+
+static void enable_cobalt_irq(unsigned int irq)
+{
+	co_apic_set(is_co_apic(irq), irq);
+}
+
+static void disable_cobalt_irq(unsigned int irq)
+{
+	int entry = is_co_apic(irq);
+
+	co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
+	co_apic_read(CO_APIC_LO(entry));
+}
+
+/*
+ * "irq" really just serves to identify the device.  Here is where we
+ * map this to the Cobalt APIC entry where it's physically wired.
+ * This is called via request_irq -> setup_irq -> irq_desc->startup()
+ */
+static unsigned int startup_cobalt_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cobalt_lock, flags);
+	if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
+		irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
+	enable_cobalt_irq(irq);
+	spin_unlock_irqrestore(&cobalt_lock, flags);
+	return 0;
+}
+
+static void ack_cobalt_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cobalt_lock, flags);
+	disable_cobalt_irq(irq);
+	apic_write(APIC_EOI, APIC_EIO_ACK);
+	spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static void end_cobalt_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cobalt_lock, flags);
+	if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
+		enable_cobalt_irq(irq);
+	spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static struct irq_chip cobalt_irq_type = {
+	.typename =	"Cobalt-APIC",
+	.startup =	startup_cobalt_irq,
+	.shutdown =	disable_cobalt_irq,
+	.enable =	enable_cobalt_irq,
+	.disable =	disable_cobalt_irq,
+	.ack =		ack_cobalt_irq,
+	.end =		end_cobalt_irq,
+};
+
+
+/*
+ * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
+ * -- not the manner expected by the code in i8259.c.
+ *
+ * there is a 'master' physical interrupt source that gets sent to
+ * the CPU. But in the chipset there are various 'virtual' interrupts
+ * waiting to be handled. We represent this to Linux through a 'master'
+ * interrupt controller type, and through a special virtual interrupt-
+ * controller. Device drivers only see the virtual interrupt sources.
+ */
+static unsigned int startup_piix4_master_irq(unsigned int irq)
+{
+	init_8259A(0);
+
+	return startup_cobalt_irq(irq);
+}
+
+static void end_piix4_master_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cobalt_lock, flags);
+	enable_cobalt_irq(irq);
+	spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static struct irq_chip piix4_master_irq_type = {
+	.typename =	"PIIX4-master",
+	.startup =	startup_piix4_master_irq,
+	.ack =		ack_cobalt_irq,
+	.end =		end_piix4_master_irq,
+};
+
+
+static struct irq_chip piix4_virtual_irq_type = {
+	.typename =	"PIIX4-virtual",
+	.shutdown =	disable_8259A_irq,
+	.enable =	enable_8259A_irq,
+	.disable =	disable_8259A_irq,
+};
+
+
+/*
+ * PIIX4-8259 master/virtual functions to handle interrupt requests
+ * from legacy devices: floppy, parallel, serial, rtc.
+ *
+ * None of these get Cobalt APIC entries, neither do they have IDT
+ * entries. These interrupts are purely virtual and distributed from
+ * the 'master' interrupt source: CO_IRQ_8259.
+ *
+ * When the 8259 interrupts its handler figures out which of these
+ * devices is interrupting and dispatches to its handler.
+ *
+ * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
+ * enable_irq gets the right irq. This 'master' irq is never directly
+ * manipulated by any driver.
+ */
+static irqreturn_t piix4_master_intr(int irq, void *dev_id)
+{
+	int realirq;
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	/* Find out what's interrupting in the PIIX4 master 8259 */
+	outb(0x0c, 0x20);		/* OCW3 Poll command */
+	realirq = inb(0x20);
+
+	/*
+	 * Bit 7 == 0 means invalid/spurious
+	 */
+	if (unlikely(!(realirq & 0x80)))
+		goto out_unlock;
+
+	realirq &= 7;
+
+	if (unlikely(realirq == 2)) {
+		outb(0x0c, 0xa0);
+		realirq = inb(0xa0);
+
+		if (unlikely(!(realirq & 0x80)))
+			goto out_unlock;
+
+		realirq = (realirq & 7) + 8;
+	}
+
+	/* mask and ack interrupt */
+	cached_irq_mask |= 1 << realirq;
+	if (unlikely(realirq > 7)) {
+		inb(0xa1);
+		outb(cached_slave_mask, 0xa1);
+		outb(0x60 + (realirq & 7), 0xa0);
+		outb(0x60 + 2, 0x20);
+	} else {
+		inb(0x21);
+		outb(cached_master_mask, 0x21);
+		outb(0x60 + realirq, 0x20);
+	}
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+
+	desc = irq_desc + realirq;
+
+	/*
+	 * handle this 'virtual interrupt' as a Cobalt one now.
+	 */
+	kstat_cpu(smp_processor_id()).irqs[realirq]++;
+
+	if (likely(desc->action != NULL))
+		handle_IRQ_event(realirq, desc->action);
+
+	if (!(desc->status & IRQ_DISABLED))
+		enable_8259A_irq(realirq);
+
+	return IRQ_HANDLED;
+
+out_unlock:
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+	return IRQ_NONE;
+}
+
+static struct irqaction master_action = {
+	.handler =	piix4_master_intr,
+	.name =		"PIIX4-8259",
+};
+
+static struct irqaction cascade_action = {
+	.handler = 	no_action,
+	.name =		"cascade",
+};
+
+
+void init_VISWS_APIC_irqs(void)
+{
+	int i;
+
+	for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
+		irq_desc[i].status = IRQ_DISABLED;
+		irq_desc[i].action = 0;
+		irq_desc[i].depth = 1;
+
+		if (i == 0) {
+			irq_desc[i].chip = &cobalt_irq_type;
+		}
+		else if (i == CO_IRQ_IDE0) {
+			irq_desc[i].chip = &cobalt_irq_type;
+		}
+		else if (i == CO_IRQ_IDE1) {
+			irq_desc[i].chip = &cobalt_irq_type;
+		}
+		else if (i == CO_IRQ_8259) {
+			irq_desc[i].chip = &piix4_master_irq_type;
+		}
+		else if (i < CO_IRQ_APIC0) {
+			irq_desc[i].chip = &piix4_virtual_irq_type;
+		}
+		else if (IS_CO_APIC(i)) {
+			irq_desc[i].chip = &cobalt_irq_type;
+		}
+	}
+
+	setup_irq(CO_IRQ_8259, &master_action);
+	setup_irq(2, &cascade_action);
+}
diff --git a/arch/x86/kernel/setup_visws.c b/arch/x86/kernel/setup_visws.c
new file mode 100644
index 000000000000..e95e9499c8cd
--- /dev/null
+++ b/arch/x86/kernel/setup_visws.c
@@ -0,0 +1,331 @@
+/*
+ *  Unmaintained SGI Visual Workstation support.
+ *  Split out from setup.c by davej@suse.de
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/visws/cobalt.h>
+#include <asm/visws/piix4.h>
+#include <asm/arch_hooks.h>
+#include <asm/fixmap.h>
+#include <asm/reboot.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/smp.h>
+#include <asm/io.h>
+
+#include <mach_ipi.h>
+
+#include "mach_apic.h"
+
+#include <linux/init.h>
+#include <linux/smp.h>
+
+char visws_board_type	= -1;
+char visws_board_rev	= -1;
+
+int is_visws_box(void)
+{
+	return visws_board_type >= 0;
+}
+
+static int __init visws_time_init_quirk(void)
+{
+	printk(KERN_INFO "Starting Cobalt Timer system clock\n");
+
+	/* Set the countdown value */
+	co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
+
+	/* Start the timer */
+	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
+
+	/* Enable (unmask) the timer interrupt */
+	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
+
+	/*
+	 * Zero return means the generic timer setup code will set up
+	 * the standard vector:
+	 */
+	return 0;
+}
+
+static int __init visws_pre_intr_init_quirk(void)
+{
+	init_VISWS_APIC_irqs();
+
+	/*
+	 * We dont want ISA irqs to be set up by the generic code:
+	 */
+	return 1;
+}
+
+/* Quirk for machine specific memory setup. */
+
+#define MB (1024 * 1024)
+
+unsigned long sgivwfb_mem_phys;
+unsigned long sgivwfb_mem_size;
+EXPORT_SYMBOL(sgivwfb_mem_phys);
+EXPORT_SYMBOL(sgivwfb_mem_size);
+
+long long mem_size __initdata = 0;
+
+static char * __init visws_memory_setup_quirk(void)
+{
+	long long gfx_mem_size = 8 * MB;
+
+	mem_size = boot_params.alt_mem_k;
+
+	if (!mem_size) {
+		printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
+		mem_size = 128 * MB;
+	}
+
+	/*
+	 * this hardcodes the graphics memory to 8 MB
+	 * it really should be sized dynamically (or at least
+	 * set as a boot param)
+	 */
+	if (!sgivwfb_mem_size) {
+		printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
+		sgivwfb_mem_size = 8 * MB;
+	}
+
+	/*
+	 * Trim to nearest MB
+	 */
+	sgivwfb_mem_size &= ~((1 << 20) - 1);
+	sgivwfb_mem_phys = mem_size - gfx_mem_size;
+
+	e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+	e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
+	e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
+
+	return "PROM";
+}
+
+static void visws_machine_emergency_restart(void)
+{
+	/*
+	 * Visual Workstations restart after this
+	 * register is poked on the PIIX4
+	 */
+	outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
+}
+
+static void visws_machine_power_off(void)
+{
+	unsigned short pm_status;
+/*	extern unsigned int pci_bus0; */
+
+	while ((pm_status = inw(PMSTS_PORT)) & 0x100)
+		outw(pm_status, PMSTS_PORT);
+
+	outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
+
+	mdelay(10);
+
+#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
+	(0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
+
+/*	outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
+	outl(PIIX_SPECIAL_STOP, 0xCFC);
+}
+
+static int __init visws_get_smp_config_quirk(unsigned int early)
+{
+	/*
+	 * Prevent MP-table parsing by the generic code:
+	 */
+	return 1;
+}
+
+extern unsigned int __cpuinitdata maxcpus;
+
+/*
+ * The Visual Workstation is Intel MP compliant in the hardware
+ * sense, but it doesn't have a BIOS(-configuration table).
+ * No problem for Linux.
+ */
+
+static void __init MP_processor_info (struct mpc_config_processor *m)
+{
+	int ver, logical_apicid;
+	physid_mask_t apic_cpus;
+
+	if (!(m->mpc_cpuflag & CPU_ENABLED))
+		return;
+
+	logical_apicid = m->mpc_apicid;
+	printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
+	       m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver);
+
+	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
+		boot_cpu_physical_apicid = m->mpc_apicid;
+
+	ver = m->mpc_apicver;
+	if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
+		printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
+			m->mpc_apicid, MAX_APICS);
+		return;
+	}
+
+	apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
+	physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
+	/*
+	 * Validate version
+	 */
+	if (ver == 0x0) {
+		printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
+			"fixing up to 0x10. (tell your hw vendor)\n",
+			m->mpc_apicid);
+		ver = 0x10;
+	}
+	apic_version[m->mpc_apicid] = ver;
+}
+
+int __init visws_find_smp_config_quirk(unsigned int reserve)
+{
+	struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
+	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
+
+	if (ncpus > CO_CPU_MAX) {
+		printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
+			ncpus, mp);
+
+		ncpus = CO_CPU_MAX;
+	}
+
+	if (ncpus > maxcpus)
+		ncpus = maxcpus;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	smp_found_config = 1;
+#endif
+	while (ncpus--)
+		MP_processor_info(mp++);
+
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+	return 1;
+}
+
+extern int visws_trap_init_quirk(void);
+
+void __init visws_early_detect(void)
+{
+	int raw;
+
+	visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
+							 >> PIIX_GPI_BD_SHIFT;
+
+	if (visws_board_type < 0)
+		return;
+
+	/*
+	 * Install special quirks for timer, interrupt and memory setup:
+	 */
+	arch_time_init_quirk		= visws_time_init_quirk;
+	arch_pre_intr_init_quirk	= visws_pre_intr_init_quirk;
+	arch_memory_setup_quirk		= visws_memory_setup_quirk;
+
+	/*
+	 * Fall back to generic behavior for traps:
+	 */
+	arch_intr_init_quirk		= NULL;
+	arch_trap_init_quirk		= visws_trap_init_quirk;
+
+	/*
+	 * Install reboot quirks:
+	 */
+	pm_power_off			= visws_machine_power_off;
+	machine_ops.emergency_restart	= visws_machine_emergency_restart;
+
+	/*
+	 * Do not use broadcast IPIs:
+	 */
+	no_broadcast = 0;
+
+	/*
+	 * Override generic MP-table parsing:
+	 */
+	mach_get_smp_config_quirk	= visws_get_smp_config_quirk;
+	mach_find_smp_config_quirk	= visws_find_smp_config_quirk;
+
+	/*
+	 * Get Board rev.
+	 * First, we have to initialize the 307 part to allow us access
+	 * to the GPIO registers.  Let's map them at 0x0fc0 which is right
+	 * after the PIIX4 PM section.
+	 */
+	outb_p(SIO_DEV_SEL, SIO_INDEX);
+	outb_p(SIO_GP_DEV, SIO_DATA);	/* Talk to GPIO regs. */
+
+	outb_p(SIO_DEV_MSB, SIO_INDEX);
+	outb_p(SIO_GP_MSB, SIO_DATA);	/* MSB of GPIO base address */
+
+	outb_p(SIO_DEV_LSB, SIO_INDEX);
+	outb_p(SIO_GP_LSB, SIO_DATA);	/* LSB of GPIO base address */
+
+	outb_p(SIO_DEV_ENB, SIO_INDEX);
+	outb_p(1, SIO_DATA);		/* Enable GPIO registers. */
+
+	/*
+	 * Now, we have to map the power management section to write
+	 * a bit which enables access to the GPIO registers.
+	 * What lunatic came up with this shit?
+	 */
+	outb_p(SIO_DEV_SEL, SIO_INDEX);
+	outb_p(SIO_PM_DEV, SIO_DATA);	/* Talk to GPIO regs. */
+
+	outb_p(SIO_DEV_MSB, SIO_INDEX);
+	outb_p(SIO_PM_MSB, SIO_DATA);	/* MSB of PM base address */
+
+	outb_p(SIO_DEV_LSB, SIO_INDEX);
+	outb_p(SIO_PM_LSB, SIO_DATA);	/* LSB of PM base address */
+
+	outb_p(SIO_DEV_ENB, SIO_INDEX);
+	outb_p(1, SIO_DATA);		/* Enable PM registers. */
+
+	/*
+	 * Now, write the PM register which enables the GPIO registers.
+	 */
+	outb_p(SIO_PM_FER2, SIO_PM_INDEX);
+	outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
+
+	/*
+	 * Now, initialize the GPIO registers.
+	 * We want them all to be inputs which is the
+	 * power on default, so let's leave them alone.
+	 * So, let's just read the board rev!
+	 */
+	raw = inb_p(SIO_GP_DATA1);
+	raw &= 0x7f;	/* 7 bits of valid board revision ID. */
+
+	if (visws_board_type == VISWS_320) {
+		if (raw < 0x6) {
+			visws_board_rev = 4;
+		} else if (raw < 0xc) {
+			visws_board_rev = 5;
+		} else {
+			visws_board_rev = 6;
+		}
+	} else if (visws_board_type == VISWS_540) {
+			visws_board_rev = 2;
+		} else {
+			visws_board_rev = raw;
+		}
+
+	printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
+	       (visws_board_type == VISWS_320 ? "320" :
+	       (visws_board_type == VISWS_540 ? "540" :
+		"unknown")), visws_board_rev);
+}
diff --git a/arch/x86/kernel/traps_visws.c b/arch/x86/kernel/traps_visws.c
new file mode 100644
index 000000000000..e5e6492c2676
--- /dev/null
+++ b/arch/x86/kernel/traps_visws.c
@@ -0,0 +1,71 @@
+/* VISWS traps */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/visws/cobalt.h>
+#include <asm/visws/lithium.h>
+
+
+#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
+#define BCD (LI_INTB | LI_INTC | LI_INTD)
+#define ALLDEVS (A01234 | BCD)
+
+static __init void lithium_init(void)
+{
+	set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
+	set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
+
+	if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
+	    (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
+		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
+/*		panic("This machine is not SGI Visual Workstation 320/540"); */
+	}
+
+	if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
+	    (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
+		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
+/*		panic("This machine is not SGI Visual Workstation 320/540"); */
+	}
+
+	li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
+	li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
+}
+
+static __init void cobalt_init(void)
+{
+	/*
+	 * On normal SMP PC this is used only with SMP, but we have to
+	 * use it and set it up here to start the Cobalt clock
+	 */
+	set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
+	setup_local_APIC();
+	printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
+		(unsigned int)apic_read(APIC_LVR),
+		(unsigned int)apic_read(APIC_ID));
+
+	set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
+	set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
+	printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
+		co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
+
+	/* Enable Cobalt APIC being careful to NOT change the ID! */
+	co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
+
+	printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
+		co_apic_read(CO_APIC_ID));
+}
+
+int __init visws_trap_init_quirk(void)
+{
+	lithium_init();
+	cobalt_init();
+
+	return 1;
+}
-- 
cgit v1.2.3


From 1b84e1c81f56e13c7d81b47c85eda15d94624e43 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Jul 2008 15:55:27 +0200
Subject: x86, VisWS: turn into generic arch, flip over VISWS to generic arch

this is the big move: flip over VISWS to generic arch support.

From this commit on CONFIG_X86_VISWS is just another (default-disabled)
option that turns on certain quirks - no other complications.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig         | 26 +++++++++++++-------------
 arch/x86/Makefile        |  4 ++--
 arch/x86/kernel/Makefile |  1 +
 3 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c8cb800b4b61..1805b24ba1a2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -232,7 +232,7 @@ config SMP
 
 config X86_FIND_SMP_CONFIG
 	def_bool y
-	depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS
+	depends on X86_MPPARSE || X86_VOYAGER
 
 if ACPI
 config X86_MPPARSE
@@ -281,18 +281,6 @@ config X86_VOYAGER
 	  If you do not specifically know you have a Voyager based machine,
 	  say N here, otherwise the kernel you build will not be bootable.
 
-config X86_VISWS
-	bool "SGI 320/540 (Visual Workstation)"
-	depends on X86_32 && PCI
-	help
-	  The SGI Visual Workstation series is an IA32-based workstation
-	  based on SGI systems chips with some legacy PC hardware attached.
-
-	  Say Y here to create a kernel to run on the SGI 320 or 540.
-
-	  A kernel compiled for the Visual Workstation will not run on PCs
-	  and vice versa. See <file:Documentation/sgi-visws.txt> for details.
-
 config X86_GENERICARCH
        bool "Generic architecture"
 	depends on X86_32
@@ -363,6 +351,18 @@ config X86_VSMP
 
 endchoice
 
+config X86_VISWS
+	bool "SGI 320/540 (Visual Workstation)"
+	depends on X86_32 && PCI
+	help
+	  The SGI Visual Workstation series is an IA32-based workstation
+	  based on SGI systems chips with some legacy PC hardware attached.
+
+	  Say Y here to create a kernel to run on the SGI 320 or 540.
+
+	  A kernel compiled for the Visual Workstation will run on general
+	  PCs as well. See <file:Documentation/sgi-visws.txt> for details.
+
 config SCHED_NO_NO_OMIT_FRAME_POINTER
 	def_bool y
 	prompt "Single-depth WCHAN output"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b03d24b44bf9..ce04b031f68b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -114,8 +114,8 @@ mflags-$(CONFIG_X86_VOYAGER)	:= -Iinclude/asm-x86/mach-voyager
 mcore-$(CONFIG_X86_VOYAGER)	:= arch/x86/mach-voyager/
 
 # VISWS subarch support
-mflags-$(CONFIG_X86_VISWS)	:= -Iinclude/asm-x86/mach-visws
-mcore-$(CONFIG_X86_VISWS)	:= arch/x86/mach-visws/
+#mflags-$(CONFIG_X86_VISWS)	:= -Iinclude/asm-x86/mach-visws
+#mcore-$(CONFIG_X86_VISWS)	:= arch/x86/mach-visws/
 
 # generic subarchitecture
 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 59b14c940a28..08b7561cd5a9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -19,6 +19,7 @@ obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
 obj-y			+= setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
+obj-$(CONFIG_X86_VISWS)	+= setup_visws.o traps_visws.o apic_visws.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
-- 
cgit v1.2.3


From 54ce7f99065cadcbb627f47f2321b76f099fda28 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Jul 2008 16:14:56 +0200
Subject: x86, VisWS: turn into generic arch, IO-APIC setup fix

skip IO-APIC setup on a VISWS if it's enabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_visws.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_visws.c b/arch/x86/kernel/setup_visws.c
index e95e9499c8cd..e19b91c47e90 100644
--- a/arch/x86/kernel/setup_visws.c
+++ b/arch/x86/kernel/setup_visws.c
@@ -260,6 +260,13 @@ void __init visws_early_detect(void)
 	mach_get_smp_config_quirk	= visws_get_smp_config_quirk;
 	mach_find_smp_config_quirk	= visws_find_smp_config_quirk;
 
+#ifdef CONFIG_X86_IO_APIC
+	/*
+	 * Turn off IO-APIC detection and initialization:
+	 */
+	skip_ioapic_setup		= 1;
+#endif
+
 	/*
 	 * Get Board rev.
 	 * First, we have to initialize the 307 part to allow us access
-- 
cgit v1.2.3


From 0cecf92db84694cfed08329271e8ae6316e811eb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Jul 2008 16:20:32 +0200
Subject: x86, VisWS: turn into generic arch, clean up

rename setup_visws.c to visws_quirks.c.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile       |   2 +-
 arch/x86/kernel/setup_visws.c  | 338 -----------------------------------------
 arch/x86/kernel/visws_quirks.c | 338 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 339 insertions(+), 339 deletions(-)
 delete mode 100644 arch/x86/kernel/setup_visws.c
 create mode 100644 arch/x86/kernel/visws_quirks.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 08b7561cd5a9..2ef0fbd3488f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -19,7 +19,7 @@ obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
 obj-y			+= setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
-obj-$(CONFIG_X86_VISWS)	+= setup_visws.o traps_visws.o apic_visws.o
+obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o traps_visws.o apic_visws.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
diff --git a/arch/x86/kernel/setup_visws.c b/arch/x86/kernel/setup_visws.c
deleted file mode 100644
index e19b91c47e90..000000000000
--- a/arch/x86/kernel/setup_visws.c
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- *  Unmaintained SGI Visual Workstation support.
- *  Split out from setup.c by davej@suse.de
- */
-
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/visws/cobalt.h>
-#include <asm/visws/piix4.h>
-#include <asm/arch_hooks.h>
-#include <asm/fixmap.h>
-#include <asm/reboot.h>
-#include <asm/setup.h>
-#include <asm/e820.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-
-#include <mach_ipi.h>
-
-#include "mach_apic.h"
-
-#include <linux/init.h>
-#include <linux/smp.h>
-
-char visws_board_type	= -1;
-char visws_board_rev	= -1;
-
-int is_visws_box(void)
-{
-	return visws_board_type >= 0;
-}
-
-static int __init visws_time_init_quirk(void)
-{
-	printk(KERN_INFO "Starting Cobalt Timer system clock\n");
-
-	/* Set the countdown value */
-	co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
-
-	/* Start the timer */
-	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
-
-	/* Enable (unmask) the timer interrupt */
-	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
-
-	/*
-	 * Zero return means the generic timer setup code will set up
-	 * the standard vector:
-	 */
-	return 0;
-}
-
-static int __init visws_pre_intr_init_quirk(void)
-{
-	init_VISWS_APIC_irqs();
-
-	/*
-	 * We dont want ISA irqs to be set up by the generic code:
-	 */
-	return 1;
-}
-
-/* Quirk for machine specific memory setup. */
-
-#define MB (1024 * 1024)
-
-unsigned long sgivwfb_mem_phys;
-unsigned long sgivwfb_mem_size;
-EXPORT_SYMBOL(sgivwfb_mem_phys);
-EXPORT_SYMBOL(sgivwfb_mem_size);
-
-long long mem_size __initdata = 0;
-
-static char * __init visws_memory_setup_quirk(void)
-{
-	long long gfx_mem_size = 8 * MB;
-
-	mem_size = boot_params.alt_mem_k;
-
-	if (!mem_size) {
-		printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
-		mem_size = 128 * MB;
-	}
-
-	/*
-	 * this hardcodes the graphics memory to 8 MB
-	 * it really should be sized dynamically (or at least
-	 * set as a boot param)
-	 */
-	if (!sgivwfb_mem_size) {
-		printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
-		sgivwfb_mem_size = 8 * MB;
-	}
-
-	/*
-	 * Trim to nearest MB
-	 */
-	sgivwfb_mem_size &= ~((1 << 20) - 1);
-	sgivwfb_mem_phys = mem_size - gfx_mem_size;
-
-	e820_add_region(0, LOWMEMSIZE(), E820_RAM);
-	e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
-	e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
-
-	return "PROM";
-}
-
-static void visws_machine_emergency_restart(void)
-{
-	/*
-	 * Visual Workstations restart after this
-	 * register is poked on the PIIX4
-	 */
-	outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
-}
-
-static void visws_machine_power_off(void)
-{
-	unsigned short pm_status;
-/*	extern unsigned int pci_bus0; */
-
-	while ((pm_status = inw(PMSTS_PORT)) & 0x100)
-		outw(pm_status, PMSTS_PORT);
-
-	outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
-
-	mdelay(10);
-
-#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
-	(0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
-
-/*	outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
-	outl(PIIX_SPECIAL_STOP, 0xCFC);
-}
-
-static int __init visws_get_smp_config_quirk(unsigned int early)
-{
-	/*
-	 * Prevent MP-table parsing by the generic code:
-	 */
-	return 1;
-}
-
-extern unsigned int __cpuinitdata maxcpus;
-
-/*
- * The Visual Workstation is Intel MP compliant in the hardware
- * sense, but it doesn't have a BIOS(-configuration table).
- * No problem for Linux.
- */
-
-static void __init MP_processor_info (struct mpc_config_processor *m)
-{
-	int ver, logical_apicid;
-	physid_mask_t apic_cpus;
-
-	if (!(m->mpc_cpuflag & CPU_ENABLED))
-		return;
-
-	logical_apicid = m->mpc_apicid;
-	printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
-	       m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
-	       m->mpc_apicid,
-	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-	       m->mpc_apicver);
-
-	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
-		boot_cpu_physical_apicid = m->mpc_apicid;
-
-	ver = m->mpc_apicver;
-	if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
-		printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
-			m->mpc_apicid, MAX_APICS);
-		return;
-	}
-
-	apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
-	physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
-	/*
-	 * Validate version
-	 */
-	if (ver == 0x0) {
-		printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
-			"fixing up to 0x10. (tell your hw vendor)\n",
-			m->mpc_apicid);
-		ver = 0x10;
-	}
-	apic_version[m->mpc_apicid] = ver;
-}
-
-int __init visws_find_smp_config_quirk(unsigned int reserve)
-{
-	struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
-	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
-
-	if (ncpus > CO_CPU_MAX) {
-		printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
-			ncpus, mp);
-
-		ncpus = CO_CPU_MAX;
-	}
-
-	if (ncpus > maxcpus)
-		ncpus = maxcpus;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	smp_found_config = 1;
-#endif
-	while (ncpus--)
-		MP_processor_info(mp++);
-
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-	return 1;
-}
-
-extern int visws_trap_init_quirk(void);
-
-void __init visws_early_detect(void)
-{
-	int raw;
-
-	visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
-							 >> PIIX_GPI_BD_SHIFT;
-
-	if (visws_board_type < 0)
-		return;
-
-	/*
-	 * Install special quirks for timer, interrupt and memory setup:
-	 */
-	arch_time_init_quirk		= visws_time_init_quirk;
-	arch_pre_intr_init_quirk	= visws_pre_intr_init_quirk;
-	arch_memory_setup_quirk		= visws_memory_setup_quirk;
-
-	/*
-	 * Fall back to generic behavior for traps:
-	 */
-	arch_intr_init_quirk		= NULL;
-	arch_trap_init_quirk		= visws_trap_init_quirk;
-
-	/*
-	 * Install reboot quirks:
-	 */
-	pm_power_off			= visws_machine_power_off;
-	machine_ops.emergency_restart	= visws_machine_emergency_restart;
-
-	/*
-	 * Do not use broadcast IPIs:
-	 */
-	no_broadcast = 0;
-
-	/*
-	 * Override generic MP-table parsing:
-	 */
-	mach_get_smp_config_quirk	= visws_get_smp_config_quirk;
-	mach_find_smp_config_quirk	= visws_find_smp_config_quirk;
-
-#ifdef CONFIG_X86_IO_APIC
-	/*
-	 * Turn off IO-APIC detection and initialization:
-	 */
-	skip_ioapic_setup		= 1;
-#endif
-
-	/*
-	 * Get Board rev.
-	 * First, we have to initialize the 307 part to allow us access
-	 * to the GPIO registers.  Let's map them at 0x0fc0 which is right
-	 * after the PIIX4 PM section.
-	 */
-	outb_p(SIO_DEV_SEL, SIO_INDEX);
-	outb_p(SIO_GP_DEV, SIO_DATA);	/* Talk to GPIO regs. */
-
-	outb_p(SIO_DEV_MSB, SIO_INDEX);
-	outb_p(SIO_GP_MSB, SIO_DATA);	/* MSB of GPIO base address */
-
-	outb_p(SIO_DEV_LSB, SIO_INDEX);
-	outb_p(SIO_GP_LSB, SIO_DATA);	/* LSB of GPIO base address */
-
-	outb_p(SIO_DEV_ENB, SIO_INDEX);
-	outb_p(1, SIO_DATA);		/* Enable GPIO registers. */
-
-	/*
-	 * Now, we have to map the power management section to write
-	 * a bit which enables access to the GPIO registers.
-	 * What lunatic came up with this shit?
-	 */
-	outb_p(SIO_DEV_SEL, SIO_INDEX);
-	outb_p(SIO_PM_DEV, SIO_DATA);	/* Talk to GPIO regs. */
-
-	outb_p(SIO_DEV_MSB, SIO_INDEX);
-	outb_p(SIO_PM_MSB, SIO_DATA);	/* MSB of PM base address */
-
-	outb_p(SIO_DEV_LSB, SIO_INDEX);
-	outb_p(SIO_PM_LSB, SIO_DATA);	/* LSB of PM base address */
-
-	outb_p(SIO_DEV_ENB, SIO_INDEX);
-	outb_p(1, SIO_DATA);		/* Enable PM registers. */
-
-	/*
-	 * Now, write the PM register which enables the GPIO registers.
-	 */
-	outb_p(SIO_PM_FER2, SIO_PM_INDEX);
-	outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
-
-	/*
-	 * Now, initialize the GPIO registers.
-	 * We want them all to be inputs which is the
-	 * power on default, so let's leave them alone.
-	 * So, let's just read the board rev!
-	 */
-	raw = inb_p(SIO_GP_DATA1);
-	raw &= 0x7f;	/* 7 bits of valid board revision ID. */
-
-	if (visws_board_type == VISWS_320) {
-		if (raw < 0x6) {
-			visws_board_rev = 4;
-		} else if (raw < 0xc) {
-			visws_board_rev = 5;
-		} else {
-			visws_board_rev = 6;
-		}
-	} else if (visws_board_type == VISWS_540) {
-			visws_board_rev = 2;
-		} else {
-			visws_board_rev = raw;
-		}
-
-	printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
-	       (visws_board_type == VISWS_320 ? "320" :
-	       (visws_board_type == VISWS_540 ? "540" :
-		"unknown")), visws_board_rev);
-}
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
new file mode 100644
index 000000000000..e19b91c47e90
--- /dev/null
+++ b/arch/x86/kernel/visws_quirks.c
@@ -0,0 +1,338 @@
+/*
+ *  Unmaintained SGI Visual Workstation support.
+ *  Split out from setup.c by davej@suse.de
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/visws/cobalt.h>
+#include <asm/visws/piix4.h>
+#include <asm/arch_hooks.h>
+#include <asm/fixmap.h>
+#include <asm/reboot.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/smp.h>
+#include <asm/io.h>
+
+#include <mach_ipi.h>
+
+#include "mach_apic.h"
+
+#include <linux/init.h>
+#include <linux/smp.h>
+
+char visws_board_type	= -1;
+char visws_board_rev	= -1;
+
+int is_visws_box(void)
+{
+	return visws_board_type >= 0;
+}
+
+static int __init visws_time_init_quirk(void)
+{
+	printk(KERN_INFO "Starting Cobalt Timer system clock\n");
+
+	/* Set the countdown value */
+	co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
+
+	/* Start the timer */
+	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
+
+	/* Enable (unmask) the timer interrupt */
+	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
+
+	/*
+	 * Zero return means the generic timer setup code will set up
+	 * the standard vector:
+	 */
+	return 0;
+}
+
+static int __init visws_pre_intr_init_quirk(void)
+{
+	init_VISWS_APIC_irqs();
+
+	/*
+	 * We dont want ISA irqs to be set up by the generic code:
+	 */
+	return 1;
+}
+
+/* Quirk for machine specific memory setup. */
+
+#define MB (1024 * 1024)
+
+unsigned long sgivwfb_mem_phys;
+unsigned long sgivwfb_mem_size;
+EXPORT_SYMBOL(sgivwfb_mem_phys);
+EXPORT_SYMBOL(sgivwfb_mem_size);
+
+long long mem_size __initdata = 0;
+
+static char * __init visws_memory_setup_quirk(void)
+{
+	long long gfx_mem_size = 8 * MB;
+
+	mem_size = boot_params.alt_mem_k;
+
+	if (!mem_size) {
+		printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
+		mem_size = 128 * MB;
+	}
+
+	/*
+	 * this hardcodes the graphics memory to 8 MB
+	 * it really should be sized dynamically (or at least
+	 * set as a boot param)
+	 */
+	if (!sgivwfb_mem_size) {
+		printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
+		sgivwfb_mem_size = 8 * MB;
+	}
+
+	/*
+	 * Trim to nearest MB
+	 */
+	sgivwfb_mem_size &= ~((1 << 20) - 1);
+	sgivwfb_mem_phys = mem_size - gfx_mem_size;
+
+	e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+	e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
+	e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
+
+	return "PROM";
+}
+
+static void visws_machine_emergency_restart(void)
+{
+	/*
+	 * Visual Workstations restart after this
+	 * register is poked on the PIIX4
+	 */
+	outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
+}
+
+static void visws_machine_power_off(void)
+{
+	unsigned short pm_status;
+/*	extern unsigned int pci_bus0; */
+
+	while ((pm_status = inw(PMSTS_PORT)) & 0x100)
+		outw(pm_status, PMSTS_PORT);
+
+	outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
+
+	mdelay(10);
+
+#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
+	(0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
+
+/*	outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
+	outl(PIIX_SPECIAL_STOP, 0xCFC);
+}
+
+static int __init visws_get_smp_config_quirk(unsigned int early)
+{
+	/*
+	 * Prevent MP-table parsing by the generic code:
+	 */
+	return 1;
+}
+
+extern unsigned int __cpuinitdata maxcpus;
+
+/*
+ * The Visual Workstation is Intel MP compliant in the hardware
+ * sense, but it doesn't have a BIOS(-configuration table).
+ * No problem for Linux.
+ */
+
+static void __init MP_processor_info (struct mpc_config_processor *m)
+{
+	int ver, logical_apicid;
+	physid_mask_t apic_cpus;
+
+	if (!(m->mpc_cpuflag & CPU_ENABLED))
+		return;
+
+	logical_apicid = m->mpc_apicid;
+	printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
+	       m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver);
+
+	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
+		boot_cpu_physical_apicid = m->mpc_apicid;
+
+	ver = m->mpc_apicver;
+	if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
+		printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
+			m->mpc_apicid, MAX_APICS);
+		return;
+	}
+
+	apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
+	physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
+	/*
+	 * Validate version
+	 */
+	if (ver == 0x0) {
+		printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
+			"fixing up to 0x10. (tell your hw vendor)\n",
+			m->mpc_apicid);
+		ver = 0x10;
+	}
+	apic_version[m->mpc_apicid] = ver;
+}
+
+int __init visws_find_smp_config_quirk(unsigned int reserve)
+{
+	struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
+	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
+
+	if (ncpus > CO_CPU_MAX) {
+		printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
+			ncpus, mp);
+
+		ncpus = CO_CPU_MAX;
+	}
+
+	if (ncpus > maxcpus)
+		ncpus = maxcpus;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	smp_found_config = 1;
+#endif
+	while (ncpus--)
+		MP_processor_info(mp++);
+
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+	return 1;
+}
+
+extern int visws_trap_init_quirk(void);
+
+void __init visws_early_detect(void)
+{
+	int raw;
+
+	visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
+							 >> PIIX_GPI_BD_SHIFT;
+
+	if (visws_board_type < 0)
+		return;
+
+	/*
+	 * Install special quirks for timer, interrupt and memory setup:
+	 */
+	arch_time_init_quirk		= visws_time_init_quirk;
+	arch_pre_intr_init_quirk	= visws_pre_intr_init_quirk;
+	arch_memory_setup_quirk		= visws_memory_setup_quirk;
+
+	/*
+	 * Fall back to generic behavior for traps:
+	 */
+	arch_intr_init_quirk		= NULL;
+	arch_trap_init_quirk		= visws_trap_init_quirk;
+
+	/*
+	 * Install reboot quirks:
+	 */
+	pm_power_off			= visws_machine_power_off;
+	machine_ops.emergency_restart	= visws_machine_emergency_restart;
+
+	/*
+	 * Do not use broadcast IPIs:
+	 */
+	no_broadcast = 0;
+
+	/*
+	 * Override generic MP-table parsing:
+	 */
+	mach_get_smp_config_quirk	= visws_get_smp_config_quirk;
+	mach_find_smp_config_quirk	= visws_find_smp_config_quirk;
+
+#ifdef CONFIG_X86_IO_APIC
+	/*
+	 * Turn off IO-APIC detection and initialization:
+	 */
+	skip_ioapic_setup		= 1;
+#endif
+
+	/*
+	 * Get Board rev.
+	 * First, we have to initialize the 307 part to allow us access
+	 * to the GPIO registers.  Let's map them at 0x0fc0 which is right
+	 * after the PIIX4 PM section.
+	 */
+	outb_p(SIO_DEV_SEL, SIO_INDEX);
+	outb_p(SIO_GP_DEV, SIO_DATA);	/* Talk to GPIO regs. */
+
+	outb_p(SIO_DEV_MSB, SIO_INDEX);
+	outb_p(SIO_GP_MSB, SIO_DATA);	/* MSB of GPIO base address */
+
+	outb_p(SIO_DEV_LSB, SIO_INDEX);
+	outb_p(SIO_GP_LSB, SIO_DATA);	/* LSB of GPIO base address */
+
+	outb_p(SIO_DEV_ENB, SIO_INDEX);
+	outb_p(1, SIO_DATA);		/* Enable GPIO registers. */
+
+	/*
+	 * Now, we have to map the power management section to write
+	 * a bit which enables access to the GPIO registers.
+	 * What lunatic came up with this shit?
+	 */
+	outb_p(SIO_DEV_SEL, SIO_INDEX);
+	outb_p(SIO_PM_DEV, SIO_DATA);	/* Talk to GPIO regs. */
+
+	outb_p(SIO_DEV_MSB, SIO_INDEX);
+	outb_p(SIO_PM_MSB, SIO_DATA);	/* MSB of PM base address */
+
+	outb_p(SIO_DEV_LSB, SIO_INDEX);
+	outb_p(SIO_PM_LSB, SIO_DATA);	/* LSB of PM base address */
+
+	outb_p(SIO_DEV_ENB, SIO_INDEX);
+	outb_p(1, SIO_DATA);		/* Enable PM registers. */
+
+	/*
+	 * Now, write the PM register which enables the GPIO registers.
+	 */
+	outb_p(SIO_PM_FER2, SIO_PM_INDEX);
+	outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
+
+	/*
+	 * Now, initialize the GPIO registers.
+	 * We want them all to be inputs which is the
+	 * power on default, so let's leave them alone.
+	 * So, let's just read the board rev!
+	 */
+	raw = inb_p(SIO_GP_DATA1);
+	raw &= 0x7f;	/* 7 bits of valid board revision ID. */
+
+	if (visws_board_type == VISWS_320) {
+		if (raw < 0x6) {
+			visws_board_rev = 4;
+		} else if (raw < 0xc) {
+			visws_board_rev = 5;
+		} else {
+			visws_board_rev = 6;
+		}
+	} else if (visws_board_type == VISWS_540) {
+			visws_board_rev = 2;
+		} else {
+			visws_board_rev = raw;
+		}
+
+	printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
+	       (visws_board_type == VISWS_320 ? "320" :
+	       (visws_board_type == VISWS_540 ? "540" :
+		"unknown")), visws_board_rev);
+}
-- 
cgit v1.2.3


From 26dd9fcfc2abc298e3c60597bbe6405826aabf91 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Jul 2008 16:21:38 +0200
Subject: x86, VisWS: turn into generic arch, clean up

merge traps_visws.c and apic_visws.c into visws_quirks.c.

(no code changed)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile       |   2 +-
 arch/x86/kernel/apic_visws.c   | 295 --------------------------------
 arch/x86/kernel/traps_visws.c  |  71 --------
 arch/x86/kernel/visws_quirks.c | 373 ++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 372 insertions(+), 369 deletions(-)
 delete mode 100644 arch/x86/kernel/apic_visws.c
 delete mode 100644 arch/x86/kernel/traps_visws.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ef0fbd3488f..f77dd6774bba 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -19,7 +19,7 @@ obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
 obj-y			+= setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
-obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o traps_visws.o apic_visws.o
+obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
diff --git a/arch/x86/kernel/apic_visws.c b/arch/x86/kernel/apic_visws.c
deleted file mode 100644
index 6c02c8d09932..000000000000
--- a/arch/x86/kernel/apic_visws.c
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- *	Copyright (C) 1999 Bent Hagemark, Ingo Molnar
- *
- *  SGI Visual Workstation interrupt controller
- *
- *  The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
- *  which serves as the main interrupt controller in the system.  Non-legacy
- *  hardware in the system uses this controller directly.  Legacy devices
- *  are connected to the PIIX4 which in turn has its 8259(s) connected to
- *  a of the Cobalt APIC entry.
- *
- *  09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
- *
- *  25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
- */
-
-#include <linux/kernel_stat.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-#include <asm/irq_vectors.h>
-#include <asm/visws/cobalt.h>
-
-static DEFINE_SPINLOCK(cobalt_lock);
-
-/*
- * Set the given Cobalt APIC Redirection Table entry to point
- * to the given IDT vector/index.
- */
-static inline void co_apic_set(int entry, int irq)
-{
-	co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
-	co_apic_write(CO_APIC_HI(entry), 0);
-}
-
-/*
- * Cobalt (IO)-APIC functions to handle PCI devices.
- */
-static inline int co_apic_ide0_hack(void)
-{
-	extern char visws_board_type;
-	extern char visws_board_rev;
-
-	if (visws_board_type == VISWS_320 && visws_board_rev == 5)
-		return 5;
-	return CO_APIC_IDE0;
-}
-
-static int is_co_apic(unsigned int irq)
-{
-	if (IS_CO_APIC(irq))
-		return CO_APIC(irq);
-
-	switch (irq) {
-		case 0: return CO_APIC_CPU;
-		case CO_IRQ_IDE0: return co_apic_ide0_hack();
-		case CO_IRQ_IDE1: return CO_APIC_IDE1;
-		default: return -1;
-	}
-}
-
-
-/*
- * This is the SGI Cobalt (IO-)APIC:
- */
-
-static void enable_cobalt_irq(unsigned int irq)
-{
-	co_apic_set(is_co_apic(irq), irq);
-}
-
-static void disable_cobalt_irq(unsigned int irq)
-{
-	int entry = is_co_apic(irq);
-
-	co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
-	co_apic_read(CO_APIC_LO(entry));
-}
-
-/*
- * "irq" really just serves to identify the device.  Here is where we
- * map this to the Cobalt APIC entry where it's physically wired.
- * This is called via request_irq -> setup_irq -> irq_desc->startup()
- */
-static unsigned int startup_cobalt_irq(unsigned int irq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cobalt_lock, flags);
-	if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
-		irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
-	enable_cobalt_irq(irq);
-	spin_unlock_irqrestore(&cobalt_lock, flags);
-	return 0;
-}
-
-static void ack_cobalt_irq(unsigned int irq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cobalt_lock, flags);
-	disable_cobalt_irq(irq);
-	apic_write(APIC_EOI, APIC_EIO_ACK);
-	spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static void end_cobalt_irq(unsigned int irq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cobalt_lock, flags);
-	if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
-		enable_cobalt_irq(irq);
-	spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static struct irq_chip cobalt_irq_type = {
-	.typename =	"Cobalt-APIC",
-	.startup =	startup_cobalt_irq,
-	.shutdown =	disable_cobalt_irq,
-	.enable =	enable_cobalt_irq,
-	.disable =	disable_cobalt_irq,
-	.ack =		ack_cobalt_irq,
-	.end =		end_cobalt_irq,
-};
-
-
-/*
- * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
- * -- not the manner expected by the code in i8259.c.
- *
- * there is a 'master' physical interrupt source that gets sent to
- * the CPU. But in the chipset there are various 'virtual' interrupts
- * waiting to be handled. We represent this to Linux through a 'master'
- * interrupt controller type, and through a special virtual interrupt-
- * controller. Device drivers only see the virtual interrupt sources.
- */
-static unsigned int startup_piix4_master_irq(unsigned int irq)
-{
-	init_8259A(0);
-
-	return startup_cobalt_irq(irq);
-}
-
-static void end_piix4_master_irq(unsigned int irq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cobalt_lock, flags);
-	enable_cobalt_irq(irq);
-	spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static struct irq_chip piix4_master_irq_type = {
-	.typename =	"PIIX4-master",
-	.startup =	startup_piix4_master_irq,
-	.ack =		ack_cobalt_irq,
-	.end =		end_piix4_master_irq,
-};
-
-
-static struct irq_chip piix4_virtual_irq_type = {
-	.typename =	"PIIX4-virtual",
-	.shutdown =	disable_8259A_irq,
-	.enable =	enable_8259A_irq,
-	.disable =	disable_8259A_irq,
-};
-
-
-/*
- * PIIX4-8259 master/virtual functions to handle interrupt requests
- * from legacy devices: floppy, parallel, serial, rtc.
- *
- * None of these get Cobalt APIC entries, neither do they have IDT
- * entries. These interrupts are purely virtual and distributed from
- * the 'master' interrupt source: CO_IRQ_8259.
- *
- * When the 8259 interrupts its handler figures out which of these
- * devices is interrupting and dispatches to its handler.
- *
- * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
- * enable_irq gets the right irq. This 'master' irq is never directly
- * manipulated by any driver.
- */
-static irqreturn_t piix4_master_intr(int irq, void *dev_id)
-{
-	int realirq;
-	irq_desc_t *desc;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-
-	/* Find out what's interrupting in the PIIX4 master 8259 */
-	outb(0x0c, 0x20);		/* OCW3 Poll command */
-	realirq = inb(0x20);
-
-	/*
-	 * Bit 7 == 0 means invalid/spurious
-	 */
-	if (unlikely(!(realirq & 0x80)))
-		goto out_unlock;
-
-	realirq &= 7;
-
-	if (unlikely(realirq == 2)) {
-		outb(0x0c, 0xa0);
-		realirq = inb(0xa0);
-
-		if (unlikely(!(realirq & 0x80)))
-			goto out_unlock;
-
-		realirq = (realirq & 7) + 8;
-	}
-
-	/* mask and ack interrupt */
-	cached_irq_mask |= 1 << realirq;
-	if (unlikely(realirq > 7)) {
-		inb(0xa1);
-		outb(cached_slave_mask, 0xa1);
-		outb(0x60 + (realirq & 7), 0xa0);
-		outb(0x60 + 2, 0x20);
-	} else {
-		inb(0x21);
-		outb(cached_master_mask, 0x21);
-		outb(0x60 + realirq, 0x20);
-	}
-
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-
-	desc = irq_desc + realirq;
-
-	/*
-	 * handle this 'virtual interrupt' as a Cobalt one now.
-	 */
-	kstat_cpu(smp_processor_id()).irqs[realirq]++;
-
-	if (likely(desc->action != NULL))
-		handle_IRQ_event(realirq, desc->action);
-
-	if (!(desc->status & IRQ_DISABLED))
-		enable_8259A_irq(realirq);
-
-	return IRQ_HANDLED;
-
-out_unlock:
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-	return IRQ_NONE;
-}
-
-static struct irqaction master_action = {
-	.handler =	piix4_master_intr,
-	.name =		"PIIX4-8259",
-};
-
-static struct irqaction cascade_action = {
-	.handler = 	no_action,
-	.name =		"cascade",
-};
-
-
-void init_VISWS_APIC_irqs(void)
-{
-	int i;
-
-	for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
-		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].action = 0;
-		irq_desc[i].depth = 1;
-
-		if (i == 0) {
-			irq_desc[i].chip = &cobalt_irq_type;
-		}
-		else if (i == CO_IRQ_IDE0) {
-			irq_desc[i].chip = &cobalt_irq_type;
-		}
-		else if (i == CO_IRQ_IDE1) {
-			irq_desc[i].chip = &cobalt_irq_type;
-		}
-		else if (i == CO_IRQ_8259) {
-			irq_desc[i].chip = &piix4_master_irq_type;
-		}
-		else if (i < CO_IRQ_APIC0) {
-			irq_desc[i].chip = &piix4_virtual_irq_type;
-		}
-		else if (IS_CO_APIC(i)) {
-			irq_desc[i].chip = &cobalt_irq_type;
-		}
-	}
-
-	setup_irq(CO_IRQ_8259, &master_action);
-	setup_irq(2, &cascade_action);
-}
diff --git a/arch/x86/kernel/traps_visws.c b/arch/x86/kernel/traps_visws.c
deleted file mode 100644
index e5e6492c2676..000000000000
--- a/arch/x86/kernel/traps_visws.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/* VISWS traps */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-
-#include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/arch_hooks.h>
-#include <asm/visws/cobalt.h>
-#include <asm/visws/lithium.h>
-
-
-#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
-#define BCD (LI_INTB | LI_INTC | LI_INTD)
-#define ALLDEVS (A01234 | BCD)
-
-static __init void lithium_init(void)
-{
-	set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
-	set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
-
-	if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
-	    (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
-		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
-/*		panic("This machine is not SGI Visual Workstation 320/540"); */
-	}
-
-	if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
-	    (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
-		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
-/*		panic("This machine is not SGI Visual Workstation 320/540"); */
-	}
-
-	li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
-	li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
-}
-
-static __init void cobalt_init(void)
-{
-	/*
-	 * On normal SMP PC this is used only with SMP, but we have to
-	 * use it and set it up here to start the Cobalt clock
-	 */
-	set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
-	setup_local_APIC();
-	printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
-		(unsigned int)apic_read(APIC_LVR),
-		(unsigned int)apic_read(APIC_ID));
-
-	set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
-	set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
-	printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
-		co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
-
-	/* Enable Cobalt APIC being careful to NOT change the ID! */
-	co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
-
-	printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
-		co_apic_read(CO_APIC_ID));
-}
-
-int __init visws_trap_init_quirk(void)
-{
-	lithium_init();
-	cobalt_init();
-
-	return 1;
-}
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e19b91c47e90..5c715c24ab55 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -1,8 +1,22 @@
 /*
- *  Unmaintained SGI Visual Workstation support.
+ *  SGI Visual Workstation support and quirks, unmaintained.
+ *
  *  Split out from setup.c by davej@suse.de
+ *
+ *	Copyright (C) 1999 Bent Hagemark, Ingo Molnar
+ *
+ *  SGI Visual Workstation interrupt controller
+ *
+ *  The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
+ *  which serves as the main interrupt controller in the system.  Non-legacy
+ *  hardware in the system uses this controller directly.  Legacy devices
+ *  are connected to the PIIX4 which in turn has its 8259(s) connected to
+ *  a of the Cobalt APIC entry.
+ *
+ *  09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
+ *
+ *  25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
  */
-
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -25,6 +39,30 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+#include <asm/irq_vectors.h>
+#include <asm/visws/cobalt.h>
+#include <asm/visws/lithium.h>
+#include <asm/visws/piix4.h>
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/visws/cobalt.h>
+#include <asm/visws/lithium.h>
+
 char visws_board_type	= -1;
 char visws_board_rev	= -1;
 
@@ -336,3 +374,334 @@ void __init visws_early_detect(void)
 	       (visws_board_type == VISWS_540 ? "540" :
 		"unknown")), visws_board_rev);
 }
+
+#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
+#define BCD (LI_INTB | LI_INTC | LI_INTD)
+#define ALLDEVS (A01234 | BCD)
+
+static __init void lithium_init(void)
+{
+	set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
+	set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
+
+	if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
+	    (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
+		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
+/*		panic("This machine is not SGI Visual Workstation 320/540"); */
+	}
+
+	if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
+	    (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
+		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
+/*		panic("This machine is not SGI Visual Workstation 320/540"); */
+	}
+
+	li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
+	li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
+}
+
+static __init void cobalt_init(void)
+{
+	/*
+	 * On normal SMP PC this is used only with SMP, but we have to
+	 * use it and set it up here to start the Cobalt clock
+	 */
+	set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
+	setup_local_APIC();
+	printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
+		(unsigned int)apic_read(APIC_LVR),
+		(unsigned int)apic_read(APIC_ID));
+
+	set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
+	set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
+	printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
+		co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
+
+	/* Enable Cobalt APIC being careful to NOT change the ID! */
+	co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
+
+	printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
+		co_apic_read(CO_APIC_ID));
+}
+
+int __init visws_trap_init_quirk(void)
+{
+	lithium_init();
+	cobalt_init();
+
+	return 1;
+}
+
+/*
+ * IRQ controller / APIC support:
+ */
+
+static DEFINE_SPINLOCK(cobalt_lock);
+
+/*
+ * Set the given Cobalt APIC Redirection Table entry to point
+ * to the given IDT vector/index.
+ */
+static inline void co_apic_set(int entry, int irq)
+{
+	co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
+	co_apic_write(CO_APIC_HI(entry), 0);
+}
+
+/*
+ * Cobalt (IO)-APIC functions to handle PCI devices.
+ */
+static inline int co_apic_ide0_hack(void)
+{
+	extern char visws_board_type;
+	extern char visws_board_rev;
+
+	if (visws_board_type == VISWS_320 && visws_board_rev == 5)
+		return 5;
+	return CO_APIC_IDE0;
+}
+
+static int is_co_apic(unsigned int irq)
+{
+	if (IS_CO_APIC(irq))
+		return CO_APIC(irq);
+
+	switch (irq) {
+		case 0: return CO_APIC_CPU;
+		case CO_IRQ_IDE0: return co_apic_ide0_hack();
+		case CO_IRQ_IDE1: return CO_APIC_IDE1;
+		default: return -1;
+	}
+}
+
+
+/*
+ * This is the SGI Cobalt (IO-)APIC:
+ */
+
+static void enable_cobalt_irq(unsigned int irq)
+{
+	co_apic_set(is_co_apic(irq), irq);
+}
+
+static void disable_cobalt_irq(unsigned int irq)
+{
+	int entry = is_co_apic(irq);
+
+	co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
+	co_apic_read(CO_APIC_LO(entry));
+}
+
+/*
+ * "irq" really just serves to identify the device.  Here is where we
+ * map this to the Cobalt APIC entry where it's physically wired.
+ * This is called via request_irq -> setup_irq -> irq_desc->startup()
+ */
+static unsigned int startup_cobalt_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cobalt_lock, flags);
+	if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
+		irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
+	enable_cobalt_irq(irq);
+	spin_unlock_irqrestore(&cobalt_lock, flags);
+	return 0;
+}
+
+static void ack_cobalt_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cobalt_lock, flags);
+	disable_cobalt_irq(irq);
+	apic_write(APIC_EOI, APIC_EIO_ACK);
+	spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static void end_cobalt_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cobalt_lock, flags);
+	if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
+		enable_cobalt_irq(irq);
+	spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static struct irq_chip cobalt_irq_type = {
+	.typename =	"Cobalt-APIC",
+	.startup =	startup_cobalt_irq,
+	.shutdown =	disable_cobalt_irq,
+	.enable =	enable_cobalt_irq,
+	.disable =	disable_cobalt_irq,
+	.ack =		ack_cobalt_irq,
+	.end =		end_cobalt_irq,
+};
+
+
+/*
+ * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
+ * -- not the manner expected by the code in i8259.c.
+ *
+ * there is a 'master' physical interrupt source that gets sent to
+ * the CPU. But in the chipset there are various 'virtual' interrupts
+ * waiting to be handled. We represent this to Linux through a 'master'
+ * interrupt controller type, and through a special virtual interrupt-
+ * controller. Device drivers only see the virtual interrupt sources.
+ */
+static unsigned int startup_piix4_master_irq(unsigned int irq)
+{
+	init_8259A(0);
+
+	return startup_cobalt_irq(irq);
+}
+
+static void end_piix4_master_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cobalt_lock, flags);
+	enable_cobalt_irq(irq);
+	spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static struct irq_chip piix4_master_irq_type = {
+	.typename =	"PIIX4-master",
+	.startup =	startup_piix4_master_irq,
+	.ack =		ack_cobalt_irq,
+	.end =		end_piix4_master_irq,
+};
+
+
+static struct irq_chip piix4_virtual_irq_type = {
+	.typename =	"PIIX4-virtual",
+	.shutdown =	disable_8259A_irq,
+	.enable =	enable_8259A_irq,
+	.disable =	disable_8259A_irq,
+};
+
+
+/*
+ * PIIX4-8259 master/virtual functions to handle interrupt requests
+ * from legacy devices: floppy, parallel, serial, rtc.
+ *
+ * None of these get Cobalt APIC entries, neither do they have IDT
+ * entries. These interrupts are purely virtual and distributed from
+ * the 'master' interrupt source: CO_IRQ_8259.
+ *
+ * When the 8259 interrupts its handler figures out which of these
+ * devices is interrupting and dispatches to its handler.
+ *
+ * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
+ * enable_irq gets the right irq. This 'master' irq is never directly
+ * manipulated by any driver.
+ */
+static irqreturn_t piix4_master_intr(int irq, void *dev_id)
+{
+	int realirq;
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	/* Find out what's interrupting in the PIIX4 master 8259 */
+	outb(0x0c, 0x20);		/* OCW3 Poll command */
+	realirq = inb(0x20);
+
+	/*
+	 * Bit 7 == 0 means invalid/spurious
+	 */
+	if (unlikely(!(realirq & 0x80)))
+		goto out_unlock;
+
+	realirq &= 7;
+
+	if (unlikely(realirq == 2)) {
+		outb(0x0c, 0xa0);
+		realirq = inb(0xa0);
+
+		if (unlikely(!(realirq & 0x80)))
+			goto out_unlock;
+
+		realirq = (realirq & 7) + 8;
+	}
+
+	/* mask and ack interrupt */
+	cached_irq_mask |= 1 << realirq;
+	if (unlikely(realirq > 7)) {
+		inb(0xa1);
+		outb(cached_slave_mask, 0xa1);
+		outb(0x60 + (realirq & 7), 0xa0);
+		outb(0x60 + 2, 0x20);
+	} else {
+		inb(0x21);
+		outb(cached_master_mask, 0x21);
+		outb(0x60 + realirq, 0x20);
+	}
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+
+	desc = irq_desc + realirq;
+
+	/*
+	 * handle this 'virtual interrupt' as a Cobalt one now.
+	 */
+	kstat_cpu(smp_processor_id()).irqs[realirq]++;
+
+	if (likely(desc->action != NULL))
+		handle_IRQ_event(realirq, desc->action);
+
+	if (!(desc->status & IRQ_DISABLED))
+		enable_8259A_irq(realirq);
+
+	return IRQ_HANDLED;
+
+out_unlock:
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+	return IRQ_NONE;
+}
+
+static struct irqaction master_action = {
+	.handler =	piix4_master_intr,
+	.name =		"PIIX4-8259",
+};
+
+static struct irqaction cascade_action = {
+	.handler = 	no_action,
+	.name =		"cascade",
+};
+
+
+void init_VISWS_APIC_irqs(void)
+{
+	int i;
+
+	for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
+		irq_desc[i].status = IRQ_DISABLED;
+		irq_desc[i].action = 0;
+		irq_desc[i].depth = 1;
+
+		if (i == 0) {
+			irq_desc[i].chip = &cobalt_irq_type;
+		}
+		else if (i == CO_IRQ_IDE0) {
+			irq_desc[i].chip = &cobalt_irq_type;
+		}
+		else if (i == CO_IRQ_IDE1) {
+			irq_desc[i].chip = &cobalt_irq_type;
+		}
+		else if (i == CO_IRQ_8259) {
+			irq_desc[i].chip = &piix4_master_irq_type;
+		}
+		else if (i < CO_IRQ_APIC0) {
+			irq_desc[i].chip = &piix4_virtual_irq_type;
+		}
+		else if (IS_CO_APIC(i)) {
+			irq_desc[i].chip = &cobalt_irq_type;
+		}
+	}
+
+	setup_irq(CO_IRQ_8259, &master_action);
+	setup_irq(2, &cascade_action);
+}
-- 
cgit v1.2.3


From b10e9ad0f1d0dc62bd444dd6761a6527bfe98959 Mon Sep 17 00:00:00 2001
From: Daniel Guilak <guilak@linux.vnet.ibm.com>
Date: Thu, 10 Jul 2008 09:39:32 -0700
Subject: arch/x86/kernel/.gitignore: Added vmlinux.lds to .gitignore file
 because it shouldn't be tracked.

Signed-off-by: Daniel Guilak <daniel@danielguilak.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
index 4ea38a39aed4..08f4fd731469 100644
--- a/arch/x86/kernel/.gitignore
+++ b/arch/x86/kernel/.gitignore
@@ -1,2 +1,3 @@
 vsyscall.lds
 vsyscall_32.lds
+vmlinux.lds
-- 
cgit v1.2.3


From f78cb9b1cfe618e8b4cc0c118f187f54ed102f45 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Jul 2008 19:39:55 +0200
Subject: x86, VisWS: build fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

arch/x86/kernel/visws_quirks.c: In function ‘visws_early_detect’:
arch/x86/kernel/visws_quirks.c:293: error: ‘no_broadcast’ undeclared (first use in this function)
arch/x86/kernel/visws_quirks.c:293: error: (Each undeclared identifier is reported only once
arch/x86/kernel/visws_quirks.c:293: error: for each function it appears in.)
make[1]: *** [arch/x86/kernel/visws_quirks.o] Error 1
make: *** [arch/x86/kernel/visws_quirks.o] Error 2

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/visws_quirks.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 5c715c24ab55..e94bdb6add1d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -57,6 +57,8 @@
 #include <linux/pci.h>
 #include <linux/pci_ids.h>
 
+extern int no_broadcast;
+
 #include <asm/io.h>
 #include <asm/apic.h>
 #include <asm/arch_hooks.h>
-- 
cgit v1.2.3


From e54afe38630e3b577968428f48ed8ef1e13a2a15 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Thu, 10 Jul 2008 14:01:47 -0300
Subject: x86: remove duplicate call to use_tsc_delay

Integration generated a duplicate call to use_tsc_delay.
Particularly, the one that is done before we check for general
tsc usability seems wrong.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 4a775d001957..3c36f92160c9 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -513,7 +513,6 @@ void __init tsc_init(void)
 	 */
 	for_each_possible_cpu(cpu)
 		set_cyc2ns_scale(cpu_khz, cpu);
-	use_tsc_delay();
 
 	if (tsc_disabled > 0)
 		return;
-- 
cgit v1.2.3


From 69a7704d7a80b2563278a0c55c2ca6d6202280dc Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 10 Jul 2008 04:17:00 -0700
Subject: x86: e820: user-defined memory maps: remove the range instead of
 update it to reserved

also let mem= to print out modified e820 map too

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9836a079cfd9..3451e0b3f324 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1165,6 +1165,8 @@ static void early_panic(char *msg)
 	panic(msg);
 }
 
+static int userdef __initdata;
+
 /* "mem=nopentium" disables the 4MB page tables. */
 static int __init parse_memopt(char *p)
 {
@@ -1180,17 +1182,15 @@ static int __init parse_memopt(char *p)
 	}
 #endif
 
+	userdef = 1;
 	mem_size = memparse(p, &p);
 	end_user_pfn = mem_size>>PAGE_SHIFT;
-	e820_update_range(mem_size, ULLONG_MAX - mem_size,
-		E820_RAM, E820_RESERVED);
+	e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 
 	return 0;
 }
 early_param("mem", parse_memopt);
 
-static int userdef __initdata;
-
 static int __init parse_memmap_opt(char *p)
 {
 	char *oldp;
@@ -1230,8 +1230,7 @@ static int __init parse_memmap_opt(char *p)
 		e820_add_region(start_at, mem_size, E820_RESERVED);
 	} else {
 		end_user_pfn = (mem_size >> PAGE_SHIFT);
-		e820_update_range(mem_size, ULLONG_MAX - mem_size,
-			E820_RAM, E820_RESERVED);
+		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 	}
 	return *p == '\0' ? 0 : -EINVAL;
 }
-- 
cgit v1.2.3


From f361a450bf1ad14e2b003217dbf3958638631265 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 10 Jul 2008 20:38:26 -0700
Subject: x86: introduce max_low_pfn_mapped for 64-bit

when more than 4g memory is installed, don't map the big hole below 4g.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c  |  2 +-
 arch/x86/kernel/cpu/amd_64.c | 10 +++++++---
 arch/x86/kernel/e820.c       | 23 ++++++++++++++++++++---
 arch/x86/kernel/efi.c        |  2 +-
 arch/x86/kernel/setup.c      | 22 ++++++++++++++++++----
 arch/x86/mm/init_32.c        |  1 +
 arch/x86/mm/init_64.c        |  1 +
 arch/x86/mm/pageattr.c       | 19 +++++++++++++++++--
 arch/x86/mm/pat.c            |  3 ++-
 arch/x86/pci/i386.c          |  4 +++-
 include/asm-x86/e820.h       |  3 ++-
 include/asm-x86/page.h       |  1 +
 12 files changed, 74 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a31a579a47ca..9c981c4a3644 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -130,7 +130,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
 	if (!phys || !size)
 		return NULL;
 
-	if (phys+size <= (max_pfn_mapped << PAGE_SHIFT))
+	if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
 		return __va(phys);
 
 	offset = phys & (PAGE_SIZE - 1);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 958526d6a74a..bd182b7616ee 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -199,10 +199,14 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		 * Don't do it for gbpages because there seems very little
 		 * benefit in doing so.
 		 */
-		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
-		    (tseg >> PMD_SHIFT) <
-			(max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
+		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+		    if ((tseg>>PMD_SHIFT) <
+				(max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
+			((tseg>>PMD_SHIFT) <
+				(max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
+			 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
 			set_memory_4k((unsigned long)__va(tseg), 1);
+		}
 	}
 }
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 3451e0b3f324..9f5002e0b35c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1056,7 +1056,7 @@ unsigned long __initdata end_user_pfn = MAX_ARCH_PFN;
 /*
  * Find the highest page frame number we have available
  */
-unsigned long __init e820_end(void)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 {
 	int i;
 	unsigned long last_pfn = 0;
@@ -1064,12 +1064,21 @@ unsigned long __init e820_end(void)
 
 	for (i = 0; i < e820.nr_map; i++) {
 		struct e820entry *ei = &e820.map[i];
+		unsigned long start_pfn;
 		unsigned long end_pfn;
 
-		if (ei->type != E820_RAM)
+		if (ei->type != type)
 			continue;
 
+		start_pfn = ei->addr >> PAGE_SHIFT;
 		end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+
+		if (start_pfn >= limit_pfn)
+			continue;
+		if (end_pfn > limit_pfn) {
+			last_pfn = limit_pfn;
+			break;
+		}
 		if (end_pfn > last_pfn)
 			last_pfn = end_pfn;
 	}
@@ -1083,7 +1092,15 @@ unsigned long __init e820_end(void)
 			 last_pfn, max_arch_pfn);
 	return last_pfn;
 }
+unsigned long __init e820_end_of_ram_pfn(void)
+{
+	return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+}
 
+unsigned long __init e820_end_of_low_ram_pfn(void)
+{
+	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+}
 /*
  * Finds an active region in the address range from start_pfn to last_pfn and
  * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
@@ -1206,7 +1223,7 @@ static int __init parse_memmap_opt(char *p)
 		 * the real mem size before original memory map is
 		 * reset.
 		 */
-		saved_max_pfn = e820_end();
+		saved_max_pfn = e820_end_of_ram_pfn();
 #endif
 		e820.nr_map = 0;
 		userdef = 1;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 94382faeadb6..06cc8d4254b1 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -473,7 +473,7 @@ void __init efi_enter_virtual_mode(void)
 		size = md->num_pages << EFI_PAGE_SHIFT;
 		end = md->phys_addr + size;
 
-		if (PFN_UP(end) <= max_pfn_mapped)
+		if (PFN_UP(end) <= max_low_pfn_mapped)
 			va = __va(md->phys_addr);
 		else
 			va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a7c3471ea17c..86fc2d624270 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -713,14 +713,14 @@ void __init setup_arch(char **cmdline_p)
 	 * partially used pages are not usable - thus
 	 * we are rounding upwards:
 	 */
-	max_pfn = e820_end();
+	max_pfn = e820_end_of_ram_pfn();
 
 	/* preallocate 4k for mptable mpc */
 	early_reserve_e820_mpc_new();
 	/* update e820 for memory not covered by WB MTRRs */
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(max_pfn))
-		max_pfn = e820_end();
+		max_pfn = e820_end_of_ram_pfn();
 
 #ifdef CONFIG_X86_32
 	/* max_low_pfn get updated here */
@@ -732,12 +732,26 @@ void __init setup_arch(char **cmdline_p)
 
 	/* How many end-of-memory variables you have, grandma! */
 	/* need this before calling reserve_initrd */
-	max_low_pfn = max_pfn;
+	if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
+		max_low_pfn = e820_end_of_low_ram_pfn();
+	else
+		max_low_pfn = max_pfn;
+
 	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
 #endif
 
 	/* max_pfn_mapped is updated here */
-	max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT));
+	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
+	max_pfn_mapped = max_low_pfn_mapped;
+
+#ifdef CONFIG_X86_64
+	if (max_pfn > max_low_pfn) {
+		max_pfn_mapped = init_memory_mapping(1UL<<32,
+						     max_pfn<<PAGE_SHIFT);
+		/* can we preseve max_low_pfn ?*/
+		max_low_pfn = max_pfn;
+	}
+#endif
 
 	/*
 	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index b5a0fd5f4c5f..029e8cffca9e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
+unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 48548ef7ddf8..122bcef222fc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -53,6 +53,7 @@
  * The direct mapping extends to max_pfn_mapped, so that we can directly access
  * apertures, ACPI and other tables without having to play with fixmaps.
  */
+unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
 
 static unsigned long dma_reserve __initdata;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index afd40054d157..0389cb8f6b1a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -536,8 +536,14 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
 
 	if (address >= (unsigned long)__va(0) &&
+		address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+		split_page_count(level);
+
+#ifdef CONFIG_X86_64
+	if (address >= (unsigned long)__va(1UL<<32) &&
 		address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
 		split_page_count(level);
+#endif
 
 	/*
 	 * Install the new, split up pagetable. Important details here:
@@ -655,12 +661,21 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	if (cpa->pfn > max_pfn_mapped)
 		return 0;
 
+#ifdef CONFIG_X86_64
+	if (cpa->pfn > max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+		return 0;
+#endif
 	/*
 	 * No need to redo, when the primary call touched the direct
 	 * mapping already:
 	 */
-	if (!within(cpa->vaddr, PAGE_OFFSET,
-		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+	if (!(within(cpa->vaddr, PAGE_OFFSET,
+		    PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
+#ifdef CONFIG_X86_64
+		|| within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
+		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
+#endif
+	)) {
 
 		alias_cpa = *cpa;
 		alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index a885a1019b8a..749766c3c5cd 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -449,7 +449,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	if (retval < 0)
 		return 0;
 
-	if (pfn <= max_pfn_mapped &&
+	if (((pfn <= max_low_pfn_mapped) ||
+	     (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn <= max_pfn_mapped)) &&
 	    ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
 		free_memtype(offset, offset + size);
 		printk(KERN_INFO
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 6ccd7a108cd4..5281e343dd9f 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -334,7 +334,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 		flags = new_flags;
 	}
 
-	if (vma->vm_pgoff <= max_pfn_mapped &&
+	if (((vma->vm_pgoff <= max_low_pfn_mapped) ||
+	     (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
+	      vma->vm_pgoff <= max_pfn_mapped)) &&
 	    ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
 		free_memtype(addr, addr + len);
 		return -EINVAL;
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 78c03d7bf441..33e793e991d0 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -99,7 +99,8 @@ extern void free_early(u64 start, u64 end);
 extern void early_res_to_bootmem(u64 start, u64 end);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
-extern unsigned long e820_end(void);
+extern unsigned long e820_end_of_ram_pfn(void);
+extern unsigned long e820_end_of_low_ram_pfn(void);
 extern int e820_find_active_region(const struct e820entry *ei,
 				  unsigned long start_pfn,
 				  unsigned long last_pfn,
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h
index b52ed85f32f5..28d7b4533b1a 100644
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -61,6 +61,7 @@ extern void map_devmem(unsigned long pfn, unsigned long size,
 extern void unmap_devmem(unsigned long pfn, unsigned long size,
 			 pgprot_t vma_prot);
 
+extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
 
 struct page;
-- 
cgit v1.2.3


From 3d0decc4f49e8645cd6369b02ed076bebd3d61ad Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 11 Jul 2008 15:09:15 +0200
Subject: x86: fix tsc unification buglet with ftrace and stackprotector

Yinghai Lu reported crashes on 64-bit x86:

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
 IP: [<ffffffff80253b17>] hrtick_start_fair+0x89/0x173
 [...]

And with a long session of debugging and a lot of difficulty, tracked it down
to this commit:

 --------------->
 8fbbc4b45ce3e4c0eeb15004c79c72b6896a79c2 is first bad commit
 commit 8fbbc4b45ce3e4c0eeb15004c79c72b6896a79c2
 Author: Alok Kataria <akataria@vmware.com>
 Date:   Tue Jul 1 11:43:34 2008 -0700

     x86: merge tsc_init and clocksource code
 <--------------

The problem is that the TSC unification missed these Makefile rules
in arch/x86/kernel/Makefile:

  # Do not profile debug and lowlevel utilities
  CFLAGS_REMOVE_tsc_64.o = -pg
  CFLAGS_REMOVE_tsc_32.o = -pg
  ...
  CFLAGS_tsc_64.o         := $(nostackp)
  ...

which rules make sure that various instrumentation and debugging
facilities are disabled for code that might end up in a VDSO - such as
the TSC code.

Reported-and-bisected-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Conflicts:

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 59b14c940a28..4033d8dc745b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -13,7 +13,7 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_vsyscall_64.o	:= $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o		:= $(nostackp)
-CFLAGS_tsc_64.o		:= $(nostackp)
+CFLAGS_tsc.o		:= $(nostackp)
 
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
-- 
cgit v1.2.3


From 8d28aab59fe939be40efae870ced0b05caa259fb Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 10 Jul 2008 16:22:56 -0700
Subject: x86_64: add pseudo-features for 32-bit compat syscall

Add pseudo-feature bits to describe whether the CPU supports sysenter
and/or syscall from ia32-compat userspace.  This removes a hardcoded
test in vdso32-setup.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/centaur_64.c | 2 ++
 arch/x86/kernel/cpu/common_64.c  | 3 +++
 arch/x86/kernel/cpu/intel_64.c   | 2 ++
 include/asm-x86/cpufeature.h     | 4 ++--
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 13526fd5cce1..2026d2119cdb 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -10,6 +10,8 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
 {
 	if (c->x86 == 0x6 && c->x86_model >= 0xf)
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
 }
 
 static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 751850235291..36537ab9e56a 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -314,6 +314,9 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000007)
 		c->x86_power = cpuid_edx(0x80000007);
 
+	/* Assume all 64-bit CPUs support 32-bit syscall */
+	set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+
 	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
 	    cpu_devs[c->x86_vendor]->c_early_init)
 		cpu_devs[c->x86_vendor]->c_early_init(c);
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
index fcb1cc9d75ca..02f773399e39 100644
--- a/arch/x86/kernel/cpu/intel_64.c
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -12,6 +12,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
 }
 
 /*
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 84a56da397b1..75ef959db329 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -74,8 +74,8 @@
 #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
 #define X86_FEATURE_PEBS	(3*32+12)  /* Precise-Event Based Sampling */
 #define X86_FEATURE_BTS		(3*32+13)  /* Branch Trace Store */
-/* 14 free */
-/* 15 free */
+#define X86_FEATURE_SYSCALL32	(3*32+14)  /* syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32	(3*32+15)  /* sysenter in ia32 userspace */
 #define X86_FEATURE_REP_GOOD	(3*32+16) /* rep microcode works well on this CPU */
 #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
 #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */
-- 
cgit v1.2.3


From 557d7d4e294ee6fb1db0cb6c1ec97a1c908b880d Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Thu, 10 Jul 2008 15:09:20 -0300
Subject: x86: use matching CFI_ENDPROC

The RING0_INT_FRAME macro defines a CFI_STARTPROC.
So we should really be using CFI_ENDPROC after it.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 53393c306e11..cfe28a715434 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1024,6 +1024,7 @@ ENTRY(xen_sysenter_target)
 	RING0_INT_FRAME
 	addl $5*4, %esp		/* remove xen-provided frame */
 	jmp sysenter_past_esp
+	CFI_ENDPROC
 
 ENTRY(xen_hypervisor_callback)
 	CFI_STARTPROC
-- 
cgit v1.2.3


From 1baea6e2fea6f235b21f32a322cb6cb43ffdb704 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Fri, 11 Jul 2008 19:34:36 +0100
Subject: x86: L-APIC: Set IRQ0 as edge-triggered

 IRQ0 is edge-triggered, but the "8259A Virtual Wire" through the local
APIC configuration in the 32-bit version uses the "fasteoi" handler
suitable for level-triggered APIC interrupt.  Rewrite code so that the
"edge" handler is used.  The 64-bit version uses different code and is
unaffected.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 6b220b9dcbb3..50e1131a6ec6 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2052,7 +2052,7 @@ static struct irq_chip lapic_chip __read_mostly = {
 	.name		= "local-APIC",
 	.mask		= mask_lapic_irq,
 	.unmask		= unmask_lapic_irq,
-	.eoi		= ack_apic,
+	.ack		= ack_apic,
 };
 
 static void __init setup_nmi(void)
@@ -2257,8 +2257,8 @@ static inline void __init check_timer(void)
 
 	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
-	set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
-				      "fasteoi");
+	set_irq_chip_and_handler_name(0, &lapic_chip, handle_edge_irq,
+				      "edge");
 	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
-- 
cgit v1.2.3


From c88ac1df4885ce0d762cfeff0e7d5b83725c1e5c Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Fri, 11 Jul 2008 19:35:17 +0100
Subject: x86: L-APIC: Always fully configure IRQ0

Unlike the 32-bit one, the 64-bit variation of the LVT0 setup code for
the "8259A Virtual Wire" through the local APIC timer configuration does
not fully configure the relevant irq_chip structure.  Instead it relies on
the preceding I/O APIC code to have set it up, which does not happen if
the I/O APIC variants have not been tried.

The patch includes corresponding changes to the 32-bit variation too
which make them both the same, barring a small syntactic difference
involving sequence of functions in the source.  That should work as an aid
with the upcoming merge.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 15 +++++++++++----
 arch/x86/kernel/io_apic_64.c | 29 +++++++++++++++--------------
 2 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 50e1131a6ec6..74d49e04de58 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2027,7 +2027,7 @@ static inline void init_IO_APIC_traps(void)
  * The local APIC irq-chip implementation:
  */
 
-static void ack_apic(unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
 {
 	ack_APIC_irq();
 }
@@ -2052,9 +2052,17 @@ static struct irq_chip lapic_chip __read_mostly = {
 	.name		= "local-APIC",
 	.mask		= mask_lapic_irq,
 	.unmask		= unmask_lapic_irq,
-	.ack		= ack_apic,
+	.ack		= ack_lapic_irq,
 };
 
+static void lapic_register_intr(int irq, int vector)
+{
+	irq_desc[irq].status &= ~IRQ_LEVEL;
+	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+				      "edge");
+	set_intr_gate(vector, interrupt[irq]);
+}
+
 static void __init setup_nmi(void)
 {
 	/*
@@ -2257,8 +2265,7 @@ static inline void __init check_timer(void)
 
 	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
-	set_irq_chip_and_handler_name(0, &lapic_chip, handle_edge_irq,
-				      "edge");
+	lapic_register_intr(0, vector);
 	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 848411753c7c..07ebcd305fb7 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1561,7 +1561,7 @@ static inline void init_IO_APIC_traps(void)
 	}
 }
 
-static void enable_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;
 
@@ -1569,7 +1569,7 @@ static void enable_lapic_irq (unsigned int irq)
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
-static void disable_lapic_irq (unsigned int irq)
+static void mask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;
 
@@ -1582,19 +1582,20 @@ static void ack_lapic_irq (unsigned int irq)
 	ack_APIC_irq();
 }
 
-static void end_lapic_irq (unsigned int i) { /* nothing */ }
-
-static struct hw_interrupt_type lapic_irq_type __read_mostly = {
-	.name = "local-APIC",
-	.typename = "local-APIC-edge",
-	.startup = NULL, /* startup_irq() not used for IRQ0 */
-	.shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
-	.enable = enable_lapic_irq,
-	.disable = disable_lapic_irq,
-	.ack = ack_lapic_irq,
-	.end = end_lapic_irq,
+static struct irq_chip lapic_chip __read_mostly = {
+	.name		= "local-APIC",
+	.mask		= mask_lapic_irq,
+	.unmask		= unmask_lapic_irq,
+	.ack		= ack_lapic_irq,
 };
 
+static void lapic_register_intr(int irq)
+{
+	irq_desc[irq].status &= ~IRQ_LEVEL;
+	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+				      "edge");
+}
+
 static void __init setup_nmi(void)
 {
 	/*
@@ -1784,7 +1785,7 @@ static inline void __init check_timer(void)
 
 	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
-	irq_desc[0].chip = &lapic_irq_type;
+	lapic_register_intr(0);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
-- 
cgit v1.2.3


From af174783b9251f0afd4bb78927221bcaaa65d3ac Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Fri, 11 Jul 2008 19:35:23 +0100
Subject: x86: I/O APIC: Never configure IRQ2

There is no such entity as ISA IRQ2.  The ACPI spec does not make it
explicitly clear, but does not preclude it either -- all it says is ISA
legacy interrupts are identity mapped by default (subject to overrides),
but it does not state whether IRQ2 exists or not.  As a result if there is
no IRQ0 override, then IRQ2 is normally initialised as an ISA interrupt,
which implies an edge-triggered line, which is unmasked by default as this
is what we do for edge-triggered I/O APIC interrupts so as not to miss an
edge.

To the best of my knowledge it is useless, as IRQ2 has not been in use
since the PC/AT as back then it was taken by the 8259A cascade interrupt
to the slave, with the line position in the slot rerouted to newly-created
IRQ9.  No device could thus make use of this line with the pair of 8259A
chips.  Now in theory INTIN2 of the I/O APIC may be usable, but the
interrupt of the device wired to it would not be available in the PIC mode
at all, so I seriously doubt if anybody decided to reuse it for a regular
device.

However there are two common uses of INTIN2.  One is for IRQ0, with an
ACPI interrupt override (or its equivalent in the MP table).  But in this
case IRQ2 is gone entirely with INTIN0 left vacant.  The other one is for
an 8959A ExtINTA cascade.  In this case IRQ0 goes to INTIN0 and if ACPI is
used INTIN2 is assumed to be IRQ2 (there is no override and ACPI has no
way to report ExtINTA interrupts).  This is where a problem happens.

The problem is INTIN2 is configured as a native APIC interrupt, with a
vector assigned and the mask cleared.  And the line may indeed get active
and inject interrupts if the master 8959A has its timer interrupt enabled
(it might happen for other interrupts too, but they are normally masked in
the process of rerouting them to the I/O APIC).  There are two cases where
it will happen:

* When the I/O APIC NMI watchdog is enabled.  This is actually a misnomer
  as the watchdog pulses are delivered through the 8259A to the LINT0
  inputs of all the local APICs in the system.  The implication is the
  output of the master 8259A goes high and low repeatedly, signalling
  interrupts to INTIN2 which is enabled too!

  [The origin of the name is I think for a brief period during the
  development we had a capability in our code to configure the watchdog to
  use an I/O APIC input; that would be INTIN2 in this scenario.]

* When the native route of IRQ0 via INTIN0 fails for whatever reason -- as
  it happens with the system considered here.  In this scenario the timer
  pulse is delivered through the 8259A to LINT0 input of the local APIC of
  the bootstrap processor, quite similarly to how is done for the watchdog
  described above.  The result is, again, INTIN2 receives these pulses
  too.  Rafael's system used to escape this scenario, because an incorrect
  IRQ0 override would occupy INTIN2 and prevent it from being unmasked.

My conclusion is IRQ2 should be excluded from configuration in all the
cases and the current exception for ACPI systems should be lifted.  The
reason being the exception not only being useless, but harmful as well.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 25 ++++++++++++++++---------
 arch/x86/kernel/io_apic_64.c | 25 ++++++++++++++++---------
 2 files changed, 32 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 74d49e04de58..c50adb84ea6f 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2297,11 +2297,21 @@ out:
 }
 
 /*
- *
- * IRQ's that are handled by the PIC in the MPS IOAPIC case.
- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
- *   Linux doesn't really care, as it's not actually used
- *   for any interrupt handling anyway.
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices.  However there may be an I/O APIC pin available for
+ * this interrupt regardless.  The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A.  In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table.  With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default.  We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor.  Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now.  No actual device should request
+ * it anyway.  --macro
  */
 #define PIC_IRQS	(1 << PIC_CASCADE_IR)
 
@@ -2315,10 +2325,7 @@ void __init setup_IO_APIC(void)
 
 	enable_IO_APIC();
 
-	if (acpi_ioapic)
-		io_apic_irqs = ~0;	/* all IRQs go through IOAPIC */
-	else
-		io_apic_irqs = ~PIC_IRQS;
+	io_apic_irqs = ~PIC_IRQS;
 
 	printk("ENABLING IO-APIC IRQs\n");
 
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 07ebcd305fb7..9e645cba11c4 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1823,11 +1823,21 @@ static int __init notimercheck(char *s)
 __setup("no_timer_check", notimercheck);
 
 /*
- *
- * IRQs that are handled by the PIC in the MPS IOAPIC case.
- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
- *   Linux doesn't really care, as it's not actually used
- *   for any interrupt handling anyway.
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices.  However there may be an I/O APIC pin available for
+ * this interrupt regardless.  The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A.  In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table.  With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default.  We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor.  Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now.  No actual device should request
+ * it anyway.  --macro
  */
 #define PIC_IRQS	(1<<2)
 
@@ -1838,10 +1848,7 @@ void __init setup_IO_APIC(void)
 	 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
 	 */
 
-	if (acpi_ioapic)
-		io_apic_irqs = ~0;	/* all IRQs go through IOAPIC */
-	else
-		io_apic_irqs = ~PIC_IRQS;
+	io_apic_irqs = ~PIC_IRQS;
 
 	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
 
-- 
cgit v1.2.3


From 5b4d2386c23e5de553fce002892c7691a989b350 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Fri, 11 Jul 2008 19:47:15 +0100
Subject: x86: Recover timer_ack lost in the merge of the NMI watchdog

In the course of the recent unification of the NMI watchdog an assignment
to timer_ack to switch off unnecesary POLL commands to the 8259A in the
case of a watchdog failure has been accidentally removed.  The statement
used to be limited to the 32-bit variation as since the rewrite of the
timer code it has been relevant for the 82489DX only.  This change brings
it back.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 8dfe9db87a9e..716b89284be0 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -171,6 +171,9 @@ int __init check_nmi_watchdog(void)
 error:
 	if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
 		disable_8259A_irq(0);
+#ifdef CONFIG_X86_32
+	timer_ack = 0;
+#endif
 	return -1;
 }
 
-- 
cgit v1.2.3


From da1f29f5dfcc8641f9ff6f3deaa9f03b57e63229 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 12 Jul 2008 02:50:15 +0200
Subject: x86: remove conflicting nx6325 and nx6125 quirks

We have two conflicting DMA-based quirks in there for the same set of
boxes (HP nx6325 and nx6125) and one of them actually breaks my box.

So remove the extra code.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: =?iso-8859-1?q?T=F6r=F6k_Edwin?= <edwintorok@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 47 ---------------------------------------------
 1 file changed, 47 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 9c981c4a3644..785700a08e9d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -84,8 +84,6 @@ int acpi_lapic;
 int acpi_ioapic;
 int acpi_strict;
 
-static int disable_irq0_through_ioapic __initdata;
-
 u8 acpi_sci_flags __initdata;
 int acpi_sci_override_gsi __initdata;
 int acpi_skip_timer_override __initdata;
@@ -982,10 +980,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	int pin;
 	struct mp_config_intsrc mp_irq;
 
-	/* Skip the 8254 timer interrupt (IRQ 0) if requested.  */
-	if (bus_irq == 0 && disable_irq0_through_ioapic)
-		return;
-
 	/*
 	 * Convert 'gsi' to 'ioapic.pin'.
 	 */
@@ -1052,10 +1046,6 @@ void __init mp_config_acpi_legacy_irqs(void)
 	for (i = 0; i < 16; i++) {
 		int idx;
 
-		/* Skip the 8254 timer interrupt (IRQ 0) if requested.  */
-		if (i == 0 && disable_irq0_through_ioapic)
-			continue;
-
 		for (idx = 0; idx < mp_irq_entries; idx++) {
 			struct mp_config_intsrc *irq = mp_irqs + idx;
 
@@ -1412,17 +1402,6 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
 	return 0;
 }
 
-/*
- * Don't register any I/O APIC entries for the 8254 timer IRQ.
- */
-static int __init
-dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d)
-{
-	pr_notice("%s detected: disabling IRQ 0 through I/O APIC\n", d->ident);
-	disable_irq0_through_ioapic = 1;
-	return 0;
-}
-
 /*
  * Force ignoring BIOS IRQ0 pin2 override
  */
@@ -1601,32 +1580,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
 		     DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
 		     },
 	 },
-	/*
-	 * HP laptops which use a DSDT reporting as HP/SB400/10000,
-	 * which includes some code which overrides all temperature
-	 * trip points to 16C if the INTIN2 input of the I/O APIC
-	 * is enabled.  This input is incorrectly designated the
-	 * ISA IRQ 0 via an interrupt source override even though
-	 * it is wired to the output of the master 8259A and INTIN0
-	 * is not connected at all.  Abandon any attempts to route
-	 * IRQ 0 through the I/O APIC therefore.
-	 */
-	{
-	 .callback = dmi_disable_irq0_through_ioapic,
-	 .ident = "HP NX6125 laptop",
-	 .matches = {
-		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
-		     },
-	 },
-	{
-	 .callback = dmi_disable_irq0_through_ioapic,
-	 .ident = "HP NX6325 laptop",
-	 .matches = {
-		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
-		     },
-	 },
 	/*
 	 * HP laptops which use a DSDT reporting as HP/SB400/10000,
 	 * which includes some code which overrides all temperature
-- 
cgit v1.2.3


From eca91e7838ec92e8c12122849ec7539c4765b689 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 10 Jul 2008 14:50:39 -0700
Subject: x86_64: fix delayed signals

On three of the several paths in entry_64.S that call
do_notify_resume() on the way back to user mode, we fail to properly
check again for newly-arrived work that requires another call to
do_notify_resume() before going to user mode.  These paths set the
mask to check only _TIF_NEED_RESCHED, but this is wrong.  The other
paths that lead to do_notify_resume() do this correctly already, and
entry_32.S does it correctly in all cases.

All paths back to user mode have to check all the _TIF_WORK_MASK
flags at the last possible stage, with interrupts disabled.
Otherwise, we miss any flags (TIF_SIGPENDING for example) that were
set any time after we entered do_notify_resume().  More work flags
can be set (or left set) synchronously inside do_notify_resume(), as
TIF_SIGPENDING can be, or asynchronously by interrupts or other CPUs
(which then send an asynchronous interrupt).

There are many different scenarios that could hit this bug, most of
them races.  The simplest one to demonstrate does not require any
race: when one signal has done handler setup at the check before
returning from a syscall, and there is another signal pending that
should be handled.  The second signal's handler should interrupt the
first signal handler before it actually starts (so the interrupted PC
is still at the handler's entry point).  Instead, it runs away until
the next kernel entry (next syscall, tick, etc).

This test behaves correctly on 32-bit kernels, and fails on 64-bit
(either 32-bit or 64-bit test binary).  With this fix, it works.

    #define _GNU_SOURCE
    #include <stdio.h>
    #include <signal.h>
    #include <string.h>
    #include <sys/ucontext.h>

    #ifndef REG_RIP
    #define REG_RIP REG_EIP
    #endif

    static sig_atomic_t hit1, hit2;

    static void
    handler (int sig, siginfo_t *info, void *ctx)
    {
      ucontext_t *uc = ctx;

      if ((void *) uc->uc_mcontext.gregs[REG_RIP] == &handler)
        {
          if (sig == SIGUSR1)
            hit1 = 1;
          else
            hit2 = 1;
        }

      printf ("%s at %#lx\n", strsignal (sig),
              uc->uc_mcontext.gregs[REG_RIP]);
    }

    int
    main (void)
    {
      struct sigaction sa;
      sigset_t set;

      sigemptyset (&sa.sa_mask);
      sa.sa_flags = SA_SIGINFO;
      sa.sa_sigaction = &handler;

      if (sigaction (SIGUSR1, &sa, NULL)
          || sigaction (SIGUSR2, &sa, NULL))
        return 2;

      sigemptyset (&set);
      sigaddset (&set, SIGUSR1);
      sigaddset (&set, SIGUSR2);
      if (sigprocmask (SIG_BLOCK, &set, NULL))
        return 3;

      printf ("main at %p, handler at %p\n", &main, &handler);

      raise (SIGUSR1);
      raise (SIGUSR2);

      if (sigprocmask (SIG_UNBLOCK, &set, NULL))
        return 4;

      if (hit1 + hit2 == 1)
        {
          puts ("PASS");
          return 0;
        }

      puts ("FAIL");
      return 1;
    }

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 466b9284ed2f..bb4e22f4892f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -306,7 +306,7 @@ sysret_signal:
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
-1:	movl $_TIF_NEED_RESCHED,%edi
+1:	movl $_TIF_WORK_MASK,%edi
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -394,7 +394,7 @@ int_signal:
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
 	call do_notify_resume
-1:	movl $_TIF_NEED_RESCHED,%edi	
+1:	movl $_TIF_WORK_MASK,%edi
 int_restore_rest:
 	RESTORE_REST
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -647,9 +647,8 @@ retint_signal:
 	RESTORE_REST
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	movl $_TIF_NEED_RESCHED,%edi
 	GET_THREAD_INFO(%rcx)
-	jmp retint_check
+	jmp retint_with_reschedule
 
 #ifdef CONFIG_PREEMPT
 	/* Returning to kernel space. Check if we need preemption */
-- 
cgit v1.2.3


From 7ab073b6e0cde1544f4e79fadb75532528af7595 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 12 Jul 2008 14:30:35 -0700
Subject: x86: max_low_pfn_mapped fix, #1

fix crash on Ingo's big box:

calling  pci_iommu_init+0x0/0x17
PCI-DMA: Disabling AGP.
PCI-DMA: aperture base @ d0000000 size 65536 KB
PCI-DMA: using GART IOMMU.
PCI-DMA: Reserving 64MB of IOMMU area in the AGP aperture
BUG: unable to handle kernel paging request at ffff88000003be88
IP: [<ffffffff8026d377>] __alloc_pages_internal+0xc3/0x3f2
PGD 202063 PUD 206063 PMD 22fc00163 PTE 3b162
Oops: 0000 [1] SMP

and e820 is:

 BIOS-e820: 0000000000000000 - 000000000009ac00 (usable)
 BIOS-e820: 000000000009ac00 - 00000000000a0000 (reserved)
 BIOS-e820: 00000000000ca000 - 0000000000100000 (reserved)
 BIOS-e820: 0000000000100000 - 000000007ff70000 (usable)
 BIOS-e820: 000000007ff70000 - 000000007ff86000 (ACPI data)
 BIOS-e820: 000000007ff86000 - 0000000080000000 (ACPI NVS)
 BIOS-e820: 0000000080000000 - 00000000cfe00000 (usable)
 BIOS-e820: 00000000cfe00000 - 00000000d0000000 (reserved)
 BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved)
 BIOS-e820: 00000000fec00000 - 00000000fec10000 (reserved)
 BIOS-e820: 00000000fee00000 - 00000000fee01000 (reserved)
 BIOS-e820: 00000000fff80000 - 0000000100000000 (reserved)
 BIOS-e820: 0000000100000000 - 0000000830000000 (usable)

system has 32 GB RAM installed.

max_low_pfn_mapped is 0xcfe00, and GART aperture is not mapped.

So try to use init_memory_mapping to map that area, because the iommu
thinks that area is ram ...

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-gart_64.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d0d18db5d2a4..a614ee10f846 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -630,6 +630,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	struct pci_dev *dev;
 	void *gatt;
 	int i, error;
+	unsigned long start_pfn, end_pfn;
 
 	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
 	aper_size = aper_base = info->aper_size = 0;
@@ -674,6 +675,16 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 	printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
 	       aper_base, aper_size>>10);
+
+	/* need to map that range */
+	end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+	if (end_pfn > max_low_pfn_mapped) {
+		start_pfn = max_low_pfn_mapped;
+		max_low_pfn_mapped = init_memory_mapping(start_pfn<<PAGE_SHIFT,
+							 end_pfn<<PAGE_SHIFT);
+		if (max_pfn_mapped < max_low_pfn_mapped)
+			max_pfn_mapped = max_low_pfn_mapped;
+	}
 	return 0;
 
  nommu:
-- 
cgit v1.2.3


From 965194c15dc9e4f3bc44432b39c441c86af7f11d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 12 Jul 2008 14:31:28 -0700
Subject: x86: max_low_pfn_mapped fix, #2

tighten the boundary checks around max_low_pfn_mapped - dont overmap
nor undermap into holes.

also print out tseg for AMD cpus, for diagnostic purposes.
(this is an SMM area, and we split up any big mappings around that area)

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd_64.c | 1 +
 arch/x86/mm/pageattr.c       | 4 ++--
 arch/x86/mm/pat.c            | 4 ++--
 arch/x86/pci/i386.c          | 4 ++--
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index bd182b7616ee..7c36fb8a28d4 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -200,6 +200,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		 * benefit in doing so.
 		 */
 		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+		    printk(KERN_DEBUG "tseg: %010llx\n", tseg);
 		    if ((tseg>>PMD_SHIFT) <
 				(max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
 			((tseg>>PMD_SHIFT) <
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 0389cb8f6b1a..fb6f2ab40dda 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -658,11 +658,11 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	struct cpa_data alias_cpa;
 	int ret = 0;
 
-	if (cpa->pfn > max_pfn_mapped)
+	if (cpa->pfn >= max_pfn_mapped)
 		return 0;
 
 #ifdef CONFIG_X86_64
-	if (cpa->pfn > max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+	if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
 		return 0;
 #endif
 	/*
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 749766c3c5cd..d4585077977a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -449,8 +449,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	if (retval < 0)
 		return 0;
 
-	if (((pfn <= max_low_pfn_mapped) ||
-	     (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn <= max_pfn_mapped)) &&
+	if (((pfn < max_low_pfn_mapped) ||
+	     (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
 	    ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
 		free_memtype(offset, offset + size);
 		printk(KERN_INFO
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5281e343dd9f..2aafb67dc5f1 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -334,9 +334,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 		flags = new_flags;
 	}
 
-	if (((vma->vm_pgoff <= max_low_pfn_mapped) ||
+	if (((vma->vm_pgoff < max_low_pfn_mapped) ||
 	     (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
-	      vma->vm_pgoff <= max_pfn_mapped)) &&
+	      vma->vm_pgoff < max_pfn_mapped)) &&
 	    ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
 		free_memtype(addr, addr + len);
 		return -EINVAL;
-- 
cgit v1.2.3


From 7b479becdb8c1fb4ff6fbb2a4076c471c737b54c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 12 Jul 2008 22:57:07 -0700
Subject: x86, e820: remove end_user_pfn

end_user_pfn used to modify the meaning of the e820 maps.

Now that all e820 operations are cleaned up, unified, tightened up,
the e820 map always get updated to reality, we don't need to keep
this secondary mechanism anymore.

If you hit this commit in bisection it means something slipped through.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a5383ae2cbe3..28c29180b380 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1048,11 +1048,6 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 # define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
 #endif
 
-/*
- * Last pfn which the user wants to use.
- */
-unsigned long __initdata end_user_pfn = MAX_ARCH_PFN;
-
 /*
  * Find the highest page frame number we have available
  */
@@ -1085,8 +1080,6 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 
 	if (last_pfn > max_arch_pfn)
 		last_pfn = max_arch_pfn;
-	if (last_pfn > end_user_pfn)
-		last_pfn = end_user_pfn;
 
 	printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
 			 last_pfn, max_arch_pfn);
@@ -1131,12 +1124,6 @@ int __init e820_find_active_region(const struct e820entry *ei,
 	if (*ei_endpfn > last_pfn)
 		*ei_endpfn = last_pfn;
 
-	/* Obey end_user_pfn to save on memmap */
-	if (*ei_startpfn >= end_user_pfn)
-		return 0;
-	if (*ei_endpfn > end_user_pfn)
-		*ei_endpfn = end_user_pfn;
-
 	return 1;
 }
 
@@ -1201,7 +1188,6 @@ static int __init parse_memopt(char *p)
 
 	userdef = 1;
 	mem_size = memparse(p, &p);
-	end_user_pfn = mem_size>>PAGE_SHIFT;
 	e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 
 	return 0;
@@ -1245,10 +1231,9 @@ static int __init parse_memmap_opt(char *p)
 	} else if (*p == '$') {
 		start_at = memparse(p+1, &p);
 		e820_add_region(start_at, mem_size, E820_RESERVED);
-	} else {
-		end_user_pfn = (mem_size >> PAGE_SHIFT);
+	} else
 		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
-	}
+
 	return *p == '\0' ? 0 : -EINVAL;
 }
 early_param("memmap", parse_memmap_opt);
-- 
cgit v1.2.3


From 3d88cca7085cffce077f808f36551e9050eb9e3a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 12 Jul 2008 22:52:55 -0700
Subject: x86: fix numaq_tsc_disable calling

got this on a test-system:

 calling  numaq_tsc_disable+0x0/0x39
 NUMAQ: disabling TSC
 initcall numaq_tsc_disable+0x0/0x39 returned 0 after 0 msecs

that's because we should not be using arch_initcall to call numaq_tsc_disable.

need to call it in setup_arch before time_init()/tsc_init()
and call it in init_intel() to make the cpu feature bits right.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/intel.c | 4 ++++
 arch/x86/kernel/numaq_32.c  | 7 ++++---
 arch/x86/kernel/setup.c     | 8 ++++++++
 include/asm-x86/numaq.h     | 2 ++
 4 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fe9224c51d37..70609efdf1da 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -226,6 +226,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 
 	if (cpu_has_bts)
 		ds_init_intel(c);
+
+#ifdef CONFIG_X86_NUMAQ
+	numaq_tsc_disable();
+#endif
 }
 
 static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index f0f1de1c4a1d..5b20a5e7ac28 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -93,12 +93,13 @@ int __init get_memcfg_numaq(void)
 	return 1;
 }
 
-static int __init numaq_tsc_disable(void)
+void __init numaq_tsc_disable(void)
 {
+	if (!found_numaq)
+		return -1;
+
 	if (num_online_nodes() > 1) {
 		printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
 		setup_clear_cpu_cap(X86_FEATURE_TSC);
 	}
-	return 0;
 }
-arch_initcall(numaq_tsc_disable);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 987b6fde3a99..36c540d4ac4b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -849,6 +849,14 @@ void __init setup_arch(char **cmdline_p)
 	init_cpu_to_node();
 #endif
 
+#ifdef CONFIG_X86_NUMAQ
+	/*
+	 * need to check online nodes num, call it
+	 * here before time_init/tsc_init
+	 */
+	numaq_tsc_disable();
+#endif
+
 	init_apic_mappings();
 	ioapic_init_mappings();
 
diff --git a/include/asm-x86/numaq.h b/include/asm-x86/numaq.h
index ef068d2465d6..34b92d581fa3 100644
--- a/include/asm-x86/numaq.h
+++ b/include/asm-x86/numaq.h
@@ -157,6 +157,8 @@ struct sys_cfg_data {
 	struct		eachquadmem eq[MAX_NUMNODES];	/* indexed by quad id */
 };
 
+void numaq_tsc_disable(void);
+
 #else
 static inline int get_memcfg_numaq(void)
 {
-- 
cgit v1.2.3


From ce8b06b985ae48f9425de6e4641e77cb3613ef00 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Sun, 13 Jul 2008 03:29:42 +0100
Subject: x86: I/O APIC: remove an IRQ2-mask hack

Now that IRQ2 is never made available to the I/O APIC, there is no need
to special-case it and mask as a workaround for broken systems.  Actually,
because of the former, mask_IO_APIC_irq(2) is a no-op already.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c  |  1 -
 arch/x86/kernel/io_apic_32.c | 10 ----------
 arch/x86/kernel/io_apic_64.c | 10 ----------
 include/asm-x86/genapic_32.h |  5 -----
 include/asm-x86/genapic_64.h |  6 ------
 5 files changed, 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 785700a08e9d..f489d7a9be92 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1409,7 +1409,6 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 {
 	pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident);
 	acpi_skip_timer_override = 1;
-	force_mask_ioapic_irq_2();
 	return 0;
 }
 
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index c50adb84ea6f..603261a5885c 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -59,13 +59,6 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 static DEFINE_SPINLOCK(ioapic_lock);
 static DEFINE_SPINLOCK(vector_lock);
 
-static bool mask_ioapic_irq_2 __initdata;
-
-void __init force_mask_ioapic_irq_2(void)
-{
-	mask_ioapic_irq_2 = true;
-}
-
 int timer_through_8259 __initdata;
 
 /*
@@ -2187,9 +2180,6 @@ static inline void __init check_timer(void)
 	printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		vector, apic1, pin1, apic2, pin2);
 
-	if (mask_ioapic_irq_2)
-		mask_IO_APIC_irq(2);
-
 	/*
 	 * Some BIOS writers are clueless and report the ExtINTA
 	 * I/O APIC input from the cascaded 8259A as the timer
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 9e645cba11c4..b16ef029cf88 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -94,13 +94,6 @@ static int no_timer_check;
 
 static int disable_timer_pin_1 __initdata;
 
-static bool mask_ioapic_irq_2 __initdata;
-
-void __init force_mask_ioapic_irq_2(void)
-{
-	mask_ioapic_irq_2 = true;
-}
-
 int timer_through_8259 __initdata;
 
 /* Where if anywhere is the i8259 connect in external int mode */
@@ -1706,9 +1699,6 @@ static inline void __init check_timer(void)
 	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		cfg->vector, apic1, pin1, apic2, pin2);
 
-	if (mask_ioapic_irq_2)
-		mask_IO_APIC_irq(2);
-
 	/*
 	 * Some BIOS writers are clueless and report the ExtINTA
 	 * I/O APIC input from the cascaded 8259A as the timer
diff --git a/include/asm-x86/genapic_32.h b/include/asm-x86/genapic_32.h
index 33a73f5ed222..b02ea6e17de8 100644
--- a/include/asm-x86/genapic_32.h
+++ b/include/asm-x86/genapic_32.h
@@ -119,10 +119,5 @@ enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
 #define is_uv_system()			0
 #define uv_wakeup_secondary(a, b)	1
 
-#ifdef CONFIG_X86_IO_APIC
-extern void force_mask_ioapic_irq_2(void);
-#else
-static inline void force_mask_ioapic_irq_2(void) { }
-#endif
 
 #endif
diff --git a/include/asm-x86/genapic_64.h b/include/asm-x86/genapic_64.h
index 647e4e5c2580..0f8504627c41 100644
--- a/include/asm-x86/genapic_64.h
+++ b/include/asm-x86/genapic_64.h
@@ -46,10 +46,4 @@ extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
 
 extern void setup_apic_routing(void);
 
-#ifdef CONFIG_X86_IO_APIC
-extern void force_mask_ioapic_irq_2(void);
-#else
-static inline void force_mask_ioapic_irq_2(void) { }
-#endif
-
 #endif
-- 
cgit v1.2.3


From 11369f356b66d363a615fde2c5526962f7683674 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 8 Jul 2008 14:35:21 -0700
Subject: x86: change _node_to_cpumask_ptr to return const ptr

  * Strengthen the return type for the _node_to_cpumask_ptr to be
    a const pointer.  This adds compiler checking to insure that
    node_to_cpumask_map[] is not changed inadvertently.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: "akpm@linux-foundation.org" <akpm@linux-foundation.org>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Acked-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c |  8 ++++----
 include/asm-generic/topology.h |  3 ++-
 include/asm-x86/topology.h     | 10 +++++-----
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5fc310f746fc..cac68430d31f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -343,23 +343,23 @@ static const cpumask_t cpu_mask_none;
 /*
  * Returns a pointer to the bitmask of CPUs on Node 'node'.
  */
-cpumask_t *_node_to_cpumask_ptr(int node)
+const cpumask_t *_node_to_cpumask_ptr(int node)
 {
 	if (node_to_cpumask_map == NULL) {
 		printk(KERN_WARNING
 			"_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
 			node);
 		dump_stack();
-		return &cpu_online_map;
+		return (const cpumask_t *)&cpu_online_map;
 	}
 	if (node >= nr_node_ids) {
 		printk(KERN_WARNING
 			"_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
 			node, nr_node_ids);
 		dump_stack();
-		return (cpumask_t *)&cpu_mask_none;
+		return &cpu_mask_none;
 	}
-	return (cpumask_t *)&node_to_cpumask_map[node];
+	return &node_to_cpumask_map[node];
 }
 EXPORT_SYMBOL(_node_to_cpumask_ptr);
 
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index a6aea79bca4f..54bbf6e04ee8 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -60,7 +60,8 @@
 #ifndef node_to_cpumask_ptr
 
 #define	node_to_cpumask_ptr(v, node) 					\
-		cpumask_t _##v = node_to_cpumask(node), *v = &_##v
+		cpumask_t _##v = node_to_cpumask(node);			\
+		const cpumask_t *v = &_##v
 
 #define node_to_cpumask_ptr_next(v, node)				\
 			  _##v = node_to_cpumask(node)
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index 98e5f17ea856..90ac7718469a 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -82,7 +82,7 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 extern int cpu_to_node(int cpu);
 extern int early_cpu_to_node(int cpu);
-extern cpumask_t *_node_to_cpumask_ptr(int node);
+extern const cpumask_t *_node_to_cpumask_ptr(int node);
 extern cpumask_t node_to_cpumask(int node);
 
 #else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
@@ -103,7 +103,7 @@ static inline int early_cpu_to_node(int cpu)
 }
 
 /* Returns a pointer to the cpumask of CPUs on Node 'node'. */
-static inline cpumask_t *_node_to_cpumask_ptr(int node)
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
 {
 	return &node_to_cpumask_map[node];
 }
@@ -118,7 +118,7 @@ static inline cpumask_t node_to_cpumask(int node)
 
 /* Replace default node_to_cpumask_ptr with optimized version */
 #define node_to_cpumask_ptr(v, node)		\
-		cpumask_t *v = _node_to_cpumask_ptr(node)
+		const cpumask_t *v = _node_to_cpumask_ptr(node)
 
 #define node_to_cpumask_ptr_next(v, node)	\
 			   v = _node_to_cpumask_ptr(node)
@@ -186,7 +186,7 @@ extern int __node_distance(int, int);
 #define	cpu_to_node(cpu)	0
 #define	early_cpu_to_node(cpu)	0
 
-static inline cpumask_t *_node_to_cpumask_ptr(int node)
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
 {
 	return &cpu_online_map;
 }
@@ -201,7 +201,7 @@ static inline int node_to_first_cpu(int node)
 
 /* Replace default node_to_cpumask_ptr with optimized version */
 #define node_to_cpumask_ptr(v, node)		\
-		cpumask_t *v = _node_to_cpumask_ptr(node)
+		const cpumask_t *v = _node_to_cpumask_ptr(node)
 
 #define node_to_cpumask_ptr_next(v, node)	\
 			   v = _node_to_cpumask_ptr(node)
-- 
cgit v1.2.3


From 32b23e9a7331fce57eb0af52e19e8409fdef831b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 13 Jul 2008 14:29:41 -0700
Subject: x86: max_low_pfn_mapped fix #4

only add direct mapping for aperture

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-gart_64.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a614ee10f846..c3fe78406d18 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -679,11 +679,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	/* need to map that range */
 	end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
 	if (end_pfn > max_low_pfn_mapped) {
-		start_pfn = max_low_pfn_mapped;
-		max_low_pfn_mapped = init_memory_mapping(start_pfn<<PAGE_SHIFT,
-							 end_pfn<<PAGE_SHIFT);
-		if (max_pfn_mapped < max_low_pfn_mapped)
-			max_pfn_mapped = max_low_pfn_mapped;
+		start_pfn = (aper_base>>PAGE_SHIFT);
+		init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
 	}
 	return 0;
 
-- 
cgit v1.2.3


From 87a1c441e1aeaf00f97e63dfc310ea7684ec9dda Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 13 Jul 2008 14:30:35 -0700
Subject: x86: get x86_phys_bits early

when try to make hpet_enable use io_remap instead fixmap got

ioremap: invalid physical address fed00000
------------[ cut here ]------------
WARNING: at arch/x86/mm/ioremap.c:161 __ioremap_caller+0x8c/0x2f3()
Modules linked in:
Pid: 0, comm: swapper Not tainted 2.6.26-rc9-tip-01873-ga9827e7-dirty #358

Call Trace:
 [<ffffffff8026615e>] warn_on_slowpath+0x6c/0xa7
 [<ffffffff802e2313>] ? __slab_alloc+0x20a/0x3fb
 [<ffffffff802d85c5>] ? mpol_new+0x88/0x17d
 [<ffffffff8022a4f4>] ? mcount_call+0x5/0x31
 [<ffffffff8022a4f4>] ? mcount_call+0x5/0x31
 [<ffffffff8024b0d2>] __ioremap_caller+0x8c/0x2f3
 [<ffffffff80e86dbd>] ? hpet_enable+0x39/0x241
 [<ffffffff8022a4f4>] ? mcount_call+0x5/0x31
 [<ffffffff8024b466>] ioremap_nocache+0x2a/0x40
 [<ffffffff80e86dbd>] hpet_enable+0x39/0x241
 [<ffffffff80e7a1f6>] hpet_time_init+0x21/0x4e
 [<ffffffff80e730e9>] start_kernel+0x302/0x395
 [<ffffffff80e722aa>] x86_64_start_reservations+0xb9/0xd4
 [<ffffffff80e722fe>] ? x86_64_init_pda+0x39/0x4f
 [<ffffffff80e72400>] x86_64_start_kernel+0xec/0x107

---[ end trace a7919e7f17c0a725 ]---

it seems for amd system that is set later...
try to move setting early in early_identify_cpu.
and remove same code for intel and centaur.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/centaur_64.c | 10 ----------
 arch/x86/kernel/cpu/common_64.c  | 14 ++++++++------
 arch/x86/kernel/cpu/intel_64.c   | 10 ----------
 3 files changed, 8 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 2026d2119cdb..1d181c40e2e1 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,16 +16,6 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
 
 static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 {
-	/* Cache sizes */
-	unsigned n;
-
-	n = c->extended_cpuid_level;
-	if (n >= 0x80000008) {
-		unsigned eax = cpuid_eax(0x80000008);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-
 	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 36537ab9e56a..7b8cc72feb40 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -98,7 +98,7 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 
 void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 {
-	unsigned int n, dummy, eax, ebx, ecx, edx;
+	unsigned int n, dummy, ebx, ecx, edx;
 
 	n = c->extended_cpuid_level;
 
@@ -121,11 +121,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
 		c->x86_cache_size, ecx & 0xFF);
 	}
-	if (n >= 0x80000008) {
-		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
 }
 
 void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -314,6 +309,13 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000007)
 		c->x86_power = cpuid_edx(0x80000007);
 
+	if (c->extended_cpuid_level >= 0x80000008) {
+		u32 eax = cpuid_eax(0x80000008);
+
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+
 	/* Assume all 64-bit CPUs support 32-bit syscall */
 	set_cpu_cap(c, X86_FEATURE_SYSCALL32);
 
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
index 02f773399e39..1019c58d39f0 100644
--- a/arch/x86/kernel/cpu/intel_64.c
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -54,9 +54,6 @@ static void __cpuinit srat_detect_node(void)
 
 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
-	/* Cache sizes */
-	unsigned n;
-
 	init_intel_cacheinfo(c);
 	if (c->cpuid_level > 9) {
 		unsigned eax = cpuid_eax(10);
@@ -78,13 +75,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	if (cpu_has_bts)
 		ds_init_intel(c);
 
-	n = c->extended_cpuid_level;
-	if (n >= 0x80000008) {
-		unsigned eax = cpuid_eax(0x80000008);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-
 	if (c->x86 == 15)
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
 	if (c->x86 == 6)
-- 
cgit v1.2.3


From 2387ce57a8167490d3b34a7e1ffa9a64a1a76244 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 13 Jul 2008 14:50:56 -0700
Subject: x86: make 64bit hpet_set_mapping to use ioremap too, v2

keep the one for VSYSCALL_HPET

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/hpet.c      | 20 ++++----------------
 include/asm-x86/fixmap_64.h |  1 -
 2 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ea230ec69057..0ea6a19bfdfe 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -36,26 +36,15 @@ static inline void hpet_writel(unsigned long d, unsigned long a)
 }
 
 #ifdef CONFIG_X86_64
-
 #include <asm/pgtable.h>
-
-static inline void hpet_set_mapping(void)
-{
-	set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
-	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
-	hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
-}
-
-static inline void hpet_clear_mapping(void)
-{
-	hpet_virt_address = NULL;
-}
-
-#else
+#endif
 
 static inline void hpet_set_mapping(void)
 {
 	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+#ifdef CONFIG_X86_64
+	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+#endif
 }
 
 static inline void hpet_clear_mapping(void)
@@ -63,7 +52,6 @@ static inline void hpet_clear_mapping(void)
 	iounmap(hpet_virt_address);
 	hpet_virt_address = NULL;
 }
-#endif
 
 /*
  * HPET command line enable / disable
diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h
index 6a4789d57e6c..00f3d74a0524 100644
--- a/include/asm-x86/fixmap_64.h
+++ b/include/asm-x86/fixmap_64.h
@@ -40,7 +40,6 @@ enum fixed_addresses {
 	VSYSCALL_HPET,
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
-	FIX_HPET_BASE,
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
-- 
cgit v1.2.3


From 7daf705f362e349983e92037a198b8821db198af Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 14 Jul 2008 12:12:53 -0700
Subject: Start using the new '%pS' infrastructure to print symbols

This simplifies the code significantly, and was the whole point of the
exercise.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps_64.c | 25 +------------------------
 mm/slub.c                  |  5 ++---
 2 files changed, 3 insertions(+), 27 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index adff76ea97c4..f1a95d105953 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -104,30 +104,7 @@ int kstack_depth_to_print = 12;
 
 void printk_address(unsigned long address, int reliable)
 {
-#ifdef CONFIG_KALLSYMS
-	unsigned long offset = 0, symsize;
-	const char *symname;
-	char *modname;
-	char *delim = ":";
-	char namebuf[KSYM_NAME_LEN];
-	char reliab[4] = "";
-
-	symname = kallsyms_lookup(address, &symsize, &offset,
-					&modname, namebuf);
-	if (!symname) {
-		printk(" [<%016lx>]\n", address);
-		return;
-	}
-	if (!reliable)
-		strcpy(reliab, "? ");
-
-	if (!modname)
-		modname = delim = "";
-	printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
-		address, reliab, delim, modname, delim, symname, offset, symsize);
-#else
-	printk(" [<%016lx>]\n", address);
-#endif
+	printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
 }
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
diff --git a/mm/slub.c b/mm/slub.c
index 315c392253c7..5f6e2c4a2ba7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -431,9 +431,8 @@ static void print_track(const char *s, struct track *t)
 	if (!t->addr)
 		return;
 
-	printk(KERN_ERR "INFO: %s in ", s);
-	__print_symbol("%s", (unsigned long)t->addr);
-	printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
+	printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+		s, t->addr, jiffies - t->when, t->cpu, t->pid);
 }
 
 static void print_tracking(struct kmem_cache *s, void *object)
-- 
cgit v1.2.3