summaryrefslogtreecommitdiff
path: root/arch/i386/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile5
-rw-r--r--arch/i386/kernel/acpi/cstate.c6
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c21
-rw-r--r--arch/i386/kernel/alternative.c63
-rw-r--r--arch/i386/kernel/apic.c22
-rw-r--r--arch/i386/kernel/apm.c3
-rw-r--r--arch/i386/kernel/asm-offsets.c39
-rw-r--r--arch/i386/kernel/cpu/amd.c5
-rw-r--r--arch/i386/kernel/cpu/common.c249
-rw-r--r--arch/i386/kernel/cpu/intel.c12
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c11
-rw-r--r--arch/i386/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/Makefile4
-rw-r--r--arch/i386/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/centaur.c9
-rw-r--r--arch/i386/kernel/cpu/mtrr/cyrix.c25
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c78
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c31
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c71
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h25
-rw-r--r--arch/i386/kernel/cpu/proc.c3
-rw-r--r--arch/i386/kernel/cpuid.c3
-rw-r--r--arch/i386/kernel/crash.c66
-rw-r--r--arch/i386/kernel/e820.c894
-rw-r--r--arch/i386/kernel/efi.c17
-rw-r--r--arch/i386/kernel/entry.S331
-rw-r--r--arch/i386/kernel/head.S66
-rw-r--r--arch/i386/kernel/hpet.c7
-rw-r--r--arch/i386/kernel/i8259.c5
-rw-r--r--arch/i386/kernel/io_apic.c66
-rw-r--r--arch/i386/kernel/kprobes.c4
-rw-r--r--arch/i386/kernel/ldt.c4
-rw-r--r--arch/i386/kernel/mca.c13
-rw-r--r--arch/i386/kernel/microcode.c2
-rw-r--r--arch/i386/kernel/module.c11
-rw-r--r--arch/i386/kernel/mpparse.c2
-rw-r--r--arch/i386/kernel/msr.c5
-rw-r--r--arch/i386/kernel/nmi.c42
-rw-r--r--arch/i386/kernel/paravirt.c569
-rw-r--r--arch/i386/kernel/pci-dma.c6
-rw-r--r--arch/i386/kernel/process.c81
-rw-r--r--arch/i386/kernel/ptrace.c18
-rw-r--r--arch/i386/kernel/quirks.c46
-rw-r--r--arch/i386/kernel/reboot.c1
-rw-r--r--arch/i386/kernel/setup.c855
-rw-r--r--arch/i386/kernel/signal.c6
-rw-r--r--arch/i386/kernel/smp.c10
-rw-r--r--arch/i386/kernel/smpboot.c69
-rw-r--r--arch/i386/kernel/sysenter.c6
-rw-r--r--arch/i386/kernel/time.c15
-rw-r--r--arch/i386/kernel/time_hpet.c15
-rw-r--r--arch/i386/kernel/topology.c8
-rw-r--r--arch/i386/kernel/traps.c120
-rw-r--r--arch/i386/kernel/tsc.c1
-rw-r--r--arch/i386/kernel/vm86.c121
-rw-r--r--arch/i386/kernel/vmlinux.lds.S147
56 files changed, 2709 insertions, 1609 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1a884b6e6e5c..1e8988e558c5 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -6,7 +6,7 @@ extra-y := head.o init_task.o vmlinux.lds
obj-y := process.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
- pci-dma.o i386_ksyms.o i387.o bootflag.o \
+ pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\
quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -40,6 +40,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
+# Make sure this is linked after any other paravirt_ops structs: see head.S
+obj-$(CONFIG_PARAVIRT) += paravirt.o
+
EXTRA_AFLAGS := -traditional
obj-$(CONFIG_SCx200) += scx200.o
diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c
index 4664b55f623e..12e937c1ce4b 100644
--- a/arch/i386/kernel/acpi/cstate.c
+++ b/arch/i386/kernel/acpi/cstate.c
@@ -156,10 +156,8 @@ static int __init ffh_cstate_init(void)
static void __exit ffh_cstate_exit(void)
{
- if (cpu_cstate_entry) {
- free_percpu(cpu_cstate_entry);
- cpu_cstate_entry = NULL;
- }
+ free_percpu(cpu_cstate_entry);
+ cpu_cstate_entry = NULL;
}
arch_initcall(ffh_cstate_init);
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index c9841692bb7c..4b60af7f91dd 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,6 +10,7 @@
#include <asm/pci-direct.h>
#include <asm/acpi.h>
#include <asm/apic.h>
+#include <asm/irq.h>
#ifdef CONFIG_ACPI
@@ -49,6 +50,24 @@ static int __init check_bridge(int vendor, int device)
return 0;
}
+static void check_intel(void)
+{
+ u16 vendor, device;
+
+ vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
+
+ if (vendor != PCI_VENDOR_ID_INTEL)
+ return;
+
+ device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
+#ifdef CONFIG_SMP
+ if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
+ device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
+ device == PCI_DEVICE_ID_INTEL_E7525_MCH)
+ quirk_intel_irqbalance();
+#endif
+}
+
void __init check_acpi_pci(void)
{
int num, slot, func;
@@ -60,6 +79,8 @@ void __init check_acpi_pci(void)
if (!early_pci_allowed())
return;
+ check_intel();
+
/* Poor man's PCI discovery */
for (num = 0; num < 32; num++) {
for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 535f9794fba1..9eca21b49f6b 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -124,6 +124,20 @@ static unsigned char** find_nop_table(void)
#endif /* CONFIG_X86_64 */
+static void nop_out(void *insns, unsigned int len)
+{
+ unsigned char **noptable = find_nop_table();
+
+ while (len > 0) {
+ unsigned int noplen = len;
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+ memcpy(insns, noptable[noplen], noplen);
+ insns += noplen;
+ len -= noplen;
+ }
+}
+
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -138,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[];
void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
{
- unsigned char **noptable = find_nop_table();
struct alt_instr *a;
u8 *instr;
- int diff, i, k;
+ int diff;
DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
for (a = start; a < end; a++) {
@@ -159,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
#endif
memcpy(instr, a->replacement, a->replacementlen);
diff = a->instrlen - a->replacementlen;
- /* Pad the rest with nops */
- for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
- k = diff;
- if (k > ASM_NOP_MAX)
- k = ASM_NOP_MAX;
- memcpy(a->instr + i, noptable[k], k);
- }
+ nop_out(instr + a->replacementlen, diff);
}
}
@@ -209,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
- unsigned char **noptable = find_nop_table();
u8 **ptr;
for (ptr = start; ptr < end; ptr++) {
@@ -217,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
continue;
if (*ptr > text_end)
continue;
- **ptr = noptable[1][0];
+ nop_out(*ptr, 1);
};
}
@@ -343,6 +349,40 @@ void alternatives_smp_switch(int smp)
#endif
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{
+ struct paravirt_patch *p;
+
+ for (p = start; p < end; p++) {
+ unsigned int used;
+
+ used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
+ p->len);
+#ifdef CONFIG_DEBUG_PARAVIRT
+ {
+ int i;
+ /* Deliberately clobber regs using "not %reg" to find bugs. */
+ for (i = 0; i < 3; i++) {
+ if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
+ memcpy(p->instr + used, "\xf7\xd0", 2);
+ p->instr[used+1] |= i;
+ used += 2;
+ }
+ }
+ }
+#endif
+ /* Pad the rest with nops */
+ nop_out(p->instr + used, p->len - used);
+ }
+
+ /* Sync to be conservative, in case we patched following instructions */
+ sync_core();
+}
+extern struct paravirt_patch __start_parainstructions[],
+ __stop_parainstructions[];
+#endif /* CONFIG_PARAVIRT */
+
void __init alternative_instructions(void)
{
unsigned long flags;
@@ -390,5 +430,6 @@ void __init alternative_instructions(void)
alternatives_smp_switch(0);
}
#endif
+ apply_paravirt(__start_parainstructions, __stop_parainstructions);
local_irq_restore(flags);
}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 2fd4b7d927c2..776d9be26af9 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -647,23 +647,30 @@ static struct {
static int lapic_suspend(struct sys_device *dev, pm_message_t state)
{
unsigned long flags;
+ int maxlvt;
if (!apic_pm_state.active)
return 0;
+ maxlvt = get_maxlvt();
+
apic_pm_state.apic_id = apic_read(APIC_ID);
apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
apic_pm_state.apic_ldr = apic_read(APIC_LDR);
apic_pm_state.apic_dfr = apic_read(APIC_DFR);
apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
- apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+ if (maxlvt >= 4)
+ apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
- apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+ if (maxlvt >= 5)
+ apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
local_irq_save(flags);
disable_local_APIC();
@@ -675,10 +682,13 @@ static int lapic_resume(struct sys_device *dev)
{
unsigned int l, h;
unsigned long flags;
+ int maxlvt;
if (!apic_pm_state.active)
return 0;
+ maxlvt = get_maxlvt();
+
local_irq_save(flags);
/*
@@ -700,8 +710,12 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
- apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
- apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+ if (maxlvt >= 5)
+ apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+ if (maxlvt >= 4)
+ apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index a60358fe9a49..a97847da9ed5 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -231,6 +231,7 @@
#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/i8253.h>
+#include <asm/paravirt.h>
#include "io_ports.h"
@@ -2235,7 +2236,7 @@ static int __init apm_init(void)
dmi_check_system(apm_dmi_table);
- if (apm_info.bios.version == 0) {
+ if (apm_info.bios.version == 0 || paravirt_enabled()) {
printk(KERN_INFO "apm: BIOS not found.\n");
return -ENODEV;
}
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c80271f8f084..1b2f3cd33270 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -15,6 +15,7 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
#include <asm/elf.h>
+#include <asm/pda.h>
#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -51,13 +52,35 @@ void foo(void)
OFFSET(TI_exec_domain, thread_info, exec_domain);
OFFSET(TI_flags, thread_info, flags);
OFFSET(TI_status, thread_info, status);
- OFFSET(TI_cpu, thread_info, cpu);
OFFSET(TI_preempt_count, thread_info, preempt_count);
OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_restart_block, thread_info, restart_block);
OFFSET(TI_sysenter_return, thread_info, sysenter_return);
BLANK();
+ OFFSET(GDS_size, Xgt_desc_struct, size);
+ OFFSET(GDS_address, Xgt_desc_struct, address);
+ OFFSET(GDS_pad, Xgt_desc_struct, pad);
+ BLANK();
+
+ OFFSET(PT_EBX, pt_regs, ebx);
+ OFFSET(PT_ECX, pt_regs, ecx);
+ OFFSET(PT_EDX, pt_regs, edx);
+ OFFSET(PT_ESI, pt_regs, esi);
+ OFFSET(PT_EDI, pt_regs, edi);
+ OFFSET(PT_EBP, pt_regs, ebp);
+ OFFSET(PT_EAX, pt_regs, eax);
+ OFFSET(PT_DS, pt_regs, xds);
+ OFFSET(PT_ES, pt_regs, xes);
+ OFFSET(PT_GS, pt_regs, xgs);
+ OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
+ OFFSET(PT_EIP, pt_regs, eip);
+ OFFSET(PT_CS, pt_regs, xcs);
+ OFFSET(PT_EFLAGS, pt_regs, eflags);
+ OFFSET(PT_OLDESP, pt_regs, esp);
+ OFFSET(PT_OLDSS, pt_regs, xss);
+ BLANK();
+
OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
BLANK();
@@ -74,4 +97,18 @@ void foo(void)
DEFINE(VDSO_PRELINK, VDSO_PRELINK);
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+
+ BLANK();
+ OFFSET(PDA_cpu, i386_pda, cpu_number);
+ OFFSET(PDA_pcurrent, i386_pda, pcurrent);
+
+#ifdef CONFIG_PARAVIRT
+ BLANK();
+ OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
+ OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+ OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+ OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+ OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+ OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
}
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e4758095d87a..41cfea57232b 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -104,10 +104,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
f_vide();
rdtscl(d2);
d = d2-d;
-
- /* Knock these two lines out if it debugs out ok */
- printk(KERN_INFO "AMD K6 stepping B detected - ");
- /* -- cut here -- */
+
if (d > 20*K6_BUG_LOOP)
printk("system stability may be impaired when more than 32 MB are used.\n");
else
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index d9f3e3c31f05..1b34c56f8123 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,14 +18,15 @@
#include <asm/apic.h>
#include <mach_apic.h>
#endif
+#include <asm/pda.h>
#include "cpu.h"
DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
-DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
-EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
+struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
static int cachesize_override __cpuinitdata = -1;
static int disable_x86_fxsr __cpuinitdata;
@@ -235,29 +236,14 @@ static int __cpuinit have_cpuid_p(void)
return flag_is_changeable_p(X86_EFLAGS_ID);
}
-/* Do minimum CPU detection early.
- Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
- The others are not touched to avoid unwanted side effects.
-
- WARNING: this function is only called on the BP. Don't add code here
- that is supposed to run on all CPUs. */
-static void __init early_cpu_detect(void)
+void __init cpu_detect(struct cpuinfo_x86 *c)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- c->x86_cache_alignment = 32;
-
- if (!have_cpuid_p())
- return;
-
/* Get vendor name */
cpuid(0x00000000, &c->cpuid_level,
(int *)&c->x86_vendor_id[0],
(int *)&c->x86_vendor_id[8],
(int *)&c->x86_vendor_id[4]);
- get_cpu_vendor(c, 1);
-
c->x86 = 4;
if (c->cpuid_level >= 0x00000001) {
u32 junk, tfms, cap0, misc;
@@ -274,6 +260,26 @@ static void __init early_cpu_detect(void)
}
}
+/* Do minimum CPU detection early.
+ Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
+ The others are not touched to avoid unwanted side effects.
+
+ WARNING: this function is only called on the BP. Don't add code here
+ that is supposed to run on all CPUs. */
+static void __init early_cpu_detect(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ c->x86_cache_alignment = 32;
+
+ if (!have_cpuid_p())
+ return;
+
+ cpu_detect(c);
+
+ get_cpu_vendor(c, 1);
+}
+
static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
{
u32 tfms, xlvl;
@@ -308,6 +314,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
#else
c->apicid = (ebx >> 24) & 0xFF;
#endif
+ if (c->x86_capability[0] & (1<<19))
+ c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
} else {
/* Have CPUID level 0 only - unheard of */
c->x86 = 4;
@@ -372,6 +380,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_vendor_id[0] = '\0'; /* Unset */
c->x86_model_id[0] = '\0'; /* Unset */
c->x86_max_cores = 1;
+ c->x86_clflush_size = 32;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
if (!have_cpuid_p()) {
@@ -591,42 +600,24 @@ void __init early_cpu_init(void)
disable_pse = 1;
#endif
}
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- */
-void __cpuinit cpu_init(void)
+
+/* Make sure %gs is initialized properly in idle threads */
+struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
{
- int cpu = smp_processor_id();
- struct tss_struct * t = &per_cpu(init_tss, cpu);
- struct thread_struct *thread = &current->thread;
- struct desc_struct *gdt;
- __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+ memset(regs, 0, sizeof(struct pt_regs));
+ regs->xgs = __KERNEL_PDA;
+ return regs;
+}
- if (cpu_test_and_set(cpu, cpu_initialized)) {
- printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
- for (;;) local_irq_enable();
- }
- printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+static __cpuinit int alloc_gdt(int cpu)
+{
+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+ struct desc_struct *gdt;
+ struct i386_pda *pda;
- if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
- if (tsc_disable && cpu_has_tsc) {
- printk(KERN_NOTICE "Disabling TSC...\n");
- /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
- clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
- set_in_cr4(X86_CR4_TSD);
- }
+ gdt = (struct desc_struct *)cpu_gdt_descr->address;
+ pda = cpu_pda(cpu);
- /* The CPU hotplug case */
- if (cpu_gdt_descr->address) {
- gdt = (struct desc_struct *)cpu_gdt_descr->address;
- memset(gdt, 0, PAGE_SIZE);
- goto old_gdt;
- }
/*
* This is a horrible hack to allocate the GDT. The problem
* is that cpu_init() is called really early for the boot CPU
@@ -634,43 +625,130 @@ void __cpuinit cpu_init(void)
* CPUs, when bootmem will have gone away
*/
if (NODE_DATA(0)->bdata->node_bootmem_map) {
- gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
- /* alloc_bootmem_pages panics on failure, so no check */
+ BUG_ON(gdt != NULL || pda != NULL);
+
+ gdt = alloc_bootmem_pages(PAGE_SIZE);
+ pda = alloc_bootmem(sizeof(*pda));
+ /* alloc_bootmem(_pages) panics on failure, so no check */
+
memset(gdt, 0, PAGE_SIZE);
+ memset(pda, 0, sizeof(*pda));
} else {
- gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
- if (unlikely(!gdt)) {
- printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
- for (;;)
- local_irq_enable();
+ /* GDT and PDA might already have been allocated if
+ this is a CPU hotplug re-insertion. */
+ if (gdt == NULL)
+ gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
+
+ if (pda == NULL)
+ pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
+
+ if (unlikely(!gdt || !pda)) {
+ free_pages((unsigned long)gdt, 0);
+ kfree(pda);
+ return 0;
}
}
-old_gdt:
+
+ cpu_gdt_descr->address = (unsigned long)gdt;
+ cpu_pda(cpu) = pda;
+
+ return 1;
+}
+
+/* Initial PDA used by boot CPU */
+struct i386_pda boot_pda = {
+ ._pda = &boot_pda,
+ .cpu_number = 0,
+ .pcurrent = &init_task,
+};
+
+static inline void set_kernel_gs(void)
+{
+ /* Set %gs for this CPU's PDA. Memory clobber is to create a
+ barrier with respect to any PDA operations, so the compiler
+ doesn't move any before here. */
+ asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+}
+
+/* Initialize the CPU's GDT and PDA. The boot CPU does this for
+ itself, but secondaries find this done for them. */
+__cpuinit int init_gdt(int cpu, struct task_struct *idle)
+{
+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+ struct desc_struct *gdt;
+ struct i386_pda *pda;
+
+ /* For non-boot CPUs, the GDT and PDA should already have been
+ allocated. */
+ if (!alloc_gdt(cpu)) {
+ printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
+ return 0;
+ }
+
+ gdt = (struct desc_struct *)cpu_gdt_descr->address;
+ pda = cpu_pda(cpu);
+
+ BUG_ON(gdt == NULL || pda == NULL);
+
/*
* Initialize the per-CPU GDT with the boot GDT,
* and set up the GDT descriptor:
*/
memcpy(gdt, cpu_gdt_table, GDT_SIZE);
+ cpu_gdt_descr->size = GDT_SIZE - 1;
- /* Set up GDT entry for 16bit stack */
- *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
- ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
- ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
- (CPU_16BIT_STACK_SIZE - 1);
+ pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
+ (u32 *)&gdt[GDT_ENTRY_PDA].b,
+ (unsigned long)pda, sizeof(*pda) - 1,
+ 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
- cpu_gdt_descr->size = GDT_SIZE - 1;
- cpu_gdt_descr->address = (unsigned long)gdt;
+ memset(pda, 0, sizeof(*pda));
+ pda->_pda = pda;
+ pda->cpu_number = cpu;
+ pda->pcurrent = idle;
+
+ return 1;
+}
+
+/* Common CPU init for both boot and secondary CPUs */
+static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
+{
+ struct tss_struct * t = &per_cpu(init_tss, cpu);
+ struct thread_struct *thread = &curr->thread;
+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+ /* Reinit these anyway, even if they've already been done (on
+ the boot CPU, this will transition from the boot gdt+pda to
+ the real ones). */
load_gdt(cpu_gdt_descr);
+ set_kernel_gs();
+
+ if (cpu_test_and_set(cpu, cpu_initialized)) {
+ printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
+ for (;;) local_irq_enable();
+ }
+
+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+ if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ if (tsc_disable && cpu_has_tsc) {
+ printk(KERN_NOTICE "Disabling TSC...\n");
+ /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
+ clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+ set_in_cr4(X86_CR4_TSD);
+ }
+
load_idt(&idt_descr);
/*
* Set up and load the per-CPU TSS and LDT
*/
atomic_inc(&init_mm.mm_count);
- current->active_mm = &init_mm;
- BUG_ON(current->mm);
- enter_lazy_tlb(&init_mm, current);
+ curr->active_mm = &init_mm;
+ if (curr->mm)
+ BUG();
+ enter_lazy_tlb(&init_mm, curr);
load_esp0(t, thread);
set_tss_desc(cpu,t);
@@ -682,8 +760,8 @@ old_gdt:
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
- /* Clear %fs and %gs. */
- asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
+ /* Clear %fs. */
+ asm volatile ("mov %0, %%fs" : : "r" (0));
/* Clear all 6 debug registers: */
set_debugreg(0, 0);
@@ -701,6 +779,37 @@ old_gdt:
mxcsr_feature_mask_init();
}
+/* Entrypoint to initialize secondary CPU */
+void __cpuinit secondary_cpu_init(void)
+{
+ int cpu = smp_processor_id();
+ struct task_struct *curr = current;
+
+ _cpu_init(cpu, curr);
+}
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ */
+void __cpuinit cpu_init(void)
+{
+ int cpu = smp_processor_id();
+ struct task_struct *curr = current;
+
+ /* Set up the real GDT and PDA, so we can transition from the
+ boot versions. */
+ if (!init_gdt(cpu, curr)) {
+ /* failed to allocate something; not much we can do... */
+ for (;;)
+ local_irq_enable();
+ }
+
+ _cpu_init(cpu, curr);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
void __cpuinit cpu_uninit(void)
{
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 94a95aa5227e..56fe26584957 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
* Note that the workaround only should be initialized once...
*/
c->f00f_bug = 0;
- if ( c->x86 == 5 ) {
+ if (!paravirt_enabled() && c->x86 == 5) {
static int f00f_workaround_enabled = 0;
c->f00f_bug = 1;
@@ -195,8 +195,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-}
+ if (cpu_has_ds) {
+ unsigned int l1;
+ rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+ if (!(l1 & (1<<11)))
+ set_bit(X86_FEATURE_BTS, c->x86_capability);
+ if (!(l1 & (1<<12)))
+ set_bit(X86_FEATURE_PEBS, c->x86_capability);
+ }
+}
static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
{
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 5c43be47587f..80b4c5d421b1 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -480,12 +480,10 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
if (num_cache_leaves == 0)
return -ENOENT;
- cpuid4_info[cpu] = kmalloc(
+ cpuid4_info[cpu] = kzalloc(
sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
if (unlikely(cpuid4_info[cpu] == NULL))
return -ENOMEM;
- memset(cpuid4_info[cpu], 0,
- sizeof(struct _cpuid4_info) * num_cache_leaves);
oldmask = current->cpus_allowed;
retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -658,17 +656,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
return -ENOENT;
/* Allocate all required memory */
- cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL);
+ cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
if (unlikely(cache_kobject[cpu] == NULL))
goto err_out;
- memset(cache_kobject[cpu], 0, sizeof(struct kobject));
- index_kobject[cpu] = kmalloc(
+ index_kobject[cpu] = kzalloc(
sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
if (unlikely(index_kobject[cpu] == NULL))
goto err_out;
- memset(index_kobject[cpu], 0,
- sizeof(struct _index_kobject) * num_cache_leaves);
return 0;
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index bad8b4420709..065005c3f168 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -116,7 +116,6 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
}
-#ifdef CONFIG_HOTPLUG_CPU
static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
{
return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
@@ -153,7 +152,6 @@ static struct notifier_block thermal_throttle_cpu_notifier =
{
.notifier_call = thermal_throttle_cpu_callback,
};
-#endif /* CONFIG_HOTPLUG_CPU */
static __init int thermal_throttle_init_device(void)
{
diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile
index a25b701ab84e..191fc0533649 100644
--- a/arch/i386/kernel/cpu/mtrr/Makefile
+++ b/arch/i386/kernel/cpu/mtrr/Makefile
@@ -1,5 +1,3 @@
obj-y := main.o if.o generic.o state.o
-obj-y += amd.o
-obj-y += cyrix.o
-obj-y += centaur.o
+obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c
index 1a1e04b6fd00..0949cdbf848a 100644
--- a/arch/i386/kernel/cpu/mtrr/amd.c
+++ b/arch/i386/kernel/cpu/mtrr/amd.c
@@ -7,7 +7,7 @@
static void
amd_get_mtrr(unsigned int reg, unsigned long *base,
- unsigned int *size, mtrr_type * type)
+ unsigned long *size, mtrr_type * type)
{
unsigned long low, high;
diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c
index 33f00ac314ef..cb9aa3a7a7ab 100644
--- a/arch/i386/kernel/cpu/mtrr/centaur.c
+++ b/arch/i386/kernel/cpu/mtrr/centaur.c
@@ -17,7 +17,7 @@ static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
*/
static int
-centaur_get_free_region(unsigned long base, unsigned long size)
+centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
/* [SUMMARY] Get a free MTRR.
<base> The starting (base) address of the region.
<size> The size (in bytes) of the region.
@@ -26,10 +26,11 @@ centaur_get_free_region(unsigned long base, unsigned long size)
{
int i, max;
mtrr_type ltype;
- unsigned long lbase;
- unsigned int lsize;
+ unsigned long lbase, lsize;
max = num_var_ranges;
+ if (replace_reg >= 0 && replace_reg < max)
+ return replace_reg;
for (i = 0; i < max; ++i) {
if (centaur_mcr_reserved & (1 << i))
continue;
@@ -49,7 +50,7 @@ mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
static void
centaur_get_mcr(unsigned int reg, unsigned long *base,
- unsigned int *size, mtrr_type * type)
+ unsigned long *size, mtrr_type * type)
{
*base = centaur_mcr[reg].high >> PAGE_SHIFT;
*size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c
index 9027a987006b..0737a596db43 100644
--- a/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ b/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -9,7 +9,7 @@ int arr3_protected;
static void
cyrix_get_arr(unsigned int reg, unsigned long *base,
- unsigned int *size, mtrr_type * type)
+ unsigned long *size, mtrr_type * type)
{
unsigned long flags;
unsigned char arr, ccr3, rcr, shift;
@@ -77,7 +77,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base,
}
static int
-cyrix_get_free_region(unsigned long base, unsigned long size)
+cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
/* [SUMMARY] Get a free ARR.
<base> The starting (base) address of the region.
<size> The size (in bytes) of the region.
@@ -86,9 +86,24 @@ cyrix_get_free_region(unsigned long base, unsigned long size)
{
int i;
mtrr_type ltype;
- unsigned long lbase;
- unsigned int lsize;
+ unsigned long lbase, lsize;
+ switch (replace_reg) {
+ case 7:
+ if (size < 0x40)
+ break;
+ case 6:
+ case 5:
+ case 4:
+ return replace_reg;
+ case 3:
+ if (arr3_protected)
+ break;
+ case 2:
+ case 1:
+ case 0:
+ return replace_reg;
+ }
/* If we are to set up a region >32M then look at ARR7 immediately */
if (size > 0x2000) {
cyrix_get_arr(7, &lbase, &lsize, &ltype);
@@ -214,7 +229,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
typedef struct {
unsigned long base;
- unsigned int size;
+ unsigned long size;
mtrr_type type;
} arr_state_t;
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 0b61eed8bbd8..f77fc53db654 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -3,6 +3,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/module.h>
#include <asm/io.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
@@ -15,12 +16,19 @@ struct mtrr_state {
struct mtrr_var_range *var_ranges;
mtrr_type fixed_ranges[NUM_FIXED_RANGES];
unsigned char enabled;
+ unsigned char have_fixed;
mtrr_type def_type;
};
static unsigned long smp_changes_mask;
static struct mtrr_state mtrr_state = {};
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "mtrr."
+
+static __initdata int mtrr_show;
+module_param_named(show, mtrr_show, bool, 0);
+
/* Get the MSR pair relating to a var range */
static void __init
get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
@@ -43,6 +51,14 @@ get_fixed_ranges(mtrr_type * frs)
rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
}
+static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types)
+{
+ unsigned i;
+
+ for (i = 0; i < 8; ++i, ++types, base += step)
+ printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types));
+}
+
/* Grab all of the MTRR state for this CPU into *state */
void __init get_mtrr_state(void)
{
@@ -58,13 +74,49 @@ void __init get_mtrr_state(void)
}
vrs = mtrr_state.var_ranges;
+ rdmsr(MTRRcap_MSR, lo, dummy);
+ mtrr_state.have_fixed = (lo >> 8) & 1;
+
for (i = 0; i < num_var_ranges; i++)
get_mtrr_var_range(i, &vrs[i]);
- get_fixed_ranges(mtrr_state.fixed_ranges);
+ if (mtrr_state.have_fixed)
+ get_fixed_ranges(mtrr_state.fixed_ranges);
rdmsr(MTRRdefType_MSR, lo, dummy);
mtrr_state.def_type = (lo & 0xff);
mtrr_state.enabled = (lo & 0xc00) >> 10;
+
+ if (mtrr_show) {
+ int high_width;
+
+ printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
+ if (mtrr_state.have_fixed) {
+ printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
+ mtrr_state.enabled & 1 ? "en" : "dis");
+ print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
+ for (i = 0; i < 2; ++i)
+ print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
+ for (i = 0; i < 8; ++i)
+ print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
+ }
+ printk(KERN_INFO "MTRR variable ranges %sabled:\n",
+ mtrr_state.enabled & 2 ? "en" : "dis");
+ high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
+ for (i = 0; i < num_var_ranges; ++i) {
+ if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
+ printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+ i,
+ high_width,
+ mtrr_state.var_ranges[i].base_hi,
+ mtrr_state.var_ranges[i].base_lo >> 12,
+ high_width,
+ mtrr_state.var_ranges[i].mask_hi,
+ mtrr_state.var_ranges[i].mask_lo >> 12,
+ mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+ else
+ printk(KERN_INFO "MTRR %u disabled\n", i);
+ }
+ }
}
/* Some BIOS's are fucked and don't set all MTRRs the same! */
@@ -95,7 +147,7 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
smp_processor_id(), msr, a, b);
}
-int generic_get_free_region(unsigned long base, unsigned long size)
+int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
/* [SUMMARY] Get a free MTRR.
<base> The starting (base) address of the region.
<size> The size (in bytes) of the region.
@@ -104,10 +156,11 @@ int generic_get_free_region(unsigned long base, unsigned long size)
{
int i, max;
mtrr_type ltype;
- unsigned long lbase;
- unsigned lsize;
+ unsigned long lbase, lsize;
max = num_var_ranges;
+ if (replace_reg >= 0 && replace_reg < max)
+ return replace_reg;
for (i = 0; i < max; ++i) {
mtrr_if->get(i, &lbase, &lsize, &ltype);
if (lsize == 0)
@@ -117,7 +170,7 @@ int generic_get_free_region(unsigned long base, unsigned long size)
}
static void generic_get_mtrr(unsigned int reg, unsigned long *base,
- unsigned int *size, mtrr_type * type)
+ unsigned long *size, mtrr_type *type)
{
unsigned int mask_lo, mask_hi, base_lo, base_hi;
@@ -202,7 +255,9 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
return changed;
}
-static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
+static u32 deftype_lo, deftype_hi;
+
+static unsigned long set_mtrr_state(void)
/* [SUMMARY] Set the MTRR state for this CPU.
<state> The MTRR state information to read.
<ctxt> Some relevant CPU context.
@@ -217,14 +272,14 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
change_mask |= MTRR_CHANGE_MASK_VARIABLE;
- if (set_fixed_ranges(mtrr_state.fixed_ranges))
+ if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
change_mask |= MTRR_CHANGE_MASK_FIXED;
/* Set_mtrr_restore restores the old value of MTRRdefType,
so to set it we fiddle with the saved value */
if ((deftype_lo & 0xff) != mtrr_state.def_type
|| ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
- deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10);
+ deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10);
change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
}
@@ -233,7 +288,6 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
static unsigned long cr4 = 0;
-static u32 deftype_lo, deftype_hi;
static DEFINE_SPINLOCK(set_atomicity_lock);
/*
@@ -271,7 +325,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi);
+ mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
}
static void post_set(void) __releases(set_atomicity_lock)
@@ -300,7 +354,7 @@ static void generic_set_all(void)
prepare_set();
/* Actually set the state */
- mask = set_mtrr_state(deftype_lo,deftype_hi);
+ mask = set_mtrr_state();
post_set();
local_irq_restore(flags);
@@ -366,7 +420,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
return -EINVAL;
}
- if (!(base + size < 0x70000000 || base > 0x7003FFFF) &&
+ if (!(base + size < 0x70000 || base > 0x7003F) &&
(type == MTRR_TYPE_WRCOMB
|| type == MTRR_TYPE_WRBACK)) {
printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index 5ac051bb9d55..5ae1705eafa6 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -17,7 +17,7 @@ extern unsigned int *usage_table;
#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
-static char *mtrr_strings[MTRR_NUM_TYPES] =
+static const char *const mtrr_strings[MTRR_NUM_TYPES] =
{
"uncachable", /* 0 */
"write-combining", /* 1 */
@@ -28,7 +28,7 @@ static char *mtrr_strings[MTRR_NUM_TYPES] =
"write-back", /* 6 */
};
-char *mtrr_attrib_to_str(int x)
+const char *mtrr_attrib_to_str(int x)
{
return (x <= 6) ? mtrr_strings[x] : "?";
}
@@ -44,10 +44,9 @@ mtrr_file_add(unsigned long base, unsigned long size,
max = num_var_ranges;
if (fcount == NULL) {
- fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL);
+ fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
if (!fcount)
return -ENOMEM;
- memset(fcount, 0, max * sizeof *fcount);
FILE_FCOUNT(file) = fcount;
}
if (!page) {
@@ -155,6 +154,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
{
int err = 0;
mtrr_type type;
+ unsigned long size;
struct mtrr_sentry sentry;
struct mtrr_gentry gentry;
void __user *arg = (void __user *) __arg;
@@ -235,15 +235,15 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
case MTRRIOC_GET_ENTRY:
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
- mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type);
+ mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
/* Hide entries that go above 4GB */
- if (gentry.base + gentry.size > 0x100000
- || gentry.size == 0x100000)
+ if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
+ || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
gentry.base = gentry.size = gentry.type = 0;
else {
gentry.base <<= PAGE_SHIFT;
- gentry.size <<= PAGE_SHIFT;
+ gentry.size = size << PAGE_SHIFT;
gentry.type = type;
}
@@ -273,8 +273,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
case MTRRIOC_GET_PAGE_ENTRY:
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
- mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type);
- gentry.type = type;
+ mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+ /* Hide entries that would overflow */
+ if (size != (__typeof__(gentry.size))size)
+ gentry.base = gentry.size = gentry.type = 0;
+ else {
+ gentry.size = size;
+ gentry.type = type;
+ }
break;
}
@@ -353,8 +359,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
char factor;
int i, max, len;
mtrr_type type;
- unsigned long base;
- unsigned int size;
+ unsigned long base, size;
len = 0;
max = num_var_ranges;
@@ -373,7 +378,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
}
/* RED-PEN: base can be > 32bit */
len += seq_printf(seq,
- "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n",
+ "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
i, base, base >> (20 - PAGE_SHIFT), size, factor,
mtrr_attrib_to_str(type), usage_table[i]);
}
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index fff90bda4733..16bb7ea87145 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -59,7 +59,11 @@ struct mtrr_ops * mtrr_if = NULL;
static void set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
+#ifndef CONFIG_X86_64
extern int arr3_protected;
+#else
+#define arr3_protected 0
+#endif
void set_mtrr_ops(struct mtrr_ops * ops)
{
@@ -168,6 +172,13 @@ static void ipi_handler(void *info)
#endif
+static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
+ return type1 == MTRR_TYPE_UNCACHABLE ||
+ type2 == MTRR_TYPE_UNCACHABLE ||
+ (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
+ (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
+}
+
/**
* set_mtrr - update mtrrs on all processors
* @reg: mtrr in question
@@ -263,8 +274,8 @@ static void set_mtrr(unsigned int reg, unsigned long base,
/**
* mtrr_add_page - Add a memory type region
- * @base: Physical base address of region in pages (4 KB)
- * @size: Physical size of region in pages (4 KB)
+ * @base: Physical base address of region in pages (in units of 4 kB!)
+ * @size: Physical size of region in pages (4 kB)
* @type: Type of MTRR desired
* @increment: If this is true do usage counting on the region
*
@@ -300,11 +311,9 @@ static void set_mtrr(unsigned int reg, unsigned long base,
int mtrr_add_page(unsigned long base, unsigned long size,
unsigned int type, char increment)
{
- int i;
+ int i, replace, error;
mtrr_type ltype;
- unsigned long lbase;
- unsigned int lsize;
- int error;
+ unsigned long lbase, lsize;
if (!mtrr_if)
return -ENXIO;
@@ -324,12 +333,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
return -ENOSYS;
}
+ if (!size) {
+ printk(KERN_WARNING "mtrr: zero sized request\n");
+ return -EINVAL;
+ }
+
if (base & size_or_mask || size & size_or_mask) {
printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
return -EINVAL;
}
error = -EINVAL;
+ replace = -1;
/* No CPU hotplug when we change MTRR entries */
lock_cpu_hotplug();
@@ -337,21 +352,28 @@ int mtrr_add_page(unsigned long base, unsigned long size,
mutex_lock(&mtrr_mutex);
for (i = 0; i < num_var_ranges; ++i) {
mtrr_if->get(i, &lbase, &lsize, &ltype);
- if (base >= lbase + lsize)
- continue;
- if ((base < lbase) && (base + size <= lbase))
+ if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase)
continue;
/* At this point we know there is some kind of overlap/enclosure */
- if ((base < lbase) || (base + size > lbase + lsize)) {
+ if (base < lbase || base + size - 1 > lbase + lsize - 1) {
+ if (base <= lbase && base + size - 1 >= lbase + lsize - 1) {
+ /* New region encloses an existing region */
+ if (type == ltype) {
+ replace = replace == -1 ? i : -2;
+ continue;
+ }
+ else if (types_compatible(type, ltype))
+ continue;
+ }
printk(KERN_WARNING
"mtrr: 0x%lx000,0x%lx000 overlaps existing"
- " 0x%lx000,0x%x000\n", base, size, lbase,
+ " 0x%lx000,0x%lx000\n", base, size, lbase,
lsize);
goto out;
}
/* New region is enclosed by an existing region */
if (ltype != type) {
- if (type == MTRR_TYPE_UNCACHABLE)
+ if (types_compatible(type, ltype))
continue;
printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
base, size, mtrr_attrib_to_str(ltype),
@@ -364,10 +386,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
goto out;
}
/* Search for an empty MTRR */
- i = mtrr_if->get_free_region(base, size);
+ i = mtrr_if->get_free_region(base, size, replace);
if (i >= 0) {
set_mtrr(i, base, size, type);
- usage_table[i] = 1;
+ if (likely(replace < 0))
+ usage_table[i] = 1;
+ else {
+ usage_table[i] = usage_table[replace] + !!increment;
+ if (unlikely(replace != i)) {
+ set_mtrr(replace, 0, 0, 0);
+ usage_table[replace] = 0;
+ }
+ }
} else
printk(KERN_INFO "mtrr: no more MTRRs available\n");
error = i;
@@ -455,8 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
{
int i, max;
mtrr_type ltype;
- unsigned long lbase;
- unsigned int lsize;
+ unsigned long lbase, lsize;
int error = -EINVAL;
if (!mtrr_if)
@@ -544,9 +573,11 @@ extern void centaur_init_mtrr(void);
static void __init init_ifs(void)
{
+#ifndef CONFIG_X86_64
amd_init_mtrr();
cyrix_init_mtrr();
centaur_init_mtrr();
+#endif
}
/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
@@ -555,7 +586,7 @@ static void __init init_ifs(void)
struct mtrr_value {
mtrr_type ltype;
unsigned long lbase;
- unsigned int lsize;
+ unsigned long lsize;
};
static struct mtrr_value * mtrr_state;
@@ -565,10 +596,8 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
int i;
int size = num_var_ranges * sizeof(struct mtrr_value);
- mtrr_state = kmalloc(size,GFP_ATOMIC);
- if (mtrr_state)
- memset(mtrr_state,0,size);
- else
+ mtrr_state = kzalloc(size,GFP_ATOMIC);
+ if (!mtrr_state)
return -ENOMEM;
for (i = 0; i < num_var_ranges; i++) {
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index 99c9f2682041..d61ea9db6cfe 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -43,15 +43,16 @@ struct mtrr_ops {
void (*set_all)(void);
void (*get)(unsigned int reg, unsigned long *base,
- unsigned int *size, mtrr_type * type);
- int (*get_free_region) (unsigned long base, unsigned long size);
-
+ unsigned long *size, mtrr_type * type);
+ int (*get_free_region)(unsigned long base, unsigned long size,
+ int replace_reg);
int (*validate_add_page)(unsigned long base, unsigned long size,
unsigned int type);
int (*have_wrcomb)(void);
};
-extern int generic_get_free_region(unsigned long base, unsigned long size);
+extern int generic_get_free_region(unsigned long base, unsigned long size,
+ int replace_reg);
extern int generic_validate_add_page(unsigned long base, unsigned long size,
unsigned int type);
@@ -62,17 +63,17 @@ extern int positive_have_wrcomb(void);
/* library functions for processor-specific routines */
struct set_mtrr_context {
unsigned long flags;
- unsigned long deftype_lo;
- unsigned long deftype_hi;
unsigned long cr4val;
- unsigned long ccr3;
+ u32 deftype_lo;
+ u32 deftype_hi;
+ u32 ccr3;
};
struct mtrr_var_range {
- unsigned long base_lo;
- unsigned long base_hi;
- unsigned long mask_lo;
- unsigned long mask_hi;
+ u32 base_lo;
+ u32 base_hi;
+ u32 mask_lo;
+ u32 mask_hi;
};
void set_mtrr_done(struct set_mtrr_context *ctxt);
@@ -92,6 +93,6 @@ extern struct mtrr_ops * mtrr_if;
extern unsigned int num_var_ranges;
void mtrr_state_warn(void);
-char *mtrr_attrib_to_str(int x);
+const char *mtrr_attrib_to_str(int x);
void mtrr_wrmsr(unsigned, unsigned, unsigned);
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 76aac088a323..6624d8583c42 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -152,9 +152,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, " [%d]", i);
}
- seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n",
+ seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
+ seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
return 0;
}
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index ab0c327e79dc..db6dd20c3589 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -34,7 +34,6 @@
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/smp_lock.h>
-#include <linux/fs.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
@@ -167,7 +166,6 @@ static int cpuid_device_create(int i)
return err;
}
-#ifdef CONFIG_HOTPLUG_CPU
static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
@@ -187,7 +185,6 @@ static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
{
.notifier_call = cpuid_class_cpu_callback,
};
-#endif /* !CONFIG_HOTPLUG_CPU */
static int __init cpuid_init(void)
{
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 144b43288965..a5e0e990ea95 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -31,68 +31,6 @@
/* This keeps a track of which one is crashing cpu. */
static int crashing_cpu;
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
- size_t data_len)
-{
- struct elf_note note;
-
- note.n_namesz = strlen(name) + 1;
- note.n_descsz = data_len;
- note.n_type = type;
- memcpy(buf, &note, sizeof(note));
- buf += (sizeof(note) +3)/4;
- memcpy(buf, name, note.n_namesz);
- buf += (note.n_namesz + 3)/4;
- memcpy(buf, data, note.n_descsz);
- buf += (note.n_descsz + 3)/4;
-
- return buf;
-}
-
-static void final_note(u32 *buf)
-{
- struct elf_note note;
-
- note.n_namesz = 0;
- note.n_descsz = 0;
- note.n_type = 0;
- memcpy(buf, &note, sizeof(note));
-}
-
-static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
-{
- struct elf_prstatus prstatus;
- u32 *buf;
-
- if ((cpu < 0) || (cpu >= NR_CPUS))
- return;
-
- /* Using ELF notes here is opportunistic.
- * I need a well defined structure format
- * for the data I pass, and I need tags
- * on the data to indicate what information I have
- * squirrelled away. ELF notes happen to provide
- * all of that, so there is no need to invent something new.
- */
- buf = (u32*)per_cpu_ptr(crash_notes, cpu);
- if (!buf)
- return;
- memset(&prstatus, 0, sizeof(prstatus));
- prstatus.pr_pid = current->pid;
- elf_core_copy_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
- sizeof(prstatus));
- final_note(buf);
-}
-
-static void crash_save_self(struct pt_regs *regs)
-{
- int cpu;
-
- cpu = safe_smp_processor_id();
- crash_save_this_cpu(regs, cpu);
-}
-
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static atomic_t waiting_for_crash_ipi;
@@ -121,7 +59,7 @@ static int crash_nmi_callback(struct notifier_block *self,
crash_fixup_ss_esp(&fixed_regs, regs);
regs = &fixed_regs;
}
- crash_save_this_cpu(regs, cpu);
+ crash_save_cpu(regs, cpu);
disable_local_APIC();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
@@ -195,5 +133,5 @@ void machine_crash_shutdown(struct pt_regs *regs)
#if defined(CONFIG_X86_IO_APIC)
disable_IO_APIC();
#endif
- crash_save_self(regs);
+ crash_save_cpu(regs, safe_smp_processor_id());
}
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
new file mode 100644
index 000000000000..2f7d0a92fd7c
--- /dev/null
+++ b/arch/i386/kernel/e820.c
@@ -0,0 +1,894 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/pfn.h>
+#include <linux/uaccess.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+
+#ifdef CONFIG_EFI
+int efi_enabled = 0;
+EXPORT_SYMBOL(efi_enabled);
+#endif
+
+struct e820map e820;
+struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+ unsigned long long addr; /* address for this change point */
+};
+static struct change_member change_point_list[2*E820MAX] __initdata;
+static struct change_member *change_point[2*E820MAX] __initdata;
+static struct e820entry *overlap_list[E820MAX] __initdata;
+static struct e820entry new_bios[E820MAX] __initdata;
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0x10000000;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+extern int user_defined_memmap;
+struct resource data_resource = {
+ .name = "Kernel data",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+struct resource code_resource = {
+ .name = "Kernel code",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource system_rom_resource = {
+ .name = "System ROM",
+ .start = 0xf0000,
+ .end = 0xfffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+ .name = "Extension ROM",
+ .start = 0xe0000,
+ .end = 0xeffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+ .name = "Adapter ROM",
+ .start = 0xc8000,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+ .name = "Video ROM",
+ .start = 0xc0000,
+ .end = 0xc7fff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource video_ram_resource = {
+ .name = "Video RAM area",
+ .start = 0xa0000,
+ .end = 0xbffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource standard_io_resources[] = { {
+ .name = "dma1",
+ .start = 0x0000,
+ .end = 0x001f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "pic1",
+ .start = 0x0020,
+ .end = 0x0021,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "timer0",
+ .start = 0x0040,
+ .end = 0x0043,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "timer1",
+ .start = 0x0050,
+ .end = 0x0053,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "keyboard",
+ .start = 0x0060,
+ .end = 0x006f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "dma page reg",
+ .start = 0x0080,
+ .end = 0x008f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "pic2",
+ .start = 0x00a0,
+ .end = 0x00a1,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "dma2",
+ .start = 0x00c0,
+ .end = 0x00df,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+ .name = "fpu",
+ .start = 0x00f0,
+ .end = 0x00ff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+} };
+
+static int romsignature(const unsigned char *x)
+{
+ unsigned short sig;
+ int ret = 0;
+ if (probe_kernel_address((const unsigned short *)x, sig) == 0)
+ ret = (sig == 0xaa55);
+ return ret;
+}
+
+static int __init romchecksum(unsigned char *rom, unsigned long length)
+{
+ unsigned char *p, sum = 0;
+
+ for (p = rom; p < rom + length; p++)
+ sum += *p;
+ return sum == 0;
+}
+
+static void __init probe_roms(void)
+{
+ unsigned long start, length, upper;
+ unsigned char *rom;
+ int i;
+
+ /* video rom */
+ upper = adapter_rom_resources[0].start;
+ for (start = video_rom_resource.start; start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ video_rom_resource.start = start;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = rom[2] * 512;
+
+ /* if checksum okay, trust length byte */
+ if (length && romchecksum(rom, length))
+ video_rom_resource.end = start + length - 1;
+
+ request_resource(&iomem_resource, &video_rom_resource);
+ break;
+ }
+
+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+ if (start < upper)
+ start = upper;
+
+ /* system rom */
+ request_resource(&iomem_resource, &system_rom_resource);
+ upper = system_rom_resource.start;
+
+ /* check for extension rom (ignore length byte!) */
+ rom = isa_bus_to_virt(extension_rom_resource.start);
+ if (romsignature(rom)) {
+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
+ if (romchecksum(rom, length)) {
+ request_resource(&iomem_resource, &extension_rom_resource);
+ upper = extension_rom_resource.start;
+ }
+ }
+
+ /* check for adapter roms on 2k boundaries */
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = rom[2] * 512;
+
+ /* but accept any length that fits if checksum okay */
+ if (!length || start + length > upper || !romchecksum(rom, length))
+ continue;
+
+ adapter_rom_resources[i].start = start;
+ adapter_rom_resources[i].end = start + length - 1;
+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+ start = adapter_rom_resources[i++].end & ~2047UL;
+ }
+}
+
+/*
+ * Request address space for all standard RAM and ROM resources
+ * and also for regions reported as reserved by the e820.
+ */
+static void __init
+legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
+{
+ int i;
+
+ probe_roms();
+ for (i = 0; i < e820.nr_map; i++) {
+ struct resource *res;
+#ifndef CONFIG_RESOURCES_64BIT
+ if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
+ continue;
+#endif
+ res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+ switch (e820.map[i].type) {
+ case E820_RAM: res->name = "System RAM"; break;
+ case E820_ACPI: res->name = "ACPI Tables"; break;
+ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
+ default: res->name = "reserved";
+ }
+ res->start = e820.map[i].addr;
+ res->end = res->start + e820.map[i].size - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ if (request_resource(&iomem_resource, res)) {
+ kfree(res);
+ continue;
+ }
+ if (e820.map[i].type == E820_RAM) {
+ /*
+ * We don't know which RAM region contains kernel data,
+ * so we try it repeatedly and let the resource manager
+ * test it.
+ */
+ request_resource(res, code_resource);
+ request_resource(res, data_resource);
+#ifdef CONFIG_KEXEC
+ request_resource(res, &crashk_res);
+#endif
+ }
+ }
+}
+
+/*
+ * Request address space for all standard resources
+ *
+ * This is called just before pcibios_init(), which is also a
+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
+ */
+static int __init request_standard_resources(void)
+{
+ int i;
+
+ printk("Setting up standard PCI resources\n");
+ if (efi_enabled)
+ efi_initialize_iomem_resources(&code_resource, &data_resource);
+ else
+ legacy_init_iomem_resources(&code_resource, &data_resource);
+
+ /* EFI systems may still have VGA */
+ request_resource(&iomem_resource, &video_ram_resource);
+
+ /* request I/O space for devices used on all i[345]86 PCs */
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+ request_resource(&ioport_resource, &standard_io_resources[i]);
+ return 0;
+}
+
+subsys_initcall(request_standard_resources);
+
+void __init add_memory_region(unsigned long long start,
+ unsigned long long size, int type)
+{
+ int x;
+
+ if (!efi_enabled) {
+ x = e820.nr_map;
+
+ if (x == E820MAX) {
+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+ return;
+ }
+
+ e820.map[x].addr = start;
+ e820.map[x].size = size;
+ e820.map[x].type = type;
+ e820.nr_map++;
+ }
+} /* add_memory_region */
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+ struct change_member *change_tmp;
+ unsigned long current_type, last_type;
+ unsigned long long last_addr;
+ int chgidx, still_changing;
+ int overlap_entries;
+ int new_bios_entry;
+ int old_nr, new_nr, chg_nr;
+ int i;
+
+ /*
+ Visually we're performing the following (1,2,3,4 = memory types)...
+
+ Sample memory map (w/overlaps):
+ ____22__________________
+ ______________________4_
+ ____1111________________
+ _44_____________________
+ 11111111________________
+ ____________________33__
+ ___________44___________
+ __________33333_________
+ ______________22________
+ ___________________2222_
+ _________111111111______
+ _____________________11_
+ _________________4______
+
+ Sanitized equivalent (no overlap):
+ 1_______________________
+ _44_____________________
+ ___1____________________
+ ____22__________________
+ ______11________________
+ _________1______________
+ __________3_____________
+ ___________44___________
+ _____________33_________
+ _______________2________
+ ________________1_______
+ _________________4______
+ ___________________2____
+ ____________________33__
+ ______________________4_
+ */
+ printk("sanitize start\n");
+ /* if there's only one memory region, don't bother */
+ if (*pnr_map < 2) {
+ printk("sanitize bail 0\n");
+ return -1;
+ }
+
+ old_nr = *pnr_map;
+
+ /* bail out if we find any unreasonable addresses in bios map */
+ for (i=0; i<old_nr; i++)
+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
+ printk("sanitize bail 1\n");
+ return -1;
+ }
+
+ /* create pointers for initial change-point information (for sorting) */
+ for (i=0; i < 2*old_nr; i++)
+ change_point[i] = &change_point_list[i];
+
+ /* record all known change-points (starting and ending addresses),
+ omitting those that are for empty memory regions */
+ chgidx = 0;
+ for (i=0; i < old_nr; i++) {
+ if (biosmap[i].size != 0) {
+ change_point[chgidx]->addr = biosmap[i].addr;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ }
+ }
+ chg_nr = chgidx; /* true number of change-points */
+
+ /* sort change-point list by memory addresses (low -> high) */
+ still_changing = 1;
+ while (still_changing) {
+ still_changing = 0;
+ for (i=1; i < chg_nr; i++) {
+ /* if <current_addr> > <last_addr>, swap */
+ /* or, if current=<start_addr> & last=<end_addr>, swap */
+ if ((change_point[i]->addr < change_point[i-1]->addr) ||
+ ((change_point[i]->addr == change_point[i-1]->addr) &&
+ (change_point[i]->addr == change_point[i]->pbios->addr) &&
+ (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+ )
+ {
+ change_tmp = change_point[i];
+ change_point[i] = change_point[i-1];
+ change_point[i-1] = change_tmp;
+ still_changing=1;
+ }
+ }
+ }
+
+ /* create a new bios memory map, removing overlaps */
+ overlap_entries=0; /* number of entries in the overlap table */
+ new_bios_entry=0; /* index for creating new bios map entries */
+ last_type = 0; /* start with undefined memory type */
+ last_addr = 0; /* start with 0 as last starting address */
+ /* loop through change-points, determining affect on the new bios map */
+ for (chgidx=0; chgidx < chg_nr; chgidx++)
+ {
+ /* keep track of all overlapping bios entries */
+ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+ {
+ /* add map entry to overlap list (> 1 entry implies an overlap) */
+ overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+ }
+ else
+ {
+ /* remove entry from list (order independent, so swap with last) */
+ for (i=0; i<overlap_entries; i++)
+ {
+ if (overlap_list[i] == change_point[chgidx]->pbios)
+ overlap_list[i] = overlap_list[overlap_entries-1];
+ }
+ overlap_entries--;
+ }
+ /* if there are overlapping entries, decide which "type" to use */
+ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+ current_type = 0;
+ for (i=0; i<overlap_entries; i++)
+ if (overlap_list[i]->type > current_type)
+ current_type = overlap_list[i]->type;
+ /* continue building up new bios map based on this information */
+ if (current_type != last_type) {
+ if (last_type != 0) {
+ new_bios[new_bios_entry].size =
+ change_point[chgidx]->addr - last_addr;
+ /* move forward only if the new size was non-zero */
+ if (new_bios[new_bios_entry].size != 0)
+ if (++new_bios_entry >= E820MAX)
+ break; /* no more space left for new bios entries */
+ }
+ if (current_type != 0) {
+ new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+ new_bios[new_bios_entry].type = current_type;
+ last_addr=change_point[chgidx]->addr;
+ }
+ last_type = current_type;
+ }
+ }
+ new_nr = new_bios_entry; /* retain count for new bios entries */
+
+ /* copy new bios mapping into original location */
+ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+ *pnr_map = new_nr;
+
+ printk("sanitize end\n");
+ return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory. If we aren't, we'll fake a memory map.
+ *
+ * We check to see that the memory map contains at least 2 elements
+ * before we'll use it, because the detection code in setup.S may
+ * not be perfect and most every PC known to man has two memory
+ * regions: one from 0 to 640k, and one from 1mb up. (The IBM
+ * thinkpad 560x, for example, does not cooperate with the memory
+ * detection code.)
+ */
+int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+ /* Only one memory region (or negative)? Ignore it */
+ if (nr_map < 2)
+ return -1;
+
+ do {
+ unsigned long long start = biosmap->addr;
+ unsigned long long size = biosmap->size;
+ unsigned long long end = start + size;
+ unsigned long type = biosmap->type;
+ printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
+
+ /* Overflow in 64 bits? Ignore the memory map. */
+ if (start > end)
+ return -1;
+
+ /*
+ * Some BIOSes claim RAM in the 640k - 1M region.
+ * Not right. Fix it up.
+ */
+ if (type == E820_RAM) {
+ printk("copy_e820_map() type is E820_RAM\n");
+ if (start < 0x100000ULL && end > 0xA0000ULL) {
+ printk("copy_e820_map() lies in range...\n");
+ if (start < 0xA0000ULL) {
+ printk("copy_e820_map() start < 0xA0000ULL\n");
+ add_memory_region(start, 0xA0000ULL-start, type);
+ }
+ if (end <= 0x100000ULL) {
+ printk("copy_e820_map() end <= 0x100000ULL\n");
+ continue;
+ }
+ start = 0x100000ULL;
+ size = end - start;
+ }
+ }
+ add_memory_region(start, size, type);
+ } while (biosmap++,--nr_map);
+ return 0;
+}
+
+/*
+ * Callback for efi_memory_walk.
+ */
+static int __init
+efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
+{
+ unsigned long *max_pfn = arg, pfn;
+
+ if (start < end) {
+ pfn = PFN_UP(end -1);
+ if (pfn > *max_pfn)
+ *max_pfn = pfn;
+ }
+ return 0;
+}
+
+static int __init
+efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
+{
+ memory_present(0, PFN_UP(start), PFN_DOWN(end));
+ return 0;
+}
+
+/*
+ * Find the highest page frame number we have available
+ */
+void __init find_max_pfn(void)
+{
+ int i;
+
+ max_pfn = 0;
+ if (efi_enabled) {
+ efi_memmap_walk(efi_find_max_pfn, &max_pfn);
+ efi_memmap_walk(efi_memory_present_wrapper, NULL);
+ return;
+ }
+
+ for (i = 0; i < e820.nr_map; i++) {
+ unsigned long start, end;
+ /* RAM? */
+ if (e820.map[i].type != E820_RAM)
+ continue;
+ start = PFN_UP(e820.map[i].addr);
+ end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+ if (start >= end)
+ continue;
+ if (end > max_pfn)
+ max_pfn = end;
+ memory_present(0, start, end);
+ }
+}
+
+/*
+ * Free all available memory for boot time allocation. Used
+ * as a callback function by efi_memory_walk()
+ */
+
+static int __init
+free_available_memory(unsigned long start, unsigned long end, void *arg)
+{
+ /* check max_low_pfn */
+ if (start >= (max_low_pfn << PAGE_SHIFT))
+ return 0;
+ if (end >= (max_low_pfn << PAGE_SHIFT))
+ end = max_low_pfn << PAGE_SHIFT;
+ if (start < end)
+ free_bootmem(start, end - start);
+
+ return 0;
+}
+/*
+ * Register fully available low RAM pages with the bootmem allocator.
+ */
+void __init register_bootmem_low_pages(unsigned long max_low_pfn)
+{
+ int i;
+
+ if (efi_enabled) {
+ efi_memmap_walk(free_available_memory, NULL);
+ return;
+ }
+ for (i = 0; i < e820.nr_map; i++) {
+ unsigned long curr_pfn, last_pfn, size;
+ /*
+ * Reserve usable low memory
+ */
+ if (e820.map[i].type != E820_RAM)
+ continue;
+ /*
+ * We are rounding up the start address of usable memory:
+ */
+ curr_pfn = PFN_UP(e820.map[i].addr);
+ if (curr_pfn >= max_low_pfn)
+ continue;
+ /*
+ * ... and at the end of the usable range downwards:
+ */
+ last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+
+ if (last_pfn > max_low_pfn)
+ last_pfn = max_low_pfn;
+
+ /*
+ * .. finally, did all the rounding and playing
+ * around just make the area go away?
+ */
+ if (last_pfn <= curr_pfn)
+ continue;
+
+ size = last_pfn - curr_pfn;
+ free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
+ }
+}
+
+void __init register_memory(void)
+{
+ unsigned long gapstart, gapsize, round;
+ unsigned long long last;
+ int i;
+
+ /*
+ * Search for the bigest gap in the low 32 bits of the e820
+ * memory space.
+ */
+ last = 0x100000000ull;
+ gapstart = 0x10000000;
+ gapsize = 0x400000;
+ i = e820.nr_map;
+ while (--i >= 0) {
+ unsigned long long start = e820.map[i].addr;
+ unsigned long long end = start + e820.map[i].size;
+
+ /*
+ * Since "last" is at most 4GB, we know we'll
+ * fit in 32 bits if this condition is true
+ */
+ if (last > end) {
+ unsigned long gap = last - end;
+
+ if (gap > gapsize) {
+ gapsize = gap;
+ gapstart = end;
+ }
+ }
+ if (start < last)
+ last = start;
+ }
+
+ /*
+ * See how much we want to round up: start off with
+ * rounding to the next 1MB area.
+ */
+ round = 0x100000;
+ while ((gapsize >> 4) > round)
+ round += round;
+ /* Fun with two's complement */
+ pci_mem_start = (gapstart + round) & -round;
+
+ printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
+ pci_mem_start, gapstart, gapsize);
+}
+
+void __init print_memory_map(char *who)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ printk(" %s: %016Lx - %016Lx ", who,
+ e820.map[i].addr,
+ e820.map[i].addr + e820.map[i].size);
+ switch (e820.map[i].type) {
+ case E820_RAM: printk("(usable)\n");
+ break;
+ case E820_RESERVED:
+ printk("(reserved)\n");
+ break;
+ case E820_ACPI:
+ printk("(ACPI data)\n");
+ break;
+ case E820_NVS:
+ printk("(ACPI NVS)\n");
+ break;
+ default: printk("type %lu\n", e820.map[i].type);
+ break;
+ }
+ }
+}
+
+static __init __always_inline void efi_limit_regions(unsigned long long size)
+{
+ unsigned long long current_addr = 0;
+ efi_memory_desc_t *md, *next_md;
+ void *p, *p1;
+ int i, j;
+
+ j = 0;
+ p1 = memmap.map;
+ for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+ md = p;
+ next_md = p1;
+ current_addr = md->phys_addr +
+ PFN_PHYS(md->num_pages);
+ if (is_available_memory(md)) {
+ if (md->phys_addr >= size) continue;
+ memcpy(next_md, md, memmap.desc_size);
+ if (current_addr >= size) {
+ next_md->num_pages -=
+ PFN_UP(current_addr-size);
+ }
+ p1 += memmap.desc_size;
+ next_md = p1;
+ j++;
+ } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
+ EFI_MEMORY_RUNTIME) {
+ /* In order to make runtime services
+ * available we have to include runtime
+ * memory regions in memory map */
+ memcpy(next_md, md, memmap.desc_size);
+ p1 += memmap.desc_size;
+ next_md = p1;
+ j++;
+ }
+ }
+ memmap.nr_map = j;
+ memmap.map_end = memmap.map +
+ (memmap.nr_map * memmap.desc_size);
+}
+
+void __init limit_regions(unsigned long long size)
+{
+ unsigned long long current_addr;
+ int i;
+
+ print_memory_map("limit_regions start");
+ if (efi_enabled) {
+ efi_limit_regions(size);
+ return;
+ }
+ for (i = 0; i < e820.nr_map; i++) {
+ current_addr = e820.map[i].addr + e820.map[i].size;
+ if (current_addr < size)
+ continue;
+
+ if (e820.map[i].type != E820_RAM)
+ continue;
+
+ if (e820.map[i].addr >= size) {
+ /*
+ * This region starts past the end of the
+ * requested size, skip it completely.
+ */
+ e820.nr_map = i;
+ } else {
+ e820.nr_map = i + 1;
+ e820.map[i].size -= current_addr - size;
+ }
+ print_memory_map("limit_regions endfor");
+ return;
+ }
+ print_memory_map("limit_regions endfunc");
+}
+
+ /*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init
+e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
+{
+ u64 start = s;
+ u64 end = e;
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ if (type && ei->type != type)
+ continue;
+ /* is the region (part) in overlap with the current region ?*/
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+ /* if the region is at the beginning of <start,end> we move
+ * start to the end of the region since it's ok until there
+ */
+ if (ei->addr <= start)
+ start = ei->addr + ei->size;
+ /* if start is now at or beyond end, we're done, full
+ * coverage */
+ if (start >= end)
+ return 1; /* we're done */
+ }
+ return 0;
+}
+
+static int __init parse_memmap(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp(arg, "exactmap") == 0) {
+#ifdef CONFIG_CRASH_DUMP
+ /* If we are doing a crash dump, we
+ * still need to know the real mem
+ * size before original memory map is
+ * reset.
+ */
+ find_max_pfn();
+ saved_max_pfn = max_pfn;
+#endif
+ e820.nr_map = 0;
+ user_defined_memmap = 1;
+ } else {
+ /* If the user specifies memory size, we
+ * limit the BIOS-provided memory map to
+ * that size. exactmap can be used to specify
+ * the exact map. mem=number can be used to
+ * trim the existing memory map.
+ */
+ unsigned long long start_at, mem_size;
+
+ mem_size = memparse(arg, &arg);
+ if (*arg == '@') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_RAM);
+ } else if (*arg == '#') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_ACPI);
+ } else if (*arg == '$') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_RESERVED);
+ } else {
+ limit_regions(mem_size);
+ user_defined_memmap = 1;
+ }
+ }
+ return 0;
+}
+early_param("memmap", parse_memmap);
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8b40648d0ef0..b92c7f0a358a 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -194,17 +194,24 @@ inline int efi_set_rtc_mmss(unsigned long nowtime)
return 0;
}
/*
- * This should only be used during kernel init and before runtime
- * services have been remapped, therefore, we'll need to call in physical
- * mode. Note, this call isn't used later, so mark it __init.
+ * This is used during kernel init before runtime
+ * services have been remapped and also during suspend, therefore,
+ * we'll need to call both in physical and virtual modes.
*/
-inline unsigned long __init efi_get_time(void)
+inline unsigned long efi_get_time(void)
{
efi_status_t status;
efi_time_t eft;
efi_time_cap_t cap;
- status = phys_efi_get_time(&eft, &cap);
+ if (efi.get_time) {
+ /* if we are in virtual mode use remapped function */
+ status = efi.get_time(&eft, &cap);
+ } else {
+ /* we are in physical mode */
+ status = phys_efi_get_time(&eft, &cap);
+ }
+
if (status != EFI_SUCCESS)
printk("Oops: efitime: can't read time status: 0x%lx\n",status);
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5a63d6fdb70e..de34b7fed3c1 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,12 +30,13 @@
* 18(%esp) - %eax
* 1C(%esp) - %ds
* 20(%esp) - %es
- * 24(%esp) - orig_eax
- * 28(%esp) - %eip
- * 2C(%esp) - %cs
- * 30(%esp) - %eflags
- * 34(%esp) - %oldesp
- * 38(%esp) - %oldss
+ * 24(%esp) - %gs
+ * 28(%esp) - orig_eax
+ * 2C(%esp) - %eip
+ * 30(%esp) - %cs
+ * 34(%esp) - %eflags
+ * 38(%esp) - %oldesp
+ * 3C(%esp) - %oldss
*
* "current" is in register %ebx during any slow entries.
*/
@@ -48,26 +49,24 @@
#include <asm/smp.h>
#include <asm/page.h>
#include <asm/desc.h>
+#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include "irq_vectors.h"
-#define nr_syscalls ((syscall_table_size)/4)
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization. The following will never clobber any registers:
+ * INTERRUPT_RETURN (aka. "iret")
+ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
-EBX = 0x00
-ECX = 0x04
-EDX = 0x08
-ESI = 0x0C
-EDI = 0x10
-EBP = 0x14
-EAX = 0x18
-DS = 0x1C
-ES = 0x20
-ORIG_EAX = 0x24
-EIP = 0x28
-CS = 0x2C
-EFLAGS = 0x30
-OLDESP = 0x34
-OLDSS = 0x38
+#define nr_syscalls ((syscall_table_size)/4)
CF_MASK = 0x00000001
TF_MASK = 0x00000100
@@ -76,23 +75,16 @@ DF_MASK = 0x00000400
NT_MASK = 0x00004000
VM_MASK = 0x00020000
-/* These are replaces for paravirtualization */
-#define DISABLE_INTERRUPTS cli
-#define ENABLE_INTERRUPTS sti
-#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
-#define INTERRUPT_RETURN iret
-#define GET_CR0_INTO_EAX movl %cr0, %eax
-
#ifdef CONFIG_PREEMPT
-#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
+#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
-#define preempt_stop
+#define preempt_stop(clobbers)
#define resume_kernel restore_nocheck
#endif
.macro TRACE_IRQS_IRET
#ifdef CONFIG_TRACE_IRQFLAGS
- testl $IF_MASK,EFLAGS(%esp) # interrupts off?
+ testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
jz 1f
TRACE_IRQS_ON
1:
@@ -107,6 +99,9 @@ VM_MASK = 0x00020000
#define SAVE_ALL \
cld; \
+ pushl %gs; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ /*CFI_REL_OFFSET gs, 0;*/\
pushl %es; \
CFI_ADJUST_CFA_OFFSET 4;\
/*CFI_REL_OFFSET es, 0;*/\
@@ -136,7 +131,9 @@ VM_MASK = 0x00020000
CFI_REL_OFFSET ebx, 0;\
movl $(__USER_DS), %edx; \
movl %edx, %ds; \
- movl %edx, %es;
+ movl %edx, %es; \
+ movl $(__KERNEL_PDA), %edx; \
+ movl %edx, %gs
#define RESTORE_INT_REGS \
popl %ebx; \
@@ -169,17 +166,22 @@ VM_MASK = 0x00020000
2: popl %es; \
CFI_ADJUST_CFA_OFFSET -4;\
/*CFI_RESTORE es;*/\
-.section .fixup,"ax"; \
-3: movl $0,(%esp); \
- jmp 1b; \
+3: popl %gs; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ /*CFI_RESTORE gs;*/\
+.pushsection .fixup,"ax"; \
4: movl $0,(%esp); \
+ jmp 1b; \
+5: movl $0,(%esp); \
jmp 2b; \
-.previous; \
+6: movl $0,(%esp); \
+ jmp 3b; \
.section __ex_table,"a";\
.align 4; \
- .long 1b,3b; \
- .long 2b,4b; \
-.previous
+ .long 1b,4b; \
+ .long 2b,5b; \
+ .long 3b,6b; \
+.popsection
#define RING0_INT_FRAME \
CFI_STARTPROC simple;\
@@ -198,18 +200,18 @@ VM_MASK = 0x00020000
#define RING0_PTREGS_FRAME \
CFI_STARTPROC simple;\
CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, OLDESP-EBX;\
- /*CFI_OFFSET cs, CS-OLDESP;*/\
- CFI_OFFSET eip, EIP-OLDESP;\
- /*CFI_OFFSET es, ES-OLDESP;*/\
- /*CFI_OFFSET ds, DS-OLDESP;*/\
- CFI_OFFSET eax, EAX-OLDESP;\
- CFI_OFFSET ebp, EBP-OLDESP;\
- CFI_OFFSET edi, EDI-OLDESP;\
- CFI_OFFSET esi, ESI-OLDESP;\
- CFI_OFFSET edx, EDX-OLDESP;\
- CFI_OFFSET ecx, ECX-OLDESP;\
- CFI_OFFSET ebx, EBX-OLDESP
+ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
+ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
+ CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
+ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
+ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
+ CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
+ CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
+ CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
+ CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
+ CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
+ CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+ CFI_OFFSET ebx, PT_EBX-PT_OLDESP
ENTRY(ret_from_fork)
CFI_STARTPROC
@@ -237,17 +239,18 @@ ENTRY(ret_from_fork)
ALIGN
RING0_PTREGS_FRAME
ret_from_exception:
- preempt_stop
+ preempt_stop(CLBR_ANY)
ret_from_intr:
GET_THREAD_INFO(%ebp)
check_userspace:
- movl EFLAGS(%esp), %eax # mix EFLAGS and CS
- movb CS(%esp), %al
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
cmpl $USER_RPL, %eax
jb resume_kernel # not returning to v8086 or userspace
+
ENTRY(resume_userspace)
- DISABLE_INTERRUPTS # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
@@ -258,14 +261,14 @@ ENTRY(resume_userspace)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
- DISABLE_INTERRUPTS
+ DISABLE_INTERRUPTS(CLBR_ANY)
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
jnz restore_nocheck
need_resched:
movl TI_flags(%ebp), %ecx # need_resched set ?
testb $_TIF_NEED_RESCHED, %cl
jz restore_all
- testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
+ testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
jz restore_all
call preempt_schedule_irq
jmp need_resched
@@ -287,7 +290,7 @@ sysenter_past_esp:
* No need to follow this irqs on/off section: the syscall
* disabled irqs and here we enable it straight after entry:
*/
- ENABLE_INTERRUPTS
+ ENABLE_INTERRUPTS(CLBR_NONE)
pushl $(__USER_DS)
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ss, 0*/
@@ -331,20 +334,27 @@ sysenter_past_esp:
cmpl $(nr_syscalls), %eax
jae syscall_badsys
call *sys_call_table(,%eax,4)
- movl %eax,EAX(%esp)
- DISABLE_INTERRUPTS
+ movl %eax,PT_EAX(%esp)
+ DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx
jne syscall_exit_work
/* if something modifies registers it must also disable sysexit */
- movl EIP(%esp), %edx
- movl OLDESP(%esp), %ecx
+ movl PT_EIP(%esp), %edx
+ movl PT_OLDESP(%esp), %ecx
xorl %ebp,%ebp
TRACE_IRQS_ON
+1: mov PT_GS(%esp), %gs
ENABLE_INTERRUPTS_SYSEXIT
CFI_ENDPROC
-
+.pushsection .fixup,"ax"
+2: movl $0,PT_GS(%esp)
+ jmp 1b
+.section __ex_table,"a"
+ .align 4
+ .long 1b,2b
+.popsection
# system call handler stub
ENTRY(system_call)
@@ -353,7 +363,7 @@ ENTRY(system_call)
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
- testl $TF_MASK,EFLAGS(%esp)
+ testl $TF_MASK,PT_EFLAGS(%esp)
jz no_singlestep
orl $_TIF_SINGLESTEP,TI_flags(%ebp)
no_singlestep:
@@ -365,9 +375,9 @@ no_singlestep:
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
- movl %eax,EAX(%esp) # store the return value
+ movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
- DISABLE_INTERRUPTS # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
@@ -376,12 +386,12 @@ syscall_exit:
jne syscall_exit_work
restore_all:
- movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
- # Warning: OLDSS(%esp) contains the wrong/random values if we
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
+ # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
# See comments in process.c:copy_thread() for details.
- movb OLDSS(%esp), %ah
- movb CS(%esp), %al
+ movb PT_OLDSS(%esp), %ah
+ movb PT_CS(%esp), %al
andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
@@ -390,13 +400,13 @@ restore_nocheck:
TRACE_IRQS_IRET
restore_nocheck_notrace:
RESTORE_REGS
- addl $4, %esp
+ addl $4, %esp # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
1: INTERRUPT_RETURN
.section .fixup,"ax"
iret_exc:
TRACE_IRQS_ON
- ENABLE_INTERRUPTS
+ ENABLE_INTERRUPTS(CLBR_NONE)
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
@@ -408,33 +418,42 @@ iret_exc:
CFI_RESTORE_STATE
ldt_ss:
- larl OLDSS(%esp), %eax
+ larl PT_OLDSS(%esp), %eax
jnz restore_nocheck
testl $0x00400000, %eax # returning to 32bit stack?
jnz restore_nocheck # allright, normal return
+
+#ifdef CONFIG_PARAVIRT
+ /*
+ * The kernel can't run on a non-flat stack if paravirt mode
+ * is active. Rather than try to fixup the high bits of
+ * ESP, bypass this code entirely. This may break DOSemu
+ * and/or Wine support in a paravirt VM, although the option
+ * is still available to implement the setting of the high
+ * 16-bits in the INTERRUPT_RETURN paravirt-op.
+ */
+ cmpl $0, paravirt_ops+PARAVIRT_enabled
+ jne restore_nocheck
+#endif
+
/* If returning to userspace with 16bit stack,
* try to fix the higher word of ESP, as the CPU
* won't restore it.
* This is an "official" bug of all the x86-compatible
* CPUs, which we can try to work around to make
* dosemu and wine happy. */
- subl $8, %esp # reserve space for switch16 pointer
- CFI_ADJUST_CFA_OFFSET 8
- DISABLE_INTERRUPTS
+ movl PT_OLDESP(%esp), %eax
+ movl %esp, %edx
+ call patch_espfix_desc
+ pushl $__ESPFIX_SS
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ DISABLE_INTERRUPTS(CLBR_EAX)
TRACE_IRQS_OFF
- movl %esp, %eax
- /* Set up the 16bit stack frame with switch32 pointer on top,
- * and a switch16 pointer on top of the current frame. */
- call setup_x86_bogus_stack
- CFI_ADJUST_CFA_OFFSET -8 # frame has moved
- TRACE_IRQS_IRET
- RESTORE_REGS
- lss 20+4(%esp), %esp # switch to 16bit stack
-1: INTERRUPT_RETURN
-.section __ex_table,"a"
- .align 4
- .long 1b,iret_exc
-.previous
+ lss (%esp), %esp
+ CFI_ADJUST_CFA_OFFSET -8
+ jmp restore_nocheck
CFI_ENDPROC
# perform work that needs to be done immediately before resumption
@@ -445,7 +464,7 @@ work_pending:
jz work_notifysig
work_resched:
call schedule
- DISABLE_INTERRUPTS # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
@@ -458,7 +477,8 @@ work_resched:
work_notifysig: # deal with pending signals and
# notify-resume requests
- testl $VM_MASK, EFLAGS(%esp)
+#ifdef CONFIG_VM86
+ testl $VM_MASK, PT_EFLAGS(%esp)
movl %esp, %eax
jne work_notifysig_v86 # returning to kernel-space or
# vm86-space
@@ -468,29 +488,30 @@ work_notifysig: # deal with pending signals and
ALIGN
work_notifysig_v86:
-#ifdef CONFIG_VM86
pushl %ecx # save ti_flags for do_notify_resume
CFI_ADJUST_CFA_OFFSET 4
call save_v86_state # %eax contains pt_regs pointer
popl %ecx
CFI_ADJUST_CFA_OFFSET -4
movl %eax, %esp
+#else
+ movl %esp, %eax
+#endif
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace_sig
-#endif
# perform syscall exit tracing
ALIGN
syscall_trace_entry:
- movl $-ENOSYS,EAX(%esp)
+ movl $-ENOSYS,PT_EAX(%esp)
movl %esp, %eax
xorl %edx,%edx
call do_syscall_trace
cmpl $0, %eax
jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
# so must skip actual syscall
- movl ORIG_EAX(%esp), %eax
+ movl PT_ORIG_EAX(%esp), %eax
cmpl $(nr_syscalls), %eax
jnae syscall_call
jmp syscall_exit
@@ -501,7 +522,7 @@ syscall_exit_work:
testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
jz work_pending
TRACE_IRQS_ON
- ENABLE_INTERRUPTS # could let do_syscall_trace() call
+ ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
# schedule() instead
movl %esp, %eax
movl $1, %edx
@@ -515,39 +536,38 @@ syscall_fault:
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
- movl $-EFAULT,EAX(%esp)
+ movl $-EFAULT,PT_EAX(%esp)
jmp resume_userspace
syscall_badsys:
- movl $-ENOSYS,EAX(%esp)
+ movl $-ENOSYS,PT_EAX(%esp)
jmp resume_userspace
CFI_ENDPROC
#define FIXUP_ESPFIX_STACK \
- movl %esp, %eax; \
- /* switch to 32bit stack using the pointer on top of 16bit stack */ \
- lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
- /* copy data from 16bit stack to 32bit stack */ \
- call fixup_x86_bogus_stack; \
- /* put ESP to the proper location */ \
- movl %eax, %esp;
-#define UNWIND_ESPFIX_STACK \
+ /* since we are on a wrong stack, we cant make it a C code :( */ \
+ movl %gs:PDA_cpu, %ebx; \
+ PER_CPU(cpu_gdt_descr, %ebx); \
+ movl GDS_address(%ebx), %ebx; \
+ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
+ addl %esp, %eax; \
+ pushl $__KERNEL_DS; \
+ CFI_ADJUST_CFA_OFFSET 4; \
pushl %eax; \
CFI_ADJUST_CFA_OFFSET 4; \
+ lss (%esp), %esp; \
+ CFI_ADJUST_CFA_OFFSET -8;
+#define UNWIND_ESPFIX_STACK \
movl %ss, %eax; \
- /* see if on 16bit stack */ \
+ /* see if on espfix stack */ \
cmpw $__ESPFIX_SS, %ax; \
- je 28f; \
-27: popl %eax; \
- CFI_ADJUST_CFA_OFFSET -4; \
-.section .fixup,"ax"; \
-28: movl $__KERNEL_DS, %eax; \
+ jne 27f; \
+ movl $__KERNEL_DS, %eax; \
movl %eax, %ds; \
movl %eax, %es; \
- /* switch to 32bit stack */ \
+ /* switch to normal stack */ \
FIXUP_ESPFIX_STACK; \
- jmp 27b; \
-.previous
+27:;
/*
* Build the entry stubs and pointer table with
@@ -608,13 +628,16 @@ KPROBE_ENTRY(page_fault)
CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
+ /* the function address is in %gs's slot on the stack */
+ pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0*/
pushl %ds
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ds, 0*/
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET eax, 0
- xorl %eax, %eax
pushl %ebp
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebp, 0
@@ -627,7 +650,6 @@ error_code:
pushl %edx
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET edx, 0
- decl %eax # eax = -1
pushl %ecx
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ecx, 0
@@ -635,18 +657,20 @@ error_code:
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebx, 0
cld
- pushl %es
+ pushl %gs
CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET es, 0*/
+ /*CFI_REL_OFFSET gs, 0*/
+ movl $(__KERNEL_PDA), %ecx
+ movl %ecx, %gs
UNWIND_ESPFIX_STACK
popl %ecx
CFI_ADJUST_CFA_OFFSET -4
/*CFI_REGISTER es, ecx*/
- movl ES(%esp), %edi # get the function address
- movl ORIG_EAX(%esp), %edx # get the error code
- movl %eax, ORIG_EAX(%esp)
- movl %ecx, ES(%esp)
- /*CFI_REL_OFFSET es, ES*/
+ movl PT_GS(%esp), %edi # get the function address
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+ mov %ecx, PT_GS(%esp)
+ /*CFI_REL_OFFSET gs, ES*/
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
@@ -682,7 +706,7 @@ ENTRY(device_not_available)
GET_CR0_INTO_EAX
testl $0x4, %eax # EM (math emulation bit)
jne device_not_available_emulate
- preempt_stop
+ preempt_stop(CLBR_ANY)
call math_state_restore
jmp ret_from_exception
device_not_available_emulate:
@@ -754,7 +778,7 @@ KPROBE_ENTRY(nmi)
cmpw $__ESPFIX_SS, %ax
popl %eax
CFI_ADJUST_CFA_OFFSET -4
- je nmi_16bit_stack
+ je nmi_espfix_stack
cmpl $sysenter_entry,(%esp)
je nmi_stack_fixup
pushl %eax
@@ -797,7 +821,7 @@ nmi_debug_stack_check:
FIX_STACK(24,nmi_stack_correct, 1)
jmp nmi_stack_correct
-nmi_16bit_stack:
+nmi_espfix_stack:
/* We have a RING0_INT_FRAME here.
*
* create the pointer to lss back
@@ -806,7 +830,6 @@ nmi_16bit_stack:
CFI_ADJUST_CFA_OFFSET 4
pushl %esp
CFI_ADJUST_CFA_OFFSET 4
- movzwl %sp, %esp
addw $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3
@@ -817,11 +840,11 @@ nmi_16bit_stack:
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
FIXUP_ESPFIX_STACK # %eax == %esp
- CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
xorl %edx,%edx # zero error code
call do_nmi
RESTORE_REGS
- lss 12+4(%esp), %esp # back to 16bit stack
+ lss 12+4(%esp), %esp # back to espfix stack
+ CFI_ADJUST_CFA_OFFSET -24
1: INTERRUPT_RETURN
CFI_ENDPROC
.section __ex_table,"a"
@@ -830,6 +853,19 @@ nmi_16bit_stack:
.previous
KPROBE_END(nmi)
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+1: iret
+.section __ex_table,"a"
+ .align 4
+ .long 1b,iret_exc
+.previous
+
+ENTRY(native_irq_enable_sysexit)
+ sti
+ sysexit
+#endif
+
KPROBE_ENTRY(int3)
RING0_INT_FRAME
pushl $-1 # mark this as an int
@@ -949,26 +985,27 @@ ENTRY(arch_unwind_init_running)
movl 4(%esp), %edx
movl (%esp), %ecx
leal 4(%esp), %eax
- movl %ebx, EBX(%edx)
+ movl %ebx, PT_EBX(%edx)
xorl %ebx, %ebx
- movl %ebx, ECX(%edx)
- movl %ebx, EDX(%edx)
- movl %esi, ESI(%edx)
- movl %edi, EDI(%edx)
- movl %ebp, EBP(%edx)
- movl %ebx, EAX(%edx)
- movl $__USER_DS, DS(%edx)
- movl $__USER_DS, ES(%edx)
- movl %ebx, ORIG_EAX(%edx)
- movl %ecx, EIP(%edx)
+ movl %ebx, PT_ECX(%edx)
+ movl %ebx, PT_EDX(%edx)
+ movl %esi, PT_ESI(%edx)
+ movl %edi, PT_EDI(%edx)
+ movl %ebp, PT_EBP(%edx)
+ movl %ebx, PT_EAX(%edx)
+ movl $__USER_DS, PT_DS(%edx)
+ movl $__USER_DS, PT_ES(%edx)
+ movl $0, PT_GS(%edx)
+ movl %ebx, PT_ORIG_EAX(%edx)
+ movl %ecx, PT_EIP(%edx)
movl 12(%esp), %ecx
- movl $__KERNEL_CS, CS(%edx)
- movl %ebx, EFLAGS(%edx)
- movl %eax, OLDESP(%edx)
+ movl $__KERNEL_CS, PT_CS(%edx)
+ movl %ebx, PT_EFLAGS(%edx)
+ movl %eax, PT_OLDESP(%edx)
movl 8(%esp), %eax
movl %ecx, 8(%esp)
- movl EBX(%edx), %ebx
- movl $__KERNEL_DS, OLDSS(%edx)
+ movl PT_EBX(%edx), %ebx
+ movl $__KERNEL_DS, PT_OLDSS(%edx)
jmpl *%eax
CFI_ENDPROC
ENDPROC(arch_unwind_init_running)
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index ca31f18d277c..edef5084ce17 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -55,6 +55,12 @@
*/
ENTRY(startup_32)
+#ifdef CONFIG_PARAVIRT
+ movl %cs, %eax
+ testl $0x3, %eax
+ jnz startup_paravirt
+#endif
+
/*
* Set segments to known values.
*/
@@ -302,6 +308,7 @@ is386: movl $2,%ecx # set MP
movl %eax,%cr0
call check_x87
+ call setup_pda
lgdt cpu_gdt_descr
lidt idt_descr
ljmp $(__KERNEL_CS),$1f
@@ -312,10 +319,13 @@ is386: movl $2,%ecx # set MP
movl %eax,%ds
movl %eax,%es
- xorl %eax,%eax # Clear FS/GS and LDT
+ xorl %eax,%eax # Clear FS and LDT
movl %eax,%fs
- movl %eax,%gs
lldt %ax
+
+ movl $(__KERNEL_PDA),%eax
+ mov %eax,%gs
+
cld # gcc2 wants the direction flag cleared at all times
pushl $0 # fake return address for unwinder
#ifdef CONFIG_SMP
@@ -346,6 +356,23 @@ check_x87:
ret
/*
+ * Point the GDT at this CPU's PDA. On boot this will be
+ * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
+ * that CPU's GDT and PDA.
+ */
+setup_pda:
+ /* get the PDA pointer */
+ movl start_pda, %eax
+
+ /* slot the PDA address into the GDT */
+ mov cpu_gdt_descr+2, %ecx
+ mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
+ shr $16, %eax
+ mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
+ mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
+ ret
+
+/*
* setup_idt
*
* sets up a idt with 256 entries pointing to
@@ -465,6 +492,33 @@ ignore_int:
#endif
iret
+#ifdef CONFIG_PARAVIRT
+startup_paravirt:
+ cld
+ movl $(init_thread_union+THREAD_SIZE),%esp
+
+ /* We take pains to preserve all the regs. */
+ pushl %edx
+ pushl %ecx
+ pushl %eax
+
+ /* paravirt.o is last in link, and that probe fn never returns */
+ pushl $__start_paravirtprobe
+1:
+ movl 0(%esp), %eax
+ pushl (%eax)
+ movl 8(%esp), %eax
+ call *(%esp)
+ popl %eax
+
+ movl 4(%esp), %eax
+ movl 8(%esp), %ecx
+ movl 12(%esp), %edx
+
+ addl $4, (%esp)
+ jmp 1b
+#endif
+
/*
* Real beginning of normal "text" segment
*/
@@ -484,6 +538,8 @@ ENTRY(empty_zero_page)
* This starts the data section.
*/
.data
+ENTRY(start_pda)
+ .long boot_pda
ENTRY(stack_start)
.long init_thread_union+THREAD_SIZE
@@ -525,7 +581,7 @@ idt_descr:
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
-cpu_gdt_descr:
+ENTRY(cpu_gdt_descr)
.word GDT_ENTRIES*8-1
.long cpu_gdt_table
@@ -584,8 +640,8 @@ ENTRY(cpu_gdt_table)
.quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */
.quad 0x004092000000ffff /* 0xc8 APM DS data */
- .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */
- .quad 0x0000000000000000 /* 0xd8 - unused */
+ .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
+ .quad 0x00cf92000000ffff /* 0xd8 - PDA */
.quad 0x0000000000000000 /* 0xe0 - unused */
.quad 0x0000000000000000 /* 0xe8 - unused */
.quad 0x0000000000000000 /* 0xf0 - unused */
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17647a530b2f..45a8685bb60b 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -34,6 +34,7 @@ static int __init init_hpet_clocksource(void)
unsigned long hpet_period;
void __iomem* hpet_base;
u64 tmp;
+ int err;
if (!is_hpet_enabled())
return -ENODEV;
@@ -61,7 +62,11 @@ static int __init init_hpet_clocksource(void)
do_div(tmp, FSEC_PER_NSEC);
clocksource_hpet.mult = (u32)tmp;
- return clocksource_register(&clocksource_hpet);
+ err = clocksource_register(&clocksource_hpet);
+ if (err)
+ iounmap(hpet_base);
+
+ return err;
}
module_init(init_hpet_clocksource);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index 62996cd17084..c8d45821c788 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -381,7 +381,10 @@ void __init init_ISA_irqs (void)
}
}
-void __init init_IRQ(void)
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
{
int i;
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 3b7a63e0ed1a..e21dcde0790e 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -34,6 +34,7 @@
#include <linux/pci.h>
#include <linux/msi.h>
#include <linux/htirq.h>
+#include <linux/freezer.h>
#include <asm/io.h>
#include <asm/smp.h>
@@ -153,14 +154,20 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
* the interrupt, and we need to make sure the entry is fully populated
* before that happens.
*/
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- unsigned long flags;
union entry_union eu;
eu.entry = e;
- spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0x11 + 2*pin, eu.w2);
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __ioapic_write_entry(apic, pin, e);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -836,8 +843,7 @@ static int __init find_isa_irq_pin(int irq, int type)
if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
- mp_bus_id_to_type[lbus] == MP_BUS_NEC98
+ mp_bus_id_to_type[lbus] == MP_BUS_MCA
) &&
(mp_irqs[i].mpc_irqtype == type) &&
(mp_irqs[i].mpc_srcbusirq == irq))
@@ -856,8 +862,7 @@ static int __init find_isa_irq_apic(int irq, int type)
if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
- mp_bus_id_to_type[lbus] == MP_BUS_NEC98
+ mp_bus_id_to_type[lbus] == MP_BUS_MCA
) &&
(mp_irqs[i].mpc_irqtype == type) &&
(mp_irqs[i].mpc_srcbusirq == irq))
@@ -987,12 +992,6 @@ static int EISA_ELCR(unsigned int irq)
#define default_MCA_trigger(idx) (1)
#define default_MCA_polarity(idx) (0)
-/* NEC98 interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_NEC98_trigger(idx) (0)
-#define default_NEC98_polarity(idx) (0)
-
static int __init MPBIOS_polarity(int idx)
{
int bus = mp_irqs[idx].mpc_srcbus;
@@ -1027,11 +1026,6 @@ static int __init MPBIOS_polarity(int idx)
polarity = default_MCA_polarity(idx);
break;
}
- case MP_BUS_NEC98: /* NEC 98 pin */
- {
- polarity = default_NEC98_polarity(idx);
- break;
- }
default:
{
printk(KERN_WARNING "broken BIOS!!\n");
@@ -1101,11 +1095,6 @@ static int MPBIOS_trigger(int idx)
trigger = default_MCA_trigger(idx);
break;
}
- case MP_BUS_NEC98: /* NEC 98 pin */
- {
- trigger = default_NEC98_trigger(idx);
- break;
- }
default:
{
printk(KERN_WARNING "broken BIOS!!\n");
@@ -1167,7 +1156,6 @@ static int pin_2_irq(int idx, int apic, int pin)
case MP_BUS_ISA: /* ISA pin */
case MP_BUS_EISA:
case MP_BUS_MCA:
- case MP_BUS_NEC98:
{
irq = mp_irqs[idx].mpc_srcbusirq;
break;
@@ -1235,7 +1223,7 @@ static inline int IO_APIC_irq_trigger(int irq)
}
/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
static int __assign_irq_vector(int irq)
{
@@ -1360,8 +1348,8 @@ static void __init setup_IO_APIC_irqs(void)
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
- ioapic_write_entry(apic, pin, entry);
spin_lock_irqsave(&ioapic_lock, flags);
+ __ioapic_write_entry(apic, pin, entry);
set_native_irq_info(irq, TARGET_CPUS);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -1926,6 +1914,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
static void __init setup_ioapic_ids_from_mpc(void) { }
#endif
+static int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+ no_timer_check = 1;
+ return 1;
+}
+__setup("no_timer_check", notimercheck);
+
/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
@@ -1934,10 +1931,13 @@ static void __init setup_ioapic_ids_from_mpc(void) { }
* - if this function detects that timer IRQs are defunct, then we fall
* back to ISA timer IRQs
*/
-static int __init timer_irq_works(void)
+int __init timer_irq_works(void)
{
unsigned long t1 = jiffies;
+ if (no_timer_check)
+ return 1;
+
local_irq_enable();
/* Let ten ticks pass... */
mdelay((10 * 1000) / HZ);
@@ -2161,9 +2161,15 @@ static inline void unlock_ExtINT_logic(void)
unsigned char save_control, save_freq_select;
pin = find_isa_irq_pin(8, mp_INT);
+ if (pin == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
apic = find_isa_irq_apic(8, mp_INT);
- if (pin == -1)
+ if (apic == -1) {
+ WARN_ON_ONCE(1);
return;
+ }
entry0 = ioapic_read_entry(apic, pin);
clear_IO_APIC_pin(apic, pin);
@@ -2208,7 +2214,7 @@ int timer_uses_ioapic_pin_0;
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
*/
-static inline void check_timer(void)
+static inline void __init check_timer(void)
{
int apic1, pin1, apic2, pin2;
int vector;
@@ -2856,8 +2862,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
if (!ioapic && (irq < 16))
disable_8259A_irq(irq);
- ioapic_write_entry(ioapic, pin, entry);
spin_lock_irqsave(&ioapic_lock, flags);
+ __ioapic_write_entry(ioapic, pin, entry);
set_native_irq_info(irq, TARGET_CPUS);
spin_unlock_irqrestore(&ioapic_lock, flags);
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index fc79e1e859c4..af1d53344993 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -184,7 +184,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
void __kprobes arch_remove_kprobe(struct kprobe *p)
{
mutex_lock(&kprobe_mutex);
- free_insn_slot(p->ainsn.insn);
+ free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
mutex_unlock(&kprobe_mutex);
}
@@ -333,7 +333,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
return 1;
ss_probe:
-#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
if (p->ainsn.boostable == 1 && !p->post_handler){
/* Boost up -- we can execute copied instructions directly */
reset_current_kprobe();
diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
index 445211eb2d57..b410e5fb034f 100644
--- a/arch/i386/kernel/ldt.c
+++ b/arch/i386/kernel/ldt.c
@@ -160,16 +160,14 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount)
{
int err;
unsigned long size;
- void *address;
err = 0;
- address = &default_ldt[0];
size = 5*sizeof(struct desc_struct);
if (size > bytecount)
size = bytecount;
err = size;
- if (copy_to_user(ptr, address, size))
+ if (clear_user(ptr, size))
err = -EFAULT;
return err;
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
index eb57a851789d..b83672b89527 100644
--- a/arch/i386/kernel/mca.c
+++ b/arch/i386/kernel/mca.c
@@ -283,10 +283,9 @@ static int __init mca_init(void)
bus->f.mca_transform_memory = mca_dummy_transform_memory;
/* get the motherboard device */
- mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL);
+ mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
if(unlikely(!mca_dev))
goto out_nomem;
- memset(mca_dev, 0, sizeof(struct mca_device));
/*
* We do not expect many MCA interrupts during initialization,
@@ -310,11 +309,9 @@ static int __init mca_init(void)
mca_dev->slot = MCA_MOTHERBOARD;
mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC);
+ mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
if(unlikely(!mca_dev))
goto out_unlock_nomem;
- memset(mca_dev, 0, sizeof(struct mca_device));
-
/* Put motherboard into video setup mode, read integrated video
* POS registers, and turn motherboard setup off.
@@ -349,10 +346,9 @@ static int __init mca_init(void)
}
if(which_scsi) {
/* found a scsi card */
- mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC);
+ mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
if(unlikely(!mca_dev))
goto out_unlock_nomem;
- memset(mca_dev, 0, sizeof(struct mca_device));
for(j = 0; j < 8; j++)
mca_dev->pos[j] = pos[j];
@@ -378,10 +374,9 @@ static int __init mca_init(void)
if(!mca_read_and_store_pos(pos))
continue;
- mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC);
+ mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
if(unlikely(!mca_dev))
goto out_unlock_nomem;
- memset(mca_dev, 0, sizeof(struct mca_device));
for(j=0; j<8; j++)
mca_dev->pos[j]=pos[j];
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 23f5984d0654..972346604f9d 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -703,7 +703,6 @@ static struct sysdev_driver mc_sysdev_driver = {
.resume = mc_sysdev_resume,
};
-#ifdef CONFIG_HOTPLUG_CPU
static __cpuinit int
mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
{
@@ -726,7 +725,6 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
static struct notifier_block mc_cpu_notifier = {
.notifier_call = mc_cpu_callback,
};
-#endif
static int __init microcode_init (void)
{
diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c
index 470cf97e7cd3..d7d9c8b23f72 100644
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -108,7 +108,8 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+ const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+ *para = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -118,6 +119,8 @@ int module_finalize(const Elf_Ehdr *hdr,
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks= s;
+ if (!strcmp(".parainstructions", secstrings + s->sh_name))
+ para = s;
}
if (alt) {
@@ -132,6 +135,12 @@ int module_finalize(const Elf_Ehdr *hdr,
lseg, lseg + locks->sh_size,
tseg, tseg + text->sh_size);
}
+
+ if (para) {
+ void *pseg = (void *)para->sh_addr;
+ apply_paravirt(pseg, pseg + para->sh_size);
+ }
+
return 0;
}
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 442aaf8c77eb..2ce67228dff8 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -249,8 +249,6 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
mp_current_pci_id++;
} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
- } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
} else {
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
}
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index a773f776c9ea..1d1a56cae340 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -195,7 +195,6 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
{
const u32 __user *tmp = (const u32 __user *)buf;
u32 data[2];
- size_t rv;
u32 reg = *ppos;
int cpu = iminor(file->f_dentry->d_inode);
int err;
@@ -203,7 +202,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
if (count % 8)
return -EINVAL; /* Invalid chunk size */
- for (rv = 0; count; count -= 8) {
+ for (; count; count -= 8) {
if (copy_from_user(&data, tmp, 8))
return -EFAULT;
err = do_wrmsr(cpu, reg, data[0], data[1]);
@@ -250,7 +249,6 @@ static int msr_device_create(int i)
return err;
}
-#ifdef CONFIG_HOTPLUG_CPU
static int msr_class_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
@@ -271,7 +269,6 @@ static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
{
.notifier_call = msr_class_cpu_callback,
};
-#endif
static int __init msr_init(void)
{
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index eaafe233a5da..f5bc7e1be801 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -22,6 +22,7 @@
#include <linux/percpu.h>
#include <linux/dmi.h>
#include <linux/kprobes.h>
+#include <linux/cpumask.h>
#include <asm/smp.h>
#include <asm/nmi.h>
@@ -42,6 +43,8 @@ int nmi_watchdog_enabled;
static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
* offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
*/
@@ -867,14 +870,16 @@ static unsigned int
void touch_nmi_watchdog (void)
{
- int i;
+ if (nmi_watchdog > 0) {
+ unsigned cpu;
- /*
- * Just reset the alert counters, (other CPUs might be
- * spinning on locks we hold):
- */
- for_each_possible_cpu(i)
- alert_counter[i] = 0;
+ /*
+ * Just reset the alert counters, (other CPUs might be
+ * spinning on locks we hold):
+ */
+ for_each_present_cpu (cpu)
+ alert_counter[cpu] = 0;
+ }
/*
* Tickle the softlockup detector too:
@@ -907,6 +912,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
touched = 1;
}
+ if (cpu_isset(cpu, backtrace_mask)) {
+ static DEFINE_SPINLOCK(lock); /* Serialise the printks */
+
+ spin_lock(&lock);
+ printk("NMI backtrace for cpu %d\n", cpu);
+ dump_stack();
+ spin_unlock(&lock);
+ cpu_clear(cpu, backtrace_mask);
+ }
+
sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
/* if the apic timer isn't firing, this cpu isn't doing much */
@@ -1033,6 +1048,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
#endif
+void __trigger_all_cpu_backtrace(void)
+{
+ int i;
+
+ backtrace_mask = cpu_online_map;
+ /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+ for (i = 0; i < 10 * 1000; i++) {
+ if (cpus_empty(backtrace_mask))
+ break;
+ mdelay(1);
+ }
+}
+
EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);
EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
new file mode 100644
index 000000000000..3dceab5828f1
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,569 @@
+/* Paravirtualization interfaces
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+#include <linux/start_kernel.h>
+
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+#include <asm/tlbflush.h>
+
+/* nop stub */
+static void native_nop(void)
+{
+}
+
+static void __init default_banner(void)
+{
+ printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+ paravirt_ops.name);
+}
+
+char *memory_setup(void)
+{
+ return paravirt_ops.memory_setup();
+}
+
+/* Simple instruction patching code. */
+#define DEF_NATIVE(name, code) \
+ extern const char start_##name[], end_##name[]; \
+ asm("start_" #name ": " code "; end_" #name ":")
+DEF_NATIVE(cli, "cli");
+DEF_NATIVE(sti, "sti");
+DEF_NATIVE(popf, "push %eax; popf");
+DEF_NATIVE(pushf, "pushf; pop %eax");
+DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
+DEF_NATIVE(iret, "iret");
+DEF_NATIVE(sti_sysexit, "sti; sysexit");
+
+static const struct native_insns
+{
+ const char *start, *end;
+} native_insns[] = {
+ [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+ [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+ [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+ [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+ [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+ [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+ [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
+};
+
+static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+ unsigned int insn_len;
+
+ /* Don't touch it if we don't have a replacement */
+ if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
+ return len;
+
+ insn_len = native_insns[type].end - native_insns[type].start;
+
+ /* Similarly if we can't fit replacement. */
+ if (len < insn_len)
+ return len;
+
+ memcpy(insns, native_insns[type].start, insn_len);
+ return insn_len;
+}
+
+static fastcall unsigned long native_get_debugreg(int regno)
+{
+ unsigned long val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ asm("movl %%db0, %0" :"=r" (val)); break;
+ case 1:
+ asm("movl %%db1, %0" :"=r" (val)); break;
+ case 2:
+ asm("movl %%db2, %0" :"=r" (val)); break;
+ case 3:
+ asm("movl %%db3, %0" :"=r" (val)); break;
+ case 6:
+ asm("movl %%db6, %0" :"=r" (val)); break;
+ case 7:
+ asm("movl %%db7, %0" :"=r" (val)); break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static fastcall void native_set_debugreg(int regno, unsigned long value)
+{
+ switch (regno) {
+ case 0:
+ asm("movl %0,%%db0" : /* no output */ :"r" (value));
+ break;
+ case 1:
+ asm("movl %0,%%db1" : /* no output */ :"r" (value));
+ break;
+ case 2:
+ asm("movl %0,%%db2" : /* no output */ :"r" (value));
+ break;
+ case 3:
+ asm("movl %0,%%db3" : /* no output */ :"r" (value));
+ break;
+ case 6:
+ asm("movl %0,%%db6" : /* no output */ :"r" (value));
+ break;
+ case 7:
+ asm("movl %0,%%db7" : /* no output */ :"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+void init_IRQ(void)
+{
+ paravirt_ops.init_IRQ();
+}
+
+static fastcall void native_clts(void)
+{
+ asm volatile ("clts");
+}
+
+static fastcall unsigned long native_read_cr0(void)
+{
+ unsigned long val;
+ asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void native_write_cr0(unsigned long val)
+{
+ asm volatile("movl %0,%%cr0": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr2(void)
+{
+ unsigned long val;
+ asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void native_write_cr2(unsigned long val)
+{
+ asm volatile("movl %0,%%cr2": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr3(void)
+{
+ unsigned long val;
+ asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void native_write_cr3(unsigned long val)
+{
+ asm volatile("movl %0,%%cr3": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr4(void)
+{
+ unsigned long val;
+ asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall unsigned long native_read_cr4_safe(void)
+{
+ unsigned long val;
+ /* This could fault if %cr4 does not exist */
+ asm("1: movl %%cr4, %0 \n"
+ "2: \n"
+ ".section __ex_table,\"a\" \n"
+ ".long 1b,2b \n"
+ ".previous \n"
+ : "=r" (val): "0" (0));
+ return val;
+}
+
+static fastcall void native_write_cr4(unsigned long val)
+{
+ asm volatile("movl %0,%%cr4": :"r" (val));
+}
+
+static fastcall unsigned long native_save_fl(void)
+{
+ unsigned long f;
+ asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
+ return f;
+}
+
+static fastcall void native_restore_fl(unsigned long f)
+{
+ asm volatile("pushl %0 ; popfl": /* no output */
+ :"g" (f)
+ :"memory", "cc");
+}
+
+static fastcall void native_irq_disable(void)
+{
+ asm volatile("cli": : :"memory");
+}
+
+static fastcall void native_irq_enable(void)
+{
+ asm volatile("sti": : :"memory");
+}
+
+static fastcall void native_safe_halt(void)
+{
+ asm volatile("sti; hlt": : :"memory");
+}
+
+static fastcall void native_halt(void)
+{
+ asm volatile("hlt": : :"memory");
+}
+
+static fastcall void native_wbinvd(void)
+{
+ asm volatile("wbinvd": : :"memory");
+}
+
+static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
+{
+ unsigned long long val;
+
+ asm volatile("2: rdmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %3,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=r" (*err), "=A" (val)
+ : "c" (msr), "i" (-EFAULT));
+
+ return val;
+}
+
+static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
+{
+ int err;
+ asm volatile("2: wrmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %4,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=a" (err)
+ : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
+ "i" (-EFAULT));
+ return err;
+}
+
+static fastcall unsigned long long native_read_tsc(void)
+{
+ unsigned long long val;
+ asm volatile("rdtsc" : "=A" (val));
+ return val;
+}
+
+static fastcall unsigned long long native_read_pmc(void)
+{
+ unsigned long long val;
+ asm volatile("rdpmc" : "=A" (val));
+ return val;
+}
+
+static fastcall void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+ asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
+{
+ asm ("sgdt %0":"=m" (*dtr));
+}
+
+static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
+{
+ asm ("sidt %0":"=m" (*dtr));
+}
+
+static fastcall unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+ asm ("str %0":"=r" (tr));
+ return tr;
+}
+
+static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+ C(0); C(1); C(2);
+#undef C
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+ u32 *lp = (u32 *)((char *)dt + entry*8);
+ lp[0] = entry_low;
+ lp[1] = entry_high;
+}
+
+static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+ native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+ native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+ native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_load_esp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ tss->esp0 = thread->esp0;
+
+ /* This can only happen when SEP is enabled, no need to test "SEP"arately */
+ if (unlikely(tss->ss1 != thread->sysenter_cs)) {
+ tss->ss1 = thread->sysenter_cs;
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+ }
+}
+
+static fastcall void native_io_delay(void)
+{
+ asm volatile("outb %al,$0x80");
+}
+
+static fastcall void native_flush_tlb(void)
+{
+ __native_flush_tlb();
+}
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static fastcall void native_flush_tlb_global(void)
+{
+ __native_flush_tlb_global();
+}
+
+static fastcall void native_flush_tlb_single(u32 addr)
+{
+ __native_flush_tlb_single(addr);
+}
+
+#ifndef CONFIG_X86_PAE
+static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
+{
+ *ptep = pteval;
+}
+
+static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+ *ptep = pteval;
+}
+
+static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+ *pmdp = pmdval;
+}
+
+#else /* CONFIG_X86_PAE */
+
+static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+ set_64bit((unsigned long long *)ptep,pte_val(pteval));
+}
+
+static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+ set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
+}
+
+static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
+{
+ *pudp = pudval;
+}
+
+static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = 0;
+}
+
+static fastcall void native_pmd_clear(pmd_t *pmd)
+{
+ u32 *tmp = (u32 *)pmd;
+ *tmp = 0;
+ smp_wmb();
+ *(tmp + 1) = 0;
+}
+#endif /* CONFIG_X86_PAE */
+
+/* These are in entry.S */
+extern fastcall void native_iret(void);
+extern fastcall void native_irq_enable_sysexit(void);
+
+static int __init print_banner(void)
+{
+ paravirt_ops.banner();
+ return 0;
+}
+core_initcall(print_banner);
+
+/* We simply declare start_kernel to be the paravirt probe of last resort. */
+paravirt_probe(start_kernel);
+
+struct paravirt_ops paravirt_ops = {
+ .name = "bare hardware",
+ .paravirt_enabled = 0,
+ .kernel_rpl = 0,
+
+ .patch = native_patch,
+ .banner = default_banner,
+ .arch_setup = native_nop,
+ .memory_setup = machine_specific_memory_setup,
+ .get_wallclock = native_get_wallclock,
+ .set_wallclock = native_set_wallclock,
+ .time_init = time_init_hook,
+ .init_IRQ = native_init_IRQ,
+
+ .cpuid = native_cpuid,
+ .get_debugreg = native_get_debugreg,
+ .set_debugreg = native_set_debugreg,
+ .clts = native_clts,
+ .read_cr0 = native_read_cr0,
+ .write_cr0 = native_write_cr0,
+ .read_cr2 = native_read_cr2,
+ .write_cr2 = native_write_cr2,
+ .read_cr3 = native_read_cr3,
+ .write_cr3 = native_write_cr3,
+ .read_cr4 = native_read_cr4,
+ .read_cr4_safe = native_read_cr4_safe,
+ .write_cr4 = native_write_cr4,
+ .save_fl = native_save_fl,
+ .restore_fl = native_restore_fl,
+ .irq_disable = native_irq_disable,
+ .irq_enable = native_irq_enable,
+ .safe_halt = native_safe_halt,
+ .halt = native_halt,
+ .wbinvd = native_wbinvd,
+ .read_msr = native_read_msr,
+ .write_msr = native_write_msr,
+ .read_tsc = native_read_tsc,
+ .read_pmc = native_read_pmc,
+ .load_tr_desc = native_load_tr_desc,
+ .set_ldt = native_set_ldt,
+ .load_gdt = native_load_gdt,
+ .load_idt = native_load_idt,
+ .store_gdt = native_store_gdt,
+ .store_idt = native_store_idt,
+ .store_tr = native_store_tr,
+ .load_tls = native_load_tls,
+ .write_ldt_entry = native_write_ldt_entry,
+ .write_gdt_entry = native_write_gdt_entry,
+ .write_idt_entry = native_write_idt_entry,
+ .load_esp0 = native_load_esp0,
+
+ .set_iopl_mask = native_set_iopl_mask,
+ .io_delay = native_io_delay,
+ .const_udelay = __const_udelay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ .apic_write = native_apic_write,
+ .apic_write_atomic = native_apic_write_atomic,
+ .apic_read = native_apic_read,
+#endif
+
+ .flush_tlb_user = native_flush_tlb,
+ .flush_tlb_kernel = native_flush_tlb_global,
+ .flush_tlb_single = native_flush_tlb_single,
+
+ .set_pte = native_set_pte,
+ .set_pte_at = native_set_pte_at,
+ .set_pmd = native_set_pmd,
+ .pte_update = (void *)native_nop,
+ .pte_update_defer = (void *)native_nop,
+#ifdef CONFIG_X86_PAE
+ .set_pte_atomic = native_set_pte_atomic,
+ .set_pte_present = native_set_pte_present,
+ .set_pud = native_set_pud,
+ .pte_clear = native_pte_clear,
+ .pmd_clear = native_pmd_clear,
+#endif
+
+ .irq_enable_sysexit = native_irq_enable_sysexit,
+ .iret = native_iret,
+};
+EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c
index 5c8c6ef1fc5e..41af692c1584 100644
--- a/arch/i386/kernel/pci-dma.c
+++ b/arch/i386/kernel/pci-dma.c
@@ -92,14 +92,12 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
if (!mem_base)
goto out;
- dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+ dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
if (!dev->dma_mem)
goto out;
- memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
- dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
+ dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
if (!dev->dma_mem->bitmap)
goto free1_out;
- memset(dev->dma_mem->bitmap, 0, bitmap_size);
dev->dma_mem->virt_base = mem_base;
dev->dma_mem->device_base = device_addr;
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index dd53c58f64f1..99308510a17c 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -56,6 +56,7 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
+#include <asm/pda.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -99,22 +100,18 @@ EXPORT_SYMBOL(enable_hlt);
*/
void default_idle(void)
{
- local_irq_enable();
-
if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
- while (!need_resched()) {
- local_irq_disable();
- if (!need_resched())
- safe_halt();
- else
- local_irq_enable();
- }
+ local_irq_disable();
+ if (!need_resched())
+ safe_halt(); /* enables interrupts racelessly */
+ else
+ local_irq_enable();
current_thread_info()->status |= TS_POLLING;
} else {
- while (!need_resched())
- cpu_relax();
+ /* loop is done by the caller */
+ cpu_relax();
}
}
#ifdef CONFIG_APM_MODULE
@@ -128,14 +125,7 @@ EXPORT_SYMBOL(default_idle);
*/
static void poll_idle (void)
{
- local_irq_enable();
-
- asm volatile(
- "2:"
- "testl %0, %1;"
- "rep; nop;"
- "je 2b;"
- : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
+ cpu_relax();
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -256,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
static void mwait_idle(void)
{
local_irq_enable();
- while (!need_resched())
- mwait_idle_with_hints(0, 0);
+ mwait_idle_with_hints(0, 0);
}
void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
@@ -314,8 +303,8 @@ void show_regs(struct pt_regs * regs)
regs->eax,regs->ebx,regs->ecx,regs->edx);
printk("ESI: %08lx EDI: %08lx EBP: %08lx",
regs->esi, regs->edi, regs->ebp);
- printk(" DS: %04x ES: %04x\n",
- 0xffff & regs->xds,0xffff & regs->xes);
+ printk(" DS: %04x ES: %04x GS: %04x\n",
+ 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
cr0 = read_cr0();
cr2 = read_cr2();
@@ -346,6 +335,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
regs.xds = __USER_DS;
regs.xes = __USER_DS;
+ regs.xgs = __KERNEL_PDA;
regs.orig_eax = -1;
regs.eip = (unsigned long) kernel_thread_helper;
regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -431,7 +421,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
p->thread.eip = (unsigned long) ret_from_fork;
savesegment(fs,p->thread.fs);
- savesegment(gs,p->thread.gs);
tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -508,7 +497,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
dump->regs.ds = regs->xds;
dump->regs.es = regs->xes;
savesegment(fs,dump->regs.fs);
- savesegment(gs,dump->regs.gs);
+ dump->regs.gs = regs->xgs;
dump->regs.orig_eax = regs->orig_eax;
dump->regs.eip = regs->eip;
dump->regs.cs = regs->xcs;
@@ -648,22 +637,27 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
__unlazy_fpu(prev_p);
+
+ /* we're going to use this soon, after a few expensive things */
+ if (next_p->fpu_counter > 5)
+ prefetch(&next->i387.fxsave);
+
/*
* Reload esp0.
*/
load_esp0(tss, next);
/*
- * Save away %fs and %gs. No need to save %es and %ds, as
- * those are always kernel segments while inside the kernel.
- * Doing this before setting the new TLS descriptors avoids
- * the situation where we temporarily have non-reloadable
- * segments in %fs and %gs. This could be an issue if the
- * NMI handler ever used %fs or %gs (it does not today), or
- * if the kernel is running inside of a hypervisor layer.
+ * Save away %fs. No need to save %gs, as it was saved on the
+ * stack on entry. No need to save %es and %ds, as those are
+ * always kernel segments while inside the kernel. Doing this
+ * before setting the new TLS descriptors avoids the situation
+ * where we temporarily have non-reloadable segments in %fs
+ * and %gs. This could be an issue if the NMI handler ever
+ * used %fs or %gs (it does not today), or if the kernel is
+ * running inside of a hypervisor layer.
*/
savesegment(fs, prev->fs);
- savesegment(gs, prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
@@ -671,22 +665,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
load_TLS(next, cpu);
/*
- * Restore %fs and %gs if needed.
+ * Restore %fs if needed.
*
- * Glibc normally makes %fs be zero, and %gs is one of
- * the TLS segments.
+ * Glibc normally makes %fs be zero.
*/
if (unlikely(prev->fs | next->fs))
loadsegment(fs, next->fs);
- if (prev->gs | next->gs)
- loadsegment(gs, next->gs);
-
- /*
- * Restore IOPL if needed.
- */
- if (unlikely(prev->iopl != next->iopl))
- set_iopl_mask(next->iopl);
+ write_pda(pcurrent, next_p);
/*
* Now maybe handle debug registers and/or IO bitmaps
@@ -697,6 +683,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
disable_tsc(prev_p, next_p);
+ /* If the task has used fpu the last 5 timeslices, just do a full
+ * restore of the math state immediately to avoid the trap; the
+ * chances of needing FPU soon are obviously high now
+ */
+ if (next_p->fpu_counter > 5)
+ math_state_restore();
+
return prev_p;
}
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 775f50e9395b..f3f94ac5736a 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -94,13 +94,9 @@ static int putreg(struct task_struct *child,
return -EIO;
child->thread.fs = value;
return 0;
- case GS:
- if (value && (value & 3) != 3)
- return -EIO;
- child->thread.gs = value;
- return 0;
case DS:
case ES:
+ case GS:
if (value && (value & 3) != 3)
return -EIO;
value &= 0xffff;
@@ -116,8 +112,8 @@ static int putreg(struct task_struct *child,
value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
break;
}
- if (regno > GS*4)
- regno -= 2*4;
+ if (regno > ES*4)
+ regno -= 1*4;
put_stack_long(child, regno - sizeof(struct pt_regs), value);
return 0;
}
@@ -131,18 +127,16 @@ static unsigned long getreg(struct task_struct *child,
case FS:
retval = child->thread.fs;
break;
- case GS:
- retval = child->thread.gs;
- break;
case DS:
case ES:
+ case GS:
case SS:
case CS:
retval = 0xffff;
/* fall through */
default:
- if (regno > GS*4)
- regno -= 2*4;
+ if (regno > ES*4)
+ regno -= 1*4;
regno = regno - sizeof(struct pt_regs);
retval &= get_stack_long(child, regno);
}
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 9f6ab1789bb0..a01320a7b636 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,10 +3,23 @@
*/
#include <linux/pci.h>
#include <linux/irq.h>
+#include <asm/pci-direct.h>
+#include <asm/genapic.h>
+#include <asm/cpu.h>
#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
+{
+#ifdef CONFIG_X86_64
+ if (genapic != &apic_flat)
+ panic("APIC mode must be flat on this system\n");
+#elif defined(CONFIG_X86_GENERICARCH)
+ if (genapic != &apic_default)
+ panic("APIC mode must be default(flat) on this system. Use apic=default\n");
+#endif
+}
-static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+void __init quirk_intel_irqbalance(void)
{
u8 config, rev;
u32 word;
@@ -16,18 +29,18 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
* based platforms.
* Disable SW irqbalance/affinity on those platforms.
*/
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+ rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
if (rev > 0x9)
return;
printk(KERN_INFO "Intel E7520/7320/7525 detected.");
- /* enable access to config space*/
- pci_read_config_byte(dev, 0xf4, &config);
- pci_write_config_byte(dev, 0xf4, config|0x2);
+ /* enable access to config space */
+ config = read_pci_config_byte(0, 0, 0, 0xf4);
+ write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
/* read xTPR register */
- raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
+ word = read_pci_config_16(0, 0, 0x40, 0x4c);
if (!(word & (1 << 13))) {
printk(KERN_INFO "Disabling irq balancing and affinity\n");
@@ -38,13 +51,24 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
#ifdef CONFIG_PROC_FS
no_irq_affinity = 1;
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ printk(KERN_INFO "Disabling cpu hotplug control\n");
+ enable_cpu_hotplug = 0;
+#endif
+#ifdef CONFIG_X86_64
+ /* force the genapic selection to flat mode so that
+ * interrupts can be redirected to more than one CPU.
+ */
+ genapic_force = &apic_flat;
+#endif
}
- /* put back the original value for config space*/
+ /* put back the original value for config space */
if (!(config & 0x2))
- pci_write_config_byte(dev, 0xf4, config);
+ write_pci_config_byte(0, 0, 0, 0xf4, config);
}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
+
#endif
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 84278e0093a2..3514b4153f7f 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -12,6 +12,7 @@
#include <linux/dmi.h>
#include <linux/ctype.h>
#include <linux/pm.h>
+#include <linux/reboot.h>
#include <asm/uaccess.h>
#include <asm/apic.h>
#include <asm/desc.h>
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 141041dde74d..79df6e612dbd 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -63,9 +63,6 @@
#include <setup_arch.h>
#include <bios_ebda.h>
-/* Forward Declaration. */
-void __init find_max_pfn(void);
-
/* This value is set up by the early boot code to point to the value
immediately after the boot time page tables. It contains a *physical*
address, and must not be in the .bss segment! */
@@ -76,11 +73,8 @@ int disable_pse __devinitdata = 0;
/*
* Machine setup..
*/
-
-#ifdef CONFIG_EFI
-int efi_enabled = 0;
-EXPORT_SYMBOL(efi_enabled);
-#endif
+extern struct resource code_resource;
+extern struct resource data_resource;
/* cpu data as detected by the assembly code in head.S */
struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -99,12 +93,6 @@ unsigned int machine_submodel_id;
unsigned int BIOS_revision;
unsigned int mca_pentium_flag;
-/* For PCI or other memory-mapped resources */
-unsigned long pci_mem_start = 0x10000000;
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
-
/* Boot loader ID as an integer, for the benefit of proc_dointvec */
int bootloader_type;
@@ -134,7 +122,6 @@ struct ist_info ist_info;
defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
EXPORT_SYMBOL(ist_info);
#endif
-struct e820map e820;
extern void early_cpu_init(void);
extern int root_mountflags;
@@ -149,516 +136,6 @@ static char command_line[COMMAND_LINE_SIZE];
unsigned char __initdata boot_params[PARAM_SIZE];
-static struct resource data_resource = {
- .name = "Kernel data",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource code_resource = {
- .name = "Kernel code",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource system_rom_resource = {
- .name = "System ROM",
- .start = 0xf0000,
- .end = 0xfffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
- .name = "Extension ROM",
- .start = 0xe0000,
- .end = 0xeffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
- .name = "Adapter ROM",
- .start = 0xc8000,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
- .name = "Video ROM",
- .start = 0xc0000,
- .end = 0xc7fff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource video_ram_resource = {
- .name = "Video RAM area",
- .start = 0xa0000,
- .end = 0xbffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource standard_io_resources[] = { {
- .name = "dma1",
- .start = 0x0000,
- .end = 0x001f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "pic1",
- .start = 0x0020,
- .end = 0x0021,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "timer0",
- .start = 0x0040,
- .end = 0x0043,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "timer1",
- .start = 0x0050,
- .end = 0x0053,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "keyboard",
- .start = 0x0060,
- .end = 0x006f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "dma page reg",
- .start = 0x0080,
- .end = 0x008f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "pic2",
- .start = 0x00a0,
- .end = 0x00a1,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "dma2",
- .start = 0x00c0,
- .end = 0x00df,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "fpu",
- .start = 0x00f0,
- .end = 0x00ff,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-} };
-
-#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
-
-static int __init romchecksum(unsigned char *rom, unsigned long length)
-{
- unsigned char *p, sum = 0;
-
- for (p = rom; p < rom + length; p++)
- sum += *p;
- return sum == 0;
-}
-
-static void __init probe_roms(void)
-{
- unsigned long start, length, upper;
- unsigned char *rom;
- int i;
-
- /* video rom */
- upper = adapter_rom_resources[0].start;
- for (start = video_rom_resource.start; start < upper; start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- video_rom_resource.start = start;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = rom[2] * 512;
-
- /* if checksum okay, trust length byte */
- if (length && romchecksum(rom, length))
- video_rom_resource.end = start + length - 1;
-
- request_resource(&iomem_resource, &video_rom_resource);
- break;
- }
-
- start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
- if (start < upper)
- start = upper;
-
- /* system rom */
- request_resource(&iomem_resource, &system_rom_resource);
- upper = system_rom_resource.start;
-
- /* check for extension rom (ignore length byte!) */
- rom = isa_bus_to_virt(extension_rom_resource.start);
- if (romsignature(rom)) {
- length = extension_rom_resource.end - extension_rom_resource.start + 1;
- if (romchecksum(rom, length)) {
- request_resource(&iomem_resource, &extension_rom_resource);
- upper = extension_rom_resource.start;
- }
- }
-
- /* check for adapter roms on 2k boundaries */
- for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = rom[2] * 512;
-
- /* but accept any length that fits if checksum okay */
- if (!length || start + length > upper || !romchecksum(rom, length))
- continue;
-
- adapter_rom_resources[i].start = start;
- adapter_rom_resources[i].end = start + length - 1;
- request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
- start = adapter_rom_resources[i++].end & ~2047UL;
- }
-}
-
-static void __init limit_regions(unsigned long long size)
-{
- unsigned long long current_addr = 0;
- int i;
-
- if (efi_enabled) {
- efi_memory_desc_t *md;
- void *p;
-
- for (p = memmap.map, i = 0; p < memmap.map_end;
- p += memmap.desc_size, i++) {
- md = p;
- current_addr = md->phys_addr + (md->num_pages << 12);
- if (md->type == EFI_CONVENTIONAL_MEMORY) {
- if (current_addr >= size) {
- md->num_pages -=
- (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
- memmap.nr_map = i + 1;
- return;
- }
- }
- }
- }
- for (i = 0; i < e820.nr_map; i++) {
- current_addr = e820.map[i].addr + e820.map[i].size;
- if (current_addr < size)
- continue;
-
- if (e820.map[i].type != E820_RAM)
- continue;
-
- if (e820.map[i].addr >= size) {
- /*
- * This region starts past the end of the
- * requested size, skip it completely.
- */
- e820.nr_map = i;
- } else {
- e820.nr_map = i + 1;
- e820.map[i].size -= current_addr - size;
- }
- return;
- }
-}
-
-void __init add_memory_region(unsigned long long start,
- unsigned long long size, int type)
-{
- int x;
-
- if (!efi_enabled) {
- x = e820.nr_map;
-
- if (x == E820MAX) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
- return;
- }
-
- e820.map[x].addr = start;
- e820.map[x].size = size;
- e820.map[x].type = type;
- e820.nr_map++;
- }
-} /* add_memory_region */
-
-#define E820_DEBUG 1
-
-static void __init print_memory_map(char *who)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- printk(" %s: %016Lx - %016Lx ", who,
- e820.map[i].addr,
- e820.map[i].addr + e820.map[i].size);
- switch (e820.map[i].type) {
- case E820_RAM: printk("(usable)\n");
- break;
- case E820_RESERVED:
- printk("(reserved)\n");
- break;
- case E820_ACPI:
- printk("(ACPI data)\n");
- break;
- case E820_NVS:
- printk("(ACPI NVS)\n");
- break;
- default: printk("type %lu\n", e820.map[i].type);
- break;
- }
- }
-}
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
-};
-static struct change_member change_point_list[2*E820MAX] __initdata;
-static struct change_member *change_point[2*E820MAX] __initdata;
-static struct e820entry *overlap_list[E820MAX] __initdata;
-static struct e820entry new_bios[E820MAX] __initdata;
-
-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
- struct change_member *change_tmp;
- unsigned long current_type, last_type;
- unsigned long long last_addr;
- int chgidx, still_changing;
- int overlap_entries;
- int new_bios_entry;
- int old_nr, new_nr, chg_nr;
- int i;
-
- /*
- Visually we're performing the following (1,2,3,4 = memory types)...
-
- Sample memory map (w/overlaps):
- ____22__________________
- ______________________4_
- ____1111________________
- _44_____________________
- 11111111________________
- ____________________33__
- ___________44___________
- __________33333_________
- ______________22________
- ___________________2222_
- _________111111111______
- _____________________11_
- _________________4______
-
- Sanitized equivalent (no overlap):
- 1_______________________
- _44_____________________
- ___1____________________
- ____22__________________
- ______11________________
- _________1______________
- __________3_____________
- ___________44___________
- _____________33_________
- _______________2________
- ________________1_______
- _________________4______
- ___________________2____
- ____________________33__
- ______________________4_
- */
-
- /* if there's only one memory region, don't bother */
- if (*pnr_map < 2)
- return -1;
-
- old_nr = *pnr_map;
-
- /* bail out if we find any unreasonable addresses in bios map */
- for (i=0; i<old_nr; i++)
- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
- return -1;
-
- /* create pointers for initial change-point information (for sorting) */
- for (i=0; i < 2*old_nr; i++)
- change_point[i] = &change_point_list[i];
-
- /* record all known change-points (starting and ending addresses),
- omitting those that are for empty memory regions */
- chgidx = 0;
- for (i=0; i < old_nr; i++) {
- if (biosmap[i].size != 0) {
- change_point[chgidx]->addr = biosmap[i].addr;
- change_point[chgidx++]->pbios = &biosmap[i];
- change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
- change_point[chgidx++]->pbios = &biosmap[i];
- }
- }
- chg_nr = chgidx; /* true number of change-points */
-
- /* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i=1; i < chg_nr; i++) {
- /* if <current_addr> > <last_addr>, swap */
- /* or, if current=<start_addr> & last=<end_addr>, swap */
- if ((change_point[i]->addr < change_point[i-1]->addr) ||
- ((change_point[i]->addr == change_point[i-1]->addr) &&
- (change_point[i]->addr == change_point[i]->pbios->addr) &&
- (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
- )
- {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing=1;
- }
- }
- }
-
- /* create a new bios memory map, removing overlaps */
- overlap_entries=0; /* number of entries in the overlap table */
- new_bios_entry=0; /* index for creating new bios map entries */
- last_type = 0; /* start with undefined memory type */
- last_addr = 0; /* start with 0 as last starting address */
- /* loop through change-points, determining affect on the new bios map */
- for (chgidx=0; chgidx < chg_nr; chgidx++)
- {
- /* keep track of all overlapping bios entries */
- if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
- {
- /* add map entry to overlap list (> 1 entry implies an overlap) */
- overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
- }
- else
- {
- /* remove entry from list (order independent, so swap with last) */
- for (i=0; i<overlap_entries; i++)
- {
- if (overlap_list[i] == change_point[chgidx]->pbios)
- overlap_list[i] = overlap_list[overlap_entries-1];
- }
- overlap_entries--;
- }
- /* if there are overlapping entries, decide which "type" to use */
- /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
- current_type = 0;
- for (i=0; i<overlap_entries; i++)
- if (overlap_list[i]->type > current_type)
- current_type = overlap_list[i]->type;
- /* continue building up new bios map based on this information */
- if (current_type != last_type) {
- if (last_type != 0) {
- new_bios[new_bios_entry].size =
- change_point[chgidx]->addr - last_addr;
- /* move forward only if the new size was non-zero */
- if (new_bios[new_bios_entry].size != 0)
- if (++new_bios_entry >= E820MAX)
- break; /* no more space left for new bios entries */
- }
- if (current_type != 0) {
- new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
- new_bios[new_bios_entry].type = current_type;
- last_addr=change_point[chgidx]->addr;
- }
- last_type = current_type;
- }
- }
- new_nr = new_bios_entry; /* retain count for new bios entries */
-
- /* copy new bios mapping into original location */
- memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
- *pnr_map = new_nr;
-
- return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory. If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up. (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
- */
-int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
-{
- /* Only one memory region (or negative)? Ignore it */
- if (nr_map < 2)
- return -1;
-
- do {
- unsigned long long start = biosmap->addr;
- unsigned long long size = biosmap->size;
- unsigned long long end = start + size;
- unsigned long type = biosmap->type;
-
- /* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
- return -1;
-
- /*
- * Some BIOSes claim RAM in the 640k - 1M region.
- * Not right. Fix it up.
- */
- if (type == E820_RAM) {
- if (start < 0x100000ULL && end > 0xA0000ULL) {
- if (start < 0xA0000ULL)
- add_memory_region(start, 0xA0000ULL-start, type);
- if (end <= 0x100000ULL)
- continue;
- start = 0x100000ULL;
- size = end - start;
- }
- }
- add_memory_region(start, size, type);
- } while (biosmap++,--nr_map);
- return 0;
-}
-
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
#ifdef CONFIG_EDD_MODULE
@@ -682,7 +159,7 @@ static inline void copy_edd(void)
}
#endif
-static int __initdata user_defined_memmap = 0;
+int __initdata user_defined_memmap = 0;
/*
* "mem=nopentium" disables the 4MB page tables.
@@ -719,51 +196,6 @@ static int __init parse_mem(char *arg)
}
early_param("mem", parse_mem);
-static int __init parse_memmap(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp(arg, "exactmap") == 0) {
-#ifdef CONFIG_CRASH_DUMP
- /* If we are doing a crash dump, we
- * still need to know the real mem
- * size before original memory map is
- * reset.
- */
- find_max_pfn();
- saved_max_pfn = max_pfn;
-#endif
- e820.nr_map = 0;
- user_defined_memmap = 1;
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long start_at, mem_size;
-
- mem_size = memparse(arg, &arg);
- if (*arg == '@') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_RAM);
- } else if (*arg == '#') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_ACPI);
- } else if (*arg == '$') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_RESERVED);
- } else {
- limit_regions(mem_size);
- user_defined_memmap = 1;
- }
- }
- return 0;
-}
-early_param("memmap", parse_memmap);
-
#ifdef CONFIG_PROC_VMCORE
/* elfcorehdr= specifies the location of elf core header
* stored by the crashed kernel.
@@ -828,90 +260,6 @@ static int __init parse_reservetop(char *arg)
early_param("reservetop", parse_reservetop);
/*
- * Callback for efi_memory_walk.
- */
-static int __init
-efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
-{
- unsigned long *max_pfn = arg, pfn;
-
- if (start < end) {
- pfn = PFN_UP(end -1);
- if (pfn > *max_pfn)
- *max_pfn = pfn;
- }
- return 0;
-}
-
-static int __init
-efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
-{
- memory_present(0, PFN_UP(start), PFN_DOWN(end));
- return 0;
-}
-
- /*
- * This function checks if the entire range <start,end> is mapped with type.
- *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
- */
-int __init
-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
-{
- u64 start = s;
- u64 end = e;
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- if (type && ei->type != type)
- continue;
- /* is the region (part) in overlap with the current region ?*/
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
- /* if the region is at the beginning of <start,end> we move
- * start to the end of the region since it's ok until there
- */
- if (ei->addr <= start)
- start = ei->addr + ei->size;
- /* if start is now at or beyond end, we're done, full
- * coverage */
- if (start >= end)
- return 1; /* we're done */
- }
- return 0;
-}
-
-/*
- * Find the highest page frame number we have available
- */
-void __init find_max_pfn(void)
-{
- int i;
-
- max_pfn = 0;
- if (efi_enabled) {
- efi_memmap_walk(efi_find_max_pfn, &max_pfn);
- efi_memmap_walk(efi_memory_present_wrapper, NULL);
- return;
- }
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long start, end;
- /* RAM? */
- if (e820.map[i].type != E820_RAM)
- continue;
- start = PFN_UP(e820.map[i].addr);
- end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
- if (start >= end)
- continue;
- if (end > max_pfn)
- max_pfn = end;
- memory_present(0, start, end);
- }
-}
-
-/*
* Determine low and high memory ranges:
*/
unsigned long __init find_max_low_pfn(void)
@@ -971,68 +319,6 @@ unsigned long __init find_max_low_pfn(void)
}
/*
- * Free all available memory for boot time allocation. Used
- * as a callback function by efi_memory_walk()
- */
-
-static int __init
-free_available_memory(unsigned long start, unsigned long end, void *arg)
-{
- /* check max_low_pfn */
- if (start >= (max_low_pfn << PAGE_SHIFT))
- return 0;
- if (end >= (max_low_pfn << PAGE_SHIFT))
- end = max_low_pfn << PAGE_SHIFT;
- if (start < end)
- free_bootmem(start, end - start);
-
- return 0;
-}
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
-{
- int i;
-
- if (efi_enabled) {
- efi_memmap_walk(free_available_memory, NULL);
- return;
- }
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long curr_pfn, last_pfn, size;
- /*
- * Reserve usable low memory
- */
- if (e820.map[i].type != E820_RAM)
- continue;
- /*
- * We are rounding up the start address of usable memory:
- */
- curr_pfn = PFN_UP(e820.map[i].addr);
- if (curr_pfn >= max_low_pfn)
- continue;
- /*
- * ... and at the end of the usable range downwards:
- */
- last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-
- if (last_pfn > max_low_pfn)
- last_pfn = max_low_pfn;
-
- /*
- * .. finally, did all the rounding and playing
- * around just make the area go away?
- */
- if (last_pfn <= curr_pfn)
- continue;
-
- size = last_pfn - curr_pfn;
- free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
- }
-}
-
-/*
* workaround for Dell systems that neglect to reserve EBDA
*/
static void __init reserve_ebda_region(void)
@@ -1118,8 +404,8 @@ void __init setup_bootmem_allocator(void)
* the (very unlikely) case of us accidentally initializing the
* bootmem allocator with an invalid RAM area.
*/
- reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
- bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
+ reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
+ bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
/*
* reserve physical page 0 - it's a special BIOS page on many boxes,
@@ -1162,8 +448,7 @@ void __init setup_bootmem_allocator(void)
if (LOADER_TYPE && INITRD_START) {
if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
reserve_bootmem(INITRD_START, INITRD_SIZE);
- initrd_start =
- INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+ initrd_start = INITRD_START + PAGE_OFFSET;
initrd_end = initrd_start+INITRD_SIZE;
}
else {
@@ -1200,126 +485,6 @@ void __init remapped_pgdat_init(void)
}
}
-/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
-static void __init
-legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
-{
- int i;
-
- probe_roms();
- for (i = 0; i < e820.nr_map; i++) {
- struct resource *res;
-#ifndef CONFIG_RESOURCES_64BIT
- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
- continue;
-#endif
- res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
- switch (e820.map[i].type) {
- case E820_RAM: res->name = "System RAM"; break;
- case E820_ACPI: res->name = "ACPI Tables"; break;
- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
- default: res->name = "reserved";
- }
- res->start = e820.map[i].addr;
- res->end = res->start + e820.map[i].size - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- if (request_resource(&iomem_resource, res)) {
- kfree(res);
- continue;
- }
- if (e820.map[i].type == E820_RAM) {
- /*
- * We don't know which RAM region contains kernel data,
- * so we try it repeatedly and let the resource manager
- * test it.
- */
- request_resource(res, code_resource);
- request_resource(res, data_resource);
-#ifdef CONFIG_KEXEC
- request_resource(res, &crashk_res);
-#endif
- }
- }
-}
-
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
-{
- int i;
-
- printk("Setting up standard PCI resources\n");
- if (efi_enabled)
- efi_initialize_iomem_resources(&code_resource, &data_resource);
- else
- legacy_init_iomem_resources(&code_resource, &data_resource);
-
- /* EFI systems may still have VGA */
- request_resource(&iomem_resource, &video_ram_resource);
-
- /* request I/O space for devices used on all i[345]86 PCs */
- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
- request_resource(&ioport_resource, &standard_io_resources[i]);
- return 0;
-}
-
-subsys_initcall(request_standard_resources);
-
-static void __init register_memory(void)
-{
- unsigned long gapstart, gapsize, round;
- unsigned long long last;
- int i;
-
- /*
- * Search for the bigest gap in the low 32 bits of the e820
- * memory space.
- */
- last = 0x100000000ull;
- gapstart = 0x10000000;
- gapsize = 0x400000;
- i = e820.nr_map;
- while (--i >= 0) {
- unsigned long long start = e820.map[i].addr;
- unsigned long long end = start + e820.map[i].size;
-
- /*
- * Since "last" is at most 4GB, we know we'll
- * fit in 32 bits if this condition is true
- */
- if (last > end) {
- unsigned long gap = last - end;
-
- if (gap > gapsize) {
- gapsize = gap;
- gapstart = end;
- }
- }
- if (start < last)
- last = start;
- }
-
- /*
- * See how much we want to round up: start off with
- * rounding to the next 1MB area.
- */
- round = 0x100000;
- while ((gapsize >> 4) > round)
- round += round;
- /* Fun with two's complement */
- pci_mem_start = (gapstart + round) & -round;
-
- printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
- pci_mem_start, gapstart, gapsize);
-}
-
#ifdef CONFIG_MCA
static void set_mca_bus(int x)
{
@@ -1329,6 +494,12 @@ static void set_mca_bus(int x)
static void set_mca_bus(int x) { }
#endif
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __attribute__((weak)) memory_setup(void)
+{
+ return machine_specific_memory_setup();
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -1381,7 +552,7 @@ void __init setup_arch(char **cmdline_p)
efi_init();
else {
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- print_memory_map(machine_specific_memory_setup());
+ print_memory_map(memory_setup());
}
copy_edd();
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 43002cfb40c4..65d7620eaa09 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
- GET_SEG(gs);
+ COPY_SEG(gs);
GET_SEG(fs);
COPY_SEG(es);
COPY_SEG(ds);
@@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
{
int tmp, err = 0;
- tmp = 0;
- savesegment(gs, tmp);
- err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+ err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
savesegment(fs, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 31e5c6573aae..5285aff8367f 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -321,7 +321,6 @@ static inline void leave_mm (unsigned long cpu)
fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
unsigned long cpu;
cpu = get_cpu();
@@ -352,7 +351,6 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
smp_mb__after_clear_bit();
out:
put_cpu_no_resched();
- set_irq_regs(old_regs);
}
static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
@@ -607,14 +605,11 @@ void smp_send_stop(void)
*/
fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
ack_APIC_irq();
- set_irq_regs(old_regs);
}
fastcall void smp_call_function_interrupt(struct pt_regs *regs)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
void (*func) (void *info) = call_data->func;
void *info = call_data->info;
int wait = call_data->wait;
@@ -637,7 +632,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
mb();
atomic_inc(&call_data->finished);
}
- set_irq_regs(old_regs);
}
/*
@@ -699,6 +693,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
put_cpu();
return -EBUSY;
}
+
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+
spin_lock_bh(&call_lock);
__smp_call_function_single(cpu, func, info, nonatomic, wait);
spin_unlock_bh(&call_lock);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 02a9b66b6ac3..4bf0e3c83b8b 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -33,6 +33,11 @@
* Dave Jones : Report invalid combinations of Athlon CPUs.
* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
+
+/* SMP boot always wants to use real time delay to allow sufficient time for
+ * the APs to come online */
+#define USE_REAL_TIME_DELAY
+
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -52,6 +57,8 @@
#include <asm/desc.h>
#include <asm/arch_hooks.h>
#include <asm/nmi.h>
+#include <asm/pda.h>
+#include <asm/genapic.h>
#include <mach_apic.h>
#include <mach_wakecpu.h>
@@ -536,11 +543,11 @@ set_cpu_sibling_map(int cpu)
static void __devinit start_secondary(void *unused)
{
/*
- * Dont put anything before smp_callin(), SMP
+ * Don't put *anything* before secondary_cpu_init(), SMP
* booting is too fragile that we want to limit the
* things done here to the most necessary things.
*/
- cpu_init();
+ secondary_cpu_init();
preempt_disable();
smp_callin();
while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -599,13 +606,16 @@ void __devinit initialize_secondary(void)
"movl %0,%%esp\n\t"
"jmp *%1"
:
- :"r" (current->thread.esp),"r" (current->thread.eip));
+ :"m" (current->thread.esp),"m" (current->thread.eip));
}
+/* Static state in head.S used to set up a CPU */
extern struct {
void * esp;
unsigned short ss;
} stack_start;
+extern struct i386_pda *start_pda;
+extern struct Xgt_desc_struct cpu_gdt_descr;
#ifdef CONFIG_NUMA
@@ -936,9 +946,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
unsigned long start_eip;
unsigned short nmi_high = 0, nmi_low = 0;
- ++cpucount;
- alternatives_smp_switch(1);
-
/*
* We can't use kernel_thread since we must avoid to
* reschedule the child.
@@ -946,15 +953,30 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
idle = alloc_idle_task(cpu);
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
+
+ /* Pre-allocate and initialize the CPU's GDT and PDA so it
+ doesn't have to do any memory allocation during the
+ delicate CPU-bringup phase. */
+ if (!init_gdt(cpu, idle)) {
+ printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
+ return -1; /* ? */
+ }
+
idle->thread.eip = (unsigned long) start_secondary;
/* start_eip had better be page-aligned! */
start_eip = setup_trampoline();
+ ++cpucount;
+ alternatives_smp_switch(1);
+
/* So we see what's up */
printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
/* Stack for startup_32 can be just as for start_secondary onwards */
stack_start.esp = (void *) idle->thread.esp;
+ start_pda = cpu_pda(cpu);
+ cpu_gdt_descr = per_cpu(cpu_gdt_descr, cpu);
+
irq_ctx_init(cpu);
x86_cpu_to_apicid[cpu] = apicid;
@@ -1109,34 +1131,15 @@ exit:
}
#endif
-static void smp_tune_scheduling (void)
+static void smp_tune_scheduling(void)
{
unsigned long cachesize; /* kB */
- unsigned long bandwidth = 350; /* MB/s */
- /*
- * Rough estimation for SMP scheduling, this is the number of
- * cycles it takes for a fully memory-limited process to flush
- * the SMP-local cache.
- *
- * (For a P5 this pretty much means we will choose another idle
- * CPU almost always at wakeup time (this is due to the small
- * L1 cache), on PIIs it's around 50-100 usecs, depending on
- * the cache size)
- */
- if (!cpu_khz) {
- /*
- * this basically disables processor-affinity
- * scheduling on SMP without a TSC.
- */
- return;
- } else {
+ if (cpu_khz) {
cachesize = boot_cpu_data.x86_cache_size;
- if (cachesize == -1) {
- cachesize = 16; /* Pentiums, 2x8kB cache */
- bandwidth = 100;
- }
- max_cache_size = cachesize * 1024;
+
+ if (cachesize > 0)
+ max_cache_size = cachesize * 1024;
}
}
@@ -1462,6 +1465,12 @@ int __devinit __cpu_up(unsigned int cpu)
cpu_set(cpu, smp_commenced_mask);
while (!cpu_isset(cpu, cpu_online_map))
cpu_relax();
+
+#ifdef CONFIG_X86_GENERICARCH
+ if (num_online_cpus() > 8 && genapic == &apic_default)
+ panic("Default flat APIC routing can't be used with > 8 cpus\n");
+#endif
+
return 0;
}
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 713ba39d32c6..7de9117b5a3a 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -27,7 +27,11 @@
* Should the kernel map a VDSO page into processes and pass its
* address down to glibc upon exec()?
*/
+#ifdef CONFIG_PARAVIRT
+unsigned int __read_mostly vdso_enabled = 0;
+#else
unsigned int __read_mostly vdso_enabled = 1;
+#endif
EXPORT_SYMBOL_GPL(vdso_enabled);
@@ -132,7 +136,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
goto up_fail;
}
- vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL);
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
ret = -ENOMEM;
goto up_fail;
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 78af572fd17c..c505b16c0990 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -56,6 +56,7 @@
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/timer.h>
+#include <asm/time.h>
#include "mach_time.h"
@@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long nowtime)
/* gets recalled with irq locally disabled */
/* XXX - does irqsave resolve this? -johnstul */
spin_lock_irqsave(&rtc_lock, flags);
- if (efi_enabled)
- retval = efi_set_rtc_mmss(nowtime);
- else
- retval = mach_set_rtc_mmss(nowtime);
+ retval = set_wallclock(nowtime);
spin_unlock_irqrestore(&rtc_lock, flags);
return retval;
@@ -223,10 +221,7 @@ unsigned long get_cmos_time(void)
spin_lock_irqsave(&rtc_lock, flags);
- if (efi_enabled)
- retval = efi_get_time();
- else
- retval = mach_get_cmos_time();
+ retval = get_wallclock();
spin_unlock_irqrestore(&rtc_lock, flags);
@@ -370,7 +365,7 @@ static void __init hpet_time_init(void)
printk("Using HPET for base-timer\n");
}
- time_init_hook();
+ do_time_init();
}
#endif
@@ -392,5 +387,5 @@ void __init time_init(void)
do_settimeofday(&ts);
- time_init_hook();
+ do_time_init();
}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 1a2a979cf6a3..1e4702dfcd01 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -132,14 +132,20 @@ int __init hpet_enable(void)
* the single HPET timer for system time.
*/
#ifdef CONFIG_HPET_EMULATE_RTC
- if (!(id & HPET_ID_NUMBER))
+ if (!(id & HPET_ID_NUMBER)) {
+ iounmap(hpet_virt_address);
+ hpet_virt_address = NULL;
return -1;
+ }
#endif
hpet_period = hpet_readl(HPET_PERIOD);
- if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD))
+ if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) {
+ iounmap(hpet_virt_address);
+ hpet_virt_address = NULL;
return -1;
+ }
/*
* 64 bit math
@@ -156,8 +162,11 @@ int __init hpet_enable(void)
hpet_use_timer = id & HPET_ID_LEGSUP;
- if (hpet_timer_stop_set_go(hpet_tick))
+ if (hpet_timer_stop_set_go(hpet_tick)) {
+ iounmap(hpet_virt_address);
+ hpet_virt_address = NULL;
return -1;
+ }
use_hpet = 1;
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 07d6da36a825..79cf608e14ca 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -40,14 +40,18 @@ int arch_register_cpu(int num)
* restrictions and assumptions in kernel. This basically
* doesnt add a control file, one cannot attempt to offline
* BSP.
+ *
+ * Also certain PCI quirks require not to enable hotplug control
+ * for all CPU's.
*/
- if (!num)
- cpu_devices[num].cpu.no_control = 1;
+ if (num && enable_cpu_hotplug)
+ cpu_devices[num].cpu.hotpluggable = 1;
return register_cpu(&cpu_devices[num].cpu, num);
}
#ifdef CONFIG_HOTPLUG_CPU
+int enable_cpu_hotplug = 1;
void arch_unregister_cpu(int num) {
return unregister_cpu(&cpu_devices[num].cpu);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe9c5e8e7e6f..68de48e498ca 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -29,6 +29,7 @@
#include <linux/kexec.h>
#include <linux/unwind.h>
#include <linux/uaccess.h>
+#include <linux/nmi.h>
#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -61,9 +62,6 @@ int panic_on_unrecovered_nmi;
asmlinkage int system_call(void);
-struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
- { 0, 0 }, { 0, 0 } };
-
/* Do we ignore FPU interrupts ? */
char ignore_fpu_irq = 0;
@@ -94,7 +92,7 @@ asmlinkage void alignment_check(void);
asmlinkage void spurious_interrupt_bug(void);
asmlinkage void machine_check(void);
-static int kstack_depth_to_print = 24;
+int kstack_depth_to_print = 24;
#ifdef CONFIG_STACK_UNWIND
static int call_trace = 1;
#else
@@ -163,16 +161,25 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data)
{
struct ops_and_data *oad = (struct ops_and_data *)data;
int n = 0;
+ unsigned long sp = UNW_SP(info);
+ if (arch_unw_user_mode(info))
+ return -1;
while (unwind(info) == 0 && UNW_PC(info)) {
n++;
oad->ops->address(oad->data, UNW_PC(info));
if (arch_unw_user_mode(info))
break;
+ if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
+ && sp > UNW_SP(info))
+ break;
+ sp = UNW_SP(info);
}
return n;
}
+#define MSG(msg) ops->warning(data, msg)
+
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack,
struct stacktrace_ops *ops, void *data)
@@ -191,29 +198,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (unwind_init_frame_info(&info, task, regs) == 0)
unw_ret = dump_trace_unwind(&info, &oad);
} else if (task == current)
- unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
+ unw_ret = unwind_init_running(&info, dump_trace_unwind,
+ &oad);
else {
if (unwind_init_blocked(&info, task) == 0)
unw_ret = dump_trace_unwind(&info, &oad);
}
if (unw_ret > 0) {
if (call_trace == 1 && !arch_unw_user_mode(&info)) {
- ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
+ ops->warning_symbol(data,
+ "DWARF2 unwinder stuck at %s",
UNW_PC(&info));
if (UNW_SP(&info) >= PAGE_OFFSET) {
- ops->warning(data, "Leftover inexact backtrace:\n");
+ MSG("Leftover inexact backtrace:");
stack = (void *)UNW_SP(&info);
if (!stack)
return;
ebp = UNW_FP(&info);
} else
- ops->warning(data, "Full inexact backtrace again:\n");
+ MSG("Full inexact backtrace again:");
} else if (call_trace >= 1)
return;
else
- ops->warning(data, "Full inexact backtrace again:\n");
+ MSG("Full inexact backtrace again:");
} else
- ops->warning(data, "Inexact backtrace:\n");
+ MSG("Inexact backtrace:");
}
if (!stack) {
unsigned long dummy;
@@ -247,6 +256,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
stack = (unsigned long*)context->previous_esp;
if (!stack)
break;
+ touch_nmi_watchdog();
}
}
EXPORT_SYMBOL(dump_trace);
@@ -379,7 +389,7 @@ void show_registers(struct pt_regs *regs)
* time of the fault..
*/
if (in_kernel) {
- u8 __user *eip;
+ u8 *eip;
int code_bytes = 64;
unsigned char c;
@@ -388,18 +398,20 @@ void show_registers(struct pt_regs *regs)
printk(KERN_EMERG "Code: ");
- eip = (u8 __user *)regs->eip - 43;
- if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
+ eip = (u8 *)regs->eip - 43;
+ if (eip < (u8 *)PAGE_OFFSET ||
+ probe_kernel_address(eip, c)) {
/* try starting at EIP */
- eip = (u8 __user *)regs->eip;
+ eip = (u8 *)regs->eip;
code_bytes = 32;
}
for (i = 0; i < code_bytes; i++, eip++) {
- if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
+ if (eip < (u8 *)PAGE_OFFSET ||
+ probe_kernel_address(eip, c)) {
printk(" Bad EIP value.");
break;
}
- if (eip == (u8 __user *)regs->eip)
+ if (eip == (u8 *)regs->eip)
printk("<%02x> ", c);
else
printk("%02x ", c);
@@ -415,7 +427,7 @@ static void handle_BUG(struct pt_regs *regs)
if (eip < PAGE_OFFSET)
return;
- if (probe_kernel_address((unsigned short __user *)eip, ud2))
+ if (probe_kernel_address((unsigned short *)eip, ud2))
return;
if (ud2 != 0x0b0f)
return;
@@ -428,11 +440,11 @@ static void handle_BUG(struct pt_regs *regs)
char *file;
char c;
- if (probe_kernel_address((unsigned short __user *)(eip + 2),
- line))
+ if (probe_kernel_address((unsigned short *)(eip + 2), line))
break;
- if (__get_user(file, (char * __user *)(eip + 4)) ||
- (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
+ if (probe_kernel_address((char **)(eip + 4), file) ||
+ (unsigned long)file < PAGE_OFFSET ||
+ probe_kernel_address(file, c))
file = "<bad filename>";
printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
@@ -452,7 +464,7 @@ void die(const char * str, struct pt_regs * regs, long err)
u32 lock_owner;
int lock_owner_depth;
} die = {
- .lock = SPIN_LOCK_UNLOCKED,
+ .lock = __SPIN_LOCK_UNLOCKED(die.lock),
.lock_owner = -1,
.lock_owner_depth = 0
};
@@ -707,8 +719,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
{
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
"CPU %d.\n", reason, smp_processor_id());
- printk(KERN_EMERG "You probably have a hardware problem with your RAM "
- "chips\n");
+ printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
@@ -773,7 +784,6 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
printk(" on CPU%d, eip %08lx, registers:\n",
smp_processor_id(), regs->eip);
show_registers(regs);
- printk(KERN_EMERG "console shuts up ...\n");
console_silent();
spin_unlock(&nmi_print_lock);
bust_spinlocks(0);
@@ -1088,49 +1098,24 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
#endif
}
-fastcall void setup_x86_bogus_stack(unsigned char * stk)
-{
- unsigned long *switch16_ptr, *switch32_ptr;
- struct pt_regs *regs;
- unsigned long stack_top, stack_bot;
- unsigned short iret_frame16_off;
- int cpu = smp_processor_id();
- /* reserve the space on 32bit stack for the magic switch16 pointer */
- memmove(stk, stk + 8, sizeof(struct pt_regs));
- switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
- regs = (struct pt_regs *)stk;
- /* now the switch32 on 16bit stack */
- stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
- stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
- switch32_ptr = (unsigned long *)(stack_top - 8);
- iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
- /* copy iret frame on 16bit stack */
- memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
- /* fill in the switch pointers */
- switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
- switch16_ptr[1] = __ESPFIX_SS;
- switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
- 8 - CPU_16BIT_STACK_SIZE;
- switch32_ptr[1] = __KERNEL_DS;
-}
-
-fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
+fastcall unsigned long patch_espfix_desc(unsigned long uesp,
+ unsigned long kesp)
{
- unsigned long *switch32_ptr;
- unsigned char *stack16, *stack32;
- unsigned long stack_top, stack_bot;
- int len;
int cpu = smp_processor_id();
- stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
- stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
- switch32_ptr = (unsigned long *)(stack_top - 8);
- /* copy the data from 16bit stack to 32bit stack */
- len = CPU_16BIT_STACK_SIZE - 8 - sp;
- stack16 = (unsigned char *)(stack_bot + sp);
- stack32 = (unsigned char *)
- (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
- memcpy(stack32, stack16, len);
- return stack32;
+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+ struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
+ unsigned long base = (kesp - uesp) & -THREAD_SIZE;
+ unsigned long new_kesp = kesp - base;
+ unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
+ __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
+ /* Set up base for espfix segment */
+ desc &= 0x00f0ff0000000000ULL;
+ desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
+ ((((__u64)base) << 32) & 0xff00000000000000ULL) |
+ ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
+ (lim_pages & 0xffff);
+ *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
+ return new_kesp;
}
/*
@@ -1143,7 +1128,7 @@ fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
* Must be called with kernel preemption disabled (in this case,
* local interrupts are disabled at the call-site in entry.S).
*/
-asmlinkage void math_state_restore(struct pt_regs regs)
+asmlinkage void math_state_restore(void)
{
struct thread_info *thread = current_thread_info();
struct task_struct *tsk = thread->task;
@@ -1153,6 +1138,7 @@ asmlinkage void math_state_restore(struct pt_regs regs)
init_fpu(tsk);
restore_fpu(tsk);
thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
+ tsk->fpu_counter++;
}
#ifndef CONFIG_MATH_EMULATION
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 9810c8c90750..1bbe45dca7a0 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -13,7 +13,6 @@
#include <asm/delay.h>
#include <asm/tsc.h>
-#include <asm/delay.h>
#include <asm/io.h>
#include "mach_timer.h"
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index cbcd61d6120b..be2f96e67f78 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -43,6 +43,7 @@
#include <linux/highmem.h>
#include <linux/ptrace.h>
#include <linux/audit.h>
+#include <linux/stddef.h>
#include <asm/uaccess.h>
#include <asm/io.h>
@@ -72,10 +73,10 @@
/*
* 8- and 16-bit register defines..
*/
-#define AL(regs) (((unsigned char *)&((regs)->eax))[0])
-#define AH(regs) (((unsigned char *)&((regs)->eax))[1])
-#define IP(regs) (*(unsigned short *)&((regs)->eip))
-#define SP(regs) (*(unsigned short *)&((regs)->esp))
+#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0])
+#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1])
+#define IP(regs) (*(unsigned short *)&((regs)->pt.eip))
+#define SP(regs) (*(unsigned short *)&((regs)->pt.esp))
/*
* virtual flags (16 and 32-bit versions)
@@ -89,10 +90,37 @@
#define SAFE_MASK (0xDD5)
#define RETURN_MASK (0xDFF)
-#define VM86_REGS_PART2 orig_eax
-#define VM86_REGS_SIZE1 \
- ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) )
-#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1)
+/* convert kernel_vm86_regs to vm86_regs */
+static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
+ const struct kernel_vm86_regs *regs)
+{
+ int ret = 0;
+
+ /* kernel_vm86_regs is missing xfs, so copy everything up to
+ (but not including) xgs, and then rest after xgs. */
+ ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
+ ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
+ sizeof(struct kernel_vm86_regs) -
+ offsetof(struct kernel_vm86_regs, pt.xgs));
+
+ return ret;
+}
+
+/* convert vm86_regs to kernel_vm86_regs */
+static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
+ const struct vm86_regs __user *user,
+ unsigned extra)
+{
+ int ret = 0;
+
+ ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
+ ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
+ sizeof(struct kernel_vm86_regs) -
+ offsetof(struct kernel_vm86_regs, pt.xgs) +
+ extra);
+
+ return ret;
+}
struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
@@ -112,10 +140,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
printk("no vm86_info: BAD\n");
do_exit(SIGSEGV);
}
- set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
- tmp = copy_to_user(&current->thread.vm86_info->regs,regs, VM86_REGS_SIZE1);
- tmp += copy_to_user(&current->thread.vm86_info->regs.VM86_REGS_PART2,
- &regs->VM86_REGS_PART2, VM86_REGS_SIZE2);
+ set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
+ tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
if (tmp) {
printk("vm86: could not access userspace vm86_info\n");
@@ -129,9 +155,11 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
current->thread.saved_esp0 = 0;
put_cpu();
- loadsegment(fs, current->thread.saved_fs);
- loadsegment(gs, current->thread.saved_gs);
ret = KVM86->regs32;
+
+ loadsegment(fs, current->thread.saved_fs);
+ ret->xgs = current->thread.saved_gs;
+
return ret;
}
@@ -183,9 +211,9 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
tsk = current;
if (tsk->thread.saved_esp0)
goto out;
- tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1);
- tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2,
- (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2);
+ tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
+ offsetof(struct kernel_vm86_struct, vm86plus) -
+ sizeof(info.regs));
ret = -EFAULT;
if (tmp)
goto out;
@@ -233,9 +261,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
if (tsk->thread.saved_esp0)
goto out;
v86 = (struct vm86plus_struct __user *)regs.ecx;
- tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1);
- tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2,
- (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2);
+ tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
+ offsetof(struct kernel_vm86_struct, regs32) -
+ sizeof(info.regs));
ret = -EFAULT;
if (tmp)
goto out;
@@ -252,15 +280,15 @@ out:
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
{
struct tss_struct *tss;
- long eax;
/*
* make sure the vm86() system call doesn't try to do anything silly
*/
- info->regs.__null_ds = 0;
- info->regs.__null_es = 0;
+ info->regs.pt.xds = 0;
+ info->regs.pt.xes = 0;
+ info->regs.pt.xgs = 0;
-/* we are clearing fs,gs later just before "jmp resume_userspace",
- * because starting with Linux 2.1.x they aren't no longer saved/restored
+/* we are clearing fs later just before "jmp resume_userspace",
+ * because it is not saved/restored.
*/
/*
@@ -268,10 +296,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
* has set it up safely, so this makes sure interrupt etc flags are
* inherited from protected mode.
*/
- VEFLAGS = info->regs.eflags;
- info->regs.eflags &= SAFE_MASK;
- info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK;
- info->regs.eflags |= VM_MASK;
+ VEFLAGS = info->regs.pt.eflags;
+ info->regs.pt.eflags &= SAFE_MASK;
+ info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
+ info->regs.pt.eflags |= VM_MASK;
switch (info->cpu_type) {
case CPU_286:
@@ -294,7 +322,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
info->regs32->eax = 0;
tsk->thread.saved_esp0 = tsk->thread.esp0;
savesegment(fs, tsk->thread.saved_fs);
- savesegment(gs, tsk->thread.saved_gs);
+ tsk->thread.saved_gs = info->regs32->xgs;
tss = &per_cpu(init_tss, get_cpu());
tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -306,19 +334,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
tsk->thread.screen_bitmap = info->screen_bitmap;
if (info->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);
- __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t");
- __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax));
/*call audit_syscall_exit since we do not exit via the normal paths */
if (unlikely(current->audit_context))
- audit_syscall_exit(AUDITSC_RESULT(eax), eax);
+ audit_syscall_exit(AUDITSC_RESULT(0), 0);
__asm__ __volatile__(
"movl %0,%%esp\n\t"
"movl %1,%%ebp\n\t"
+ "mov %2, %%fs\n\t"
"jmp resume_userspace"
: /* no outputs */
- :"r" (&info->regs), "r" (task_thread_info(tsk)));
+ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
/* we never return here */
}
@@ -348,12 +375,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
static inline void clear_TF(struct kernel_vm86_regs * regs)
{
- regs->eflags &= ~TF_MASK;
+ regs->pt.eflags &= ~TF_MASK;
}
static inline void clear_AC(struct kernel_vm86_regs * regs)
{
- regs->eflags &= ~AC_MASK;
+ regs->pt.eflags &= ~AC_MASK;
}
/* It is correct to call set_IF(regs) from the set_vflags_*
@@ -370,7 +397,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
{
set_flags(VEFLAGS, eflags, current->thread.v86mask);
- set_flags(regs->eflags, eflags, SAFE_MASK);
+ set_flags(regs->pt.eflags, eflags, SAFE_MASK);
if (eflags & IF_MASK)
set_IF(regs);
else
@@ -380,7 +407,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
{
set_flags(VFLAGS, flags, current->thread.v86mask);
- set_flags(regs->eflags, flags, SAFE_MASK);
+ set_flags(regs->pt.eflags, flags, SAFE_MASK);
if (flags & IF_MASK)
set_IF(regs);
else
@@ -389,7 +416,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
{
- unsigned long flags = regs->eflags & RETURN_MASK;
+ unsigned long flags = regs->pt.eflags & RETURN_MASK;
if (VEFLAGS & VIF_MASK)
flags |= IF_MASK;
@@ -493,7 +520,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
unsigned long __user *intr_ptr;
unsigned long segoffs;
- if (regs->cs == BIOSSEG)
+ if (regs->pt.xcs == BIOSSEG)
goto cannot_handle;
if (is_revectored(i, &KVM86->int_revectored))
goto cannot_handle;
@@ -505,9 +532,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
if ((segoffs >> 16) == BIOSSEG)
goto cannot_handle;
pushw(ssp, sp, get_vflags(regs), cannot_handle);
- pushw(ssp, sp, regs->cs, cannot_handle);
+ pushw(ssp, sp, regs->pt.xcs, cannot_handle);
pushw(ssp, sp, IP(regs), cannot_handle);
- regs->cs = segoffs >> 16;
+ regs->pt.xcs = segoffs >> 16;
SP(regs) -= 6;
IP(regs) = segoffs & 0xffff;
clear_TF(regs);
@@ -524,7 +551,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
if (VMPI.is_vm86pus) {
if ( (trapno==3) || (trapno==1) )
return_to_32bit(regs, VM86_TRAP + (trapno << 8));
- do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs));
+ do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
return 0;
}
if (trapno !=1)
@@ -560,10 +587,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
handle_vm86_trap(regs, 0, 1); \
return; } while (0)
- orig_flags = *(unsigned short *)&regs->eflags;
+ orig_flags = *(unsigned short *)&regs->pt.eflags;
- csp = (unsigned char __user *) (regs->cs << 4);
- ssp = (unsigned char __user *) (regs->ss << 4);
+ csp = (unsigned char __user *) (regs->pt.xcs << 4);
+ ssp = (unsigned char __user *) (regs->pt.xss << 4);
sp = SP(regs);
ip = IP(regs);
@@ -650,7 +677,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
SP(regs) += 6;
}
IP(regs) = newip;
- regs->cs = newcs;
+ regs->pt.xcs = newcs;
CHECK_IF_IN_TRAP;
if (data32) {
set_vflags_long(newflags, regs);
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index c6f84a0322ba..56e6ad5cb045 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -1,13 +1,26 @@
/* ld script to make i386 Linux kernel
* Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
*/
+/* Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
#define LOAD_OFFSET __PAGE_OFFSET
#include <asm-generic/vmlinux.lds.h>
#include <asm/thread_info.h>
#include <asm/page.h>
#include <asm/cache.h>
+#include <asm/boot.h>
OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
OUTPUT_ARCH(i386)
@@ -21,34 +34,35 @@ PHDRS {
}
SECTIONS
{
- . = __KERNEL_START;
+ . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
phys_startup_32 = startup_32 - LOAD_OFFSET;
/* read-only */
- _text = .; /* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
+ _text = .; /* Text and read-only data */
*(.text)
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
*(.fixup)
*(.gnu.warning)
- } :text = 0x9090
-
- _etext = .; /* End of text section */
+ _etext = .; /* End of text section */
+ } :text = 0x9090
. = ALIGN(16); /* Exception table */
- __start___ex_table = .;
- __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
- __stop___ex_table = .;
+ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+ __start___ex_table = .;
+ *(__ex_table)
+ __stop___ex_table = .;
+ }
RODATA
. = ALIGN(4);
- __tracedata_start = .;
.tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+ __tracedata_start = .;
*(.tracedata)
+ __tracedata_end = .;
}
- __tracedata_end = .;
/* writeable */
. = ALIGN(4096);
@@ -57,11 +71,19 @@ SECTIONS
CONSTRUCTORS
} :data
+ .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
+ __start_paravirtprobe = .;
+ *(.paravirtprobe)
+ __stop_paravirtprobe = .;
+ }
+
. = ALIGN(4096);
- __nosave_begin = .;
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
- . = ALIGN(4096);
- __nosave_end = .;
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ __nosave_begin = .;
+ *(.data.nosave)
+ . = ALIGN(4096);
+ __nosave_end = .;
+ }
. = ALIGN(4096);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
@@ -75,17 +97,10 @@ SECTIONS
/* rarely changed data like cpu maps */
. = ALIGN(32);
- .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) }
- _edata = .; /* End of data section */
-
-#ifdef CONFIG_STACK_UNWIND
- . = ALIGN(4);
- .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
- __start_unwind = .;
- *(.eh_frame)
- __end_unwind = .;
+ .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+ *(.data.read_mostly)
+ _edata = .; /* End of data section */
}
-#endif
. = ALIGN(THREAD_SIZE); /* init_task */
.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
@@ -94,88 +109,102 @@ SECTIONS
/* might get freed after init */
. = ALIGN(4096);
- __smp_alt_begin = .;
- __smp_alt_instructions = .;
.smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
+ __smp_alt_begin = .;
+ __smp_alt_instructions = .;
*(.smp_altinstructions)
+ __smp_alt_instructions_end = .;
}
- __smp_alt_instructions_end = .;
. = ALIGN(4);
- __smp_locks = .;
.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ __smp_locks = .;
*(.smp_locks)
+ __smp_locks_end = .;
}
- __smp_locks_end = .;
.smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
*(.smp_altinstr_replacement)
+ __smp_alt_end = .;
}
+ /* will be freed after init
+ * Following ALIGN() is required to make sure no other data falls on the
+ * same page where __smp_alt_end is pointing as that page might be freed
+ * after boot. Always make sure that ALIGN() directive is present after
+ * the section which contains __smp_alt_end.
+ */
. = ALIGN(4096);
- __smp_alt_end = .;
/* will be freed after init */
. = ALIGN(4096); /* Init code and data */
- __init_begin = .;
.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+ __init_begin = .;
_sinittext = .;
*(.init.text)
_einittext = .;
}
.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
. = ALIGN(16);
- __setup_start = .;
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
- __setup_end = .;
- __initcall_start = .;
+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+ __setup_start = .;
+ *(.init.setup)
+ __setup_end = .;
+ }
.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+ __initcall_start = .;
INITCALLS
+ __initcall_end = .;
}
- __initcall_end = .;
- __con_initcall_start = .;
.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+ __con_initcall_start = .;
*(.con_initcall.init)
+ __con_initcall_end = .;
}
- __con_initcall_end = .;
SECURITY_INIT
. = ALIGN(4);
- __alt_instructions = .;
.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ __alt_instructions = .;
*(.altinstructions)
+ __alt_instructions_end = .;
}
- __alt_instructions_end = .;
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
+ . = ALIGN(4);
+ .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+ __start_parainstructions = .;
+ *(.parainstructions)
+ __stop_parainstructions = .;
+ }
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
. = ALIGN(4096);
- __initramfs_start = .;
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
- __initramfs_end = .;
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+ __initramfs_start = .;
+ *(.init.ramfs)
+ __initramfs_end = .;
+ }
. = ALIGN(L1_CACHE_BYTES);
- __per_cpu_start = .;
- .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
- __per_cpu_end = .;
+ .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
+ __per_cpu_start = .;
+ *(.data.percpu)
+ __per_cpu_end = .;
+ }
. = ALIGN(4096);
- __init_end = .;
/* freed after init ends here */
- __bss_start = .; /* BSS */
- .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) {
- *(.bss.page_aligned)
- }
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+ __init_end = .;
+ __bss_start = .; /* BSS */
+ *(.bss.page_aligned)
*(.bss)
+ . = ALIGN(4);
+ __bss_stop = .;
+ _end = . ;
+ /* This is where the kernel creates the early boot page tables */
+ . = ALIGN(4096);
+ pg0 = . ;
}
- . = ALIGN(4);
- __bss_stop = .;
-
- _end = . ;
-
- /* This is where the kernel creates the early boot page tables */
- . = ALIGN(4096);
- pg0 = .;
/* Sections to be discarded */
/DISCARD/ : {