diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/boot/Makefile | 23 | ||||
-rw-r--r-- | arch/x86/boot/header.S | 29 | ||||
-rw-r--r-- | arch/x86/boot/pm.c | 44 | ||||
-rw-r--r-- | arch/x86/boot/tools/build.c | 9 | ||||
-rw-r--r-- | arch/x86/include/asm/boot.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/highmem.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/kexec.h | 13 | ||||
-rw-r--r-- | arch/x86/include/asm/linkage.h | 13 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_64.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/machine_kexec_32.c | 17 | ||||
-rw-r--r-- | arch/x86/kernel/machine_kexec_64.c | 99 | ||||
-rw-r--r-- | arch/x86/kernel/relocate_kernel_32.S | 24 | ||||
-rw-r--r-- | arch/x86/kernel/relocate_kernel_64.S | 189 | ||||
-rw-r--r-- | arch/x86/kernel/visws_quirks.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/vmlinux_64.lds.S | 7 | ||||
-rw-r--r-- | arch/x86/lguest/boot.c | 21 | ||||
-rw-r--r-- | arch/x86/mm/highmem_32.c | 19 | ||||
-rw-r--r-- | arch/x86/mm/iomap_32.c | 13 | ||||
-rw-r--r-- | arch/x86/mm/kmmio.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 11 |
22 files changed, 351 insertions, 196 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 03f0a3aa43b1..7fcf85182681 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1437,7 +1437,7 @@ config CRASH_DUMP config KEXEC_JUMP bool "kexec jump (EXPERIMENTAL)" depends on EXPERIMENTAL - depends on KEXEC && HIBERNATION && X86_32 + depends on KEXEC && HIBERNATION ---help--- Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index c70eff69a1fb..57a29fecf6bb 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -6,26 +6,23 @@ # for more details. # # Copyright (C) 1994 by Linus Torvalds +# Changed by many, many contributors over the years. # # ROOT_DEV specifies the default root-device when making the image. # This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case # the default of FLOPPY is used by 'build'. -ROOT_DEV := CURRENT +ROOT_DEV := CURRENT # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. # The number is the same as you would ordinarily press at bootup. -SVGA_MODE := -DSVGA_MODE=NORMAL_VGA +SVGA_MODE := -DSVGA_MODE=NORMAL_VGA -# If you want the RAM disk device, define this to be the size in blocks. - -#RAMDISK := -DRAMDISK=512 - -targets := vmlinux.bin setup.bin setup.elf zImage bzImage +targets := vmlinux.bin setup.bin setup.elf bzImage subdir- := compressed setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o @@ -71,17 +68,13 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ KBUILD_CFLAGS += $(call cc-option,-m32) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ -$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ -$(obj)/bzImage: BUILDFLAGS := -b +$(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ -cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \ - $(obj)/vmlinux.bin $(ROOT_DEV) > $@ +cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ + $(ROOT_DEV) > $@ -$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \ - $(obj)/vmlinux.bin $(obj)/tools/build FORCE +$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE $(call if_changed,image) @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 7ccff4884a23..5d84d1c74e4c 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -24,12 +24,8 @@ #include "boot.h" #include "offsets.h" -SETUPSECTS = 4 /* default nr of setup-sectors */ BOOTSEG = 0x07C0 /* original address of boot-sector */ -SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ -SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ - /* to be loaded */ -ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ +SYSSEG = 0x1000 /* historical load address >> 4 */ #ifndef SVGA_MODE #define SVGA_MODE ASK_VGA @@ -97,12 +93,12 @@ bugger_off_msg: .section ".header", "a" .globl hdr hdr: -setup_sects: .byte SETUPSECTS +setup_sects: .byte 0 /* Filled in by build.c */ root_flags: .word ROOT_RDONLY -syssize: .long SYSSIZE -ram_size: .word RAMDISK +syssize: .long 0 /* Filled in by build.c */ +ram_size: .word 0 /* Obsolete */ vid_mode: .word SVGA_MODE -root_dev: .word ROOT_DEV +root_dev: .word 0 /* Filled in by build.c */ boot_flag: .word 0xAA55 # offset 512, entry point @@ -123,14 +119,15 @@ _start: # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG -start_sys_seg: .word SYSSEG +start_sys_seg: .word SYSSEG # obsolete and meaningless, but just + # in case something decided to "use" it .word kernel_version-512 # pointing to kernel version string # above section of header is compatible # with loadlin-1.5 (header v1.5). Don't # change it. -type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, - # Bootlin, SYSLX, bootsect...) +type_of_loader: .byte 0 # 0 means ancient bootloader, newer + # bootloaders know to change this. # See Documentation/i386/boot.txt for # assigned ids @@ -142,11 +139,7 @@ CAN_USE_HEAP = 0x80 # If set, the loader also has set # space behind setup.S can be used for # heap purposes. # Only the loader knows what is free -#ifndef __BIG_KERNEL__ - .byte 0 -#else .byte LOADED_HIGH -#endif setup_move_size: .word 0x8000 # size to move, when setup is not # loaded at 0x90000. We will move setup @@ -157,11 +150,7 @@ setup_move_size: .word 0x8000 # size to move, when setup is not code32_start: # here loaders can put a different # start address for 32-bit code. -#ifndef __BIG_KERNEL__ - .long 0x1000 # 0x1000 = default for zImage -#else .long 0x100000 # 0x100000 = default for big kernel -#endif ramdisk_image: .long 0 # address of loaded ramdisk image # Here the loader puts the 32-bit diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index 85a1cd8a8ff8..8062f8915250 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -33,47 +33,6 @@ static void realmode_switch_hook(void) } /* - * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000. - * A bzImage kernel is loaded and runs at 0x100000. - */ -static void move_kernel_around(void) -{ - /* Note: rely on the compile-time option here rather than - the LOADED_HIGH flag. The Qemu kernel loader unconditionally - sets the loadflags to zero. */ -#ifndef __BIG_KERNEL__ - u16 dst_seg, src_seg; - u32 syssize; - - dst_seg = 0x1000 >> 4; - src_seg = 0x10000 >> 4; - syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */ - - while (syssize) { - int paras = (syssize >= 0x1000) ? 0x1000 : syssize; - int dwords = paras << 2; - - asm volatile("pushw %%es ; " - "pushw %%ds ; " - "movw %1,%%es ; " - "movw %2,%%ds ; " - "xorw %%di,%%di ; " - "xorw %%si,%%si ; " - "rep;movsl ; " - "popw %%ds ; " - "popw %%es" - : "+c" (dwords) - : "r" (dst_seg), "r" (src_seg) - : "esi", "edi"); - - syssize -= paras; - dst_seg += paras; - src_seg += paras; - } -#endif -} - -/* * Disable all interrupts at the legacy PIC. */ static void mask_all_interrupts(void) @@ -147,9 +106,6 @@ void go_to_protected_mode(void) /* Hook before leaving real mode, also disables interrupts */ realmode_switch_hook(); - /* Move the kernel/setup to their final resting places */ - move_kernel_around(); - /* Enable the A20 gate */ if (enable_a20()) { puts("A20 gate not responding, unable to boot...\n"); diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 44dc1923c0e3..ee3a4ea923ac 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -130,7 +130,7 @@ static void die(const char * str, ...) static void usage(void) { - die("Usage: build [-b] setup system [rootdev] [> image]"); + die("Usage: build setup system [rootdev] [> image]"); } int main(int argc, char ** argv) @@ -145,11 +145,6 @@ int main(int argc, char ** argv) void *kernel; u32 crc = 0xffffffffUL; - if (argc > 2 && !strcmp(argv[1], "-b")) - { - is_big_kernel = 1; - argc--, argv++; - } if ((argc < 3) || (argc > 4)) usage(); if (argc > 3) { @@ -216,8 +211,6 @@ int main(int argc, char ** argv) die("Unable to mmap '%s': %m", argv[2]); /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ sys_size = (sz + 15 + 4) / 16; - if (!is_big_kernel && sys_size > DEF_SYSSIZE) - die("System is too big. Try using bzImage or modules."); /* Patch the setup code with the appropriate size parameters */ buf[0x1f1] = setup_sectors-1; diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6526cf08b0e4..6ba23dd9fc92 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -1,10 +1,6 @@ #ifndef _ASM_X86_BOOT_H #define _ASM_X86_BOOT_H -/* Don't touch these, unless you really know what you're doing. */ -#define DEF_SYSSEG 0x1000 -#define DEF_SYSSIZE 0x7F00 - /* Internal svga startup constants */ #define NORMAL_VGA 0xffff /* 80x25 mode */ #define EXTENDED_VGA 0xfffe /* 80x50 mode */ diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index bf9276bea660..014c2b85ae45 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -63,6 +63,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); void *kmap_atomic(struct page *page, enum km_type type); void kunmap_atomic(void *kvaddr, enum km_type type); void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); +void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); #ifndef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 0ceb6d19ed30..317ff1703d0b 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -9,13 +9,13 @@ # define PAGES_NR 4 #else # define PA_CONTROL_PAGE 0 -# define PA_TABLE_PAGE 1 -# define PAGES_NR 2 +# define VA_CONTROL_PAGE 1 +# define PA_TABLE_PAGE 2 +# define PA_SWAP_PAGE 3 +# define PAGES_NR 4 #endif -#ifdef CONFIG_X86_32 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 -#endif #ifndef __ASSEMBLY__ @@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page, unsigned int has_pae, unsigned int preserve_context); #else -NORET_TYPE void +unsigned long relocate_kernel(unsigned long indirection_page, unsigned long page_list, - unsigned long start_address) ATTRIB_NORET; + unsigned long start_address, + unsigned int preserve_context); #endif #define ARCH_HAS_KIMAGE_ARCH diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index a0d70b46c27c..12d55e773eb6 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_LINKAGE_H #define _ASM_X86_LINKAGE_H +#include <linux/stringify.h> + #undef notrace #define notrace __attribute__((no_instrument_function)) @@ -53,14 +55,9 @@ .globl name; \ name: -#ifdef CONFIG_X86_64 -#define __ALIGN .p2align 4,,15 -#define __ALIGN_STR ".p2align 4,,15" -#endif - -#ifdef CONFIG_X86_ALIGNMENT_16 -#define __ALIGN .align 16,0x90 -#define __ALIGN_STR ".align 16,0x90" +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16) +#define __ALIGN .p2align 4, 0x90 +#define __ALIGN_STR __stringify(__ALIGN) #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index b585e04cbc9e..3178c3acd97e 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = { .name = "p4-clockmod", .owner = THIS_MODULE, .attr = p4clockmod_attr, - .hide_interface = 1, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index bfbd5323a635..ca14604611ec 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -639,7 +639,7 @@ static void mce_init_timer(void) if (!next_interval) return; setup_timer(t, mcheck_timer, smp_processor_id()); - t->expires = round_jiffies_relative(jiffies + next_interval); + t->expires = round_jiffies(jiffies + next_interval); add_timer(t); } @@ -1110,7 +1110,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies_relative(jiffies + next_interval); + t->expires = round_jiffies(jiffies + next_interval); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index f5fc8c781a62..e7368c1da01d 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -14,12 +14,12 @@ #include <linux/ftrace.h> #include <linux/suspend.h> #include <linux/gfp.h> +#include <linux/io.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> -#include <asm/io.h> #include <asm/apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> @@ -63,7 +63,7 @@ static void load_segments(void) "\tmovl %%eax,%%fs\n" "\tmovl %%eax,%%gs\n" "\tmovl %%eax,%%ss\n" - ::: "eax", "memory"); + : : : "eax", "memory"); #undef STR #undef __STR } @@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image) if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC - /* We need to put APICs in legacy mode so that we can + /* + * We need to put APICs in legacy mode so that we can * get timer interrupts in second kernel. kexec/kdump * paths already have calls to disable_IO_APIC() in * one form or other. kexec jump path also need @@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); - /* The segment registers are funny things, they have both a + /* + * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the @@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image) * segments, before I zap the gdt with an invalid value. */ load_segments(); - /* The gdt & idt are now invalid. + /* + * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6993d51b7fd8..89cea4d44679 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -12,11 +12,47 @@ #include <linux/reboot.h> #include <linux/numa.h> #include <linux/ftrace.h> +#include <linux/io.h> +#include <linux/suspend.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> -#include <asm/io.h> + +static int init_one_level2_page(struct kimage *image, pgd_t *pgd, + unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + struct page *page; + int result = -ENOMEM; + + addr &= PMD_MASK; + pgd += pgd_index(addr); + if (!pgd_present(*pgd)) { + page = kimage_alloc_control_pages(image, 0); + if (!page) + goto out; + pud = (pud_t *)page_address(page); + memset(pud, 0, PAGE_SIZE); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) { + page = kimage_alloc_control_pages(image, 0); + if (!page) + goto out; + pmd = (pmd_t *)page_address(page); + memset(pmd, 0, PAGE_SIZE); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + result = 0; +out: + return result; +} static void init_level2_page(pmd_t *level2p, unsigned long addr) { @@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p, } level3p = (pud_t *)page_address(page); result = init_level3_page(image, level3p, addr, last_addr); - if (result) { + if (result) goto out; - } set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); addr += PGDIR_SIZE; } @@ -156,6 +191,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); if (result) return result; + /* + * image->start may be outside 0 ~ max_pfn, for example when + * jump back to original kernel from kexeced kernel + */ + result = init_one_level2_page(image, level4p, image->start); + if (result) + return result; return init_transition_pgtable(image, level4p); } @@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + int save_ftrace_enabled; - tracer_disable(); +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page) + PAGE_SIZE; - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); - /* The segment registers are funny things, they have both a + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + + /* + * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the @@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image) * segments, before I zap the gdt with an invalid value. */ load_segments(); - /* The gdt & idt are now invalid. + /* + * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start); + image->start = relocate_kernel((unsigned long)image->head, + (unsigned long)page_list, + image->start, + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 2064d0aa8d28..41235531b11c 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -17,7 +17,8 @@ #define PTR(x) (x << 2) -/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE * ~ control_page + PAGE_SIZE are used as data storage and stack for * jumping back */ @@ -76,8 +77,10 @@ relocate_kernel: movl %eax, CP_PA_SWAP_PAGE(%edi) movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) - /* get physical address of control page now */ - /* this is impossible after page table switch */ + /* + * get physical address of control page now + * this is impossible after page table switch + */ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi /* switch to new set of page tables */ @@ -97,7 +100,8 @@ identity_mapped: /* store the start address on the stack */ pushl %edx - /* Set cr0 to a known state: + /* + * Set cr0 to a known state: * - Paging disabled * - Alignment check disabled * - Write protect disabled @@ -113,7 +117,8 @@ identity_mapped: /* clear cr4 if applicable */ testl %ecx, %ecx jz 1f - /* Set cr4 to a known state: + /* + * Set cr4 to a known state: * Setting everything to zero seems safe. */ xorl %eax, %eax @@ -132,15 +137,18 @@ identity_mapped: call swap_pages addl $8, %esp - /* To be certain of avoiding problems with self-modifying code + /* + * To be certain of avoiding problems with self-modifying code * I need to execute a serializing instruction here. * So I flush the TLB, it's handy, and not processor dependent. */ xorl %eax, %eax movl %eax, %cr3 - /* set all of the registers to known values */ - /* leave %esp alone */ + /* + * set all of the registers to known values + * leave %esp alone + */ testl %esi, %esi jnz 1f diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index d32cfb27a479..4de8f5b3d476 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -19,29 +19,77 @@ #define PTR(x) (x << 3) #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define RSP DATA(0x0) +#define CR0 DATA(0x8) +#define CR3 DATA(0x10) +#define CR4 DATA(0x18) + +/* other data */ +#define CP_PA_TABLE_PAGE DATA(0x20) +#define CP_PA_SWAP_PAGE DATA(0x28) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) + .text .align PAGE_SIZE .code64 .globl relocate_kernel relocate_kernel: - /* %rdi indirection_page + /* + * %rdi indirection_page * %rsi page_list * %rdx start address + * %rcx preserve_context */ + /* Save the CPU context, used for jumping back */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushf + + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 + movq %rsp, RSP(%r11) + movq %cr0, %rax + movq %rax, CR0(%r11) + movq %cr3, %rax + movq %rax, CR3(%r11) + movq %cr4, %rax + movq %rax, CR4(%r11) + /* zero out flags, and disable interrupts */ pushq $0 popfq - /* get physical address of control page now */ - /* this is impossible after page table switch */ + /* + * get physical address of control page now + * this is impossible after page table switch + */ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 /* get physical address of page table now too */ - movq PTR(PA_TABLE_PAGE)(%rsi), %rcx + movq PTR(PA_TABLE_PAGE)(%rsi), %r9 + + /* get physical address of swap page now */ + movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + + /* save some information for jumping back */ + movq %r9, CP_PA_TABLE_PAGE(%r11) + movq %r10, CP_PA_SWAP_PAGE(%r11) + movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) /* Switch to the identity mapped page tables */ - movq %rcx, %cr3 + movq %r9, %cr3 /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%r8), %rsp @@ -55,7 +103,8 @@ identity_mapped: /* store the start address on the stack */ pushq %rdx - /* Set cr0 to a known state: + /* + * Set cr0 to a known state: * - Paging enabled * - Alignment check disabled * - Write protect disabled @@ -68,7 +117,8 @@ identity_mapped: orl $(X86_CR0_PG | X86_CR0_PE), %eax movq %rax, %cr0 - /* Set cr4 to a known state: + /* + * Set cr4 to a known state: * - physical address extension enabled */ movq $X86_CR4_PAE, %rax @@ -78,9 +128,87 @@ identity_mapped: 1: /* Flush the TLB (needed?) */ - movq %rcx, %cr3 + movq %r9, %cr3 + + movq %rcx, %r11 + call swap_pages + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* + * set all of the registers to known values + * leave %rsp alone + */ + + testq %r11, %r11 + jnz 1f + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r9 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + ret + +1: + popq %rdx + leaq PAGE_SIZE(%r10), %rsp + call *%rdx + + /* get the re-entry point of the peer system */ + movq 0(%rsp), %rbp + call 1f +1: + popq %r8 + subq $(1b - relocate_kernel), %r8 + movq CP_PA_SWAP_PAGE(%r8), %r10 + movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi + movq CP_PA_TABLE_PAGE(%r8), %rax + movq %rax, %cr3 + lea PAGE_SIZE(%r8), %rsp + call swap_pages + movq $virtual_mapped, %rax + pushq %rax + ret + +virtual_mapped: + movq RSP(%r8), %rsp + movq CR4(%r8), %rax + movq %rax, %cr4 + movq CR3(%r8), %rax + movq CR0(%r8), %r8 + movq %rax, %cr3 + movq %r8, %cr0 + movq %rbp, %rax + + popf + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret /* Do the copies */ +swap_pages: movq %rdi, %rcx /* Put the page_list in %rcx */ xorq %rdi, %rdi xorq %rsi, %rsi @@ -112,36 +240,27 @@ identity_mapped: movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi + movq %rdi, %rdx + movq %rsi, %rax + + movq %r10, %rdi movq $512, %rcx rep ; movsq - jmp 0b -3: - - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB by reloading %cr3 here, it's handy, - * and not processor dependent. - */ - movq %cr3, %rax - movq %rax, %cr3 - /* set all of the registers to known values */ - /* leave %rsp alone */ + movq %rax, %rdi + movq %rdx, %rsi + movq $512, %rcx + rep ; movsq - xorq %rax, %rax - xorq %rbx, %rbx - xorq %rcx, %rcx - xorq %rdx, %rdx - xorq %rsi, %rsi - xorq %rdi, %rdi - xorq %rbp, %rbp - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r9 - xorq %r11, %r11 - xorq %r12, %r12 - xorq %r13, %r13 - xorq %r14, %r14 - xorq %r15, %r15 + movq %rdx, %rdi + movq %r10, %rsi + movq $512, %rcx + rep ; movsq + lea PAGE_SIZE(%rax), %rsi + jmp 0b +3: ret + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 191a876e9e87..31ffc24eec4d 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -578,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = { static irqreturn_t piix4_master_intr(int irq, void *dev_id) { int realirq; - irq_desc_t *desc; + struct irq_desc *desc; unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fbfced6f6800..5bf54e40c6ef 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), ASSERT((per_cpu__irq_stack_union == 0), "irq_stack_union is not at start of per-cpu area"); #endif + +#ifdef CONFIG_KEXEC +#include <asm/kexec.h> + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f3a5305b8adf..9fe4ddaa8f6f 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -348,6 +348,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * flush_tlb_user() for both user and kernel mappings unless * the Page Global Enable (PGE) feature bit is set. */ *dx |= 0x00002000; + /* We also lie, and say we're family id 5. 6 or greater + * leads to a rdmsr in early_init_intel which we can't handle. + * Family ID is returned as bits 8-12 in ax. */ + *ax &= 0xFFFFF0FF; + *ax |= 0x00000500; break; case 0x80000000: /* Futureproof this a little: if they ask how much extended @@ -594,19 +599,21 @@ static void __init lguest_init_IRQ(void) /* Some systems map "vectors" to interrupts weirdly. Lguest has * a straightforward 1 to 1 mapping, so force that here. */ __get_cpu_var(vector_irq)[vector] = i; - if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, - interrupt[vector-FIRST_EXTERNAL_VECTOR]); - set_irq_chip_and_handler_name(i, &lguest_irq_controller, - handle_level_irq, - "level"); - } + if (vector != SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); } /* This call is required to set up for 4k stacks, where we have * separate stacks for hard and soft interrupts. */ irq_ctx_init(smp_processor_id()); } +void lguest_setup_irq(unsigned int irq) +{ + irq_to_desc_alloc_cpu(irq, 0); + set_irq_chip_and_handler_name(irq, &lguest_irq_controller, + handle_level_irq, "level"); +} + /* * Time. * diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index d11745334a67..f256e73542d7 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -121,23 +121,30 @@ void kunmap_atomic(void *kvaddr, enum km_type type) pagefault_enable(); } -/* This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; pagefault_disable(); - idx = type + KM_TYPE_NR*smp_processor_id(); + debug_kmap_atomic_prot(type); + + idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); + set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); arch_flush_lazy_mmu_mode(); return (void*) vaddr; } + +/* This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +{ + return kmap_atomic_prot_pfn(pfn, type, kmap_prot); +} EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ struct page *kmap_atomic_to_page(void *ptr) diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 04102d42ff42..592984e5496b 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -18,6 +18,7 @@ #include <asm/iomap.h> #include <asm/pat.h> +#include <asm/highmem.h> #include <linux/module.h> int is_io_mapping_possible(resource_size_t base, unsigned long size) @@ -36,11 +37,6 @@ EXPORT_SYMBOL_GPL(is_io_mapping_possible); void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { - enum fixed_addresses idx; - unsigned long vaddr; - - pagefault_disable(); - /* * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the @@ -50,12 +46,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) prot = PAGE_KERNEL_UC_MINUS; - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, prot)); - arch_flush_lazy_mmu_mode(); - - return (void*) vaddr; + return kmap_atomic_prot_pfn(pfn, type, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 6a518dd08a36..4f115e00486b 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -310,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) { - pr_warning("kmmio: spurious debug trap on CPU %d.\n", + pr_debug("kmmio: spurious debug trap on CPU %d.\n", smp_processor_id()); goto out; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 8253bc97587e..9c4294986af7 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -522,6 +522,17 @@ static int split_large_page(pte_t *kpte, unsigned long address) * primary protection behavior: */ __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); + + /* + * Intel Atom errata AAH41 workaround. + * + * The real fix should be in hw or in a microcode update, but + * we also probabilistically try to reduce the window of having + * a large TLB mixed with 4K TLBs while instruction fetches are + * going on. + */ + __flush_tlb_all(); + base = NULL; out_unlock: |