diff options
-rw-r--r-- | arch/x86/Kconfig.debug | 36 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso32/vclock_gettime.c | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/page_64_types.h | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/page_types.h | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable.h | 25 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_types.h | 40 | ||||
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 81 | ||||
-rw-r--r-- | arch/x86/mm/gup.c | 18 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 79 |
12 files changed, 219 insertions, 74 deletions
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d8c0d3266173..3e0baf726eef 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -65,10 +65,14 @@ config EARLY_PRINTK_EFI This is useful for kernel debugging when your machine crashes very early before the console code is initialized. +config X86_PTDUMP_CORE + def_bool n + config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL select DEBUG_FS + select X86_PTDUMP_CORE ---help--- Say Y here if you want to show the kernel pagetable layout in a debugfs file. This information is only useful for kernel developers @@ -79,7 +83,8 @@ config X86_PTDUMP config EFI_PGT_DUMP bool "Dump the EFI pagetable" - depends on EFI && X86_PTDUMP + depends on EFI + select X86_PTDUMP_CORE ---help--- Enable this if you want to dump the EFI page table before enabling virtual mode. This can be used to debug miscellaneous @@ -105,6 +110,35 @@ config DEBUG_RODATA_TEST feature as well as for the change_page_attr() infrastructure. If in doubt, say "N" +config DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on DEBUG_RODATA + default y + select X86_PTDUMP_CORE + ---help--- + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving + W+X mappings after applying NX, as such mappings are a security risk. + + Look for a message in dmesg output like this: + + x86/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + x86/mm: Checked W+X mappings: FAILED, <N> W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config DEBUG_SET_MODULE_RONX bool "Set loadable kernel module data as NX and text as RO" depends on MODULES diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 175cc72c0f68..87a86e017f0e 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -14,11 +14,13 @@ */ #undef CONFIG_64BIT #undef CONFIG_X86_64 +#undef CONFIG_PGTABLE_LEVELS #undef CONFIG_ILLEGAL_POINTER_VALUE #undef CONFIG_SPARSEMEM_VMEMMAP #undef CONFIG_NR_CPUS #define CONFIG_X86_32 1 +#define CONFIG_PGTABLE_LEVELS 2 #define CONFIG_PAGE_OFFSET 0 #define CONFIG_ILLEGAL_POINTER_VALUE 0 #define CONFIG_NR_CPUS 1 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 4edd53b79a81..4928cf0d5af0 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -26,9 +26,6 @@ #define MCE_STACK 4 #define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ -#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) -#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) - /* * Set __PAGE_OFFSET to the most negative possible address + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index c7c712f2648b..c5b7fb2774d0 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -20,6 +20,9 @@ #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) + #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 867da5bbb4a3..c0b41f111a9a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -19,6 +19,13 @@ #include <asm/x86_init.h> void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_checkwx(void); + +#ifdef CONFIG_DEBUG_WX +#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#else +#define debug_checkwx() do { } while (0) +#endif /* * ZERO_PAGE is a global shared page that is always zero: used @@ -142,12 +149,12 @@ static inline unsigned long pte_pfn(pte_t pte) static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { - return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } #define pte_page(pte) pfn_to_page(pte_pfn(pte)) @@ -379,7 +386,9 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) return __pgprot(preservebits | addbits); } -#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) +#define pte_pgprot(x) __pgprot(pte_flags(x)) +#define pmd_pgprot(x) __pgprot(pmd_flags(x)) +#define pud_pgprot(x) __pgprot(pud_flags(x)) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) @@ -502,14 +511,15 @@ static inline int pmd_none(pmd_t pmd) static inline unsigned long pmd_page_vaddr(pmd_t pmd) { - return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK); + return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) +#define pmd_page(pmd) \ + pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -570,14 +580,15 @@ static inline int pud_present(pud_t pud) static inline unsigned long pud_page_vaddr(pud_t pud) { - return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK); + return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) +#define pud_page(pud) \ + pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 13f310bfc09a..dd5b0aa9dd2f 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -209,10 +209,10 @@ enum page_cache_mode { #include <linux/types.h> -/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) -/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_FLAGS_MASK (~PTE_PFN_MASK) typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; @@ -276,14 +276,46 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) } #endif +static inline pudval_t pud_pfn_mask(pud_t pud) +{ + if (native_pud_val(pud) & _PAGE_PSE) + return PUD_PAGE_MASK & PHYSICAL_PAGE_MASK; + else + return PTE_PFN_MASK; +} + +static inline pudval_t pud_flags_mask(pud_t pud) +{ + if (native_pud_val(pud) & _PAGE_PSE) + return ~(PUD_PAGE_MASK & (pudval_t)PHYSICAL_PAGE_MASK); + else + return ~PTE_PFN_MASK; +} + static inline pudval_t pud_flags(pud_t pud) { - return native_pud_val(pud) & PTE_FLAGS_MASK; + return native_pud_val(pud) & pud_flags_mask(pud); +} + +static inline pmdval_t pmd_pfn_mask(pmd_t pmd) +{ + if (native_pmd_val(pmd) & _PAGE_PSE) + return PMD_PAGE_MASK & PHYSICAL_PAGE_MASK; + else + return PTE_PFN_MASK; +} + +static inline pmdval_t pmd_flags_mask(pmd_t pmd) +{ + if (native_pmd_val(pmd) & _PAGE_PSE) + return ~(PMD_PAGE_MASK & (pmdval_t)PHYSICAL_PAGE_MASK); + else + return ~PTE_PFN_MASK; } static inline pmdval_t pmd_flags(pmd_t pmd) { - return native_pmd_val(pmd) & PTE_FLAGS_MASK; + return native_pmd_val(pmd) & pmd_flags_mask(pmd); } static inline pte_t native_make_pte(pteval_t val) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index a482d105172b..65c47fda26fc 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index f0cedf3395af..1bf417e9cc13 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -32,6 +32,8 @@ struct pg_state { const struct addr_marker *marker; unsigned long lines; bool to_dmesg; + bool check_wx; + unsigned long wx_pages; }; struct addr_marker { @@ -155,7 +157,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, " "); if ((level == 4 && pr & _PAGE_PAT) || ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) - pt_dump_cont_printf(m, dmsg, "pat "); + pt_dump_cont_printf(m, dmsg, "PAT "); else pt_dump_cont_printf(m, dmsg, " "); if (pr & _PAGE_GLOBAL) @@ -198,8 +200,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, * we have now. "break" is either changing perms, levels or * address space marker. */ - prot = pgprot_val(new_prot) & PTE_FLAGS_MASK; - cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK; + prot = pgprot_val(new_prot); + cur = pgprot_val(st->current_prot); if (!st->level) { /* First entry */ @@ -214,6 +216,16 @@ static void note_page(struct seq_file *m, struct pg_state *st, const char *unit = units; unsigned long delta; int width = sizeof(unsigned long) * 2; + pgprotval_t pr = pgprot_val(st->current_prot); + + if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) { + WARN_ONCE(1, + "x86/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, + (void *)st->start_address); + st->wx_pages += (st->current_address - + st->start_address) / PAGE_SIZE; + } /* * Now print the actual finished series @@ -269,13 +281,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, { int i; pte_t *start; + pgprotval_t prot; start = (pte_t *) pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { - pgprot_t prot = pte_pgprot(*start); - + prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, prot, 4); + note_page(m, st, __pgprot(prot), 4); start++; } } @@ -287,18 +299,19 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, { int i; pmd_t *start; + pgprotval_t prot; start = (pmd_t *) pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { - pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK; - - if (pmd_large(*start) || !pmd_present(*start)) + if (pmd_large(*start) || !pmd_present(*start)) { + prot = pmd_flags(*start); note_page(m, st, __pgprot(prot), 3); - else + } else { walk_pte_level(m, st, *start, P + i * PMD_LEVEL_MULT); + } } else note_page(m, st, __pgprot(0), 3); start++; @@ -318,19 +331,20 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, { int i; pud_t *start; + pgprotval_t prot; start = (pud_t *) pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); if (!pud_none(*start)) { - pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK; - - if (pud_large(*start) || !pud_present(*start)) + if (pud_large(*start) || !pud_present(*start)) { + prot = pud_flags(*start); note_page(m, st, __pgprot(prot), 2); - else + } else { walk_pmd_level(m, st, *start, P + i * PUD_LEVEL_MULT); + } } else note_page(m, st, __pgprot(0), 2); @@ -344,13 +358,15 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif -void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) +static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, + bool checkwx) { #ifdef CONFIG_X86_64 pgd_t *start = (pgd_t *) &init_level4_pgt; #else pgd_t *start = swapper_pg_dir; #endif + pgprotval_t prot; int i; struct pg_state st = {}; @@ -359,16 +375,20 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) st.to_dmesg = true; } + st.check_wx = checkwx; + if (checkwx) + st.wx_pages = 0; + for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start)) { - pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK; - - if (pgd_large(*start) || !pgd_present(*start)) + if (pgd_large(*start) || !pgd_present(*start)) { + prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); - else + } else { walk_pud_level(m, &st, *start, i * PGD_LEVEL_MULT); + } } else note_page(m, &st, __pgprot(0), 1); @@ -378,8 +398,26 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) /* Flush out the last page */ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); note_page(m, &st, __pgprot(0), 0); + if (!checkwx) + return; + if (st.wx_pages) + pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", + st.wx_pages); + else + pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); } +void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) +{ + ptdump_walk_pgd_level_core(m, pgd, false); +} + +void ptdump_walk_pgd_level_checkwx(void) +{ + ptdump_walk_pgd_level_core(NULL, NULL, true); +} + +#ifdef CONFIG_X86_PTDUMP static int ptdump_show(struct seq_file *m, void *v) { ptdump_walk_pgd_level(m, NULL); @@ -397,10 +435,13 @@ static const struct file_operations ptdump_fops = { .llseek = seq_lseek, .release = single_release, }; +#endif static int pt_dump_init(void) { +#ifdef CONFIG_X86_PTDUMP struct dentry *pe; +#endif #ifdef CONFIG_X86_32 /* Not a compile-time constant on x86-32 */ @@ -412,10 +453,12 @@ static int pt_dump_init(void) address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; #endif +#ifdef CONFIG_X86_PTDUMP pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, &ptdump_fops); if (!pe) return -ENOMEM; +#endif return 0; } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 81bf3d2af3eb..ae9a37bf1371 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -118,21 +118,20 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; - pte_t pte = *(pte_t *)&pmd; struct page *head, *page; int refs; mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_flags(pte) & mask) != mask) + if ((pmd_flags(pmd) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); refs = 0; - head = pte_page(pte); + head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); @@ -195,21 +194,20 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; - pte_t pte = *(pte_t *)&pud; struct page *head, *page; int refs; mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_flags(pte) & mask) != mask) + if ((pud_flags(pud) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pud_pfn(pud))); refs = 0; - head = pte_page(pte); + head = pud_page(pud); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7562f42914b4..cb4ef3de61f9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -957,6 +957,8 @@ void mark_rodata_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif mark_nxdata_nx(); + if (__supported_pte_mask & _PAGE_NX) + debug_checkwx(); } #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index df48430c279b..5ed62eff31bd 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1150,6 +1150,8 @@ void mark_rodata_ro(void) free_init_pages("unused kernel", (unsigned long) __va(__pa_symbol(rodata_end)), (unsigned long) __va(__pa_symbol(_sdata))); + + debug_checkwx(); } #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 050a092b8d9a..a3137a4feed1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -414,18 +414,28 @@ pmd_t *lookup_pmd_address(unsigned long address) phys_addr_t slow_virt_to_phys(void *__virt_addr) { unsigned long virt_addr = (unsigned long)__virt_addr; - phys_addr_t phys_addr; - unsigned long offset; + unsigned long phys_addr, offset; enum pg_level level; - unsigned long pmask; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); - pmask = page_level_mask(level); - offset = virt_addr & ~pmask; - phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; - return (phys_addr | offset); + + switch (level) { + case PG_LEVEL_1G: + phys_addr = pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PUD_PAGE_MASK; + break; + case PG_LEVEL_2M: + phys_addr = pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PMD_PAGE_MASK; + break; + default: + phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + offset = virt_addr & ~PAGE_MASK; + } + + return (phys_addr_t)(phys_addr | offset); } EXPORT_SYMBOL_GPL(slow_virt_to_phys); @@ -458,7 +468,7 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; @@ -478,17 +488,21 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: -#ifdef CONFIG_X86_64 + old_prot = pmd_pgprot(*(pmd_t *)kpte); + old_pfn = pmd_pfn(*(pmd_t *)kpte); + break; case PG_LEVEL_1G: -#endif - psize = page_level_size(level); - pmask = page_level_mask(level); + old_prot = pud_pgprot(*(pud_t *)kpte); + old_pfn = pud_pfn(*(pud_t *)kpte); break; default: do_split = -EINVAL; goto out_unlock; } + psize = page_level_size(level); + pmask = page_level_mask(level); + /* * Calculate the number of pages, which fit into this large * page starting at address: @@ -504,7 +518,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * up accordingly. */ old_pte = *kpte; - old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte)); + req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); @@ -530,10 +544,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, req_prot = canon_pgprot(req_prot); /* - * old_pte points to the large page base address. So we need + * old_pfn points to the large page base pfn. So we need * to add the offset of the virtual address: */ - pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); + pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; new_prot = static_protections(req_prot, address, pfn); @@ -544,7 +558,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * the pages in the range we try to preserve: */ addr = address & pmask; - pfn = pte_pfn(old_pte); + pfn = old_pfn; for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { pgprot_t chk_prot = static_protections(req_prot, addr, pfn); @@ -574,7 +588,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * The address is aligned and the number of pages * covers the full page. */ - new_pte = pfn_pte(pte_pfn(old_pte), new_prot); + new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; do_split = 0; @@ -591,7 +605,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { pte_t *pbase = (pte_t *)page_address(base); - unsigned long pfn, pfninc = 1; + unsigned long ref_pfn, pfn, pfninc = 1; unsigned int i, level; pte_t *tmp; pgprot_t ref_prot; @@ -608,26 +622,33 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, } paravirt_alloc_pte(&init_mm, page_to_pfn(base)); - ref_prot = pte_pgprot(pte_clrhuge(*kpte)); - /* promote PAT bit to correct position */ - if (level == PG_LEVEL_2M) + switch (level) { + case PG_LEVEL_2M: + ref_prot = pmd_pgprot(*(pmd_t *)kpte); + /* clear PSE and promote PAT bit to correct position */ ref_prot = pgprot_large_2_4k(ref_prot); + ref_pfn = pmd_pfn(*(pmd_t *)kpte); + break; -#ifdef CONFIG_X86_64 - if (level == PG_LEVEL_1G) { + case PG_LEVEL_1G: + ref_prot = pud_pgprot(*(pud_t *)kpte); + ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + /* - * Set the PSE flags only if the PRESENT flag is set + * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present/pmd_huge will return true * even on a non present pmd. */ - if (pgprot_val(ref_prot) & _PAGE_PRESENT) - pgprot_val(ref_prot) |= _PAGE_PSE; - else + if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; + break; + + default: + spin_unlock(&pgd_lock); + return 1; } -#endif /* * Set the GLOBAL flags only if the PRESENT flag is set @@ -643,7 +664,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, /* * Get the target pfn from the original entry: */ - pfn = pte_pfn(*kpte); + pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); |