diff options
Diffstat (limited to 'arch/x86')
91 files changed, 4894 insertions, 1960 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bea3a0159496..d69f1cd87fd9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,7 @@ config X86 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index aafe8ce0d65d..e26984f7ab8d 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -66,5 +66,5 @@ module_exit(aes_fini); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("aes"); -MODULE_ALIAS("aes-asm"); +MODULE_ALIAS_CRYPTO("aes"); +MODULE_ALIAS_CRYPTO("aes-asm"); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 888950f29fd9..ae855f4f64b7 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -43,10 +43,6 @@ #include <asm/crypto/glue_helper.h> #endif -#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) -#define HAS_PCBC -#endif - /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. @@ -547,7 +543,7 @@ static int ablk_ctr_init(struct crypto_tfm *tfm) #endif -#ifdef HAS_PCBC +#if IS_ENABLED(CONFIG_CRYPTO_PCBC) static int ablk_pcbc_init(struct crypto_tfm *tfm) { return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))"); @@ -1377,7 +1373,7 @@ static struct crypto_alg aesni_algs[] = { { }, }, #endif -#ifdef HAS_PCBC +#if IS_ENABLED(CONFIG_CRYPTO_PCBC) }, { .cra_name = "pcbc(aes)", .cra_driver_name = "pcbc-aes-aesni", @@ -1550,4 +1546,4 @@ module_exit(aesni_exit); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("aes"); +MODULE_ALIAS_CRYPTO("aes"); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 8af519ed73d1..17c05531dfd1 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -478,5 +478,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized"); -MODULE_ALIAS("blowfish"); -MODULE_ALIAS("blowfish-asm"); +MODULE_ALIAS_CRYPTO("blowfish"); +MODULE_ALIAS_CRYPTO("blowfish-asm"); diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 4209a76fcdaa..9a07fafe3831 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -582,5 +582,5 @@ module_exit(camellia_aesni_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized"); -MODULE_ALIAS("camellia"); -MODULE_ALIAS("camellia-asm"); +MODULE_ALIAS_CRYPTO("camellia"); +MODULE_ALIAS_CRYPTO("camellia-asm"); diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 87a041a10f4a..ed38d959add6 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -574,5 +574,5 @@ module_exit(camellia_aesni_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized"); -MODULE_ALIAS("camellia"); -MODULE_ALIAS("camellia-asm"); +MODULE_ALIAS_CRYPTO("camellia"); +MODULE_ALIAS_CRYPTO("camellia-asm"); diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index c171dcbf192d..5c8b6266a394 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1725,5 +1725,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized"); -MODULE_ALIAS("camellia"); -MODULE_ALIAS("camellia-asm"); +MODULE_ALIAS_CRYPTO("camellia"); +MODULE_ALIAS_CRYPTO("camellia-asm"); diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index e57e20ab5e0b..60ada677a928 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -491,4 +491,4 @@ module_exit(cast5_exit); MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("cast5"); +MODULE_ALIAS_CRYPTO("cast5"); diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 09f3677393e4..0160f68a57ff 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -611,4 +611,4 @@ module_exit(cast6_exit); MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("cast6"); +MODULE_ALIAS_CRYPTO("cast6"); diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 9d014a74ef96..1937fc1d8763 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -197,5 +197,5 @@ module_exit(crc32_pclmul_mod_fini); MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("crc32"); -MODULE_ALIAS("crc32-pclmul"); +MODULE_ALIAS_CRYPTO("crc32"); +MODULE_ALIAS_CRYPTO("crc32-pclmul"); diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 6812ad98355c..28640c3d6af7 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -280,5 +280,5 @@ MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.c MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware."); MODULE_LICENSE("GPL"); -MODULE_ALIAS("crc32c"); -MODULE_ALIAS("crc32c-intel"); +MODULE_ALIAS_CRYPTO("crc32c"); +MODULE_ALIAS_CRYPTO("crc32c-intel"); diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index 7845d7fd54c0..b6c67bf30fdf 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -147,5 +147,5 @@ MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); MODULE_LICENSE("GPL"); -MODULE_ALIAS("crct10dif"); -MODULE_ALIAS("crct10dif-pclmul"); +MODULE_ALIAS_CRYPTO("crct10dif"); +MODULE_ALIAS_CRYPTO("crct10dif-pclmul"); diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index 0e9c0668fe4e..38a14f818ef1 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -502,8 +502,8 @@ module_exit(des3_ede_x86_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized"); -MODULE_ALIAS("des3_ede"); -MODULE_ALIAS("des3_ede-asm"); -MODULE_ALIAS("des"); -MODULE_ALIAS("des-asm"); +MODULE_ALIAS_CRYPTO("des3_ede"); +MODULE_ALIAS_CRYPTO("des3_ede-asm"); +MODULE_ALIAS_CRYPTO("des"); +MODULE_ALIAS_CRYPTO("des-asm"); MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>"); diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index 98d7a188f46b..f368ba261739 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/crypto.h> #include <asm/i387.h> struct crypto_fpu_ctx { @@ -159,3 +160,5 @@ void __exit crypto_fpu_exit(void) { crypto_unregister_template(&crypto_fpu_tmpl); } + +MODULE_ALIAS_CRYPTO("fpu"); diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 88bb7ba8b175..8253d85aa165 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -341,4 +341,4 @@ module_exit(ghash_pclmulqdqni_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " "acclerated by PCLMULQDQ-NI"); -MODULE_ALIAS("ghash"); +MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index 5e8e67739bb5..399a29d067d6 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -119,5 +119,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)"); -MODULE_ALIAS("salsa20"); -MODULE_ALIAS("salsa20-asm"); +MODULE_ALIAS_CRYPTO("salsa20"); +MODULE_ALIAS_CRYPTO("salsa20-asm"); diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 2fae489b1524..437e47a4d302 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -558,5 +558,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); -MODULE_ALIAS("serpent"); -MODULE_ALIAS("serpent-asm"); +MODULE_ALIAS_CRYPTO("serpent"); +MODULE_ALIAS_CRYPTO("serpent-asm"); diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index ff4870870972..7e217398b4eb 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -617,4 +617,4 @@ module_exit(serpent_exit); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("serpent"); +MODULE_ALIAS_CRYPTO("serpent"); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 8c95f8637306..bf025adaea01 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -618,4 +618,4 @@ module_exit(serpent_sse2_exit); MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("serpent"); +MODULE_ALIAS_CRYPTO("serpent"); diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index 99eefd812958..a225a5ca1037 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -204,8 +204,7 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str continue; } - if (ctx) - ctx->status = HASH_CTX_STS_IDLE; + ctx->status = HASH_CTX_STS_IDLE; return ctx; } diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 74d16ef707c7..6c20fe04a738 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -278,4 +278,4 @@ module_exit(sha1_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); -MODULE_ALIAS("sha1"); +MODULE_ALIAS_CRYPTO("sha1"); diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index f248546da1ca..8fad72f4dfd2 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -211,7 +211,7 @@ static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash) sha256_ssse3_final(desc, D); memcpy(hash, D, SHA224_DIGEST_SIZE); - memset(D, 0, SHA256_DIGEST_SIZE); + memzero_explicit(D, SHA256_DIGEST_SIZE); return 0; } @@ -318,5 +318,5 @@ module_exit(sha256_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); -MODULE_ALIAS("sha256"); -MODULE_ALIAS("sha224"); +MODULE_ALIAS_CRYPTO("sha256"); +MODULE_ALIAS_CRYPTO("sha224"); diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 8626b03e83b7..0b6af26832bf 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -219,7 +219,7 @@ static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash) sha512_ssse3_final(desc, D); memcpy(hash, D, SHA384_DIGEST_SIZE); - memset(D, 0, SHA512_DIGEST_SIZE); + memzero_explicit(D, SHA512_DIGEST_SIZE); return 0; } @@ -326,5 +326,5 @@ module_exit(sha512_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); -MODULE_ALIAS("sha512"); -MODULE_ALIAS("sha384"); +MODULE_ALIAS_CRYPTO("sha512"); +MODULE_ALIAS_CRYPTO("sha384"); diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 4e3c665be129..1ac531ea9bcc 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -579,4 +579,4 @@ module_exit(twofish_exit); MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("twofish"); +MODULE_ALIAS_CRYPTO("twofish"); diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index 0a5202303501..77e06c2da83d 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -96,5 +96,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized"); -MODULE_ALIAS("twofish"); -MODULE_ALIAS("twofish-asm"); +MODULE_ALIAS_CRYPTO("twofish"); +MODULE_ALIAS_CRYPTO("twofish-asm"); diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 13e63b3e1dfb..56d8a08ee479 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -495,5 +495,5 @@ module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized"); -MODULE_ALIAS("twofish"); -MODULE_ALIAS("twofish-asm"); +MODULE_ALIAS_CRYPTO("twofish"); +MODULE_ALIAS_CRYPTO("twofish-asm"); diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 5d7b381da692..2eccc8932ae6 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall) case __NR_socketcall: return 4; case __NR_execve: + case __NR_execveat: return 5; default: return 1; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffe71228fc10..82e8a1d44658 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -480,6 +480,7 @@ GLOBAL(\label) PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn PTREGSCALL stub32_sigreturn, sys32_sigreturn PTREGSCALL stub32_execve, compat_sys_execve + PTREGSCALL stub32_execveat, compat_sys_execveat PTREGSCALL stub32_fork, sys_fork PTREGSCALL stub32_vfork, sys_vfork diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 0f4460b5636d..2ab1eb33106e 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -24,78 +24,28 @@ #define wmb() asm volatile("sfence" ::: "memory") #endif -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * <programlisting> - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * </programlisting> - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * <programlisting> - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * </programlisting> - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() #ifdef CONFIG_X86_PPRO_FENCE -# define smp_rmb() rmb() +#define dma_rmb() rmb() #else -# define smp_rmb() barrier() +#define dma_rmb() barrier() #endif +#define dma_wmb() barrier() + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#define smp_rmb() dma_rmb() #define smp_wmb() barrier() -#define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; barrier(); } while (0) #endif /* SMP */ +#define read_barrier_depends() do { } while (0) +#define smp_read_barrier_depends() do { } while (0) + #if defined(CONFIG_X86_PPRO_FENCE) /* diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 0bdb0c54d9a1..fe884e18fa6e 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -70,7 +70,7 @@ #define MAX_DMA_CHANNELS 8 /* 16MB ISA DMA zone */ -#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) +#define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ #define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h deleted file mode 100644 index e8c58f88b1d4..000000000000 --- a/arch/x86/include/asm/hash.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _ASM_X86_HASH_H -#define _ASM_X86_HASH_H - -struct fast_hash_ops; -extern void setup_arch_fast_hash(struct fast_hash_ops *ops); - -#endif /* _ASM_X86_HASH_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6ed0c30d6a0c..d89c6b828c96 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -33,7 +33,7 @@ #define KVM_MAX_VCPUS 255 #define KVM_SOFT_MAX_VCPUS 160 -#define KVM_USER_MEM_SLOTS 125 +#define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) @@ -51,6 +51,7 @@ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL +#define CR3_PCID_INVD (1UL << 63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ @@ -361,6 +362,7 @@ struct kvm_vcpu_arch { int mp_state; u64 ia32_misc_enable_msr; bool tpr_access_reporting; + u64 ia32_xss; /* * Paging state of the vcpu @@ -542,7 +544,7 @@ struct kvm_apic_map { struct rcu_head rcu; u8 ldr_bits; /* fields bellow are used to decode ldr values in different modes */ - u32 cid_shift, cid_mask, lid_mask; + u32 cid_shift, cid_mask, lid_mask, broadcast; struct kvm_lapic *phys_map[256]; /* first index is cluster id second is cpu id in a cluster */ struct kvm_lapic *logical_map[16][16]; @@ -602,6 +604,9 @@ struct kvm_arch { struct kvm_xen_hvm_config xen_hvm_config; + /* reads protected by irq_srcu, writes by irq_lock */ + struct hlist_head mask_notifier_list; + /* fields used by HYPER-V emulation */ u64 hv_guest_os_id; u64 hv_hypercall; @@ -659,6 +664,16 @@ struct msr_data { u64 data; }; +struct kvm_lapic_irq { + u32 vector; + u32 delivery_mode; + u32 dest_mode; + u32 level; + u32 trig_mode; + u32 shorthand; + u32 dest_id; +}; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -767,6 +782,7 @@ struct kvm_x86_ops { enum x86_intercept_stage stage); void (*handle_external_intr)(struct kvm_vcpu *vcpu); bool (*mpx_supported)(void); + bool (*xsaves_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); @@ -818,6 +834,19 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); +struct kvm_irq_mask_notifier { + void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); + int irq; + struct hlist_node link; +}; + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask); + extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); @@ -863,7 +892,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector); +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); @@ -895,6 +924,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); +bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); static inline int __kvm_irq_line_state(unsigned long *irq_state, int irq_source_id, int level) @@ -1066,6 +1096,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index af447f95e3be..25bcd4a89517 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -452,6 +452,7 @@ static inline void update_page_count(int level, unsigned long pages) { } extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); +extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags); diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h index 0a4e140315b6..7249e6d0902d 100644 --- a/arch/x86/include/asm/platform_sst_audio.h +++ b/arch/x86/include/asm/platform_sst_audio.h @@ -16,6 +16,9 @@ #include <linux/sfi.h> +#define MAX_NUM_STREAMS_MRFLD 25 +#define MAX_NUM_STREAMS MAX_NUM_STREAMS_MRFLD + enum sst_audio_task_id_mrfld { SST_TASK_ID_NONE = 0, SST_TASK_ID_SBA = 1, @@ -73,6 +76,65 @@ struct sst_platform_data { unsigned int strm_map_size; }; +struct sst_info { + u32 iram_start; + u32 iram_end; + bool iram_use; + u32 dram_start; + u32 dram_end; + bool dram_use; + u32 imr_start; + u32 imr_end; + bool imr_use; + u32 mailbox_start; + bool use_elf; + bool lpe_viewpt_rqd; + unsigned int max_streams; + u32 dma_max_len; + u8 num_probes; +}; + +struct sst_lib_dnld_info { + unsigned int mod_base; + unsigned int mod_end; + unsigned int mod_table_offset; + unsigned int mod_table_size; + bool mod_ddr_dnld; +}; + +struct sst_res_info { + unsigned int shim_offset; + unsigned int shim_size; + unsigned int shim_phy_addr; + unsigned int ssp0_offset; + unsigned int ssp0_size; + unsigned int dma0_offset; + unsigned int dma0_size; + unsigned int dma1_offset; + unsigned int dma1_size; + unsigned int iram_offset; + unsigned int iram_size; + unsigned int dram_offset; + unsigned int dram_size; + unsigned int mbox_offset; + unsigned int mbox_size; + unsigned int acpi_lpe_res_index; + unsigned int acpi_ddr_index; + unsigned int acpi_ipc_irq_index; +}; + +struct sst_ipc_info { + int ipc_offset; + unsigned int mbox_recv_off; +}; + +struct sst_platform_info { + const struct sst_info *probe_data; + const struct sst_ipc_info *ipc_info; + const struct sst_res_info *res_info; + const struct sst_lib_dnld_info *lib_info; + const char *platform; +}; int add_sst_platform_device(void); #endif diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 6f1c3a8a33ab..db257a58571f 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -23,6 +23,15 @@ #define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) +#define SEGMENT_RPL_MASK 0x3 /* + * Bottom two bits of selector give the ring + * privilege level + */ +#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */ +#define USER_RPL 0x3 /* User mode is privilege level 3 */ +#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */ +#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */ + #ifdef CONFIG_X86_32 /* * The layout of the per-CPU GDT under Linux: @@ -125,16 +134,6 @@ #define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ #define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ -/* Bottom two bits of selector give the ring privilege level */ -#define SEGMENT_RPL_MASK 0x3 -/* Bit 2 is table indicator (LDT/GDT) */ -#define SEGMENT_TI_MASK 0x4 - -/* User mode is privilege level 3 */ -#define USER_RPL 0x3 -/* LDT segment has TI set, GDT has it cleared */ -#define SEGMENT_LDT 0x4 -#define SEGMENT_GDT 0x0 /* * Matching rules for certain types of segments. @@ -192,17 +191,6 @@ #define get_kernel_rpl() 0 #endif -/* User mode is privilege level 3 */ -#define USER_RPL 0x3 -/* LDT segment has TI set, GDT has it cleared */ -#define SEGMENT_LDT 0x4 -#define SEGMENT_GDT 0x0 - -/* Bottom two bits of selector give the ring privilege level */ -#define SEGMENT_RPL_MASK 0x3 -/* Bit 2 is table indicator (LDT/GDT) */ -#define SEGMENT_TI_MASK 0x4 - #define IDT_ENTRIES 256 #define NUM_EXCEPTION_VECTORS 32 /* Bitmask of exception vectors which push an error code on the stack */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index bcbfade26d8d..45afaee9555c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -69,6 +69,7 @@ #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_XSAVES 0x00100000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -159,6 +160,8 @@ enum vmcs_field { EOI_EXIT_BITMAP3_HIGH = 0x00002023, VMREAD_BITMAP = 0x00002026, VMWRITE_BITMAP = 0x00002028, + XSS_EXIT_BITMAP = 0x0000202C, + XSS_EXIT_BITMAP_HIGH = 0x0000202D, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h new file mode 100644 index 000000000000..0d809e9fc975 --- /dev/null +++ b/arch/x86/include/asm/xen/cpuid.h @@ -0,0 +1,91 @@ +/****************************************************************************** + * arch-x86/cpuid.h + * + * CPUID interface to Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2007 Citrix Systems, Inc. + * + * Authors: + * Keir Fraser <keir@xen.org> + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__ +#define __XEN_PUBLIC_ARCH_X86_CPUID_H__ + +/* + * For compatibility with other hypervisor interfaces, the Xen cpuid leaves + * can be found at the first otherwise unused 0x100 aligned boundary starting + * from 0x40000000. + * + * e.g If viridian extensions are enabled for an HVM domain, the Xen cpuid + * leaves will start at 0x40000100 + */ + +#define XEN_CPUID_FIRST_LEAF 0x40000000 +#define XEN_CPUID_LEAF(i) (XEN_CPUID_FIRST_LEAF + (i)) + +/* + * Leaf 1 (0x40000x00) + * EAX: Largest Xen-information leaf. All leaves up to an including @EAX + * are supported by the Xen host. + * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification + * of a Xen host. + */ +#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */ +#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */ +#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */ + +/* + * Leaf 2 (0x40000x01) + * EAX[31:16]: Xen major version. + * EAX[15: 0]: Xen minor version. + * EBX-EDX: Reserved (currently all zeroes). + */ + +/* + * Leaf 3 (0x40000x02) + * EAX: Number of hypercall transfer pages. This register is always guaranteed + * to specify one hypercall page. + * EBX: Base address of Xen-specific MSRs. + * ECX: Features 1. Unused bits are set to zero. + * EDX: Features 2. Unused bits are set to zero. + */ + +/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */ +#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0 +#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0) + +/* + * Leaf 5 (0x40000x04) + * HVM-specific features + */ + +/* EAX Features */ +/* Virtualized APIC registers */ +#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) +/* Virtualized x2APIC accesses */ +#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) +/* Memory mapped from other domains has valid IOMMU entries */ +#define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2) + +#define XEN_CPUID_MAX_NUM_LEAVES 4 + +#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */ diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h index 7f02fe4e2c7b..acd844c017d3 100644 --- a/arch/x86/include/asm/xen/page-coherent.h +++ b/arch/x86/include/asm/xen/page-coherent.h @@ -22,8 +22,8 @@ static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, } static inline void xen_dma_map_page(struct device *hwdev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) { } + dma_addr_t dev_addr, unsigned long offset, size_t size, + enum dma_data_direction dir, struct dma_attrs *attrs) { } static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, size_t size, enum dma_data_direction dir, diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c949923a5668..5eea09915a15 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -41,10 +41,12 @@ typedef struct xpaddr { extern unsigned long *machine_to_phys_mapping; extern unsigned long machine_to_phys_nr; +extern unsigned long *xen_p2m_addr; +extern unsigned long xen_p2m_size; +extern unsigned long xen_max_p2m_pfn; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); @@ -52,17 +54,52 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s, extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); -extern int m2p_add_override(unsigned long mfn, struct page *page, - struct gnttab_map_grant_ref *kmap_op); extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); -extern int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op, - unsigned long mfn); -extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); +/* + * Helper functions to write or read unsigned long values to/from + * memory, when the access may fault. + */ +static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val) +{ + return __put_user(val, (unsigned long __user *)addr); +} + +static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val) +{ + return __get_user(*val, (unsigned long __user *)addr); +} + +/* + * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine(): + * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator + * bits (identity or foreign) are set. + * - __pfn_to_mfn() returns the found entry of the p2m table. A possibly set + * identity or foreign indicator will be still set. __pfn_to_mfn() is + * encapsulating get_phys_to_machine() which is called in special cases only. + * - get_phys_to_machine() is to be called by __pfn_to_mfn() only in special + * cases needing an extended handling. + */ +static inline unsigned long __pfn_to_mfn(unsigned long pfn) +{ + unsigned long mfn; + + if (pfn < xen_p2m_size) + mfn = xen_p2m_addr[pfn]; + else if (unlikely(pfn < xen_max_p2m_pfn)) + return get_phys_to_machine(pfn); + else + return IDENTITY_FRAME(pfn); + + if (unlikely(mfn == INVALID_P2M_ENTRY)) + return get_phys_to_machine(pfn); + + return mfn; +} + static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; @@ -70,7 +107,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; - mfn = get_phys_to_machine(pfn); + mfn = __pfn_to_mfn(pfn); if (mfn != INVALID_P2M_ENTRY) mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); @@ -83,7 +120,7 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return 1; - return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY; + return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY; } static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) @@ -102,7 +139,7 @@ static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) * In such cases it doesn't matter what we return (we return garbage), * but we must handle the fault without crashing! */ - ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); + ret = xen_safe_read_ulong(&machine_to_phys_mapping[mfn], &pfn); if (ret < 0) return ~0; @@ -117,7 +154,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) return mfn; pfn = mfn_to_pfn_no_overrides(mfn); - if (get_phys_to_machine(pfn) != mfn) { + if (__pfn_to_mfn(pfn) != mfn) { /* * If this appears to be a foreign mfn (because the pfn * doesn't map back to the mfn), then check the local override @@ -133,8 +170,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) * entry doesn't map back to the mfn and m2p_override doesn't have a * valid entry for it. */ - if (pfn == ~0 && - get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn)) + if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn)) pfn = mfn; return pfn; @@ -180,7 +216,7 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) return mfn; pfn = mfn_to_pfn(mfn); - if (get_phys_to_machine(pfn) != mfn) + if (__pfn_to_mfn(pfn) != mfn) return -1; /* force !pfn_valid() */ return pfn; } @@ -236,4 +272,11 @@ void make_lowmem_page_readwrite(void *vaddr); #define xen_remap(cookie, size) ioremap((cookie), (size)); #define xen_unmap(cookie) iounmap((cookie)) +static inline bool xen_arch_need_swiotlb(struct device *dev, + unsigned long pfn, + unsigned long mfn) +{ + return false; +} + #endif /* _ASM_X86_XEN_PAGE_H */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 7e7a79ada658..5fa9770035dc 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -16,6 +16,7 @@ #define XSTATE_Hi16_ZMM 0x80 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* Bit 63 of XCR0 is reserved for future expansion */ #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 990a2fe1588d..b813bf9da1e2 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -72,6 +72,8 @@ #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 #define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_XSAVES 63 +#define EXIT_REASON_XRSTORS 64 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -116,6 +118,8 @@ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ { EXIT_REASON_INVVPID, "INVVPID" }, \ - { EXIT_REASON_INVPCID, "INVPCID" } + { EXIT_REASON_INVPCID, "INVPCID" }, \ + { EXIT_REASON_XSAVES, "XSAVES" }, \ + { EXIT_REASON_XRSTORS, "XRSTORS" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index d67c4be3e8b1..3b3b9d33ac1d 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,3 +1,7 @@ +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + #include <asm/ucontext.h> #include <linux/lguest.h> diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 4f9359f36bb7..fdcbb4d27c9f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,3 +1,7 @@ +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + #include <asm/ia32.h> #define __SYSCALL_64(nr, sym, compat) [nr] = 1, diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 06d3e5a14d9d..f3672508b249 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall) case __NR_openat: return 3; case __NR_execve: + case __NR_execveat: return 5; default: return 0; diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c index 639d1289b1ba..97242a9242bd 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c @@ -130,10 +130,7 @@ static ssize_t _iommu_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask); - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask); } static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 30790d798e6b..cc6cedb8f25d 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -219,7 +219,6 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n; cpumask_t *active_mask; struct pmu *pmu = dev_get_drvdata(dev); @@ -230,10 +229,7 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, else return 0; - n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask); - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, active_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index d64f275fe274..673f930c700f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -365,11 +365,7 @@ static void rapl_pmu_event_read(struct perf_event *event) static ssize_t rapl_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); - - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 9762dbd9f3f7..08f3fed2b0f2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -647,11 +647,7 @@ static int uncore_pmu_event_init(struct perf_event *event) static ssize_t uncore_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask); - - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 49f886481615..dd2f07ae9d0c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1114,8 +1114,8 @@ void __init memblock_find_dma_reserve(void) * at first, and assume boot_mem will not take below MAX_DMA_PFN */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); - end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); + start_pfn = min(start_pfn, MAX_DMA_PFN); + end_pfn = min(end_pfn, MAX_DMA_PFN); nr_pages += end_pfn - start_pfn; } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 2e1a6853e00c..fe9f0b79a18b 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -455,6 +455,23 @@ struct intel_stolen_funcs { u32 (*base)(int num, int slot, int func, size_t size); }; +static size_t __init gen9_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; + gmch_ctrl &= BDW_GMCH_GMS_MASK; + + if (gmch_ctrl < 0xf0) + return gmch_ctrl << 25; /* 32 MB units */ + else + /* 4MB increments starting at 0xf0 for 4MB */ + return (gmch_ctrl - 0xf0 + 1) << 22; +} + +typedef size_t (*stolen_size_fn)(int num, int slot, int func); + static const struct intel_stolen_funcs i830_stolen_funcs __initconst = { .base = i830_stolen_base, .size = i830_stolen_size, @@ -490,6 +507,11 @@ static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = { .size = gen8_stolen_size, }; +static const struct intel_stolen_funcs gen9_stolen_funcs __initconst = { + .base = intel_stolen_base, + .size = gen9_stolen_size, +}; + static const struct intel_stolen_funcs chv_stolen_funcs __initconst = { .base = intel_stolen_base, .size = chv_stolen_size, @@ -523,6 +545,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = { INTEL_BDW_M_IDS(&gen8_stolen_funcs), INTEL_BDW_D_IDS(&gen8_stolen_funcs), INTEL_CHV_IDS(&chv_stolen_funcs), + INTEL_SKL_IDS(&gen9_stolen_funcs), }; static void __init intel_graphics_stolen(int num, int slot, int func) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 344b63f18d14..1cf7c97ff175 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1191,10 +1191,10 @@ ENTRY(ftrace_graph_caller) pushl %eax pushl %ecx pushl %edx - movl 0xc(%esp), %edx - lea 0x4(%ebp), %eax + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %edx + subl $MCOUNT_INSN_SIZE, %eax call prepare_ftrace_return popl %edx popl %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c0226ab54106..90878aa38dbd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -652,6 +652,20 @@ ENTRY(stub_execve) CFI_ENDPROC END(stub_execve) +ENTRY(stub_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execveat) + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. @@ -697,6 +711,20 @@ ENTRY(stub_x32_execve) CFI_ENDPROC END(stub_x32_execve) +ENTRY(stub_x32_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call compat_sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execveat) + #endif /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f6945bef2cd1..94f643484300 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -283,7 +283,14 @@ NOKPROBE_SYMBOL(do_async_page_fault); static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - pv_info.paravirt_enabled = 1; + + /* + * KVM isn't paravirt in the sense of paravirt_enabled. A KVM + * guest kernel works like a bare metal kernel with additional + * features, and paravirt_enabled is about features that are + * missing. + */ + pv_info.paravirt_enabled = 0; if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d9156ceecdff..42caaef897c8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -59,13 +59,12 @@ static void kvm_get_wallclock(struct timespec *now) native_write_msr(msr_kvm_wall_clock, low, high); - preempt_disable(); - cpu = smp_processor_id(); + cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; pvclock_read_wallclock(&wall_clock, vcpu_time, now); - preempt_enable(); + put_cpu(); } static int kvm_set_wallclock(const struct timespec *now) @@ -107,11 +106,10 @@ static unsigned long kvm_get_tsc_khz(void) int cpu; unsigned long tsc_khz; - preempt_disable(); - cpu = smp_processor_id(); + cpu = get_cpu(); src = &hv_clock[cpu].pvti; tsc_khz = pvclock_tsc_khz(src); - preempt_enable(); + put_cpu(); return tsc_khz; } @@ -263,7 +261,6 @@ void __init kvmclock_init(void) #endif kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); - pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) @@ -284,23 +281,22 @@ int __init kvm_setup_vsyscall_timeinfo(void) size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - preempt_disable(); - cpu = smp_processor_id(); + cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; flags = pvclock_read_flags(vcpu_time); if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { - preempt_enable(); + put_cpu(); return 1; } if ((ret = pvclock_init_vsyscall(hv_clock, size))) { - preempt_enable(); + put_cpu(); return ret; } - preempt_enable(); + put_cpu(); kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3ed4a68d4013..5a2c02913af3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* - * Reload esp0, LDT and the page table pointer: - */ + /* Reload esp0 and ss1. */ load_sp0(tss, next); - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - savesegment(es, prev->es); - if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); - - savesegment(ds, prev->ds); - if (unlikely(next->ds | prev->ds)) - loadsegment(ds, next->ds); - - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) savesegment(fs, fsindex); savesegment(gs, gsindex); + /* + * Load TLS before restoring any segments so that segment loads + * reference the correct GDT entries. + */ load_TLS(next, cpu); /* - * Leave lazy mode, flushing any hypercalls made here. - * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before math_state_restore, so the TS bit is up - * to date. + * Leave lazy mode, flushing any hypercalls made here. This + * must be done after loading TLS entries in the GDT but before + * loading segments that might reference them, and and it must + * be done before math_state_restore, so the TS bit is up to + * date. */ arch_end_context_switch(next_p); + /* Switch DS and ES. + * + * Reading them only returns the selectors, but writing them (if + * nonzero) loads the full descriptor from the GDT or LDT. The + * LDT for next is loaded in switch_mm, and the GDT is loaded + * above. + * + * We therefore need to write new values to the segment + * registers on every context switch unless both the new and old + * values are zero. + * + * Note that we don't need to do anything for CS and SS, as + * those are saved and restored as part of pt_regs. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + /* * Switch FS and GS. * - * Segment register != 0 always requires a reload. Also - * reload when it has changed. When prev process used 64bit - * base always reload to avoid an information leak. + * These are even more complicated than FS and GS: they have + * 64-bit bases are that controlled by arch_prctl. Those bases + * only differ from the values in the GDT or LDT if the selector + * is 0. + * + * Loading the segment register resets the hidden base part of + * the register to 0 or the value from the GDT / LDT. If the + * next base address zero, writing 0 to the segment register is + * much faster than using wrmsr to explicitly zero the base. + * + * The thread_struct.fs and thread_struct.gs values are 0 + * if the fs and gs bases respectively are not overridden + * from the values implied by fsindex and gsindex. They + * are nonzero, and store the nonzero base addresses, if + * the bases are overridden. + * + * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should + * be impossible. + * + * Therefore we need to reload the segment registers if either + * the old or new selector is nonzero, and we need to override + * the base address if next thread expects it to be overridden. + * + * This code is unnecessarily slow in the case where the old and + * new indexes are zero and the new base is nonzero -- it will + * unnecessarily write 0 to the selector before writing the new + * base address. + * + * Note: This all depends on arch_prctl being the only way that + * user code can override the segment base. Once wrfsbase and + * wrgsbase are enabled, most of this code will need to change. */ if (unlikely(fsindex | next->fsindex | prev->fs)) { loadsegment(fs, next->fsindex); + /* - * Check if the user used a selector != 0; if yes - * clear 64bit base, since overloaded base is always - * mapped to the Null selector + * If user code wrote a nonzero value to FS, then it also + * cleared the overridden base address. + * + * XXX: if user code wrote 0 to FS and cleared the base + * address itself, we won't notice and we'll incorrectly + * restore the prior base address next time we reschdule + * the process. */ if (fsindex) prev->fs = 0; } - /* when next process has a 64bit base use it */ if (next->fs) wrmsrl(MSR_FS_BASE, next->fs); prev->fsindex = fsindex; if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); + + /* This works (and fails) the same way as fsindex above. */ if (gsindex) prev->gs = 0; } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index f7fec09e3e3a..3e551eee87b9 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -27,6 +27,43 @@ static int get_free_idx(void) return -ESRCH; } +static bool tls_desc_okay(const struct user_desc *info) +{ + if (LDT_empty(info)) + return true; + + /* + * espfix is required for 16-bit data segments, but espfix + * only works for LDT segments. + */ + if (!info->seg_32bit) + return false; + + /* Only allow data segments in the TLS array. */ + if (info->contents > 1) + return false; + + /* + * Non-present segments with DPL 3 present an interesting attack + * surface. The kernel should handle such segments correctly, + * but TLS is very difficult to protect in a sandbox, so prevent + * such segments from being created. + * + * If userspace needs to remove a TLS entry, it can still delete + * it outright. + */ + if (info->seg_not_present) + return false; + +#ifdef CONFIG_X86_64 + /* The L bit makes no sense for data. */ + if (info->lm) + return false; +#endif + + return true; +} + static void set_tls_desc(struct task_struct *p, int idx, const struct user_desc *info, int n) { @@ -66,6 +103,9 @@ int do_set_thread_area(struct task_struct *p, int idx, if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; + if (!tls_desc_okay(&info)) + return -EINVAL; + if (idx == -1) idx = info.entry_number; @@ -192,6 +232,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, { struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; + int i; if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || @@ -205,6 +246,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, else info = infobuf; + for (i = 0; i < count / sizeof(struct user_desc); i++) + if (!tls_desc_okay(info + i)) + return -EINVAL; + set_tls_desc(target, GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), info, count / sizeof(struct user_desc)); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 4c540c4719d8..0de1fae2bdf0 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -738,3 +738,4 @@ void *get_xsave_addr(struct xsave_struct *xsave, int xstate) return (void *)xsave + xstate_comp_offsets[feature]; } +EXPORT_SYMBOL_GPL(get_xsave_addr); diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 25d22b2d6509..08f790dfadc9 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,14 +7,13 @@ CFLAGS_vmx.o := -I. KVM := ../../../virt/kvm -kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \ - $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ +kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o cpuid.o pmu.o + i8254.o ioapic.o irq_comm.o cpuid.o pmu.o +kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c new file mode 100644 index 000000000000..6eb5c20ee373 --- /dev/null +++ b/arch/x86/kvm/assigned-dev.c @@ -0,0 +1,1052 @@ +/* + * Kernel-based Virtual Machine - device assignment support + * + * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/errno.h> +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/fs.h> +#include "irq.h" +#include "assigned-dev.h" + +struct kvm_assigned_dev_kernel { + struct kvm_irq_ack_notifier ack_notifier; + struct list_head list; + int assigned_dev_id; + int host_segnr; + int host_busnr; + int host_devfn; + unsigned int entries_nr; + int host_irq; + bool host_irq_disabled; + bool pci_2_3; + struct msix_entry *host_msix_entries; + int guest_irq; + struct msix_entry *guest_msix_entries; + unsigned long irq_requested_type; + int irq_source_id; + int flags; + struct pci_dev *dev; + struct kvm *kvm; + spinlock_t intx_lock; + spinlock_t intx_mask_lock; + char irq_name[32]; + struct pci_saved_state *pci_saved_state; +}; + +static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, + int assigned_dev_id) +{ + struct list_head *ptr; + struct kvm_assigned_dev_kernel *match; + + list_for_each(ptr, head) { + match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + if (match->assigned_dev_id == assigned_dev_id) + return match; + } + return NULL; +} + +static int find_index_from_host_irq(struct kvm_assigned_dev_kernel + *assigned_dev, int irq) +{ + int i, index; + struct msix_entry *host_msix_entries; + + host_msix_entries = assigned_dev->host_msix_entries; + + index = -1; + for (i = 0; i < assigned_dev->entries_nr; i++) + if (irq == host_msix_entries[i].vector) { + index = i; + break; + } + if (index < 0) + printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); + + return index; +} + +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int ret; + + spin_lock(&assigned_dev->intx_lock); + if (pci_check_and_mask_intx(assigned_dev->dev)) { + assigned_dev->host_irq_disabled = true; + ret = IRQ_WAKE_THREAD; + } else + ret = IRQ_NONE; + spin_unlock(&assigned_dev->intx_lock); + + return ret; +} + +static void +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, + int vector) +{ + if (unlikely(assigned_dev->irq_requested_type & + KVM_DEV_IRQ_GUEST_INTX)) { + spin_lock(&assigned_dev->intx_mask_lock); + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) + kvm_set_irq(assigned_dev->kvm, + assigned_dev->irq_source_id, vector, 1, + false); + spin_unlock(&assigned_dev->intx_mask_lock); + } else + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + vector, 1, false); +} + +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + spin_lock_irq(&assigned_dev->intx_lock); + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + spin_unlock_irq(&assigned_dev->intx_lock); + } + + kvm_assigned_dev_raise_guest_irq(assigned_dev, + assigned_dev->guest_irq); + + return IRQ_HANDLED; +} + +#ifdef __KVM_HAVE_MSI +static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int ret = kvm_set_irq_inatomic(assigned_dev->kvm, + assigned_dev->irq_source_id, + assigned_dev->guest_irq, 1); + return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; +} + +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + + kvm_assigned_dev_raise_guest_irq(assigned_dev, + assigned_dev->guest_irq); + + return IRQ_HANDLED; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int index = find_index_from_host_irq(assigned_dev, irq); + u32 vector; + int ret = 0; + + if (index >= 0) { + vector = assigned_dev->guest_msix_entries[index].vector; + ret = kvm_set_irq_inatomic(assigned_dev->kvm, + assigned_dev->irq_source_id, + vector, 1); + } + + return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; +} + +static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int index = find_index_from_host_irq(assigned_dev, irq); + u32 vector; + + if (index >= 0) { + vector = assigned_dev->guest_msix_entries[index].vector; + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); + } + + return IRQ_HANDLED; +} +#endif + +/* Ack the irq line for an assigned device */ +static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_assigned_dev_kernel *dev = + container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); + + kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); + + spin_lock(&dev->intx_mask_lock); + + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { + bool reassert = false; + + spin_lock_irq(&dev->intx_lock); + /* + * The guest IRQ may be shared so this ack can come from an + * IRQ for another guest device. + */ + if (dev->host_irq_disabled) { + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) + enable_irq(dev->host_irq); + else if (!pci_check_and_unmask_intx(dev->dev)) + reassert = true; + dev->host_irq_disabled = reassert; + } + spin_unlock_irq(&dev->intx_lock); + + if (reassert) + kvm_set_irq(dev->kvm, dev->irq_source_id, + dev->guest_irq, 1, false); + } + + spin_unlock(&dev->intx_mask_lock); +} + +static void deassign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + if (assigned_dev->ack_notifier.gsi != -1) + kvm_unregister_irq_ack_notifier(kvm, + &assigned_dev->ack_notifier); + + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 0, false); + + if (assigned_dev->irq_source_id != -1) + kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); + assigned_dev->irq_source_id = -1; + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); +} + +/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ +static void deassign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + /* + * We disable irq here to prevent further events. + * + * Notice this maybe result in nested disable if the interrupt type is + * INTx, but it's OK for we are going to free it. + * + * If this function is a part of VM destroy, please ensure that till + * now, the kvm state is still legal for probably we also have to wait + * on a currently running IRQ handler. + */ + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + int i; + for (i = 0; i < assigned_dev->entries_nr; i++) + disable_irq(assigned_dev->host_msix_entries[i].vector); + + for (i = 0; i < assigned_dev->entries_nr; i++) + free_irq(assigned_dev->host_msix_entries[i].vector, + assigned_dev); + + assigned_dev->entries_nr = 0; + kfree(assigned_dev->host_msix_entries); + kfree(assigned_dev->guest_msix_entries); + pci_disable_msix(assigned_dev->dev); + } else { + /* Deal with MSI and INTx */ + if ((assigned_dev->irq_requested_type & + KVM_DEV_IRQ_HOST_INTX) && + (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + spin_lock_irq(&assigned_dev->intx_lock); + pci_intx(assigned_dev->dev, false); + spin_unlock_irq(&assigned_dev->intx_lock); + synchronize_irq(assigned_dev->host_irq); + } else + disable_irq(assigned_dev->host_irq); + + free_irq(assigned_dev->host_irq, assigned_dev); + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) + pci_disable_msi(assigned_dev->dev); + } + + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); +} + +static int kvm_deassign_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev, + unsigned long irq_requested_type) +{ + unsigned long guest_irq_type, host_irq_type; + + if (!irqchip_in_kernel(kvm)) + return -EINVAL; + /* no irq assignment to deassign */ + if (!assigned_dev->irq_requested_type) + return -ENXIO; + + host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; + guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; + + if (host_irq_type) + deassign_host_irq(kvm, assigned_dev); + if (guest_irq_type) + deassign_guest_irq(kvm, assigned_dev); + + return 0; +} + +static void kvm_free_assigned_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); +} + +static void kvm_free_assigned_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + kvm_free_assigned_irq(kvm, assigned_dev); + + pci_reset_function(assigned_dev->dev); + if (pci_load_and_free_saved_state(assigned_dev->dev, + &assigned_dev->pci_saved_state)) + printk(KERN_INFO "%s: Couldn't reload %s saved state\n", + __func__, dev_name(&assigned_dev->dev->dev)); + else + pci_restore_state(assigned_dev->dev); + + pci_clear_dev_assigned(assigned_dev->dev); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); +} + +void kvm_free_all_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + kvm_free_assigned_device(kvm, assigned_dev); + } +} + +static int assigned_device_enable_host_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + irq_handler_t irq_handler; + unsigned long flags; + + dev->host_irq = dev->dev->irq; + + /* + * We can only share the IRQ line with other host devices if we are + * able to disable the IRQ source at device-level - independently of + * the guest driver. Otherwise host devices may suffer from unbounded + * IRQ latencies when the guest keeps the line asserted. + */ + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { + irq_handler = kvm_assigned_dev_intx; + flags = IRQF_SHARED; + } else { + irq_handler = NULL; + flags = IRQF_ONESHOT; + } + if (request_threaded_irq(dev->host_irq, irq_handler, + kvm_assigned_dev_thread_intx, flags, + dev->irq_name, dev)) + return -EIO; + + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { + spin_lock_irq(&dev->intx_lock); + pci_intx(dev->dev, true); + spin_unlock_irq(&dev->intx_lock); + } + return 0; +} + +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_host_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int r; + + if (!dev->dev->msi_enabled) { + r = pci_enable_msi(dev->dev); + if (r) + return r; + } + + dev->host_irq = dev->dev->irq; + if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, + kvm_assigned_dev_thread_msi, 0, + dev->irq_name, dev)) { + pci_disable_msi(dev->dev); + return -EIO; + } + + return 0; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_host_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int i, r = -EINVAL; + + /* host_msix_entries and guest_msix_entries should have been + * initialized */ + if (dev->entries_nr == 0) + return r; + + r = pci_enable_msix_exact(dev->dev, + dev->host_msix_entries, dev->entries_nr); + if (r) + return r; + + for (i = 0; i < dev->entries_nr; i++) { + r = request_threaded_irq(dev->host_msix_entries[i].vector, + kvm_assigned_dev_msix, + kvm_assigned_dev_thread_msix, + 0, dev->irq_name, dev); + if (r) + goto err; + } + + return 0; +err: + for (i -= 1; i >= 0; i--) + free_irq(dev->host_msix_entries[i].vector, dev); + pci_disable_msix(dev->dev); + return r; +} + +#endif + +static int assigned_device_enable_guest_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = irq->guest_irq; + return 0; +} + +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_guest_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + return 0; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_guest_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + return 0; +} +#endif + +static int assign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + __u32 host_irq_type) +{ + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) + return r; + + snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", + pci_name(dev->dev)); + + switch (host_irq_type) { + case KVM_DEV_IRQ_HOST_INTX: + r = assigned_device_enable_host_intx(kvm, dev); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_HOST_MSI: + r = assigned_device_enable_host_msi(kvm, dev); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_HOST_MSIX: + r = assigned_device_enable_host_msix(kvm, dev); + break; +#endif + default: + r = -EINVAL; + } + dev->host_irq_disabled = false; + + if (!r) + dev->irq_requested_type |= host_irq_type; + + return r; +} + +static int assign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq, + unsigned long guest_irq_type) +{ + int id; + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) + return r; + + id = kvm_request_irq_source_id(kvm); + if (id < 0) + return id; + + dev->irq_source_id = id; + + switch (guest_irq_type) { + case KVM_DEV_IRQ_GUEST_INTX: + r = assigned_device_enable_guest_intx(kvm, dev, irq); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_GUEST_MSI: + r = assigned_device_enable_guest_msi(kvm, dev, irq); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_GUEST_MSIX: + r = assigned_device_enable_guest_msix(kvm, dev, irq); + break; +#endif + default: + r = -EINVAL; + } + + if (!r) { + dev->irq_requested_type |= guest_irq_type; + if (dev->ack_notifier.gsi != -1) + kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); + } else { + kvm_free_irq_source_id(kvm, dev->irq_source_id); + dev->irq_source_id = -1; + } + + return r; +} + +/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ +static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, + struct kvm_assigned_irq *assigned_irq) +{ + int r = -EINVAL; + struct kvm_assigned_dev_kernel *match; + unsigned long host_irq_type, guest_irq_type; + + if (!irqchip_in_kernel(kvm)) + return r; + + mutex_lock(&kvm->lock); + r = -ENODEV; + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); + guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); + + r = -EINVAL; + /* can only assign one type at a time */ + if (hweight_long(host_irq_type) > 1) + goto out; + if (hweight_long(guest_irq_type) > 1) + goto out; + if (host_irq_type == 0 && guest_irq_type == 0) + goto out; + + r = 0; + if (host_irq_type) + r = assign_host_irq(kvm, match, host_irq_type); + if (r) + goto out; + + if (guest_irq_type) + r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = -ENODEV; + struct kvm_assigned_dev_kernel *match; + unsigned long irq_type; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | + KVM_DEV_IRQ_GUEST_MASK); + r = kvm_deassign_irq(kvm, match, irq_type); +out: + mutex_unlock(&kvm->lock); + return r; +} + +/* + * We want to test whether the caller has been granted permissions to + * use this device. To be able to configure and control the device, + * the user needs access to PCI configuration space and BAR resources. + * These are accessed through PCI sysfs. PCI config space is often + * passed to the process calling this ioctl via file descriptor, so we + * can't rely on access to that file. We can check for permissions + * on each of the BAR resource files, which is a pretty clear + * indicator that the user has been granted access to the device. + */ +static int probe_sysfs_permissions(struct pci_dev *dev) +{ +#ifdef CONFIG_SYSFS + int i; + bool bar_found = false; + + for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { + char *kpath, *syspath; + struct path path; + struct inode *inode; + int r; + + if (!pci_resource_len(dev, i)) + continue; + + kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); + if (!kpath) + return -ENOMEM; + + /* Per sysfs-rules, sysfs is always at /sys */ + syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); + kfree(kpath); + if (!syspath) + return -ENOMEM; + + r = kern_path(syspath, LOOKUP_FOLLOW, &path); + kfree(syspath); + if (r) + return r; + + inode = path.dentry->d_inode; + + r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); + path_put(&path); + if (r) + return r; + + bar_found = true; + } + + /* If no resources, probably something special */ + if (!bar_found) + return -EPERM; + + return 0; +#else + return -EINVAL; /* No way to control the device without sysfs */ +#endif +} + +static int kvm_vm_ioctl_assign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0, idx; + struct kvm_assigned_dev_kernel *match; + struct pci_dev *dev; + + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) + return -EINVAL; + + mutex_lock(&kvm->lock); + idx = srcu_read_lock(&kvm->srcu); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (match) { + /* device already assigned */ + r = -EEXIST; + goto out; + } + + match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); + if (match == NULL) { + printk(KERN_INFO "%s: Couldn't allocate memory\n", + __func__); + r = -ENOMEM; + goto out; + } + dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, + assigned_dev->busnr, + assigned_dev->devfn); + if (!dev) { + printk(KERN_INFO "%s: host device not found\n", __func__); + r = -EINVAL; + goto out_free; + } + + /* Don't allow bridges to be assigned */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { + r = -EPERM; + goto out_put; + } + + r = probe_sysfs_permissions(dev); + if (r) + goto out_put; + + if (pci_enable_device(dev)) { + printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); + r = -EBUSY; + goto out_put; + } + r = pci_request_regions(dev, "kvm_assigned_device"); + if (r) { + printk(KERN_INFO "%s: Could not get access to device regions\n", + __func__); + goto out_disable; + } + + pci_reset_function(dev); + pci_save_state(dev); + match->pci_saved_state = pci_store_saved_state(dev); + if (!match->pci_saved_state) + printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", + __func__, dev_name(&dev->dev)); + + if (!pci_intx_mask_supported(dev)) + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; + + match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_segnr = assigned_dev->segnr; + match->host_busnr = assigned_dev->busnr; + match->host_devfn = assigned_dev->devfn; + match->flags = assigned_dev->flags; + match->dev = dev; + spin_lock_init(&match->intx_lock); + spin_lock_init(&match->intx_mask_lock); + match->irq_source_id = -1; + match->kvm = kvm; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + + list_add(&match->list, &kvm->arch.assigned_dev_head); + + if (!kvm->arch.iommu_domain) { + r = kvm_iommu_map_guest(kvm); + if (r) + goto out_list_del; + } + r = kvm_assign_device(kvm, match->dev); + if (r) + goto out_list_del; + +out: + srcu_read_unlock(&kvm->srcu, idx); + mutex_unlock(&kvm->lock); + return r; +out_list_del: + if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) + printk(KERN_INFO "%s: Couldn't reload %s saved state\n", + __func__, dev_name(&dev->dev)); + list_del(&match->list); + pci_release_regions(dev); +out_disable: + pci_disable_device(dev); +out_put: + pci_dev_put(dev); +out_free: + kfree(match); + srcu_read_unlock(&kvm->srcu, idx); + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (!match) { + printk(KERN_INFO "%s: device hasn't been assigned before, " + "so cannot be deassigned\n", __func__); + r = -EINVAL; + goto out; + } + + kvm_deassign_device(kvm, match->dev); + + kvm_free_assigned_device(kvm, match); + +out: + mutex_unlock(&kvm->lock); + return r; +} + + +#ifdef __KVM_HAVE_MSIX +static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, + struct kvm_assigned_msix_nr *entry_nr) +{ + int r = 0; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry_nr->assigned_dev_id); + if (!adev) { + r = -EINVAL; + goto msix_nr_out; + } + + if (adev->entries_nr == 0) { + adev->entries_nr = entry_nr->entry_nr; + if (adev->entries_nr == 0 || + adev->entries_nr > KVM_MAX_MSIX_PER_DEV) { + r = -EINVAL; + goto msix_nr_out; + } + + adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * + entry_nr->entry_nr, + GFP_KERNEL); + if (!adev->host_msix_entries) { + r = -ENOMEM; + goto msix_nr_out; + } + adev->guest_msix_entries = + kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, + GFP_KERNEL); + if (!adev->guest_msix_entries) { + kfree(adev->host_msix_entries); + r = -ENOMEM; + goto msix_nr_out; + } + } else /* Not allowed set MSI-X number twice */ + r = -EINVAL; +msix_nr_out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, + struct kvm_assigned_msix_entry *entry) +{ + int r = 0, i; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry->assigned_dev_id); + + if (!adev) { + r = -EINVAL; + goto msix_entry_out; + } + + for (i = 0; i < adev->entries_nr; i++) + if (adev->guest_msix_entries[i].vector == 0 || + adev->guest_msix_entries[i].entry == entry->entry) { + adev->guest_msix_entries[i].entry = entry->entry; + adev->guest_msix_entries[i].vector = entry->gsi; + adev->host_msix_entries[i].entry = entry->entry; + break; + } + if (i == adev->entries_nr) { + r = -ENOSPC; + goto msix_entry_out; + } + +msix_entry_out: + mutex_unlock(&kvm->lock); + + return r; +} +#endif + +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (!match) { + r = -ENODEV; + goto out; + } + + spin_lock(&match->intx_mask_lock); + + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; + + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { + kvm_set_irq(match->kvm, match->irq_source_id, + match->guest_irq, 0, false); + /* + * Masking at hardware-level is performed on demand, + * i.e. when an IRQ actually arrives at the host. + */ + } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + /* + * Unmask the IRQ line if required. Unmasking at + * device level will be performed by user space. + */ + spin_lock_irq(&match->intx_lock); + if (match->host_irq_disabled) { + enable_irq(match->host_irq); + match->host_irq_disabled = false; + } + spin_unlock_irq(&match->intx_lock); + } + } + + spin_unlock(&match->intx_mask_lock); + +out: + mutex_unlock(&kvm->lock); + return r; +} + +long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + int r; + + switch (ioctl) { + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } + case KVM_ASSIGN_IRQ: { + r = -EOPNOTSUPP; + break; + } + case KVM_ASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } + case KVM_DEASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } + case KVM_DEASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } +#ifdef __KVM_HAVE_MSIX + case KVM_ASSIGN_SET_MSIX_NR: { + struct kvm_assigned_msix_nr entry_nr; + r = -EFAULT; + if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) + goto out; + r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); + if (r) + goto out; + break; + } + case KVM_ASSIGN_SET_MSIX_ENTRY: { + struct kvm_assigned_msix_entry entry; + r = -EFAULT; + if (copy_from_user(&entry, argp, sizeof entry)) + goto out; + r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); + if (r) + goto out; + break; + } +#endif + case KVM_ASSIGN_SET_INTX_MASK: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); + break; + } + default: + r = -ENOTTY; + break; + } +out: + return r; +} diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h new file mode 100644 index 000000000000..a428c1a211b2 --- /dev/null +++ b/arch/x86/kvm/assigned-dev.h @@ -0,0 +1,32 @@ +#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H +#define ARCH_X86_KVM_ASSIGNED_DEV_H + +#include <linux/kvm_host.h> + +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT +int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev); +int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev); + +int kvm_iommu_map_guest(struct kvm *kvm); +int kvm_iommu_unmap_guest(struct kvm *kvm); + +long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg); + +void kvm_free_all_assigned_devices(struct kvm *kvm); +#else +static inline int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + return 0; +} + +static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg) +{ + return -ENOTTY; +} + +static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} +#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */ + +#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 976e3a57f9ea..8a80737ee6e6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -23,7 +23,7 @@ #include "mmu.h" #include "trace.h" -static u32 xstate_required_size(u64 xstate_bv) +static u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; @@ -31,9 +31,10 @@ static u32 xstate_required_size(u64 xstate_bv) xstate_bv &= XSTATE_EXTEND_MASK; while (xstate_bv) { if (xstate_bv & 0x1) { - u32 eax, ebx, ecx, edx; + u32 eax, ebx, ecx, edx, offset; cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); - ret = max(ret, eax + ebx); + offset = compacted ? ret : ebx; + ret = max(ret, offset + eax); } xstate_bv >>= 1; @@ -53,6 +54,8 @@ u64 kvm_supported_xcr0(void) return xcr0; } +#define F(x) bit(X86_FEATURE_##x) + int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -64,13 +67,13 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) /* Update OSXSAVE bit */ if (cpu_has_xsave && best->function == 0x1) { - best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); + best->ecx &= ~F(OSXSAVE); if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) - best->ecx |= bit(X86_FEATURE_OSXSAVE); + best->ecx |= F(OSXSAVE); } if (apic) { - if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) + if (best->ecx & F(TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; else apic->lapic_timer.timer_mode_mask = 1 << 17; @@ -85,9 +88,13 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) (best->eax | ((u64)best->edx << 32)) & kvm_supported_xcr0(); vcpu->arch.guest_xstate_size = best->ebx = - xstate_required_size(vcpu->arch.xcr0); + xstate_required_size(vcpu->arch.xcr0, false); } + best = kvm_find_cpuid_entry(vcpu, 0xD, 1); + if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) + best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + /* * The existing code assumes virtual address is 48-bit in the canonical * address checks; exit if it is ever changed. @@ -122,8 +129,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) break; } } - if (entry && (entry->edx & bit(X86_FEATURE_NX)) && !is_efer_nx()) { - entry->edx &= ~bit(X86_FEATURE_NX); + if (entry && (entry->edx & F(NX)) && !is_efer_nx()) { + entry->edx &= ~F(NX); printk(KERN_INFO "kvm: guest NX capability removed\n"); } } @@ -227,8 +234,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } -#define F(x) bit(X86_FEATURE_##x) - static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, u32 func, u32 index, int *nent, int maxnent) { @@ -267,6 +272,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0; + unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -317,7 +323,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP); + F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | + F(AVX512CD); + + /* cpuid 0xD.1.eax */ + const u32 kvm_supported_word10_x86_features = + F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -453,16 +464,34 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u64 supported = kvm_supported_xcr0(); entry->eax &= supported; + entry->ebx = xstate_required_size(supported, false); + entry->ecx = entry->ebx; entry->edx &= supported >> 32; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + if (!supported) + break; + for (idx = 1, i = 1; idx < 64; ++idx) { u64 mask = ((u64)1 << idx); if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !(supported & mask)) - continue; + if (idx == 1) { + entry[i].eax &= kvm_supported_word10_x86_features; + entry[i].ebx = 0; + if (entry[i].eax & (F(XSAVES)|F(XSAVEC))) + entry[i].ebx = + xstate_required_size(supported, + true); + } else { + if (entry[i].eax == 0 || !(supported & mask)) + continue; + if (WARN_ON_ONCE(entry[i].ecx & 1)) + continue; + } + entry[i].ecx = 0; + entry[i].edx = 0; entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9f8a2faf5040..169b09d76ddd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -123,6 +123,7 @@ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ #define Escape (5<<15) /* Escape to coprocessor instruction */ +#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ #define Sse (1<<18) /* SSE Vector instruction */ /* Generic ModRM decode. */ #define ModRM (1<<19) @@ -166,6 +167,8 @@ #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ #define NoBigReal ((u64)1 << 50) /* No big real mode */ #define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ +#define NearBranch ((u64)1 << 52) /* Near branches */ +#define No16 ((u64)1 << 53) /* No 16 bit operand */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -209,6 +212,7 @@ struct opcode { const struct group_dual *gdual; const struct gprefix *gprefix; const struct escape *esc; + const struct instr_dual *idual; void (*fastop)(struct fastop *fake); } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); @@ -231,6 +235,11 @@ struct escape { struct opcode high[64]; }; +struct instr_dual { + struct opcode mod012; + struct opcode mod3; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -379,6 +388,15 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); ON64(FOP2E(op##q, rax, cl)) \ FOP_END +/* 2 operand, src and dest are reversed */ +#define FASTOP2R(op, name) \ + FOP_START(name) \ + FOP2E(op##b, dl, al) \ + FOP2E(op##w, dx, ax) \ + FOP2E(op##l, edx, eax) \ + ON64(FOP2E(op##q, rdx, rax)) \ + FOP_END + #define FOP3E(op, dst, src, src2) \ FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET @@ -477,9 +495,9 @@ address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) } static inline unsigned long -register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) +register_address(struct x86_emulate_ctxt *ctxt, int reg) { - return address_mask(ctxt, reg); + return address_mask(ctxt, reg_read(ctxt, reg)); } static void masked_increment(ulong *reg, ulong mask, int inc) @@ -488,7 +506,7 @@ static void masked_increment(ulong *reg, ulong mask, int inc) } static inline void -register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) +register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc) { ulong mask; @@ -496,7 +514,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in mask = ~0UL; else mask = ad_mask(ctxt); - masked_increment(reg, mask, inc); + masked_increment(reg_rmw(ctxt, reg), mask, inc); } static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) @@ -564,40 +582,6 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } -static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, - int cs_l) -{ - switch (ctxt->op_bytes) { - case 2: - ctxt->_eip = (u16)dst; - break; - case 4: - ctxt->_eip = (u32)dst; - break; -#ifdef CONFIG_X86_64 - case 8: - if ((cs_l && is_noncanonical_address(dst)) || - (!cs_l && (dst >> 32) != 0)) - return emulate_gp(ctxt, 0); - ctxt->_eip = dst; - break; -#endif - default: - WARN(1, "unsupported eip assignment size\n"); - } - return X86EMUL_CONTINUE; -} - -static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) -{ - return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64); -} - -static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) -{ - return assign_eip_near(ctxt, ctxt->_eip + rel); -} - static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) { u16 selector; @@ -641,25 +625,24 @@ static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) return true; } -static int __linearize(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr, - unsigned *max_size, unsigned size, - bool write, bool fetch, - ulong *linear) +static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + unsigned *max_size, unsigned size, + bool write, bool fetch, + enum x86emul_mode mode, ulong *linear) { struct desc_struct desc; bool usable; ulong la; u32 lim; u16 sel; - unsigned cpl; la = seg_base(ctxt, addr.seg) + addr.ea; *max_size = 0; - switch (ctxt->mode) { + switch (mode) { case X86EMUL_MODE_PROT64: - if (((signed long)la << 16) >> 16 != la) - return emulate_gp(ctxt, 0); + if (is_noncanonical_address(la)) + goto bad; *max_size = min_t(u64, ~0u, (1ull << 48) - la); if (size > *max_size) @@ -678,46 +661,20 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if (!fetch && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); - if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch && - (ctxt->d & NoBigReal)) { - /* la is between zero and 0xffff */ - if (la > 0xffff) - goto bad; - *max_size = 0x10000 - la; - } else if ((desc.type & 8) || !(desc.type & 4)) { - /* expand-up segment */ - if (addr.ea > lim) - goto bad; - *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); - } else { + if (!(desc.type & 8) && (desc.type & 4)) { /* expand-down segment */ if (addr.ea <= lim) goto bad; lim = desc.d ? 0xffffffff : 0xffff; - if (addr.ea > lim) - goto bad; - *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); } + if (addr.ea > lim) + goto bad; + *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); if (size > *max_size) goto bad; - cpl = ctxt->ops->cpl(ctxt); - if (!(desc.type & 8)) { - /* data segment */ - if (cpl > desc.dpl) - goto bad; - } else if ((desc.type & 8) && !(desc.type & 4)) { - /* nonconforming code segment */ - if (cpl != desc.dpl) - goto bad; - } else if ((desc.type & 8) && (desc.type & 4)) { - /* conforming code segment */ - if (cpl < desc.dpl) - goto bad; - } + la &= (u32)-1; break; } - if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) - la &= (u32)-1; if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) return emulate_gp(ctxt, 0); *linear = la; @@ -735,9 +692,55 @@ static int linearize(struct x86_emulate_ctxt *ctxt, ulong *linear) { unsigned max_size; - return __linearize(ctxt, addr, &max_size, size, write, false, linear); + return __linearize(ctxt, addr, &max_size, size, write, false, + ctxt->mode, linear); +} + +static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, + enum x86emul_mode mode) +{ + ulong linear; + int rc; + unsigned max_size; + struct segmented_address addr = { .seg = VCPU_SREG_CS, + .ea = dst }; + + if (ctxt->op_bytes != sizeof(unsigned long)) + addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); + rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear); + if (rc == X86EMUL_CONTINUE) + ctxt->_eip = addr.ea; + return rc; +} + +static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) +{ + return assign_eip(ctxt, dst, ctxt->mode); } +static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, + const struct desc_struct *cs_desc) +{ + enum x86emul_mode mode = ctxt->mode; + +#ifdef CONFIG_X86_64 + if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) { + u64 efer = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) + mode = X86EMUL_MODE_PROT64; + } +#endif + if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) + mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + return assign_eip(ctxt, dst, mode); +} + +static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) +{ + return assign_eip_near(ctxt, ctxt->_eip + rel); +} static int segmented_read_std(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, @@ -776,7 +779,8 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) * boundary check itself. Instead, we use max_size to check * against op_size. */ - rc = __linearize(ctxt, addr, &max_size, 0, false, true, &linear); + rc = __linearize(ctxt, addr, &max_size, 0, false, true, ctxt->mode, + &linear); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; @@ -911,6 +915,8 @@ FASTOP2W(btc); FASTOP2(xadd); +FASTOP2R(cmp, cmp_r); + static u8 test_cc(unsigned int condition, unsigned long flags) { u8 rc; @@ -1221,6 +1227,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if (index_reg != 4) modrm_ea += reg_read(ctxt, index_reg) << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { + modrm_ea += insn_fetch(s32, ctxt); if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; } else { @@ -1229,10 +1236,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, adjust_modrm_seg(ctxt, base_reg); } switch (ctxt->modrm_mod) { - case 0: - if (ctxt->modrm_rm == 5) - modrm_ea += insn_fetch(s32, ctxt); - break; case 1: modrm_ea += insn_fetch(s8, ctxt); break; @@ -1284,7 +1287,8 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) else sv = (s64)ctxt->src.val & (s64)mask; - ctxt->dst.addr.mem.ea += (sv >> 3); + ctxt->dst.addr.mem.ea = address_mask(ctxt, + ctxt->dst.addr.mem.ea + (sv >> 3)); } /* only subword offset */ @@ -1610,6 +1614,9 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, sizeof(base3), &ctxt->exception); if (ret != X86EMUL_CONTINUE) return ret; + if (is_noncanonical_address(get_desc_base(&seg_desc) | + ((u64)base3 << 32))) + return emulate_gp(ctxt, 0); } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); @@ -1807,6 +1814,10 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt) int seg = ctxt->src2.val; ctxt->src.val = get_segment_selector(ctxt, seg); + if (ctxt->op_bytes == 4) { + rsp_increment(ctxt, -2); + ctxt->op_bytes = 2; + } return em_push(ctxt); } @@ -1850,7 +1861,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) static int em_pushf(struct x86_emulate_ctxt *ctxt) { - ctxt->src.val = (unsigned long)ctxt->eflags; + ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM; return em_push(ctxt); } @@ -2035,7 +2046,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); + rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); if (rc != X86EMUL_CONTINUE) { WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64); /* assigning eip failed; restore the old cs */ @@ -2045,31 +2056,22 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return rc; } -static int em_grp45(struct x86_emulate_ctxt *ctxt) +static int em_jmp_abs(struct x86_emulate_ctxt *ctxt) { - int rc = X86EMUL_CONTINUE; + return assign_eip_near(ctxt, ctxt->src.val); +} - switch (ctxt->modrm_reg) { - case 2: /* call near abs */ { - long int old_eip; - old_eip = ctxt->_eip; - rc = assign_eip_near(ctxt, ctxt->src.val); - if (rc != X86EMUL_CONTINUE) - break; - ctxt->src.val = old_eip; - rc = em_push(ctxt); - break; - } - case 4: /* jmp abs */ - rc = assign_eip_near(ctxt, ctxt->src.val); - break; - case 5: /* jmp far */ - rc = em_jmp_far(ctxt); - break; - case 6: /* push */ - rc = em_push(ctxt); - break; - } +static int em_call_near_abs(struct x86_emulate_ctxt *ctxt) +{ + int rc; + long int old_eip; + + old_eip = ctxt->_eip; + rc = assign_eip_near(ctxt, ctxt->src.val); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->src.val = old_eip; + rc = em_push(ctxt); return rc; } @@ -2128,11 +2130,11 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) /* Outer-privilege level return is not implemented */ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) return X86EMUL_UNHANDLEABLE; - rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false, + rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, eip, new_desc.l); + rc = assign_eip_far(ctxt, eip, &new_desc); if (rc != X86EMUL_CONTINUE) { WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64); ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); @@ -2316,6 +2318,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~msr_data; + ctxt->eflags |= EFLG_RESERVED_ONE_MASK; #endif } else { /* legacy mode */ @@ -2349,11 +2352,9 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) && !vendor_intel(ctxt)) return emulate_ud(ctxt); - /* XXX sysenter/sysexit have not been tested in 64bit mode. - * Therefore, we inject an #UD. - */ + /* sysenter/sysexit have not been tested in 64bit mode. */ if (ctxt->mode == X86EMUL_MODE_PROT64) - return emulate_ud(ctxt); + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -2425,6 +2426,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); ss_sel = (u16)(msr_data + 24); + rcx = (u32)rcx; + rdx = (u32)rdx; break; case X86EMUL_MODE_PROT64: cs_sel = (u16)(msr_data + 32); @@ -2599,7 +2602,6 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; save_state_to_tss16(ctxt, &tss_seg); @@ -2607,13 +2609,11 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; if (old_tss_sel != 0xffff) { @@ -2624,7 +2624,6 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, sizeof tss_seg.prev_task_link, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; } @@ -2813,7 +2812,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, * * 1. jmp/call/int to task gate: Check against DPL of the task gate * 2. Exception/IRQ/iret: No check is performed - * 3. jmp/call to TSS: Check against DPL of the TSS + * 3. jmp/call to TSS/task-gate: No check is performed since the + * hardware checks it before exiting. */ if (reason == TASK_SWITCH_GATE) { if (idt_index != -1) { @@ -2830,13 +2830,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) return emulate_gp(ctxt, (idt_index << 3) | 0x2); } - } else if (reason != TASK_SWITCH_IRET) { - int dpl = next_tss_desc.dpl; - if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) - return emulate_gp(ctxt, tss_selector); } - desc_limit = desc_limit_scaled(&next_tss_desc); if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || @@ -2913,8 +2908,8 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, { int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count; - register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes); - op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg)); + register_address_increment(ctxt, reg, df * op->bytes); + op->addr.mem.ea = register_address(ctxt, reg); } static int em_das(struct x86_emulate_ctxt *ctxt) @@ -3025,7 +3020,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); + rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); if (rc != X86EMUL_CONTINUE) goto fail; @@ -3215,6 +3210,8 @@ static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) return emulate_ud(ctxt); ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg); + if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM) + ctxt->dst.bytes = 2; return X86EMUL_CONTINUE; } @@ -3317,7 +3314,7 @@ static int em_sidt(struct x86_emulate_ctxt *ctxt) return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); } -static int em_lgdt(struct x86_emulate_ctxt *ctxt) +static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt) { struct desc_ptr desc_ptr; int rc; @@ -3329,12 +3326,23 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt) ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - ctxt->ops->set_gdt(ctxt, &desc_ptr); + if (ctxt->mode == X86EMUL_MODE_PROT64 && + is_noncanonical_address(desc_ptr.address)) + return emulate_gp(ctxt, 0); + if (lgdt) + ctxt->ops->set_gdt(ctxt, &desc_ptr); + else + ctxt->ops->set_idt(ctxt, &desc_ptr); /* Disable writeback. */ ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } +static int em_lgdt(struct x86_emulate_ctxt *ctxt) +{ + return em_lgdt_lidt(ctxt, true); +} + static int em_vmmcall(struct x86_emulate_ctxt *ctxt) { int rc; @@ -3348,20 +3356,7 @@ static int em_vmmcall(struct x86_emulate_ctxt *ctxt) static int em_lidt(struct x86_emulate_ctxt *ctxt) { - struct desc_ptr desc_ptr; - int rc; - - if (ctxt->mode == X86EMUL_MODE_PROT64) - ctxt->op_bytes = 8; - rc = read_descriptor(ctxt, ctxt->src.addr.mem, - &desc_ptr.size, &desc_ptr.address, - ctxt->op_bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - ctxt->ops->set_idt(ctxt, &desc_ptr); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; + return em_lgdt_lidt(ctxt, false); } static int em_smsw(struct x86_emulate_ctxt *ctxt) @@ -3384,7 +3379,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; - register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); + register_address_increment(ctxt, VCPU_REGS_RCX, -1); if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) rc = jmp_rel(ctxt, ctxt->src.val); @@ -3554,7 +3549,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) - rsvd = CR3_L_MODE_RESERVED_BITS; + rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD; if (new_val & rsvd) return emulate_gp(ctxt, 0); @@ -3596,8 +3591,15 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) return emulate_ud(ctxt); - if (check_dr7_gd(ctxt)) + if (check_dr7_gd(ctxt)) { + ulong dr6; + + ctxt->ops->get_dr(ctxt, 6, &dr6); + dr6 &= ~15; + dr6 |= DR6_BD | DR6_RTM; + ctxt->ops->set_dr(ctxt, 6, dr6); return emulate_db(ctxt); + } return X86EMUL_CONTINUE; } @@ -3684,6 +3686,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } +#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) } #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } @@ -3780,11 +3783,11 @@ static const struct opcode group4[] = { static const struct opcode group5[] = { F(DstMem | SrcNone | Lock, em_inc), F(DstMem | SrcNone | Lock, em_dec), - I(SrcMem | Stack, em_grp45), + I(SrcMem | NearBranch, em_call_near_abs), I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), - I(SrcMem | Stack, em_grp45), - I(SrcMemFAddr | ImplicitOps, em_grp45), - I(SrcMem | Stack, em_grp45), D(Undefined), + I(SrcMem | NearBranch, em_jmp_abs), + I(SrcMemFAddr | ImplicitOps, em_jmp_far), + I(SrcMem | Stack, em_push), D(Undefined), }; static const struct opcode group6[] = { @@ -3845,8 +3848,12 @@ static const struct gprefix pfx_0f_6f_0f_7f = { I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; +static const struct instr_dual instr_dual_0f_2b = { + I(0, em_mov), N +}; + static const struct gprefix pfx_0f_2b = { - I(0, em_mov), I(0, em_mov), N, N, + ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N, }; static const struct gprefix pfx_0f_28_0f_29 = { @@ -3920,6 +3927,10 @@ static const struct escape escape_dd = { { N, N, N, N, N, N, N, N, } }; +static const struct instr_dual instr_dual_0f_c3 = { + I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N +}; + static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ F6ALU(Lock, em_add), @@ -3964,7 +3975,7 @@ static const struct opcode opcode_table[256] = { I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ - X16(D(SrcImmByte)), + X16(D(SrcImmByte | NearBranch)), /* 0x80 - 0x87 */ G(ByteOp | DstMem | SrcImm, group1), G(DstMem | SrcImm, group1), @@ -3991,20 +4002,20 @@ static const struct opcode opcode_table[256] = { I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), - F2bv(SrcSI | DstDI | String | NoWrite, em_cmp), + F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r), /* 0xA8 - 0xAF */ F2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), - F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp), + F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp_r), /* 0xB0 - 0xB7 */ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), - I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), - I(ImplicitOps | Stack, em_ret), + I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm), + I(ImplicitOps | NearBranch, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), @@ -4024,13 +4035,14 @@ static const struct opcode opcode_table[256] = { /* 0xD8 - 0xDF */ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, /* 0xE0 - 0xE7 */ - X3(I(SrcImmByte, em_loop)), - I(SrcImmByte, em_jcxz), + X3(I(SrcImmByte | NearBranch, em_loop)), + I(SrcImmByte | NearBranch, em_jcxz), I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ - I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps), - I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), + I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch), + I(SrcImmFAddr | No64, em_jmp_far), + D(SrcImmByte | ImplicitOps | NearBranch), I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out), /* 0xF0 - 0xF7 */ @@ -4090,7 +4102,7 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x80 - 0x8F */ - X16(D(SrcImm)), + X16(D(SrcImm | NearBranch)), /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ @@ -4121,7 +4133,7 @@ static const struct opcode twobyte_table[256] = { D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xC7 */ F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), - N, D(DstMem | SrcReg | ModRM | Mov), + N, ID(0, &instr_dual_0f_c3), N, N, N, GD(0, &group9), /* 0xC8 - 0xCF */ X8(I(DstReg, em_bswap)), @@ -4134,12 +4146,20 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; +static const struct instr_dual instr_dual_0f_38_f0 = { + I(DstReg | SrcMem | Mov, em_movbe), N +}; + +static const struct instr_dual instr_dual_0f_38_f1 = { + I(DstMem | SrcReg | Mov, em_movbe), N +}; + static const struct gprefix three_byte_0f_38_f0 = { - I(DstReg | SrcMem | Mov, em_movbe), N, N, N + ID(0, &instr_dual_0f_38_f0), N, N, N }; static const struct gprefix three_byte_0f_38_f1 = { - I(DstMem | SrcReg | Mov, em_movbe), N, N, N + ID(0, &instr_dual_0f_38_f1), N, N, N }; /* @@ -4152,8 +4172,8 @@ static const struct opcode opcode_map_0f_38[256] = { /* 0x80 - 0xef */ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), /* 0xf0 - 0xf1 */ - GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0), - GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1), + GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0), + GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1), /* 0xf2 - 0xff */ N, N, X4(N), X8(N) }; @@ -4275,7 +4295,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = - register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI)); + register_address(ctxt, VCPU_REGS_RDI); op->addr.mem.seg = VCPU_SREG_ES; op->val = 0; op->count = 1; @@ -4329,7 +4349,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = - register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI)); + register_address(ctxt, VCPU_REGS_RSI); op->addr.mem.seg = ctxt->seg_override; op->val = 0; op->count = 1; @@ -4338,7 +4358,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = - register_address(ctxt, + address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RBX) + (reg_read(ctxt, VCPU_REGS_RAX) & 0xff)); op->addr.mem.seg = ctxt->seg_override; @@ -4510,8 +4530,7 @@ done_prefixes: /* vex-prefix instructions are not implemented */ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && - (mode == X86EMUL_MODE_PROT64 || - (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) { + (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) { ctxt->d = NotImpl; } @@ -4549,6 +4568,12 @@ done_prefixes: else opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; break; + case InstrDual: + if ((ctxt->modrm >> 6) == 3) + opcode = opcode.u.idual->mod3; + else + opcode = opcode.u.idual->mod012; + break; default: return EMULATION_FAILED; } @@ -4567,7 +4592,8 @@ done_prefixes: return EMULATION_FAILED; if (unlikely(ctxt->d & - (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) { + (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm|NearBranch| + No16))) { /* * These are copied unconditionally here, and checked unconditionally * in x86_emulate_insn. @@ -4578,8 +4604,12 @@ done_prefixes: if (ctxt->d & NotImpl) return EMULATION_FAILED; - if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) - ctxt->op_bytes = 8; + if (mode == X86EMUL_MODE_PROT64) { + if (ctxt->op_bytes == 4 && (ctxt->d & Stack)) + ctxt->op_bytes = 8; + else if (ctxt->d & NearBranch) + ctxt->op_bytes = 8; + } if (ctxt->d & Op3264) { if (mode == X86EMUL_MODE_PROT64) @@ -4588,6 +4618,9 @@ done_prefixes: ctxt->op_bytes = 4; } + if ((ctxt->d & No16) && ctxt->op_bytes == 2) + ctxt->op_bytes = 4; + if (ctxt->d & Sse) ctxt->op_bytes = 16; else if (ctxt->d & Mmx) @@ -4631,7 +4664,8 @@ done_prefixes: rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); if (ctxt->rip_relative) - ctxt->memopp->addr.mem.ea += ctxt->_eip; + ctxt->memopp->addr.mem.ea = address_mask(ctxt, + ctxt->memopp->addr.mem.ea + ctxt->_eip); done: return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; @@ -4775,6 +4809,12 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + /* Instruction can only be executed in protected mode */ + if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { + rc = emulate_ud(ctxt); + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ if ((ctxt->d & Priv) && ops->cpl(ctxt)) { if (ctxt->d & PrivUD) @@ -4784,12 +4824,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - /* Instruction can only be executed in protected mode */ - if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { - rc = emulate_ud(ctxt); - goto done; - } - /* Do instruction specific permission checks */ if (ctxt->d & CheckPerm) { rc = ctxt->check_perm(ctxt); @@ -4974,8 +5008,7 @@ writeback: count = ctxt->src.count; else count = ctxt->dst.count; - register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), - -count); + register_address_increment(ctxt, VCPU_REGS_RCX, -count); if (!string_insn_completed(ctxt)) { /* @@ -5053,11 +5086,6 @@ twobyte_insn: ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; break; - case 0xc3: /* movnti */ - ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val : - (u32) ctxt->src.val; - break; default: goto cannot_emulate; } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c new file mode 100644 index 000000000000..b1947e0f3e10 --- /dev/null +++ b/arch/x86/kvm/ioapic.c @@ -0,0 +1,675 @@ +/* + * Copyright (C) 2001 MandrakeSoft S.A. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * MandrakeSoft S.A. + * 43, rue d'Aboukir + * 75002 Paris - France + * http://www.linux-mandrake.com/ + * http://www.mandrakesoft.com/ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Yunhong Jiang <yunhong.jiang@intel.com> + * Yaozu (Eddie) Dong <eddie.dong@intel.com> + * Based on Xen 3.1 code. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/smp.h> +#include <linux/hrtimer.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/current.h> +#include <trace/events/kvm.h> + +#include "ioapic.h" +#include "lapic.h" +#include "irq.h" + +#if 0 +#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) +#else +#define ioapic_debug(fmt, arg...) +#endif +static int ioapic_service(struct kvm_ioapic *vioapic, int irq, + bool line_status); + +static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, + unsigned long addr, + unsigned long length) +{ + unsigned long result = 0; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16) + | (IOAPIC_VERSION_ID & 0xff)); + break; + + case IOAPIC_REG_APIC_ID: + case IOAPIC_REG_ARB_ID: + result = ((ioapic->id & 0xf) << 24); + break; + + default: + { + u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; + u64 redir_content; + + if (redir_index < IOAPIC_NUM_PINS) + redir_content = + ioapic->redirtbl[redir_index].bits; + else + redir_content = ~0ULL; + + result = (ioapic->ioregsel & 0x1) ? + (redir_content >> 32) & 0xffffffff : + redir_content & 0xffffffff; + break; + } + } + + return result; +} + +static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) +{ + ioapic->rtc_status.pending_eoi = 0; + bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS); +} + +static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); + +static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic) +{ + if (WARN_ON(ioapic->rtc_status.pending_eoi < 0)) + kvm_rtc_eoi_tracking_restore_all(ioapic); +} + +static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) +{ + bool new_val, old_val; + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + union kvm_ioapic_redirect_entry *e; + + e = &ioapic->redirtbl[RTC_GSI]; + if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, + e->fields.dest_mode)) + return; + + new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); + old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + + if (new_val == old_val) + return; + + if (new_val) { + __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + ioapic->rtc_status.pending_eoi++; + } else { + __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + ioapic->rtc_status.pending_eoi--; + rtc_status_pending_eoi_check_valid(ioapic); + } +} + +void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) +{ + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + + spin_lock(&ioapic->lock); + __rtc_irq_eoi_tracking_restore_one(vcpu); + spin_unlock(&ioapic->lock); +} + +static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) +{ + struct kvm_vcpu *vcpu; + int i; + + if (RTC_GSI >= IOAPIC_NUM_PINS) + return; + + rtc_irq_eoi_tracking_reset(ioapic); + kvm_for_each_vcpu(i, vcpu, ioapic->kvm) + __rtc_irq_eoi_tracking_restore_one(vcpu); +} + +static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu) +{ + if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) { + --ioapic->rtc_status.pending_eoi; + rtc_status_pending_eoi_check_valid(ioapic); + } +} + +static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) +{ + if (ioapic->rtc_status.pending_eoi > 0) + return true; /* coalesced */ + + return false; +} + +static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, + int irq_level, bool line_status) +{ + union kvm_ioapic_redirect_entry entry; + u32 mask = 1 << irq; + u32 old_irr; + int edge, ret; + + entry = ioapic->redirtbl[irq]; + edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); + + if (!irq_level) { + ioapic->irr &= ~mask; + ret = 1; + goto out; + } + + /* + * Return 0 for coalesced interrupts; for edge-triggered interrupts, + * this only happens if a previous edge has not been delivered due + * do masking. For level interrupts, the remote_irr field tells + * us if the interrupt is waiting for an EOI. + * + * RTC is special: it is edge-triggered, but userspace likes to know + * if it has been already ack-ed via EOI because coalesced RTC + * interrupts lead to time drift in Windows guests. So we track + * EOI manually for the RTC interrupt. + */ + if (irq == RTC_GSI && line_status && + rtc_irq_check_coalesced(ioapic)) { + ret = 0; + goto out; + } + + old_irr = ioapic->irr; + ioapic->irr |= mask; + if ((edge && old_irr == ioapic->irr) || + (!edge && entry.fields.remote_irr)) { + ret = 0; + goto out; + } + + ret = ioapic_service(ioapic, irq, line_status); + +out: + trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); + return ret; +} + +static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) +{ + u32 idx; + + rtc_irq_eoi_tracking_reset(ioapic); + for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS) + ioapic_set_irq(ioapic, idx, 1, true); + + kvm_rtc_eoi_tracking_restore_all(ioapic); +} + + +static void update_handled_vectors(struct kvm_ioapic *ioapic) +{ + DECLARE_BITMAP(handled_vectors, 256); + int i; + + memset(handled_vectors, 0, sizeof(handled_vectors)); + for (i = 0; i < IOAPIC_NUM_PINS; ++i) + __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); + memcpy(ioapic->handled_vectors, handled_vectors, + sizeof(handled_vectors)); + smp_wmb(); +} + +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, + u32 *tmr) +{ + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + union kvm_ioapic_redirect_entry *e; + int index; + + spin_lock(&ioapic->lock); + for (index = 0; index < IOAPIC_NUM_PINS; index++) { + e = &ioapic->redirtbl[index]; + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || + kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || + index == RTC_GSI) { + if (kvm_apic_match_dest(vcpu, NULL, 0, + e->fields.dest_id, e->fields.dest_mode)) { + __set_bit(e->fields.vector, + (unsigned long *)eoi_exit_bitmap); + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG) + __set_bit(e->fields.vector, + (unsigned long *)tmr); + } + } + } + spin_unlock(&ioapic->lock); +} + +void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + if (!ioapic) + return; + kvm_make_scan_ioapic_request(kvm); +} + +static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) +{ + unsigned index; + bool mask_before, mask_after; + union kvm_ioapic_redirect_entry *e; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + /* Writes are ignored. */ + break; + + case IOAPIC_REG_APIC_ID: + ioapic->id = (val >> 24) & 0xf; + break; + + case IOAPIC_REG_ARB_ID: + break; + + default: + index = (ioapic->ioregsel - 0x10) >> 1; + + ioapic_debug("change redir index %x val %x\n", index, val); + if (index >= IOAPIC_NUM_PINS) + return; + e = &ioapic->redirtbl[index]; + mask_before = e->fields.mask; + if (ioapic->ioregsel & 1) { + e->bits &= 0xffffffff; + e->bits |= (u64) val << 32; + } else { + e->bits &= ~0xffffffffULL; + e->bits |= (u32) val; + e->fields.remote_irr = 0; + } + update_handled_vectors(ioapic); + mask_after = e->fields.mask; + if (mask_before != mask_after) + kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG + && ioapic->irr & (1 << index)) + ioapic_service(ioapic, index, false); + kvm_vcpu_request_scan_ioapic(ioapic->kvm); + break; + } +} + +static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) +{ + union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; + struct kvm_lapic_irq irqe; + int ret; + + if (entry->fields.mask) + return -1; + + ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " + "vector=%x trig_mode=%x\n", + entry->fields.dest_id, entry->fields.dest_mode, + entry->fields.delivery_mode, entry->fields.vector, + entry->fields.trig_mode); + + irqe.dest_id = entry->fields.dest_id; + irqe.vector = entry->fields.vector; + irqe.dest_mode = entry->fields.dest_mode; + irqe.trig_mode = entry->fields.trig_mode; + irqe.delivery_mode = entry->fields.delivery_mode << 8; + irqe.level = 1; + irqe.shorthand = 0; + + if (irqe.trig_mode == IOAPIC_EDGE_TRIG) + ioapic->irr &= ~(1 << irq); + + if (irq == RTC_GSI && line_status) { + /* + * pending_eoi cannot ever become negative (see + * rtc_status_pending_eoi_check_valid) and the caller + * ensures that it is only called if it is >= zero, namely + * if rtc_irq_check_coalesced returns false). + */ + BUG_ON(ioapic->rtc_status.pending_eoi != 0); + ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, + ioapic->rtc_status.dest_map); + ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret); + } else + ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); + + if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG) + entry->fields.remote_irr = 1; + + return ret; +} + +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, + int level, bool line_status) +{ + int ret, irq_level; + + BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS); + + spin_lock(&ioapic->lock); + irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], + irq_source_id, level); + ret = ioapic_set_irq(ioapic, irq, irq_level, line_status); + + spin_unlock(&ioapic->lock); + + return ret; +} + +void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) +{ + int i; + + spin_lock(&ioapic->lock); + for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) + __clear_bit(irq_source_id, &ioapic->irq_states[i]); + spin_unlock(&ioapic->lock); +} + +static void kvm_ioapic_eoi_inject_work(struct work_struct *work) +{ + int i; + struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic, + eoi_inject.work); + spin_lock(&ioapic->lock); + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG) + continue; + + if (ioapic->irr & (1 << i) && !ent->fields.remote_irr) + ioapic_service(ioapic, i, false); + } + spin_unlock(&ioapic->lock); +} + +#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 + +static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, + struct kvm_ioapic *ioapic, int vector, int trigger_mode) +{ + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.vector != vector) + continue; + + if (i == RTC_GSI) + rtc_irq_eoi(ioapic, vcpu); + /* + * We are dropping lock while calling ack notifiers because ack + * notifier callbacks for assigned devices call into IOAPIC + * recursively. Since remote_irr is cleared only after call + * to notifiers if the same vector will be delivered while lock + * is dropped it will be put into irr and will be delivered + * after ack notifier returns. + */ + spin_unlock(&ioapic->lock); + kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); + spin_lock(&ioapic->lock); + + if (trigger_mode != IOAPIC_LEVEL_TRIG) + continue; + + ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); + ent->fields.remote_irr = 0; + if (!ent->fields.mask && (ioapic->irr & (1 << i))) { + ++ioapic->irq_eoi[i]; + if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { + /* + * Real hardware does not deliver the interrupt + * immediately during eoi broadcast, and this + * lets a buggy guest make slow progress + * even if it does not correctly handle a + * level-triggered interrupt. Emulate this + * behavior if we detect an interrupt storm. + */ + schedule_delayed_work(&ioapic->eoi_inject, HZ / 100); + ioapic->irq_eoi[i] = 0; + trace_kvm_ioapic_delayed_eoi_inj(ent->bits); + } else { + ioapic_service(ioapic, i, false); + } + } else { + ioapic->irq_eoi[i] = 0; + } + } +} + +bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + smp_rmb(); + return test_bit(vector, ioapic->handled_vectors); +} + +void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) +{ + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + + spin_lock(&ioapic->lock); + __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode); + spin_unlock(&ioapic->lock); +} + +static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_ioapic, dev); +} + +static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr) +{ + return ((addr >= ioapic->base_address && + (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); +} + +static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) +{ + struct kvm_ioapic *ioapic = to_ioapic(this); + u32 result; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; + + ioapic_debug("addr %lx\n", (unsigned long)addr); + ASSERT(!(addr & 0xf)); /* check alignment */ + + addr &= 0xff; + spin_lock(&ioapic->lock); + switch (addr) { + case IOAPIC_REG_SELECT: + result = ioapic->ioregsel; + break; + + case IOAPIC_REG_WINDOW: + result = ioapic_read_indirect(ioapic, addr, len); + break; + + default: + result = 0; + break; + } + spin_unlock(&ioapic->lock); + + switch (len) { + case 8: + *(u64 *) val = result; + break; + case 1: + case 2: + case 4: + memcpy(val, (char *)&result, len); + break; + default: + printk(KERN_WARNING "ioapic: wrong length %d\n", len); + } + return 0; +} + +static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct kvm_ioapic *ioapic = to_ioapic(this); + u32 data; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; + + ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", + (void*)addr, len, val); + ASSERT(!(addr & 0xf)); /* check alignment */ + + switch (len) { + case 8: + case 4: + data = *(u32 *) val; + break; + case 2: + data = *(u16 *) val; + break; + case 1: + data = *(u8 *) val; + break; + default: + printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); + return 0; + } + + addr &= 0xff; + spin_lock(&ioapic->lock); + switch (addr) { + case IOAPIC_REG_SELECT: + ioapic->ioregsel = data & 0xFF; /* 8-bit register */ + break; + + case IOAPIC_REG_WINDOW: + ioapic_write_indirect(ioapic, data); + break; + + default: + break; + } + spin_unlock(&ioapic->lock); + return 0; +} + +static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) +{ + int i; + + cancel_delayed_work_sync(&ioapic->eoi_inject); + for (i = 0; i < IOAPIC_NUM_PINS; i++) + ioapic->redirtbl[i].fields.mask = 1; + ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; + ioapic->ioregsel = 0; + ioapic->irr = 0; + ioapic->id = 0; + memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); + rtc_irq_eoi_tracking_reset(ioapic); + update_handled_vectors(ioapic); +} + +static const struct kvm_io_device_ops ioapic_mmio_ops = { + .read = ioapic_mmio_read, + .write = ioapic_mmio_write, +}; + +int kvm_ioapic_init(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic; + int ret; + + ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); + if (!ioapic) + return -ENOMEM; + spin_lock_init(&ioapic->lock); + INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); + kvm->arch.vioapic = ioapic; + kvm_ioapic_reset(ioapic); + kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); + ioapic->kvm = kvm; + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address, + IOAPIC_MEM_LENGTH, &ioapic->dev); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) { + kvm->arch.vioapic = NULL; + kfree(ioapic); + } + + return ret; +} + +void kvm_ioapic_destroy(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + cancel_delayed_work_sync(&ioapic->eoi_inject); + if (ioapic) { + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm->arch.vioapic = NULL; + kfree(ioapic); + } +} + +int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +{ + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + if (!ioapic) + return -EINVAL; + + spin_lock(&ioapic->lock); + memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); + spin_unlock(&ioapic->lock); + return 0; +} + +int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +{ + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + if (!ioapic) + return -EINVAL; + + spin_lock(&ioapic->lock); + memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); + ioapic->irr = 0; + update_handled_vectors(ioapic); + kvm_vcpu_request_scan_ioapic(kvm); + kvm_ioapic_inject_all(ioapic, state->irr); + spin_unlock(&ioapic->lock); + return 0; +} diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h new file mode 100644 index 000000000000..3c9195535ffc --- /dev/null +++ b/arch/x86/kvm/ioapic.h @@ -0,0 +1,119 @@ +#ifndef __KVM_IO_APIC_H +#define __KVM_IO_APIC_H + +#include <linux/kvm_host.h> + +#include "iodev.h" + +struct kvm; +struct kvm_vcpu; + +#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ +#define IOAPIC_EDGE_TRIG 0 +#define IOAPIC_LEVEL_TRIG 1 + +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 +#define IOAPIC_MEM_LENGTH 0x100 + +/* Direct registers. */ +#define IOAPIC_REG_SELECT 0x00 +#define IOAPIC_REG_WINDOW 0x10 + +/* Indirect registers. */ +#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ +#define IOAPIC_REG_VERSION 0x01 +#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ + +/*ioapic delivery mode*/ +#define IOAPIC_FIXED 0x0 +#define IOAPIC_LOWEST_PRIORITY 0x1 +#define IOAPIC_PMI 0x2 +#define IOAPIC_NMI 0x4 +#define IOAPIC_INIT 0x5 +#define IOAPIC_EXTINT 0x7 + +#ifdef CONFIG_X86 +#define RTC_GSI 8 +#else +#define RTC_GSI -1U +#endif + +struct rtc_status { + int pending_eoi; + DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS); +}; + +union kvm_ioapic_redirect_entry { + u64 bits; + struct { + u8 vector; + u8 delivery_mode:3; + u8 dest_mode:1; + u8 delivery_status:1; + u8 polarity:1; + u8 remote_irr:1; + u8 trig_mode:1; + u8 mask:1; + u8 reserve:7; + u8 reserved[4]; + u8 dest_id; + } fields; +}; + +struct kvm_ioapic { + u64 base_address; + u32 ioregsel; + u32 id; + u32 irr; + u32 pad; + union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; + unsigned long irq_states[IOAPIC_NUM_PINS]; + struct kvm_io_device dev; + struct kvm *kvm; + void (*ack_notifier)(void *opaque, int irq); + spinlock_t lock; + DECLARE_BITMAP(handled_vectors, 256); + struct rtc_status rtc_status; + struct delayed_work eoi_inject; + u32 irq_eoi[IOAPIC_NUM_PINS]; +}; + +#ifdef DEBUG +#define ASSERT(x) \ +do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } \ +} while (0) +#else +#define ASSERT(x) do { } while (0) +#endif + +static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) +{ + return kvm->arch.vioapic; +} + +void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, unsigned int dest, int dest_mode); +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); +void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, + int trigger_mode); +bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); +int kvm_ioapic_init(struct kvm *kvm); +void kvm_ioapic_destroy(struct kvm *kvm); +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, + int level, bool line_status); +void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, unsigned long *dest_map); +int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, + u32 *tmr); + +#endif diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c new file mode 100644 index 000000000000..17b73eeac8a4 --- /dev/null +++ b/arch/x86/kvm/iommu.c @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Author: Allen M. Kay <allen.m.kay@intel.com> + * Author: Weidong Han <weidong.han@intel.com> + * Author: Ben-Ami Yassour <benami@il.ibm.com> + */ + +#include <linux/list.h> +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/stat.h> +#include <linux/dmar.h> +#include <linux/iommu.h> +#include <linux/intel-iommu.h> +#include "assigned-dev.h" + +static bool allow_unsafe_assigned_interrupts; +module_param_named(allow_unsafe_assigned_interrupts, + allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, + "Enable device assignment on platforms without interrupt remapping support."); + +static int kvm_iommu_unmap_memslots(struct kvm *kvm); +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages); + +static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, + unsigned long npages) +{ + gfn_t end_gfn; + pfn_t pfn; + + pfn = gfn_to_pfn_memslot(slot, gfn); + end_gfn = gfn + npages; + gfn += 1; + + if (is_error_noslot_pfn(pfn)) + return pfn; + + while (gfn < end_gfn) + gfn_to_pfn_memslot(slot, gfn++); + + return pfn; +} + +static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) +{ + unsigned long i; + + for (i = 0; i < npages; ++i) + kvm_release_pfn_clean(pfn + i); +} + +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + gfn_t gfn, end_gfn; + pfn_t pfn; + int r = 0; + struct iommu_domain *domain = kvm->arch.iommu_domain; + int flags; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + gfn = slot->base_gfn; + end_gfn = gfn + slot->npages; + + flags = IOMMU_READ; + if (!(slot->flags & KVM_MEM_READONLY)) + flags |= IOMMU_WRITE; + if (!kvm->arch.iommu_noncoherent) + flags |= IOMMU_CACHE; + + + while (gfn < end_gfn) { + unsigned long page_size; + + /* Check if already mapped */ + if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { + gfn += 1; + continue; + } + + /* Get the page size we could use to map */ + page_size = kvm_host_page_size(kvm, gfn); + + /* Make sure the page_size does not exceed the memslot */ + while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) + page_size >>= 1; + + /* Make sure gfn is aligned to the page size we want to map */ + while ((gfn << PAGE_SHIFT) & (page_size - 1)) + page_size >>= 1; + + /* Make sure hva is aligned to the page size we want to map */ + while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) + page_size >>= 1; + + /* + * Pin all pages we are about to map in memory. This is + * important because we unmap and unpin in 4kb steps later. + */ + pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); + if (is_error_noslot_pfn(pfn)) { + gfn += 1; + continue; + } + + /* Map into IO address space */ + r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), + page_size, flags); + if (r) { + printk(KERN_ERR "kvm_iommu_map_address:" + "iommu failed to map pfn=%llx\n", pfn); + kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); + goto unmap_pages; + } + + gfn += page_size >> PAGE_SHIFT; + + + } + + return 0; + +unmap_pages: + kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn); + return r; +} + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int idx, r = 0; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + if (kvm->arch.iommu_noncoherent) + kvm_arch_register_noncoherent_dma(kvm); + + idx = srcu_read_lock(&kvm->srcu); + slots = kvm_memslots(kvm); + + kvm_for_each_memslot(memslot, slots) { + r = kvm_iommu_map_pages(kvm, memslot); + if (r) + break; + } + srcu_read_unlock(&kvm->srcu, idx); + + return r; +} + +int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev) +{ + struct iommu_domain *domain = kvm->arch.iommu_domain; + int r; + bool noncoherent; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + if (pdev == NULL) + return -ENODEV; + + r = iommu_attach_device(domain, &pdev->dev); + if (r) { + dev_err(&pdev->dev, "kvm assign device failed ret %d", r); + return r; + } + + noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY); + + /* Check if need to update IOMMU page table for guest memory */ + if (noncoherent != kvm->arch.iommu_noncoherent) { + kvm_iommu_unmap_memslots(kvm); + kvm->arch.iommu_noncoherent = noncoherent; + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + } + + pci_set_dev_assigned(pdev); + + dev_info(&pdev->dev, "kvm assign device\n"); + + return 0; +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev) +{ + struct iommu_domain *domain = kvm->arch.iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + if (pdev == NULL) + return -ENODEV; + + iommu_detach_device(domain, &pdev->dev); + + pci_clear_dev_assigned(pdev); + + dev_info(&pdev->dev, "kvm deassign device\n"); + + return 0; +} + +int kvm_iommu_map_guest(struct kvm *kvm) +{ + int r; + + if (!iommu_present(&pci_bus_type)) { + printk(KERN_ERR "%s: iommu not found\n", __func__); + return -ENODEV; + } + + mutex_lock(&kvm->slots_lock); + + kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); + if (!kvm->arch.iommu_domain) { + r = -ENOMEM; + goto out_unlock; + } + + if (!allow_unsafe_assigned_interrupts && + !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { + printk(KERN_WARNING "%s: No interrupt remapping support," + " disallowing device assignment." + " Re-enble with \"allow_unsafe_assigned_interrupts=1\"" + " module option.\n", __func__); + iommu_domain_free(kvm->arch.iommu_domain); + kvm->arch.iommu_domain = NULL; + r = -EPERM; + goto out_unlock; + } + + r = kvm_iommu_map_memslots(kvm); + if (r) + kvm_iommu_unmap_memslots(kvm); + +out_unlock: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + struct iommu_domain *domain; + gfn_t end_gfn, gfn; + pfn_t pfn; + u64 phys; + + domain = kvm->arch.iommu_domain; + end_gfn = base_gfn + npages; + gfn = base_gfn; + + /* check if iommu exists and in use */ + if (!domain) + return; + + while (gfn < end_gfn) { + unsigned long unmap_pages; + size_t size; + + /* Get physical address */ + phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); + + if (!phys) { + gfn++; + continue; + } + + pfn = phys >> PAGE_SHIFT; + + /* Unmap address from IO address space */ + size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); + unmap_pages = 1ULL << get_order(size); + + /* Unpin all pages we just unmapped to not leak any memory */ + kvm_unpin_pages(kvm, pfn, unmap_pages); + + gfn += unmap_pages; + } +} + +void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); +} + +static int kvm_iommu_unmap_memslots(struct kvm *kvm) +{ + int idx; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + idx = srcu_read_lock(&kvm->srcu); + slots = kvm_memslots(kvm); + + kvm_for_each_memslot(memslot, slots) + kvm_iommu_unmap_pages(kvm, memslot); + + srcu_read_unlock(&kvm->srcu, idx); + + if (kvm->arch.iommu_noncoherent) + kvm_arch_unregister_noncoherent_dma(kvm); + + return 0; +} + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct iommu_domain *domain = kvm->arch.iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + mutex_lock(&kvm->slots_lock); + kvm_iommu_unmap_memslots(kvm); + kvm->arch.iommu_domain = NULL; + kvm->arch.iommu_noncoherent = false; + mutex_unlock(&kvm->slots_lock); + + iommu_domain_free(domain); + return 0; +} diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c new file mode 100644 index 000000000000..72298b3ac025 --- /dev/null +++ b/arch/x86/kvm/irq_comm.c @@ -0,0 +1,332 @@ +/* + * irq_comm.c: Common API for in kernel interrupt controller + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <trace/events/kvm.h> + +#include <asm/msidef.h> + +#include "irq.h" + +#include "ioapic.h" + +static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + struct kvm_pic *pic = pic_irqchip(kvm); + return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); +} + +static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, + line_status); +} + +inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) +{ + return irq->delivery_mode == APIC_DM_LOWEST; +} + +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, unsigned long *dest_map) +{ + int i, r = -1; + struct kvm_vcpu *vcpu, *lowest = NULL; + + if (irq->dest_mode == 0 && irq->dest_id == 0xff && + kvm_is_dm_lowest_prio(irq)) { + printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); + irq->delivery_mode = APIC_DM_FIXED; + } + + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + return r; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_is_dm_lowest_prio(irq)) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, irq, dest_map); + } else if (kvm_lapic_enabled(vcpu)) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } + } + + if (lowest) + r = kvm_apic_set_irq(lowest, irq, dest_map); + + return r; +} + +static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) +{ + trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + + irq->dest_id = (e->msi.address_lo & + MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + irq->vector = (e->msi.data & + MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; + irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; + irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; + irq->delivery_mode = e->msi.data & 0x700; + irq->level = 1; + irq->shorthand = 0; + /* TODO Deal with RH bit of MSI message address */ +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + struct kvm_lapic_irq irq; + + if (!level) + return -1; + + kvm_set_msi_irq(e, &irq); + + return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); +} + + +static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm) +{ + struct kvm_lapic_irq irq; + int r; + + kvm_set_msi_irq(e, &irq); + + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) + return r; + else + return -EWOULDBLOCK; +} + +/* + * Deliver an IRQ in an atomic context if we can, or return a failure, + * user can retry in a process context. + * Return value: + * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. + * Other values - No need to retry. + */ +int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) +{ + struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; + struct kvm_kernel_irq_routing_entry *e; + int ret = -EINVAL; + int idx; + + trace_kvm_set_irq(irq, level, irq_source_id); + + /* + * Injection into either PIC or IOAPIC might need to scan all CPUs, + * which would need to be retried from thread context; when same GSI + * is connected to both PIC and IOAPIC, we'd have to report a + * partial failure here. + * Since there's no easy way to do this, we only support injecting MSI + * which is limited to 1:1 GSI mapping. + */ + idx = srcu_read_lock(&kvm->irq_srcu); + if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { + e = &entries[0]; + if (likely(e->type == KVM_IRQ_ROUTING_MSI)) + ret = kvm_set_msi_inatomic(e, kvm); + else + ret = -EWOULDBLOCK; + } + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + +int kvm_request_irq_source_id(struct kvm *kvm) +{ + unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; + int irq_source_id; + + mutex_lock(&kvm->irq_lock); + irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); + + if (irq_source_id >= BITS_PER_LONG) { + printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); + irq_source_id = -EFAULT; + goto unlock; + } + + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + set_bit(irq_source_id, bitmap); +unlock: + mutex_unlock(&kvm->irq_lock); + + return irq_source_id; +} + +void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) +{ + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + + mutex_lock(&kvm->irq_lock); + if (irq_source_id < 0 || + irq_source_id >= BITS_PER_LONG) { + printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); + goto unlock; + } + clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); + if (!irqchip_in_kernel(kvm)) + goto unlock; + + kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); + kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); +unlock: + mutex_unlock(&kvm->irq_lock); +} + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + mutex_lock(&kvm->irq_lock); + kimn->irq = irq; + hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list); + mutex_unlock(&kvm->irq_lock); +} + +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + mutex_lock(&kvm->irq_lock); + hlist_del_rcu(&kimn->link); + mutex_unlock(&kvm->irq_lock); + synchronize_srcu(&kvm->irq_srcu); +} + +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask) +{ + struct kvm_irq_mask_notifier *kimn; + int idx, gsi; + + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); + if (gsi != -1) + hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link) + if (kimn->irq == gsi) + kimn->func(kimn, mask); + srcu_read_unlock(&kvm->irq_srcu, idx); +} + +int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + int delta; + unsigned max_pin; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + delta = 0; + switch (ue->u.irqchip.irqchip) { + case KVM_IRQCHIP_PIC_MASTER: + e->set = kvm_set_pic_irq; + max_pin = PIC_NUM_PINS; + break; + case KVM_IRQCHIP_PIC_SLAVE: + e->set = kvm_set_pic_irq; + max_pin = PIC_NUM_PINS; + delta = 8; + break; + case KVM_IRQCHIP_IOAPIC: + max_pin = KVM_IOAPIC_NUM_PINS; + e->set = kvm_set_ioapic_irq; + break; + default: + goto out; + } + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin + delta; + if (e->irqchip.pin >= max_pin) + goto out; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + break; + default: + goto out; + } + + r = 0; +out: + return r; +} + +#define IOAPIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } +#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) + +#define PIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } } +#define ROUTING_ENTRY2(irq) \ + IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) + +static const struct kvm_irq_routing_entry default_routing[] = { + ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), + ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), + ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), + ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), + ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), + ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), + ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), + ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), + ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), + ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), + ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), + ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), +}; + +int kvm_setup_default_irq_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, default_routing, + ARRAY_SIZE(default_routing), 0); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b8345dd41b25..4f0c0b954686 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -68,6 +68,9 @@ #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 +#define APIC_BROADCAST 0xFF +#define X2APIC_BROADCAST 0xFFFFFFFFul + #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) @@ -129,8 +132,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic) return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } -#define KVM_X2APIC_CID_BITS 0 - static void recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; @@ -149,42 +150,56 @@ static void recalculate_apic_map(struct kvm *kvm) new->cid_shift = 8; new->cid_mask = 0; new->lid_mask = 0xff; + new->broadcast = APIC_BROADCAST; kvm_for_each_vcpu(i, vcpu, kvm) { struct kvm_lapic *apic = vcpu->arch.apic; - u16 cid, lid; - u32 ldr; if (!kvm_apic_present(vcpu)) continue; + if (apic_x2apic_mode(apic)) { + new->ldr_bits = 32; + new->cid_shift = 16; + new->cid_mask = new->lid_mask = 0xffff; + new->broadcast = X2APIC_BROADCAST; + } else if (kvm_apic_get_reg(apic, APIC_LDR)) { + if (kvm_apic_get_reg(apic, APIC_DFR) == + APIC_DFR_CLUSTER) { + new->cid_shift = 4; + new->cid_mask = 0xf; + new->lid_mask = 0xf; + } else { + new->cid_shift = 8; + new->cid_mask = 0; + new->lid_mask = 0xff; + } + } + /* * All APICs have to be configured in the same mode by an OS. * We take advatage of this while building logical id loockup - * table. After reset APICs are in xapic/flat mode, so if we - * find apic with different setting we assume this is the mode + * table. After reset APICs are in software disabled mode, so if + * we find apic with different setting we assume this is the mode * OS wants all apics to be in; build lookup table accordingly. */ - if (apic_x2apic_mode(apic)) { - new->ldr_bits = 32; - new->cid_shift = 16; - new->cid_mask = (1 << KVM_X2APIC_CID_BITS) - 1; - new->lid_mask = 0xffff; - } else if (kvm_apic_sw_enabled(apic) && - !new->cid_mask /* flat mode */ && - kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) { - new->cid_shift = 4; - new->cid_mask = 0xf; - new->lid_mask = 0xf; - } + if (kvm_apic_sw_enabled(apic)) + break; + } - new->phys_map[kvm_apic_id(apic)] = apic; + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvm_lapic *apic = vcpu->arch.apic; + u16 cid, lid; + u32 ldr, aid; + aid = kvm_apic_id(apic); ldr = kvm_apic_get_reg(apic, APIC_LDR); cid = apic_cluster_id(new, ldr); lid = apic_logical_id(new, ldr); - if (lid) + if (aid < ARRAY_SIZE(new->phys_map)) + new->phys_map[aid] = apic; + if (lid && cid < ARRAY_SIZE(new->logical_map)) new->logical_map[cid][ffs(lid) - 1] = apic; } out: @@ -201,11 +216,13 @@ out: static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) { - u32 prev = kvm_apic_get_reg(apic, APIC_SPIV); + bool enabled = val & APIC_SPIV_APIC_ENABLED; apic_set_reg(apic, APIC_SPIV, val); - if ((prev ^ val) & APIC_SPIV_APIC_ENABLED) { - if (val & APIC_SPIV_APIC_ENABLED) { + + if (enabled != apic->sw_enabled) { + apic->sw_enabled = enabled; + if (enabled) { static_key_slow_dec_deferred(&apic_sw_disabled); recalculate_apic_map(apic->vcpu->kvm); } else @@ -237,21 +254,17 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) { - return ((kvm_apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); + return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT; } static inline int apic_lvtt_period(struct kvm_lapic *apic) { - return ((kvm_apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); + return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC; } static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) { - return ((kvm_apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) == - APIC_LVT_TIMER_TSCDEADLINE); + return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE; } static inline int apic_lvt_nmi_mode(u32 lvt_val) @@ -326,8 +339,12 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_irr); static inline void apic_set_irr(int vec, struct kvm_lapic *apic) { - apic->irr_pending = true; apic_set_vector(vec, apic->regs + APIC_IRR); + /* + * irr_pending must be true if any interrupt is pending; set it after + * APIC_IRR to avoid race with apic_clear_irr + */ + apic->irr_pending = true; } static inline int apic_search_irr(struct kvm_lapic *apic) @@ -359,13 +376,15 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) vcpu = apic->vcpu; - apic_clear_vector(vec, apic->regs + APIC_IRR); - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) { /* try to update RVI */ + apic_clear_vector(vec, apic->regs + APIC_IRR); kvm_make_request(KVM_REQ_EVENT, vcpu); - else { - vec = apic_search_irr(apic); - apic->irr_pending = (vec != -1); + } else { + apic->irr_pending = false; + apic_clear_vector(vec, apic->regs + APIC_IRR); + if (apic_search_irr(apic) != -1) + apic->irr_pending = true; } } @@ -558,16 +577,25 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) apic_update_ppr(apic); } -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) +static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) +{ + return dest == (apic_x2apic_mode(apic) ? + X2APIC_BROADCAST : APIC_BROADCAST); +} + +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) { - return dest == 0xff || kvm_apic_id(apic) == dest; + return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); } -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) { int result = 0; u32 logical_id; + if (kvm_apic_broadcast(apic, mda)) + return 1; + if (apic_x2apic_mode(apic)) { logical_id = kvm_apic_get_reg(apic, APIC_LDR); return logical_id & mda; @@ -595,7 +623,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) } int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, int dest, int dest_mode) + int short_hand, unsigned int dest, int dest_mode) { int result = 0; struct kvm_lapic *target = vcpu->arch.apic; @@ -657,15 +685,24 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, if (!map) goto out; + if (irq->dest_id == map->broadcast) + goto out; + + ret = true; + if (irq->dest_mode == 0) { /* physical mode */ - if (irq->delivery_mode == APIC_DM_LOWEST || - irq->dest_id == 0xff) + if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) goto out; - dst = &map->phys_map[irq->dest_id & 0xff]; + + dst = &map->phys_map[irq->dest_id]; } else { u32 mda = irq->dest_id << (32 - map->ldr_bits); + u16 cid = apic_cluster_id(map, mda); + + if (cid >= ARRAY_SIZE(map->logical_map)) + goto out; - dst = map->logical_map[apic_cluster_id(map, mda)]; + dst = map->logical_map[cid]; bitmap = apic_logical_id(map, mda); @@ -691,8 +728,6 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, *r = 0; *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } - - ret = true; out: rcu_read_unlock(); return ret; @@ -1034,6 +1069,26 @@ static void update_divide_count(struct kvm_lapic *apic) apic->divide_count); } +static void apic_timer_expired(struct kvm_lapic *apic) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + wait_queue_head_t *q = &vcpu->wq; + + /* + * Note: KVM_REQ_PENDING_TIMER is implicitly checked in + * vcpu_enter_guest. + */ + if (atomic_read(&apic->lapic_timer.pending)) + return; + + atomic_inc(&apic->lapic_timer.pending); + /* FIXME: this code should not know anything about vcpus */ + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + + if (waitqueue_active(q)) + wake_up_interruptible(q); +} + static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; @@ -1096,9 +1151,10 @@ static void start_apic_timer(struct kvm_lapic *apic) if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); - } - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + } else + apic_timer_expired(apic); local_irq_restore(flags); } @@ -1203,17 +1259,20 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; - case APIC_LVTT: - if ((kvm_apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) != - (val & apic->lapic_timer.timer_mode_mask)) + case APIC_LVTT: { + u32 timer_mode = val & apic->lapic_timer.timer_mode_mask; + + if (apic->lapic_timer.timer_mode != timer_mode) { + apic->lapic_timer.timer_mode = timer_mode; hrtimer_cancel(&apic->lapic_timer.timer); + } if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); apic_set_reg(apic, APIC_LVTT, val); break; + } case APIC_TMICT: if (apic_lvtt_tscdeadline(apic)) @@ -1320,7 +1379,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) static_key_slow_dec_deferred(&apic_hw_disabled); - if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED)) + if (!apic->sw_enabled) static_key_slow_dec_deferred(&apic_sw_disabled); if (apic->regs) @@ -1355,9 +1414,6 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) return; hrtimer_cancel(&apic->lapic_timer.timer); - /* Inject here so clearing tscdeadline won't override new value */ - if (apic_has_pending_timer(vcpu)) - kvm_inject_apic_timer_irqs(vcpu); apic->lapic_timer.tscdeadline = data; start_apic_timer(apic); } @@ -1422,6 +1478,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; + if ((value & MSR_IA32_APICBASE_ENABLE) && + apic->base_address != APIC_DEFAULT_PHYS_BASE) + pr_warn_once("APIC base relocation is unsupported by KVM"); + /* with FSB delivery interrupt, we can restart APIC functionality */ apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); @@ -1447,6 +1507,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + apic->lapic_timer.timer_mode = 0; apic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); @@ -1538,23 +1599,8 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) { struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer); - struct kvm_vcpu *vcpu = apic->vcpu; - wait_queue_head_t *q = &vcpu->wq; - - /* - * There is a race window between reading and incrementing, but we do - * not care about potentially losing timer events in the !reinject - * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked - * in vcpu_enter_guest. - */ - if (!atomic_read(&ktimer->pending)) { - atomic_inc(&ktimer->pending); - /* FIXME: this code should not know anything about vcpus */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - } - if (waitqueue_active(q)) - wake_up_interruptible(q); + apic_timer_expired(apic); if (lapic_is_periodic(apic)) { hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); @@ -1693,6 +1739,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ? 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; + if (kvm_x86_ops->hwapic_irr_update) + kvm_x86_ops->hwapic_irr_update(vcpu, + apic_find_highest_irr(apic)); kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_rtc_eoi_tracking_restore_one(vcpu); @@ -1837,8 +1886,11 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) return 1; + if (reg == APIC_ICR2) + return 1; + /* if this is ICR write vector before command */ - if (msr == 0x830) + if (reg == APIC_ICR) apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); return apic_reg_write(apic, reg, (u32)data); } @@ -1851,9 +1903,15 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) return 1; + if (reg == APIC_DFR || reg == APIC_ICR2) { + apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n", + reg); + return 1; + } + if (apic_reg_read(apic, reg, 4, &low)) return 1; - if (msr == 0x830) + if (reg == APIC_ICR) apic_reg_read(apic, APIC_ICR2, 4, &high); *data = (((u64)high) << 32) | low; @@ -1908,7 +1966,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) void kvm_apic_accept_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - unsigned int sipi_vector; + u8 sipi_vector; unsigned long pe; if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6a11845fd8b9..c674fce53cf9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -11,6 +11,7 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; atomic_t pending; /* accumulated triggered timers */ @@ -22,6 +23,7 @@ struct kvm_lapic { struct kvm_timer lapic_timer; u32 divide_count; struct kvm_vcpu *vcpu; + bool sw_enabled; bool irr_pending; /* Number of bits set in ISR. */ s16 isr_count; @@ -55,8 +57,8 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest); +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); @@ -119,11 +121,11 @@ static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic) extern struct static_key_deferred apic_sw_disabled; -static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic) +static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic) { if (static_key_false(&apic_sw_disabled.key)) - return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; - return APIC_SPIV_APIC_ENABLED; + return apic->sw_enabled; + return true; } static inline bool kvm_apic_present(struct kvm_vcpu *vcpu) @@ -152,8 +154,6 @@ static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) ldr >>= 32 - map->ldr_bits; cid = (ldr >> map->cid_shift) & map->cid_mask; - BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); - return cid; } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 978f402006ee..10fbed126b11 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -214,13 +214,12 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); #define MMIO_GEN_LOW_SHIFT 10 #define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) -#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) static u64 generation_mmio_spte_mask(unsigned int gen) { u64 mask; - WARN_ON(gen > MMIO_MAX_GEN); + WARN_ON(gen & ~MMIO_GEN_MASK); mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; @@ -263,13 +262,13 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; return (spte & ~mask) >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; return (spte & ~mask) & ~PAGE_MASK; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7527cefc5a43..41dd0387cccb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1056,9 +1056,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho { struct vcpu_svm *svm = to_svm(vcpu); - WARN_ON(adjustment < 0); - if (host) - adjustment = svm_scale_tsc(vcpu, adjustment); + if (host) { + if (svm->tsc_ratio != TSC_RATIO_DEFAULT) + WARN_ON(adjustment < 0); + adjustment = svm_scale_tsc(vcpu, (u64)adjustment); + } svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) @@ -2999,7 +3001,6 @@ static int dr_interception(struct vcpu_svm *svm) { int reg, dr; unsigned long val; - int err; if (svm->vcpu.guest_debug == 0) { /* @@ -3019,12 +3020,15 @@ static int dr_interception(struct vcpu_svm *svm) dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; if (dr >= 16) { /* mov to DRn */ + if (!kvm_require_dr(&svm->vcpu, dr - 16)) + return 1; val = kvm_register_read(&svm->vcpu, reg); kvm_set_dr(&svm->vcpu, dr - 16, val); } else { - err = kvm_get_dr(&svm->vcpu, dr, &val); - if (!err) - kvm_register_write(&svm->vcpu, reg, val); + if (!kvm_require_dr(&svm->vcpu, dr)) + return 1; + kvm_get_dr(&svm->vcpu, dr, &val); + kvm_register_write(&svm->vcpu, reg, val); } skip_emulated_instruction(&svm->vcpu); @@ -4123,6 +4127,11 @@ static bool svm_mpx_supported(void) return false; } +static bool svm_xsaves_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -4410,6 +4419,7 @@ static struct kvm_x86_ops svm_x86_ops = { .rdtscp_supported = svm_rdtscp_supported, .invpcid_supported = svm_invpcid_supported, .mpx_supported = svm_mpx_supported, + .xsaves_supported = svm_xsaves_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 6b06ab8748dd..c2a34bb5ad93 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -5,6 +5,7 @@ #include <asm/vmx.h> #include <asm/svm.h> #include <asm/clocksource.h> +#include <asm/pvclock-abi.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -877,6 +878,42 @@ TRACE_EVENT(kvm_ple_window, #define trace_kvm_ple_window_shrink(vcpu_id, new, old) \ trace_kvm_ple_window(false, vcpu_id, new, old) +TRACE_EVENT(kvm_pvclock_update, + TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock), + TP_ARGS(vcpu_id, pvclock), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( __u32, version ) + __field( __u64, tsc_timestamp ) + __field( __u64, system_time ) + __field( __u32, tsc_to_system_mul ) + __field( __s8, tsc_shift ) + __field( __u8, flags ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->version = pvclock->version; + __entry->tsc_timestamp = pvclock->tsc_timestamp; + __entry->system_time = pvclock->system_time; + __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul; + __entry->tsc_shift = pvclock->tsc_shift; + __entry->flags = pvclock->flags; + ), + + TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, " + "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, " + "flags 0x%x }", + __entry->vcpu_id, + __entry->version, + __entry->tsc_timestamp, + __entry->system_time, + __entry->tsc_to_system_mul, + __entry->tsc_shift, + __entry->flags) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3e556c68351b..feb852b04598 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -99,13 +99,15 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); +static u64 __read_mostly host_xss; + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT) + | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -214,6 +216,7 @@ struct __packed vmcs12 { u64 virtual_apic_page_addr; u64 apic_access_addr; u64 ept_pointer; + u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; u64 guest_ia32_debugctl; @@ -616,6 +619,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), FIELD64(EPT_POINTER, ept_pointer), + FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), @@ -720,12 +724,15 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(HOST_RSP, host_rsp), FIELD(HOST_RIP, host_rip), }; -static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); static inline short vmcs_field_to_offset(unsigned long field) { - if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0) - return -1; + BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); + + if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || + vmcs_field_to_offset_table[field] == 0) + return -ENOENT; + return vmcs_field_to_offset_table[field]; } @@ -758,6 +765,7 @@ static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); +static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -1098,6 +1106,12 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); } +static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) && + vmx_xsaves_supported(); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1659,12 +1673,20 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) vmx->guest_msrs[efer_offset].mask = ~ignore_bits; clear_atomic_switch_msr(vmx, MSR_EFER); - /* On ept, can't emulate nx, and must switch nx atomically */ - if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) { + + /* + * On EPT, we can't emulate NX, so we must switch EFER atomically. + * On CPUs that support "load IA32_EFER", always switch EFER + * atomically, since it's faster than switching it manually. + */ + if (cpu_has_load_ia32_efer || + (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { guest_efer = vmx->vcpu.arch.efer; if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; - add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); + if (guest_efer != host_efer) + add_atomic_switch_msr(vmx, MSR_EFER, + guest_efer, host_efer); return false; } @@ -2377,12 +2399,13 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_low = 0; nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_WBINVD_EXITING; + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_XSAVES; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ - nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT | + SECONDARY_EXEC_UNRESTRICTED_GUEST; nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; @@ -2558,6 +2581,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) if (!nested_vmx_allowed(vcpu)) return 1; return vmx_get_vmx_msr(vcpu, msr_index, pdata); + case MSR_IA32_XSS: + if (!vmx_xsaves_supported()) + return 1; + data = vcpu->arch.ia32_xss; + break; case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; @@ -2649,6 +2677,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return 1; /* they are read-only */ + case MSR_IA32_XSS: + if (!vmx_xsaves_supported()) + return 1; + /* + * The only supported bit as of Skylake is bit 8, but + * it is not supported on KVM. + */ + if (data != 0) + return 1; + vcpu->arch.ia32_xss = data; + if (vcpu->arch.ia32_xss != host_xss) + add_atomic_switch_msr(vmx, MSR_IA32_XSS, + vcpu->arch.ia32_xss, host_xss); + else + clear_atomic_switch_msr(vmx, MSR_IA32_XSS); + break; case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) return 1; @@ -2884,7 +2928,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_SHADOW_VMCS; + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_XSAVES; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3007,6 +3052,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) } } + if (cpu_has_xsaves) + rdmsrl(MSR_IA32_XSS, host_xss); + return 0; } @@ -3110,76 +3158,6 @@ static __init int alloc_kvm_area(void) return 0; } -static __init int hardware_setup(void) -{ - if (setup_vmcs_config(&vmcs_config) < 0) - return -EIO; - - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - - if (!cpu_has_vmx_vpid()) - enable_vpid = 0; - if (!cpu_has_vmx_shadow_vmcs()) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) - init_vmcs_shadow_fields(); - - if (!cpu_has_vmx_ept() || - !cpu_has_vmx_ept_4levels()) { - enable_ept = 0; - enable_unrestricted_guest = 0; - enable_ept_ad_bits = 0; - } - - if (!cpu_has_vmx_ept_ad_bits()) - enable_ept_ad_bits = 0; - - if (!cpu_has_vmx_unrestricted_guest()) - enable_unrestricted_guest = 0; - - if (!cpu_has_vmx_flexpriority()) { - flexpriority_enabled = 0; - - /* - * set_apic_access_page_addr() is used to reload apic access - * page upon invalidation. No need to do anything if the - * processor does not have the APIC_ACCESS_ADDR VMCS field. - */ - kvm_x86_ops->set_apic_access_page_addr = NULL; - } - - if (!cpu_has_vmx_tpr_shadow()) - kvm_x86_ops->update_cr8_intercept = NULL; - - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - - if (!cpu_has_vmx_ple()) - ple_gap = 0; - - if (!cpu_has_vmx_apicv()) - enable_apicv = 0; - - if (enable_apicv) - kvm_x86_ops->update_cr8_intercept = NULL; - else { - kvm_x86_ops->hwapic_irr_update = NULL; - kvm_x86_ops->deliver_posted_interrupt = NULL; - kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; - } - - if (nested) - nested_vmx_setup_ctls_msrs(); - - return alloc_kvm_area(); -} - -static __exit void hardware_unsetup(void) -{ - free_kvm_area(); -} - static bool emulation_required(struct kvm_vcpu *vcpu) { return emulate_invalid_guest_state && !guest_state_valid(vcpu); @@ -4396,6 +4374,7 @@ static void ept_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); } +#define VMX_XSS_EXIT_BITMAP 0 /* * Sets up the vmcs for emulated real mode. */ @@ -4505,6 +4484,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); + if (vmx_xsaves_supported()) + vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); + return 0; } @@ -5163,13 +5145,20 @@ static int handle_cr(struct kvm_vcpu *vcpu) static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int dr, reg; + int dr, dr7, reg; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + dr = exit_qualification & DEBUG_REG_ACCESS_NUM; + + /* First, if DR does not exist, trigger UD */ + if (!kvm_require_dr(vcpu, dr)) + return 1; /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; - dr = vmcs_readl(GUEST_DR7); - if (dr & DR7_GD) { + dr7 = vmcs_readl(GUEST_DR7); + if (dr7 & DR7_GD) { /* * As the vm-exit takes precedence over the debug trap, we * need to emulate the latter, either for the host or the @@ -5177,17 +5166,14 @@ static int handle_dr(struct kvm_vcpu *vcpu) */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; - vcpu->run->debug.arch.dr7 = dr; - vcpu->run->debug.arch.pc = - vmcs_readl(GUEST_CS_BASE) + - vmcs_readl(GUEST_RIP); + vcpu->run->debug.arch.dr7 = dr7; + vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { - vcpu->arch.dr7 &= ~DR7_GD; + vcpu->arch.dr6 &= ~15; vcpu->arch.dr6 |= DR6_BD | DR6_RTM; - vmcs_writel(GUEST_DR7, vcpu->arch.dr7); kvm_queue_exception(vcpu, DB_VECTOR); return 1; } @@ -5209,8 +5195,6 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; } - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; @@ -5391,6 +5375,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) return 1; } +static int handle_xsaves(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + WARN(1, "this should never happen\n"); + return 1; +} + +static int handle_xrstors(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + WARN(1, "this should never happen\n"); + return 1; +} + static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { @@ -5492,7 +5490,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) } /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155); /* * TODO: What about debug traps on tss switch? @@ -5539,11 +5537,11 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) trace_kvm_page_fault(gpa, exit_qualification); /* It is a write fault? */ - error_code = exit_qualification & (1U << 1); + error_code = exit_qualification & PFERR_WRITE_MASK; /* It is a fetch fault? */ - error_code |= (exit_qualification & (1U << 2)) << 2; + error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; /* ept page table is present? */ - error_code |= (exit_qualification >> 3) & 0x1; + error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; vcpu->arch.exit_qualification = exit_qualification; @@ -5785,6 +5783,204 @@ static void update_ple_window_actual_max(void) ple_window_grow, INT_MIN); } +static __init int hardware_setup(void) +{ + int r = -ENOMEM, i, msr; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); + + vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_a) + return r; + + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_b) + goto out; + + vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy) + goto out1; + + vmx_msr_bitmap_legacy_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic) + goto out2; + + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode) + goto out3; + + vmx_msr_bitmap_longmode_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic) + goto out4; + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) + goto out5; + + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmwrite_bitmap) + goto out6; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + + /* + * Allow direct access to the PC debug port (it is often used for I/O + * delays, but the vmexits simply slow things down). + */ + memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); + clear_bit(0x80, vmx_io_bitmap_a); + + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); + + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); + + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + + memcpy(vmx_msr_bitmap_legacy_x2apic, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic, + vmx_msr_bitmap_longmode, PAGE_SIZE); + + if (enable_apicv) { + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. + * But in KVM, it only use the highest eight bits. Need to + * intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); + } + + if (enable_ept) { + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); + + update_ple_window_actual_max(); + + if (setup_vmcs_config(&vmcs_config) < 0) { + r = -EIO; + goto out7; + } + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + + if (!cpu_has_vmx_vpid()) + enable_vpid = 0; + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); + + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels()) { + enable_ept = 0; + enable_unrestricted_guest = 0; + enable_ept_ad_bits = 0; + } + + if (!cpu_has_vmx_ept_ad_bits()) + enable_ept_ad_bits = 0; + + if (!cpu_has_vmx_unrestricted_guest()) + enable_unrestricted_guest = 0; + + if (!cpu_has_vmx_flexpriority()) { + flexpriority_enabled = 0; + + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if the + * processor does not have the APIC_ACCESS_ADDR VMCS field. + */ + kvm_x86_ops->set_apic_access_page_addr = NULL; + } + + if (!cpu_has_vmx_tpr_shadow()) + kvm_x86_ops->update_cr8_intercept = NULL; + + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + + if (!cpu_has_vmx_ple()) + ple_gap = 0; + + if (!cpu_has_vmx_apicv()) + enable_apicv = 0; + + if (enable_apicv) + kvm_x86_ops->update_cr8_intercept = NULL; + else { + kvm_x86_ops->hwapic_irr_update = NULL; + kvm_x86_ops->deliver_posted_interrupt = NULL; + kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; + } + + if (nested) + nested_vmx_setup_ctls_msrs(); + + return alloc_kvm_area(); + +out7: + free_page((unsigned long)vmx_vmwrite_bitmap); +out6: + free_page((unsigned long)vmx_vmread_bitmap); +out5: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +out4: + free_page((unsigned long)vmx_msr_bitmap_longmode); +out3: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); +out2: + free_page((unsigned long)vmx_msr_bitmap_legacy); +out1: + free_page((unsigned long)vmx_io_bitmap_b); +out: + free_page((unsigned long)vmx_io_bitmap_a); + + return r; +} + +static __exit void hardware_unsetup(void) +{ + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); + free_page((unsigned long)vmx_vmread_bitmap); + + free_kvm_area(); +} + /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -6361,58 +6557,60 @@ static inline int vmcs_field_readonly(unsigned long field) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 *ret) +static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 *ret) { short offset = vmcs_field_to_offset(field); char *p; if (offset < 0) - return 0; + return offset; p = ((char *)(get_vmcs12(vcpu))) + offset; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_NATURAL_WIDTH: *ret = *((natural_width *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U16: *ret = *((u16 *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U32: *ret = *((u32 *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U64: *ret = *((u64 *)p); - return 1; + return 0; default: - return 0; /* can never happen. */ + WARN_ON(1); + return -ENOENT; } } -static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 field_value){ +static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 field_value){ short offset = vmcs_field_to_offset(field); char *p = ((char *) get_vmcs12(vcpu)) + offset; if (offset < 0) - return false; + return offset; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_U16: *(u16 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_U32: *(u32 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_U64: *(u64 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_NATURAL_WIDTH: *(natural_width *)p = field_value; - return true; + return 0; default: - return false; /* can never happen. */ + WARN_ON(1); + return -ENOENT; } } @@ -6445,6 +6643,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) case VMCS_FIELD_TYPE_NATURAL_WIDTH: field_value = vmcs_readl(field); break; + default: + WARN_ON(1); + continue; } vmcs12_write_any(&vmx->vcpu, field, field_value); } @@ -6490,6 +6691,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) case VMCS_FIELD_TYPE_NATURAL_WIDTH: vmcs_writel(field, (long)field_value); break; + default: + WARN_ON(1); + break; } } } @@ -6528,7 +6732,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (!vmcs12_read_any(vcpu, field, &field_value)) { + if (vmcs12_read_any(vcpu, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; @@ -6598,7 +6802,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; } - if (!vmcs12_write_any(vcpu, field, field_value)) { + if (vmcs12_write_any(vcpu, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; @@ -6802,6 +7006,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, [EXIT_REASON_INVVPID] = handle_invvpid, + [EXIT_REASON_XSAVES] = handle_xsaves, + [EXIT_REASON_XRSTORS] = handle_xrstors, }; static const int kvm_vmx_max_exit_handlers = @@ -7089,6 +7295,14 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: return 1; + case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + /* + * This should never happen, since it is not possible to + * set XSS to a non-zero value---neither in L1 nor in L2. + * If if it were, XSS would have to be checked against + * the XSS exit bitmap in vmcs12. + */ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); default: return 1; } @@ -7277,6 +7491,9 @@ static void vmx_set_rvi(int vector) u16 status; u8 old; + if (vector == -1) + vector = 0; + status = vmcs_read16(GUEST_INTR_STATUS); old = (u8)status & 0xff; if ((u8)vector != old) { @@ -7288,22 +7505,23 @@ static void vmx_set_rvi(int vector) static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { + if (!is_guest_mode(vcpu)) { + vmx_set_rvi(max_irr); + return; + } + if (max_irr == -1) return; /* - * If a vmexit is needed, vmx_check_nested_events handles it. + * In guest mode. If a vmexit is needed, vmx_check_nested_events + * handles it. */ - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + if (nested_exit_on_intr(vcpu)) return; - if (!is_guest_mode(vcpu)) { - vmx_set_rvi(max_irr); - return; - } - /* - * Fall back to pre-APICv interrupt injection since L2 + * Else, fall back to pre-APICv interrupt injection since L2 * is run without virtual interrupt delivery. */ if (!kvm_event_needs_reinjection(vcpu) && @@ -7400,6 +7618,12 @@ static bool vmx_mpx_supported(void) (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); } +static bool vmx_xsaves_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_XSAVES; +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -8135,6 +8359,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; @@ -8775,6 +9001,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); if (vmx_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (nested_cpu_has_xsaves(vmcs12)) + vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); /* update exit information fields: */ @@ -9176,6 +9404,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, .mpx_supported = vmx_mpx_supported, + .xsaves_supported = vmx_xsaves_supported, .check_nested_events = vmx_check_nested_events, @@ -9184,150 +9413,21 @@ static struct kvm_x86_ops vmx_x86_ops = { static int __init vmx_init(void) { - int r, i, msr; - - rdmsrl_safe(MSR_EFER, &host_efer); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return -ENOMEM; - - r = -ENOMEM; - - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) - goto out; - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out3; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out4; - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmread_bitmap) - goto out5; - - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmwrite_bitmap) - goto out6; - - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); - memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - - /* - * Allow direct access to the PC debug port (it is often used for I/O - * delays, but the vmexits simply slow things down). - */ - memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - clear_bit(0x80, vmx_io_bitmap_a); - - memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - goto out7; + return r; #ifdef CONFIG_KEXEC rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); #endif - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); - vmx_disable_intercept_for_msr(MSR_GS_BASE, false); - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - - memcpy(vmx_msr_bitmap_legacy_x2apic, - vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, - vmx_msr_bitmap_longmode, PAGE_SIZE); - - if (enable_apicv) { - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); - - /* According SDM, in x2apic mode, the whole id reg is used. - * But in KVM, it only use the highest eight bits. Need to - * intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); - /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); - /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); - } - - if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else - kvm_disable_tdp(); - - update_ple_window_actual_max(); - return 0; - -out7: - free_page((unsigned long)vmx_vmwrite_bitmap); -out6: - free_page((unsigned long)vmx_vmread_bitmap); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out4: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); -out: - free_page((unsigned long)vmx_io_bitmap_a); - return r; } static void __exit vmx_exit(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); - #ifdef CONFIG_KEXEC RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0033df32a745..c259814200bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -27,6 +27,7 @@ #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" +#include "assigned-dev.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -353,6 +354,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, if (!vcpu->arch.exception.pending) { queue: + if (has_error && !is_protmode(vcpu)) + has_error = false; vcpu->arch.exception.pending = true; vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.nr = nr; @@ -455,6 +458,16 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) } EXPORT_SYMBOL_GPL(kvm_require_cpl); +bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) +{ + if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return true; + + kvm_queue_exception(vcpu, UD_VECTOR); + return false; +} +EXPORT_SYMBOL_GPL(kvm_require_dr); + /* * This function will be used to read from the physical memory of the currently * running guest. The difference to kvm_read_guest_page is that this function @@ -656,6 +669,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) return 1; + if (xcr0 & XSTATE_AVX512) { + if (!(xcr0 & XSTATE_YMM)) + return 1; + if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512) + return 1; + } kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; @@ -732,6 +751,10 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { +#ifdef CONFIG_X86_64 + cr3 &= ~CR3_PCID_INVD; +#endif + if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -811,8 +834,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) vcpu->arch.eff_db[dr] = val; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* #UD */ /* fall through */ case 6: if (val & 0xffffffff00000000ULL) @@ -821,8 +842,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) kvm_update_dr6(vcpu); break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* #UD */ /* fall through */ default: /* 7 */ if (val & 0xffffffff00000000ULL) @@ -837,27 +856,21 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { - int res; - - res = __kvm_set_dr(vcpu, dr, val); - if (res > 0) - kvm_queue_exception(vcpu, UD_VECTOR); - else if (res < 0) + if (__kvm_set_dr(vcpu, dr, val)) { kvm_inject_gp(vcpu, 0); - - return res; + return 1; + } + return 0; } EXPORT_SYMBOL_GPL(kvm_set_dr); -static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { switch (dr) { case 0 ... 3: *val = vcpu->arch.db[dr]; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* fall through */ case 6: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) @@ -866,23 +879,11 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) *val = kvm_x86_ops->get_dr6(vcpu); break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* fall through */ default: /* 7 */ *val = vcpu->arch.dr7; break; } - - return 0; -} - -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) -{ - if (_kvm_get_dr(vcpu, dr, val)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } return 0; } EXPORT_SYMBOL_GPL(kvm_get_dr); @@ -1237,21 +1238,22 @@ void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 bool vcpus_matched; - bool do_request = false; struct kvm_arch *ka = &vcpu->kvm->arch; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&vcpu->kvm->online_vcpus)); - if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) - if (!ka->use_master_clock) - do_request = 1; - - if (!vcpus_matched && ka->use_master_clock) - do_request = 1; - - if (do_request) + /* + * Once the masterclock is enabled, always perform request in + * order to update it. + * + * In order to enable masterclock, the host clocksource must be TSC + * and the vcpus need to have matched TSCs. When that happens, + * perform request to enable masterclock. + */ + if (ka->use_master_clock || + (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, @@ -1637,16 +1639,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return 0; + /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate * state, we just increase by 2 at the end. */ - vcpu->hv_clock.version += 2; - - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, - &guest_hv_clock, sizeof(guest_hv_clock)))) - return 0; + vcpu->hv_clock.version = guest_hv_clock.version + 2; /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); @@ -1662,6 +1664,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; + trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); @@ -2140,7 +2144,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: if (guest_cpuid_has_tsc_adjust(vcpu)) { if (!msr_info->host_initiated) { - u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; + s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); } vcpu->arch.ia32_tsc_adjust_msr = data; @@ -3106,7 +3110,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, unsigned long val; memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - _kvm_get_dr(vcpu, 6, &val); + kvm_get_dr(vcpu, 6, &val); dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; @@ -3128,15 +3132,89 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } +#define XSTATE_COMPACTION_ENABLED (1ULL << 63) + +static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) +{ + struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; + u64 xstate_bv = xsave->xsave_hdr.xstate_bv; + u64 valid; + + /* + * Copy legacy XSAVE area, to avoid complications with CPUID + * leaves 0 and 1 in the loop below. + */ + memcpy(dest, xsave, XSAVE_HDR_OFFSET); + + /* Set XSTATE_BV */ + *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; + + /* + * Copy each region from the possibly compacted offset to the + * non-compacted offset. + */ + valid = xstate_bv & ~XSTATE_FPSSE; + while (valid) { + u64 feature = valid & -valid; + int index = fls64(feature) - 1; + void *src = get_xsave_addr(xsave, feature); + + if (src) { + u32 size, offset, ecx, edx; + cpuid_count(XSTATE_CPUID, index, + &size, &offset, &ecx, &edx); + memcpy(dest + offset, src, size); + } + + valid -= feature; + } +} + +static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) +{ + struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; + u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); + u64 valid; + + /* + * Copy legacy XSAVE area, to avoid complications with CPUID + * leaves 0 and 1 in the loop below. + */ + memcpy(xsave, src, XSAVE_HDR_OFFSET); + + /* Set XSTATE_BV and possibly XCOMP_BV. */ + xsave->xsave_hdr.xstate_bv = xstate_bv; + if (cpu_has_xsaves) + xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; + + /* + * Copy each region from the non-compacted offset to the + * possibly compacted offset. + */ + valid = xstate_bv & ~XSTATE_FPSSE; + while (valid) { + u64 feature = valid & -valid; + int index = fls64(feature) - 1; + void *dest = get_xsave_addr(xsave, feature); + + if (dest) { + u32 size, offset, ecx, edx; + cpuid_count(XSTATE_CPUID, index, + &size, &offset, &ecx, &edx); + memcpy(dest, src + offset, size); + } else + WARN_ON_ONCE(1); + + valid -= feature; + } +} + static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { if (cpu_has_xsave) { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->xsave, - vcpu->arch.guest_xstate_size); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &= - vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE; + memset(guest_xsave, 0, sizeof(struct kvm_xsave)); + fill_xsave((u8 *) guest_xsave->region, vcpu); } else { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->fxsave, @@ -3160,8 +3238,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, */ if (xstate_bv & ~kvm_supported_xcr0()) return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state->xsave, - guest_xsave->region, vcpu->arch.guest_xstate_size); + load_xsave(vcpu, (u8 *)guest_xsave->region); } else { if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; @@ -4004,7 +4081,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } default: - ; + r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); } out: return r; @@ -4667,7 +4744,7 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) @@ -5211,21 +5288,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) { - struct kvm_run *kvm_run = vcpu->run; - unsigned long eip = vcpu->arch.emulate_ctxt.eip; - u32 dr6 = 0; - if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { - dr6 = kvm_vcpu_check_hw_bp(eip, 0, + struct kvm_run *kvm_run = vcpu->run; + unsigned long eip = kvm_get_linear_rip(vcpu); + u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.guest_debug_dr7, vcpu->arch.eff_db); if (dr6 != 0) { kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; - kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); - + kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; @@ -5235,7 +5308,8 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { - dr6 = kvm_vcpu_check_hw_bp(eip, 0, + unsigned long eip = kvm_get_linear_rip(vcpu); + u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, vcpu->arch.db); @@ -5365,7 +5439,9 @@ restart: kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) kvm_vcpu_check_singlestep(vcpu, rflags, &r); - __kvm_set_rflags(vcpu, ctxt->eflags); + if (!ctxt->have_exception || + exception_type(ctxt->exception.vector) == EXCPT_TRAP) + __kvm_set_rflags(vcpu, ctxt->eflags); /* * For STI, interrupts are shadowed; so KVM_REQ_EVENT will @@ -5965,6 +6041,12 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); + if (vcpu->arch.exception.nr == DB_VECTOR && + (vcpu->arch.dr7 & DR7_GD)) { + vcpu->arch.dr7 &= ~DR7_GD; + kvm_update_dr7(vcpu); + } + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, @@ -6873,6 +6955,9 @@ int fx_init(struct kvm_vcpu *vcpu) return err; fpu_finit(&vcpu->arch.guest_fpu); + if (cpu_has_xsaves) + vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv = + host_xcr0 | XSTATE_COMPACTION_ENABLED; /* * Ensure guest xcr0 is valid for loading @@ -7024,7 +7109,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_reset(vcpu); } -void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector) +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { struct kvm_segment cs; @@ -7256,6 +7341,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (type) return -EINVAL; + INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -7536,12 +7622,18 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) return kvm_x86_ops->interrupt_allowed(vcpu); } -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) +unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) { - unsigned long current_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); + if (is_64_bit_mode(vcpu)) + return kvm_rip_read(vcpu); + return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + + kvm_rip_read(vcpu)); +} +EXPORT_SYMBOL_GPL(kvm_get_linear_rip); - return current_rip == linear_rip; +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) +{ + return kvm_get_linear_rip(vcpu) == linear_rip; } EXPORT_SYMBOL_GPL(kvm_is_linear_rip); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7cb9c45a5fe0..cc1d61af6140 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_BNDREGS | XSTATE_BNDCSR) + | XSTATE_BNDREGS | XSTATE_BNDCSR \ + | XSTATE_AVX512) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index db92793b7e23..1530afb07c85 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -23,7 +23,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o -obj-y += msr.o msr-reg.o msr-reg-export.o hash.o +obj-y += msr.o msr-reg.o msr-reg-export.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c deleted file mode 100644 index ff4fa51a5b1f..000000000000 --- a/arch/x86/lib/hash.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Some portions derived from code covered by the following notice: - * - * Copyright (c) 2010-2013 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/hash.h> -#include <linux/init.h> - -#include <asm/processor.h> -#include <asm/cpufeature.h> -#include <asm/hash.h> - -static inline u32 crc32_u32(u32 crc, u32 val) -{ -#ifdef CONFIG_AS_CRC32 - asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val)); -#else - asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val)); -#endif - return crc; -} - -static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed) -{ - const u32 *p32 = (const u32 *) data; - u32 i, tmp = 0; - - for (i = 0; i < len / 4; i++) - seed = crc32_u32(seed, *p32++); - - switch (len & 3) { - case 3: - tmp |= *((const u8 *) p32 + 2) << 16; - /* fallthrough */ - case 2: - tmp |= *((const u8 *) p32 + 1) << 8; - /* fallthrough */ - case 1: - tmp |= *((const u8 *) p32); - seed = crc32_u32(seed, tmp); - break; - } - - return seed; -} - -static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed) -{ - const u32 *p32 = (const u32 *) data; - u32 i; - - for (i = 0; i < len; i++) - seed = crc32_u32(seed, *p32++); - - return seed; -} - -void __init setup_arch_fast_hash(struct fast_hash_ops *ops) -{ - if (cpu_has_xmm4_2) { - ops->hash = intel_crc4_2_hash; - ops->hash2 = intel_crc4_2_hash2; - } -} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d973e61e450d..38dcec403b46 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -844,11 +844,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { struct task_struct *tsk = current; - struct mm_struct *mm = tsk->mm; int code = BUS_ADRERR; - up_read(&mm->mmap_sem); - /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); @@ -879,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { - up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, 0, 0); return; } @@ -887,14 +883,11 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { - up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); return; } - up_read(¤t->mm->mmap_sem); - /* * We ran out of memory, call the OOM killer, and return the * userspace (which will retry the fault, or kill us if we got @@ -1062,7 +1055,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; - int fault; + int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; @@ -1237,47 +1230,50 @@ good_area: * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ fault = handle_mm_fault(mm, vma, address, flags); + major |= fault & VM_FAULT_MAJOR; /* - * If we need to retry but a fatal signal is pending, handle the - * signal first. We do not need to release the mmap_sem because it - * would already be released in __lock_page_or_retry in mm/filemap.c. + * If we need to retry the mmap_sem has already been released, + * and if there is a fatal signal pending there is no guarantee + * that we made any progress. Handle this case first. */ - if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) + if (unlikely(fault & VM_FAULT_RETRY)) { + /* Retry at most once */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; + if (!fatal_signal_pending(tsk)) + goto retry; + } + + /* User mode? Just return to handle the fatal exception */ + if (flags & FAULT_FLAG_USER) + return; + + /* Not returning to user mode? Handle exceptions or die: */ + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; + } + up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. + * Major/minor page fault accounting. If any of the events + * returned VM_FAULT_MAJOR, we account it as a major fault. */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - goto retry; - } + if (major) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } check_v8086_mode(regs, address, tsk); - - up_read(&mm->mmap_sem); } NOKPROBE_SYMBOL(__do_page_fault); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 82b41d56bb98..a97ee0801475 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -703,10 +703,10 @@ void __init zone_sizes_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); #ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; + max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); #endif #ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; + max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a3a5d46605d2..536ea2fb6e33 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -384,6 +384,26 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, } /* + * Lookup the PMD entry for a virtual address. Return a pointer to the entry + * or NULL if not present. + */ +pmd_t *lookup_pmd_address(unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(address); + if (pgd_none(*pgd)) + return NULL; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) + return NULL; + + return pmd_offset(pud, address); +} + +/* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() * areas on 32-bit NUMA systems. The percpu areas can @@ -1817,7 +1837,7 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) return; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 3f627345d51c..987514396c1e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -24,7 +24,7 @@ extern u8 sk_load_byte_positive_offset[]; extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[]; extern u8 sk_load_byte_negative_offset[]; -static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) +static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { if (len == 1) *ptr = bytes; @@ -52,12 +52,12 @@ static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT4_off32(b1, b2, b3, b4, off) \ do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) -static inline bool is_imm8(int value) +static bool is_imm8(int value) { return value <= 127 && value >= -128; } -static inline bool is_simm32(s64 value) +static bool is_simm32(s64 value) { return value == (s64) (s32) value; } @@ -94,7 +94,7 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define X86_JGE 0x7D #define X86_JG 0x7F -static inline void bpf_flush_icache(void *start, void *end) +static void bpf_flush_icache(void *start, void *end) { mm_segment_t old_fs = get_fs(); @@ -133,24 +133,24 @@ static const int reg2hex[] = { * which need extra byte of encoding. * rax,rcx,...,rbp have simpler encoding */ -static inline bool is_ereg(u32 reg) +static bool is_ereg(u32 reg) { - if (reg == BPF_REG_5 || reg == AUX_REG || - (reg >= BPF_REG_7 && reg <= BPF_REG_9)) - return true; - else - return false; + return (1 << reg) & (BIT(BPF_REG_5) | + BIT(AUX_REG) | + BIT(BPF_REG_7) | + BIT(BPF_REG_8) | + BIT(BPF_REG_9)); } /* add modifiers if 'reg' maps to x64 registers r8..r15 */ -static inline u8 add_1mod(u8 byte, u32 reg) +static u8 add_1mod(u8 byte, u32 reg) { if (is_ereg(reg)) byte |= 1; return byte; } -static inline u8 add_2mod(u8 byte, u32 r1, u32 r2) +static u8 add_2mod(u8 byte, u32 r1, u32 r2) { if (is_ereg(r1)) byte |= 1; @@ -160,13 +160,13 @@ static inline u8 add_2mod(u8 byte, u32 r1, u32 r2) } /* encode 'dst_reg' register into x64 opcode 'byte' */ -static inline u8 add_1reg(u8 byte, u32 dst_reg) +static u8 add_1reg(u8 byte, u32 dst_reg) { return byte + reg2hex[dst_reg]; } /* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */ -static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) +static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) { return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } @@ -178,7 +178,7 @@ static void jit_fill_hole(void *area, unsigned int size) } struct jit_context { - unsigned int cleanup_addr; /* epilogue code offset */ + int cleanup_addr; /* epilogue code offset */ bool seen_ld_abs; }; @@ -192,6 +192,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, struct bpf_insn *insn = bpf_prog->insnsi; int insn_cnt = bpf_prog->len; bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); + bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i; int proglen = 0; @@ -854,10 +855,11 @@ common_load: goto common_load; case BPF_JMP | BPF_EXIT: - if (i != insn_cnt - 1) { + if (seen_exit) { jmp_offset = ctx->cleanup_addr - addrs[i]; goto emit_jmp; } + seen_exit = true; /* update cleanup_addr */ ctx->cleanup_addr = proglen; /* mov rbx, qword ptr [rbp-X] */ diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 1819a91bbb9f..c489ef2c1a39 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -23,6 +23,8 @@ #include <xen/features.h> #include <xen/events.h> #include <asm/xen/pci.h> +#include <asm/xen/cpuid.h> +#include <asm/apic.h> #include <asm/i8259.h> static int xen_pcifront_enable_irq(struct pci_dev *dev) @@ -423,6 +425,28 @@ int __init pci_xen_init(void) return 0; } +#ifdef CONFIG_PCI_MSI +void __init xen_msi_init(void) +{ + if (!disable_apic) { + /* + * If hardware supports (x2)APIC virtualization (as indicated + * by hypervisor's leaf 4) then we don't need to use pirqs/ + * event channels for MSI handling and instead use regular + * APIC processing + */ + uint32_t eax = cpuid_eax(xen_cpuid_base() + 4); + + if (((eax & XEN_HVM_CPUID_X2APIC_VIRT) && x2apic_mode) || + ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && cpu_has_apic)) + return; + } + + x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs; + x86_msi.teardown_msi_irq = xen_teardown_msi_irq; +} +#endif + int __init pci_xen_hvm_init(void) { if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs)) @@ -437,8 +461,11 @@ int __init pci_xen_hvm_init(void) #endif #ifdef CONFIG_PCI_MSI - x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs; - x86_msi.teardown_msi_irq = xen_teardown_msi_irq; + /* + * We need to wait until after x2apic is initialized + * before we can set MSI IRQ ops. + */ + x86_platform.apic_post_init = xen_msi_init; #endif return 0; } diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 4d171e8640ef..735ba21efe91 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -86,7 +86,6 @@ static int iris_remove(struct platform_device *pdev) static struct platform_driver iris_driver = { .driver = { .name = "iris", - .owner = THIS_MODULE, }, .probe = iris_probe, .remove = iris_remove, diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index a9acde72d4ed..c5350fd27d70 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -170,7 +170,6 @@ static int xo1_pm_remove(struct platform_device *pdev) static struct platform_driver cs5535_pms_driver = { .driver = { .name = "cs5535-pms", - .owner = THIS_MODULE, }, .probe = xo1_pm_probe, .remove = xo1_pm_remove, @@ -179,7 +178,6 @@ static struct platform_driver cs5535_pms_driver = { static struct platform_driver cs5535_acpi_driver = { .driver = { .name = "olpc-xo1-pm-acpi", - .owner = THIS_MODULE, }, .probe = xo1_pm_probe, .remove = xo1_pm_remove, diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 9fe1b5d002f0..b3560ece1c9f 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -364,3 +364,4 @@ 355 i386 getrandom sys_getrandom 356 i386 memfd_create sys_memfd_create 357 i386 bpf sys_bpf +358 i386 execveat sys_execveat stub32_execveat diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 281150b539a2..8d656fbb57aa 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -328,6 +328,7 @@ 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf +322 64 execveat stub_execveat # # x32-specific system call numbers start at 512 to avoid cache impact @@ -366,3 +367,4 @@ 542 x32 getsockopt compat_sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit +545 x32 execveat stub_x32_execveat diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index cc04e67bfd05..2d7d9a1f5b53 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -29,20 +29,18 @@ #endif /* CONFIG_X86_32 */ -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP - -#define smp_mb() mb() #ifdef CONFIG_X86_PPRO_FENCE -#define smp_rmb() rmb() +#define dma_rmb() rmb() #else /* CONFIG_X86_PPRO_FENCE */ -#define smp_rmb() barrier() +#define dma_rmb() barrier() #endif /* CONFIG_X86_PPRO_FENCE */ +#define dma_wmb() barrier() -#define smp_wmb() barrier() +#ifdef CONFIG_SMP -#define smp_read_barrier_depends() read_barrier_depends() +#define smp_mb() mb() +#define smp_rmb() dma_rmb() +#define smp_wmb() barrier() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* CONFIG_SMP */ @@ -50,11 +48,13 @@ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; barrier(); } while (0) #endif /* CONFIG_SMP */ +#define read_barrier_depends() do { } while (0) +#define smp_read_barrier_depends() do { } while (0) + /* * Stop RDTSC speculation. This is needed when you need to use RDTSC * (or get_cycles or vread that possibly accesses the TSC) in a defined diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index f2f0723070ca..20c3649d0691 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -31,6 +31,7 @@ #define stub_fork sys_fork #define stub_vfork sys_vfork #define stub_execve sys_execve +#define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 8c8298d78185..5c1f9ace7ae7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -387,7 +387,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) unsigned long mfn; if (!xen_feature(XENFEAT_auto_translated_physmap)) - mfn = get_phys_to_machine(pfn); + mfn = __pfn_to_mfn(pfn); else mfn = pfn; /* @@ -1113,20 +1113,16 @@ static void __init xen_cleanhighmap(unsigned long vaddr, * instead of somewhere later and be confusing. */ xen_mc_flush(); } -static void __init xen_pagetable_p2m_copy(void) + +static void __init xen_pagetable_p2m_free(void) { unsigned long size; unsigned long addr; - unsigned long new_mfn_list; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - new_mfn_list = xen_revector_p2m_tree(); /* No memory or already called. */ - if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list) + if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list) return; /* using __ka address and sticking INVALID_P2M_ENTRY! */ @@ -1144,8 +1140,6 @@ static void __init xen_pagetable_p2m_copy(void) size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); memblock_free(__pa(xen_start_info->mfn_list), size); - /* And revector! Bye bye old array */ - xen_start_info->mfn_list = new_mfn_list; /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of @@ -1169,17 +1163,35 @@ static void __init xen_pagetable_p2m_copy(void) } #endif -static void __init xen_pagetable_init(void) +static void __init xen_pagetable_p2m_setup(void) { - paging_init(); + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + xen_vmalloc_p2m_tree(); + #ifdef CONFIG_X86_64 - xen_pagetable_p2m_copy(); + xen_pagetable_p2m_free(); #endif + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; +} + +static void __init xen_pagetable_init(void) +{ + paging_init(); + xen_post_allocator_init(); + + xen_pagetable_p2m_setup(); + /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); + /* Remap memory freed due to conflicts with E820 map */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + xen_remap_memory(); + xen_setup_shared_info(); - xen_post_allocator_init(); } static void xen_write_cr2(unsigned long cr2) { diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b456b048eca9..edbc7a63fd73 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -3,21 +3,22 @@ * guests themselves, but it must also access and update the p2m array * during suspend/resume when all the pages are reallocated. * - * The p2m table is logically a flat array, but we implement it as a - * three-level tree to allow the address space to be sparse. + * The logical flat p2m table is mapped to a linear kernel memory area. + * For accesses by Xen a three-level tree linked via mfns only is set up to + * allow the address space to be sparse. * - * Xen - * | - * p2m_top p2m_top_mfn - * / \ / \ - * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn - * / \ / \ / / - * p2m p2m p2m p2m p2m p2m p2m ... + * Xen + * | + * p2m_top_mfn + * / \ + * p2m_mid_mfn p2m_mid_mfn + * / / + * p2m p2m p2m ... * * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. * - * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the - * maximum representable pseudo-physical address space is: + * The p2m_top_mfn level is limited to 1 page, so the maximum representable + * pseudo-physical address space is: * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages * * P2M_PER_PAGE depends on the architecture, as a mfn is always @@ -30,6 +31,9 @@ * leaf entries, or for the top root, or middle one, for which there is a void * entry, we assume it is "missing". So (for example) * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. + * We have a dedicated page p2m_missing with all entries being + * INVALID_P2M_ENTRY. This page may be referenced multiple times in the p2m + * list/tree in case there are multiple areas with P2M_PER_PAGE invalid pfns. * * We also have the possibility of setting 1-1 mappings on certain regions, so * that: @@ -39,122 +43,20 @@ * PCI BARs, or ACPI spaces), we can create mappings easily because we * get the PFN value to match the MFN. * - * For this to work efficiently we have one new page p2m_identity and - * allocate (via reserved_brk) any other pages we need to cover the sides - * (1GB or 4MB boundary violations). All entries in p2m_identity are set to - * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, - * no other fancy value). + * For this to work efficiently we have one new page p2m_identity. All entries + * in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only + * recognizes that and MFNs, no other fancy value). * * On lookup we spot that the entry points to p2m_identity and return the * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. * If the entry points to an allocated page, we just proceed as before and - * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in + * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in * appropriate functions (pfn_to_mfn). * * The reason for having the IDENTITY_FRAME_BIT instead of just returning the * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a * non-identity pfn. To protect ourselves against we elect to set (and get) the * IDENTITY_FRAME_BIT on all identity mapped PFNs. - * - * This simplistic diagram is used to explain the more subtle piece of code. - * There is also a digram of the P2M at the end that can help. - * Imagine your E820 looking as so: - * - * 1GB 2GB 4GB - * /-------------------+---------\/----\ /----------\ /---+-----\ - * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | - * \-------------------+---------/\----/ \----------/ \---+-----/ - * ^- 1029MB ^- 2001MB - * - * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), - * 2048MB = 524288 (0x80000)] - * - * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB - * is actually not present (would have to kick the balloon driver to put it in). - * - * When we are told to set the PFNs for identity mapping (see patch: "xen/setup: - * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start - * of the PFN and the end PFN (263424 and 512256 respectively). The first step - * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page - * covers 512^2 of page estate (1GB) and in case the start or end PFN is not - * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as - * required to split any existing p2m_mid_missing middle pages. - * - * With the E820 example above, 263424 is not 1GB aligned so we allocate a - * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. - * Each entry in the allocate page is "missing" (points to p2m_missing). - * - * Next stage is to determine if we need to do a more granular boundary check - * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. - * We check if the start pfn and end pfn violate that boundary check, and if - * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer - * granularity of setting which PFNs are missing and which ones are identity. - * In our example 263424 and 512256 both fail the check so we reserve_brk two - * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" - * values) and assign them to p2m[1][2] and p2m[1][488] respectively. - * - * At this point we would at minimum reserve_brk one page, but could be up to - * three. Each call to set_phys_range_identity has at maximum a three page - * cost. If we were to query the P2M at this stage, all those entries from - * start PFN through end PFN (so 1029MB -> 2001MB) would return - * INVALID_P2M_ENTRY ("missing"). - * - * The next step is to walk from the start pfn to the end pfn setting - * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. - * If we find that the middle entry is pointing to p2m_missing we can swap it - * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and - * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions). - * At this point we do not need to worry about boundary aligment (so no need to - * reserve_brk a middle page, figure out which PFNs are "missing" and which - * ones are identity), as that has been done earlier. If we find that the - * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference - * that page (which covers 512 PFNs) and set the appropriate PFN with - * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we - * set from p2m[1][2][256->511] and p2m[1][488][0->256] with - * IDENTITY_FRAME_BIT set. - * - * All other regions that are void (or not filled) either point to p2m_missing - * (considered missing) or have the default value of INVALID_P2M_ENTRY (also - * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] - * contain the INVALID_P2M_ENTRY value and are considered "missing." - * - * Finally, the region beyond the end of of the E820 (4 GB in this example) - * is set to be identity (in case there are MMIO regions placed here). - * - * This is what the p2m ends up looking (for the E820 above) with this - * fabulous drawing: - * - * p2m /--------------\ - * /-----\ | &mfn_list[0],| /-----------------\ - * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. | - * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] | - * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] | - * |-----| \ | [p2m_identity]+\\ | .... | - * | 2 |--\ \-------------------->| ... | \\ \----------------/ - * |-----| \ \---------------/ \\ - * | 3 |-\ \ \\ p2m_identity [1] - * |-----| \ \-------------------->/---------------\ /-----------------\ - * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... | - * \-----/ | | | [p2m_identity]+-->| ..., ~0 | - * | | | .... | \-----------------/ - * | | +-[x], ~0, ~0.. +\ - * | | \---------------/ \ - * | | \-> /---------------\ - * | V p2m_mid_missing p2m_missing | IDENTITY[@0] | - * | /-----------------\ /------------\ | IDENTITY[@256]| - * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... | - * | | [p2m_missing] +---->| ..., ~0 | \---------------/ - * | | ... | \------------/ - * | \-----------------/ - * | - * | p2m_mid_identity - * | /-----------------\ - * \-->| [p2m_identity] +---->[1] - * | [p2m_identity] +---->[1] - * | ... | - * \-----------------/ - * - * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ #include <linux/init.h> @@ -164,9 +66,11 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/bootmem.h> +#include <linux/slab.h> #include <asm/cache.h> #include <asm/setup.h> +#include <asm/uaccess.h> #include <asm/xen/page.h> #include <asm/xen/hypercall.h> @@ -178,31 +82,26 @@ #include "multicalls.h" #include "xen-ops.h" +#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) + static void __init m2p_override_init(void); +unsigned long *xen_p2m_addr __read_mostly; +EXPORT_SYMBOL_GPL(xen_p2m_addr); +unsigned long xen_p2m_size __read_mostly; +EXPORT_SYMBOL_GPL(xen_p2m_size); unsigned long xen_max_p2m_pfn __read_mostly; +EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); + +static DEFINE_SPINLOCK(p2m_update_lock); static unsigned long *p2m_mid_missing_mfn; static unsigned long *p2m_top_mfn; static unsigned long **p2m_top_mfn_p; - -/* Placeholders for holes in the address space */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); - -static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); - -static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); - -RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); - -/* For each I/O range remapped we may lose up to two leaf pages for the boundary - * violations and three mid pages to cover up to 3GB. With - * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the - * remapped region. - */ -RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES); +static unsigned long *p2m_missing; +static unsigned long *p2m_identity; +static pte_t *p2m_missing_pte; +static pte_t *p2m_identity_pte; static inline unsigned p2m_top_index(unsigned long pfn) { @@ -220,14 +119,6 @@ static inline unsigned p2m_index(unsigned long pfn) return pfn % P2M_PER_PAGE; } -static void p2m_top_init(unsigned long ***top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = p2m_mid_missing; -} - static void p2m_top_mfn_init(unsigned long *top) { unsigned i; @@ -244,28 +135,43 @@ static void p2m_top_mfn_p_init(unsigned long **top) top[i] = p2m_mid_missing_mfn; } -static void p2m_mid_init(unsigned long **mid, unsigned long *leaf) +static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = leaf; + mid[i] = virt_to_mfn(leaf); } -static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) +static void p2m_init(unsigned long *p2m) { unsigned i; - for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = virt_to_mfn(leaf); + for (i = 0; i < P2M_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; } -static void p2m_init(unsigned long *p2m) +static void p2m_init_identity(unsigned long *p2m, unsigned long pfn) { unsigned i; - for (i = 0; i < P2M_MID_PER_PAGE; i++) - p2m[i] = INVALID_P2M_ENTRY; + for (i = 0; i < P2M_PER_PAGE; i++) + p2m[i] = IDENTITY_FRAME(pfn + i); +} + +static void * __ref alloc_p2m_page(void) +{ + if (unlikely(!slab_is_available())) + return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); + + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} + +/* Only to be called in case of a race for a page just allocated! */ +static void free_p2m_page(void *p) +{ + BUG_ON(!slab_is_available()); + free_page((unsigned long)p); } /* @@ -280,40 +186,46 @@ static void p2m_init(unsigned long *p2m) */ void __ref xen_build_mfn_list_list(void) { - unsigned long pfn; + unsigned long pfn, mfn; + pte_t *ptep; + unsigned int level, topidx, mididx; + unsigned long *mid_mfn_p; if (xen_feature(XENFEAT_auto_translated_physmap)) return; /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { - p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); + p2m_mid_missing_mfn = alloc_p2m_page(); p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); - p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p = alloc_p2m_page(); p2m_top_mfn_p_init(p2m_top_mfn_p); - p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn = alloc_p2m_page(); p2m_top_mfn_init(p2m_top_mfn); } else { /* Reinitialise, mfn's all change after migration */ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); } - for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - unsigned long **mid; - unsigned long *mid_mfn_p; + for (pfn = 0; pfn < xen_max_p2m_pfn && pfn < MAX_P2M_PFN; + pfn += P2M_PER_PAGE) { + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); - mid = p2m_top[topidx]; mid_mfn_p = p2m_top_mfn_p[topidx]; + ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), + &level); + BUG_ON(!ptep || level != PG_LEVEL_4K); + mfn = pte_mfn(*ptep); + ptep = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); /* Don't bother allocating any mfn mid levels if * they're just missing, just update the stored mfn, * since all could have changed over a migrate. */ - if (mid == p2m_mid_missing) { + if (ptep == p2m_missing_pte || ptep == p2m_identity_pte) { BUG_ON(mididx); BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); @@ -322,19 +234,14 @@ void __ref xen_build_mfn_list_list(void) } if (mid_mfn_p == p2m_mid_missing_mfn) { - /* - * XXX boot-time only! We should never find - * missing parts of the mfn tree after - * runtime. - */ - mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); + mid_mfn_p = alloc_p2m_page(); p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; } p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); + mid_mfn_p[mididx] = mfn; } } @@ -353,171 +260,235 @@ void xen_setup_mfn_list_list(void) /* Set up p2m_top to point to the domain-builder provided p2m pages */ void __init xen_build_dynamic_phys_to_machine(void) { - unsigned long *mfn_list; - unsigned long max_pfn; unsigned long pfn; if (xen_feature(XENFEAT_auto_translated_physmap)) return; - mfn_list = (unsigned long *)xen_start_info->mfn_list; - max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - xen_max_p2m_pfn = max_pfn; + xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list; + xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE); - p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_missing); - p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_identity); + for (pfn = xen_start_info->nr_pages; pfn < xen_p2m_size; pfn++) + xen_p2m_addr[pfn] = INVALID_P2M_ENTRY; - p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_missing, p2m_missing); - p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_identity, p2m_identity); + xen_max_p2m_pfn = xen_p2m_size; +} - p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_init(p2m_top); +#define P2M_TYPE_IDENTITY 0 +#define P2M_TYPE_MISSING 1 +#define P2M_TYPE_PFN 2 +#define P2M_TYPE_UNKNOWN 3 - /* - * The domain builder gives us a pre-constructed p2m array in - * mfn_list for all the pages initially given to us, so we just - * need to graft that into our tree structure. - */ - for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); +static int xen_p2m_elem_type(unsigned long pfn) +{ + unsigned long mfn; - if (p2m_top[topidx] == p2m_mid_missing) { - unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid, p2m_missing); + if (pfn >= xen_p2m_size) + return P2M_TYPE_IDENTITY; - p2m_top[topidx] = mid; - } + mfn = xen_p2m_addr[pfn]; - /* - * As long as the mfn_list has enough entries to completely - * fill a p2m page, pointing into the array is ok. But if - * not the entries beyond the last pfn will be undefined. - */ - if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) { - unsigned long p2midx; + if (mfn == INVALID_P2M_ENTRY) + return P2M_TYPE_MISSING; - p2midx = max_pfn % P2M_PER_PAGE; - for ( ; p2midx < P2M_PER_PAGE; p2midx++) - mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY; - } - p2m_top[topidx][mididx] = &mfn_list[pfn]; - } + if (mfn & IDENTITY_FRAME_BIT) + return P2M_TYPE_IDENTITY; - m2p_override_init(); + return P2M_TYPE_PFN; } -#ifdef CONFIG_X86_64 -unsigned long __init xen_revector_p2m_tree(void) + +static void __init xen_rebuild_p2m_list(unsigned long *p2m) { - unsigned long va_start; - unsigned long va_end; + unsigned int i, chunk; unsigned long pfn; - unsigned long pfn_free = 0; - unsigned long *mfn_list = NULL; - unsigned long size; - - va_start = xen_start_info->mfn_list; - /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long), - * so make sure it is rounded up to that */ - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - va_end = va_start + size; - - /* If we were revectored already, don't do it again. */ - if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET) - return 0; + unsigned long *mfns; + pte_t *ptep; + pmd_t *pmdp; + int type; - mfn_list = alloc_bootmem_align(size, PAGE_SIZE); - if (!mfn_list) { - pr_warn("Could not allocate space for a new P2M tree!\n"); - return xen_start_info->mfn_list; - } - /* Fill it out with INVALID_P2M_ENTRY value */ - memset(mfn_list, 0xFF, size); + p2m_missing = alloc_p2m_page(); + p2m_init(p2m_missing); + p2m_identity = alloc_p2m_page(); + p2m_init(p2m_identity); - for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx; - unsigned long *mid_p; + p2m_missing_pte = alloc_p2m_page(); + paravirt_alloc_pte(&init_mm, __pa(p2m_missing_pte) >> PAGE_SHIFT); + p2m_identity_pte = alloc_p2m_page(); + paravirt_alloc_pte(&init_mm, __pa(p2m_identity_pte) >> PAGE_SHIFT); + for (i = 0; i < PTRS_PER_PTE; i++) { + set_pte(p2m_missing_pte + i, + pfn_pte(PFN_DOWN(__pa(p2m_missing)), PAGE_KERNEL_RO)); + set_pte(p2m_identity_pte + i, + pfn_pte(PFN_DOWN(__pa(p2m_identity)), PAGE_KERNEL_RO)); + } - if (!p2m_top[topidx]) + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += chunk) { + /* + * Try to map missing/identity PMDs or p2m-pages if possible. + * We have to respect the structure of the mfn_list_list + * which will be built just afterwards. + * Chunk size to test is one p2m page if we are in the middle + * of a mfn_list_list mid page and the complete mid page area + * if we are at index 0 of the mid page. Please note that a + * mid page might cover more than one PMD, e.g. on 32 bit PAE + * kernels. + */ + chunk = (pfn & (P2M_PER_PAGE * P2M_MID_PER_PAGE - 1)) ? + P2M_PER_PAGE : P2M_PER_PAGE * P2M_MID_PER_PAGE; + + type = xen_p2m_elem_type(pfn); + i = 0; + if (type != P2M_TYPE_PFN) + for (i = 1; i < chunk; i++) + if (xen_p2m_elem_type(pfn + i) != type) + break; + if (i < chunk) + /* Reset to minimal chunk size. */ + chunk = P2M_PER_PAGE; + + if (type == P2M_TYPE_PFN || i < chunk) { + /* Use initial p2m page contents. */ +#ifdef CONFIG_X86_64 + mfns = alloc_p2m_page(); + copy_page(mfns, xen_p2m_addr + pfn); +#else + mfns = xen_p2m_addr + pfn; +#endif + ptep = populate_extra_pte((unsigned long)(p2m + pfn)); + set_pte(ptep, + pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL)); continue; + } - if (p2m_top[topidx] == p2m_mid_missing) + if (chunk == P2M_PER_PAGE) { + /* Map complete missing or identity p2m-page. */ + mfns = (type == P2M_TYPE_MISSING) ? + p2m_missing : p2m_identity; + ptep = populate_extra_pte((unsigned long)(p2m + pfn)); + set_pte(ptep, + pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL_RO)); continue; + } - mididx = p2m_mid_index(pfn); - mid_p = p2m_top[topidx][mididx]; - if (!mid_p) - continue; - if ((mid_p == p2m_missing) || (mid_p == p2m_identity)) - continue; + /* Complete missing or identity PMD(s) can be mapped. */ + ptep = (type == P2M_TYPE_MISSING) ? + p2m_missing_pte : p2m_identity_pte; + for (i = 0; i < PMDS_PER_MID_PAGE; i++) { + pmdp = populate_extra_pmd( + (unsigned long)(p2m + pfn + i * PTRS_PER_PTE)); + set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE)); + } + } +} - if ((unsigned long)mid_p == INVALID_P2M_ENTRY) - continue; +void __init xen_vmalloc_p2m_tree(void) +{ + static struct vm_struct vm; - /* The old va. Rebase it on mfn_list */ - if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { - unsigned long *new; + vm.flags = VM_ALLOC; + vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn, + PMD_SIZE * PMDS_PER_MID_PAGE); + vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE); + pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size); - if (pfn_free > (size / sizeof(unsigned long))) { - WARN(1, "Only allocated for %ld pages, but we want %ld!\n", - size / sizeof(unsigned long), pfn_free); - return 0; - } - new = &mfn_list[pfn_free]; + xen_max_p2m_pfn = vm.size / sizeof(unsigned long); - copy_page(new, mid_p); - p2m_top[topidx][mididx] = &mfn_list[pfn_free]; + xen_rebuild_p2m_list(vm.addr); - pfn_free += P2M_PER_PAGE; + xen_p2m_addr = vm.addr; + xen_p2m_size = xen_max_p2m_pfn; - } - /* This should be the leafs allocated for identity from _brk. */ - } - return (unsigned long)mfn_list; + xen_inv_extra_mem(); + m2p_override_init(); } -#else -unsigned long __init xen_revector_p2m_tree(void) -{ - return 0; -} -#endif + unsigned long get_phys_to_machine(unsigned long pfn) { - unsigned topidx, mididx, idx; + pte_t *ptep; + unsigned int level; + + if (unlikely(pfn >= xen_p2m_size)) { + if (pfn < xen_max_p2m_pfn) + return xen_chk_extra_mem(pfn); - if (unlikely(pfn >= MAX_P2M_PFN)) return IDENTITY_FRAME(pfn); + } - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); + ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level); + BUG_ON(!ptep || level != PG_LEVEL_4K); /* * The INVALID_P2M_ENTRY is filled in both p2m_*identity * and in p2m_*missing, so returning the INVALID_P2M_ENTRY * would be wrong. */ - if (p2m_top[topidx][mididx] == p2m_identity) + if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity))) return IDENTITY_FRAME(pfn); - return p2m_top[topidx][mididx][idx]; + return xen_p2m_addr[pfn]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); -static void *alloc_p2m_page(void) +/* + * Allocate new pmd(s). It is checked whether the old pmd is still in place. + * If not, nothing is changed. This is okay as the only reason for allocating + * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual + * pmd. In case of PAE/x86-32 there are multiple pmds to allocate! + */ +static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg) { - return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); -} + pte_t *ptechk; + pte_t *pteret = ptep; + pte_t *pte_newpg[PMDS_PER_MID_PAGE]; + pmd_t *pmdp; + unsigned int level; + unsigned long flags; + unsigned long vaddr; + int i; -static void free_p2m_page(void *p) -{ - free_page((unsigned long)p); + /* Do all allocations first to bail out in error case. */ + for (i = 0; i < PMDS_PER_MID_PAGE; i++) { + pte_newpg[i] = alloc_p2m_page(); + if (!pte_newpg[i]) { + for (i--; i >= 0; i--) + free_p2m_page(pte_newpg[i]); + + return NULL; + } + } + + vaddr = addr & ~(PMD_SIZE * PMDS_PER_MID_PAGE - 1); + + for (i = 0; i < PMDS_PER_MID_PAGE; i++) { + copy_page(pte_newpg[i], pte_pg); + paravirt_alloc_pte(&init_mm, __pa(pte_newpg[i]) >> PAGE_SHIFT); + + pmdp = lookup_pmd_address(vaddr); + BUG_ON(!pmdp); + + spin_lock_irqsave(&p2m_update_lock, flags); + + ptechk = lookup_address(vaddr, &level); + if (ptechk == pte_pg) { + set_pmd(pmdp, + __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); + if (vaddr == (addr & ~(PMD_SIZE - 1))) + pteret = pte_offset_kernel(pmdp, addr); + pte_newpg[i] = NULL; + } + + spin_unlock_irqrestore(&p2m_update_lock, flags); + + if (pte_newpg[i]) { + paravirt_release_pte(__pa(pte_newpg[i]) >> PAGE_SHIFT); + free_p2m_page(pte_newpg[i]); + } + + vaddr += PMD_SIZE; + } + + return pteret; } /* @@ -530,58 +501,62 @@ static void free_p2m_page(void *p) static bool alloc_p2m(unsigned long pfn) { unsigned topidx, mididx; - unsigned long ***top_p, **mid; unsigned long *top_mfn_p, *mid_mfn; - unsigned long *p2m_orig; + pte_t *ptep, *pte_pg; + unsigned int level; + unsigned long flags; + unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); + unsigned long p2m_pfn; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); - top_p = &p2m_top[topidx]; - mid = ACCESS_ONCE(*top_p); + ptep = lookup_address(addr, &level); + BUG_ON(!ptep || level != PG_LEVEL_4K); + pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); - if (mid == p2m_mid_missing) { - /* Mid level is missing, allocate a new one */ - mid = alloc_p2m_page(); - if (!mid) + if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) { + /* PMD level is missing, allocate a new one */ + ptep = alloc_p2m_pmd(addr, ptep, pte_pg); + if (!ptep) return false; - - p2m_mid_init(mid, p2m_missing); - - if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) - free_p2m_page(mid); } - top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); + if (p2m_top_mfn) { + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); - BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); - if (mid_mfn == p2m_mid_missing_mfn) { - /* Separately check the mid mfn level */ - unsigned long missing_mfn; - unsigned long mid_mfn_mfn; - unsigned long old_mfn; + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + unsigned long old_mfn; - mid_mfn = alloc_p2m_page(); - if (!mid_mfn) - return false; + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; - p2m_mid_mfn_init(mid_mfn, p2m_missing); + p2m_mid_mfn_init(mid_mfn, p2m_missing); - missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); - mid_mfn_mfn = virt_to_mfn(mid_mfn); - old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); - if (old_mfn != missing_mfn) { - free_p2m_page(mid_mfn); - mid_mfn = mfn_to_virt(old_mfn); - } else { - p2m_top_mfn_p[topidx] = mid_mfn; + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); + if (old_mfn != missing_mfn) { + free_p2m_page(mid_mfn); + mid_mfn = mfn_to_virt(old_mfn); + } else { + p2m_top_mfn_p[topidx] = mid_mfn; + } } + } else { + mid_mfn = NULL; } - p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]); - if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) { + p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep)); + if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) || + p2m_pfn == PFN_DOWN(__pa(p2m_missing))) { /* p2m leaf page is missing */ unsigned long *p2m; @@ -589,183 +564,36 @@ static bool alloc_p2m(unsigned long pfn) if (!p2m) return false; - p2m_init(p2m); - - if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig) - free_p2m_page(p2m); + if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) + p2m_init(p2m); else - mid_mfn[mididx] = virt_to_mfn(p2m); - } - - return true; -} - -static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) -{ - unsigned topidx, mididx, idx; - unsigned long *p2m; - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - /* Pfff.. No boundary cross-over, lets get out. */ - if (!idx && check_boundary) - return false; - - WARN(p2m_top[topidx][mididx] == p2m_identity, - "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n", - topidx, mididx); - - /* - * Could be done by xen_build_dynamic_phys_to_machine.. - */ - if (p2m_top[topidx][mididx] != p2m_missing) - return false; - - /* Boundary cross-over for the edges: */ - p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); - - p2m_init(p2m); + p2m_init_identity(p2m, pfn); - p2m_top[topidx][mididx] = p2m; + spin_lock_irqsave(&p2m_update_lock, flags); - return true; -} - -static bool __init early_alloc_p2m_middle(unsigned long pfn) -{ - unsigned topidx = p2m_top_index(pfn); - unsigned long **mid; - - mid = p2m_top[topidx]; - if (mid == p2m_mid_missing) { - mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - - p2m_mid_init(mid, p2m_missing); - - p2m_top[topidx] = mid; - } - return true; -} - -/* - * Skim over the P2M tree looking at pages that are either filled with - * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and - * replace the P2M leaf with a p2m_missing or p2m_identity. - * Stick the old page in the new P2M tree location. - */ -static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn) -{ - unsigned topidx; - unsigned mididx; - unsigned ident_pfns; - unsigned inv_pfns; - unsigned long *p2m; - unsigned idx; - unsigned long pfn; - - /* We only look when this entails a P2M middle layer */ - if (p2m_index(set_pfn)) - return false; - - for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { - topidx = p2m_top_index(pfn); - - if (!p2m_top[topidx]) - continue; - - if (p2m_top[topidx] == p2m_mid_missing) - continue; - - mididx = p2m_mid_index(pfn); - p2m = p2m_top[topidx][mididx]; - if (!p2m) - continue; - - if ((p2m == p2m_missing) || (p2m == p2m_identity)) - continue; - - if ((unsigned long)p2m == INVALID_P2M_ENTRY) - continue; - - ident_pfns = 0; - inv_pfns = 0; - for (idx = 0; idx < P2M_PER_PAGE; idx++) { - /* IDENTITY_PFNs are 1:1 */ - if (p2m[idx] == IDENTITY_FRAME(pfn + idx)) - ident_pfns++; - else if (p2m[idx] == INVALID_P2M_ENTRY) - inv_pfns++; - else - break; + if (pte_pfn(*ptep) == p2m_pfn) { + set_pte(ptep, + pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); + if (mid_mfn) + mid_mfn[mididx] = virt_to_mfn(p2m); + p2m = NULL; } - if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE)) - goto found; - } - return false; -found: - /* Found one, replace old with p2m_identity or p2m_missing */ - p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); - - /* Reset where we want to stick the old page in. */ - topidx = p2m_top_index(set_pfn); - mididx = p2m_mid_index(set_pfn); - - /* This shouldn't happen */ - if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) - early_alloc_p2m_middle(set_pfn); - - if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) - return false; - - p2m_init(p2m); - p2m_top[topidx][mididx] = p2m; - return true; -} -bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!early_alloc_p2m_middle(pfn)) - return false; - - if (early_can_reuse_p2m_middle(pfn)) - return __set_phys_to_machine(pfn, mfn); - - if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) - return false; + spin_unlock_irqrestore(&p2m_update_lock, flags); - if (!__set_phys_to_machine(pfn, mfn)) - return false; + if (p2m) + free_p2m_page(p2m); } return true; } -static void __init early_split_p2m(unsigned long pfn) -{ - unsigned long mididx, idx; - - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - /* - * Allocate new middle and leaf pages if this pfn lies in the - * middle of one. - */ - if (mididx || idx) - early_alloc_p2m_middle(pfn); - if (idx) - early_alloc_p2m(pfn, false); -} - unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { unsigned long pfn; - if (unlikely(pfn_s >= MAX_P2M_PFN)) + if (unlikely(pfn_s >= xen_p2m_size)) return 0; if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) @@ -774,101 +602,51 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, if (pfn_s > pfn_e) return 0; - if (pfn_e > MAX_P2M_PFN) - pfn_e = MAX_P2M_PFN; - - early_split_p2m(pfn_s); - early_split_p2m(pfn_e); - - for (pfn = pfn_s; pfn < pfn_e;) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - - if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) - break; - pfn++; - - /* - * If the PFN was set to a middle or leaf identity - * page the remainder must also be identity, so skip - * ahead to the next middle or leaf entry. - */ - if (p2m_top[topidx] == p2m_mid_identity) - pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE); - else if (p2m_top[topidx][mididx] == p2m_identity) - pfn = ALIGN(pfn, P2M_PER_PAGE); - } + if (pfn_e > xen_p2m_size) + pfn_e = xen_p2m_size; - WARN((pfn - pfn_s) != (pfn_e - pfn_s), - "Identity mapping failed. We are %ld short of 1-1 mappings!\n", - (pfn_e - pfn_s) - (pfn - pfn_s)); + for (pfn = pfn_s; pfn < pfn_e; pfn++) + xen_p2m_addr[pfn] = IDENTITY_FRAME(pfn); return pfn - pfn_s; } -/* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - unsigned topidx, mididx, idx; + pte_t *ptep; + unsigned int level; /* don't track P2M changes in autotranslate guests */ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) return true; - if (unlikely(pfn >= MAX_P2M_PFN)) { + if (unlikely(pfn >= xen_p2m_size)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; } - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - /* For sparse holes were the p2m leaf has real PFN along with - * PCI holes, stick in the PFN as the MFN value. - * - * set_phys_range_identity() will have allocated new middle - * and leaf pages as required so an existing p2m_mid_missing - * or p2m_missing mean that whole range will be identity so - * these can be switched to p2m_mid_identity or p2m_identity. - */ - if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { - if (p2m_top[topidx] == p2m_mid_identity) - return true; - - if (p2m_top[topidx] == p2m_mid_missing) { - WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing, - p2m_mid_identity) != p2m_mid_missing); - return true; - } - - if (p2m_top[topidx][mididx] == p2m_identity) - return true; + if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) + return true; - /* Swap over from MISSING to IDENTITY if needed. */ - if (p2m_top[topidx][mididx] == p2m_missing) { - WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing, - p2m_identity) != p2m_missing); - return true; - } - } + ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level); + BUG_ON(!ptep || level != PG_LEVEL_4K); - if (p2m_top[topidx][mididx] == p2m_missing) + if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_missing))) return mfn == INVALID_P2M_ENTRY; - p2m_top[topidx][mididx][idx] = mfn; + if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity))) + return mfn == IDENTITY_FRAME(pfn); - return true; + return false; } bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!alloc_p2m(pfn)) return false; - if (!__set_phys_to_machine(pfn, mfn)) - return false; + return __set_phys_to_machine(pfn, mfn); } return true; @@ -877,15 +655,16 @@ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) #define M2P_OVERRIDE_HASH_SHIFT 10 #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) -static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); +static struct list_head *m2p_overrides; static DEFINE_SPINLOCK(m2p_override_lock); static void __init m2p_override_init(void) { unsigned i; - m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, - sizeof(unsigned long)); + m2p_overrides = alloc_bootmem_align( + sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, + sizeof(unsigned long)); for (i = 0; i < M2P_OVERRIDE_HASH; i++) INIT_LIST_HEAD(&m2p_overrides[i]); @@ -896,68 +675,9 @@ static unsigned long mfn_hash(unsigned long mfn) return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); } -int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) -{ - int i, ret = 0; - bool lazy = false; - pte_t *pte; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - - if (kmap_ops && - !in_interrupt() && - paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } - - for (i = 0; i < count; i++) { - unsigned long mfn, pfn; - - /* Do not add to override if the map failed. */ - if (map_ops[i].status) - continue; - - if (map_ops[i].flags & GNTMAP_contains_pte) { - pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + - (map_ops[i].host_addr & ~PAGE_MASK)); - mfn = pte_mfn(*pte); - } else { - mfn = PFN_DOWN(map_ops[i].dev_bus_addr); - } - pfn = page_to_pfn(pages[i]); - - WARN_ON(PagePrivate(pages[i])); - SetPagePrivate(pages[i]); - set_page_private(pages[i], mfn); - pages[i]->index = pfn_to_mfn(pfn); - - if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { - ret = -ENOMEM; - goto out; - } - - if (kmap_ops) { - ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); - if (ret) - goto out; - } - } - -out: - if (lazy) - arch_leave_lazy_mmu_mode(); - - return ret; -} -EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); - /* Add an MFN override for a particular page */ -int m2p_add_override(unsigned long mfn, struct page *page, - struct gnttab_map_grant_ref *kmap_op) +static int m2p_add_override(unsigned long mfn, struct page *page, + struct gnttab_map_grant_ref *kmap_op) { unsigned long flags; unsigned long pfn; @@ -970,7 +690,7 @@ int m2p_add_override(unsigned long mfn, struct page *page, address = (unsigned long)__va(pfn << PAGE_SHIFT); ptep = lookup_address(address, &level); if (WARN(ptep == NULL || level != PG_LEVEL_4K, - "m2p_add_override: pfn %lx not mapped", pfn)) + "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; } @@ -1004,19 +724,19 @@ int m2p_add_override(unsigned long mfn, struct page *page, * because mfn_to_pfn (that ends up being called by GUPF) will * return the backend pfn rather than the frontend pfn. */ pfn = mfn_to_pfn_no_overrides(mfn); - if (get_phys_to_machine(pfn) == mfn) + if (__pfn_to_mfn(pfn) == mfn) set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); return 0; } -EXPORT_SYMBOL_GPL(m2p_add_override); -int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) +int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) { int i, ret = 0; bool lazy = false; + pte_t *pte; if (xen_feature(XENFEAT_auto_translated_physmap)) return 0; @@ -1029,35 +749,75 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, } for (i = 0; i < count; i++) { - unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i])); - unsigned long pfn = page_to_pfn(pages[i]); + unsigned long mfn, pfn; - if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { - ret = -EINVAL; - goto out; + /* Do not add to override if the map failed. */ + if (map_ops[i].status) + continue; + + if (map_ops[i].flags & GNTMAP_contains_pte) { + pte = (pte_t *)(mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + (map_ops[i].host_addr & ~PAGE_MASK)); + mfn = pte_mfn(*pte); + } else { + mfn = PFN_DOWN(map_ops[i].dev_bus_addr); } + pfn = page_to_pfn(pages[i]); - set_page_private(pages[i], INVALID_P2M_ENTRY); - WARN_ON(!PagePrivate(pages[i])); - ClearPagePrivate(pages[i]); - set_phys_to_machine(pfn, pages[i]->index); + WARN_ON(PagePrivate(pages[i])); + SetPagePrivate(pages[i]); + set_page_private(pages[i], mfn); + pages[i]->index = pfn_to_mfn(pfn); - if (kmap_ops) - ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); - if (ret) + if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { + ret = -ENOMEM; goto out; + } + + if (kmap_ops) { + ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); + if (ret) + goto out; + } } out: if (lazy) arch_leave_lazy_mmu_mode(); + return ret; } -EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); +EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); -int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op, - unsigned long mfn) +static struct page *m2p_find_override(unsigned long mfn) +{ + unsigned long flags; + struct list_head *bucket; + struct page *p, *ret; + + if (unlikely(!m2p_overrides)) + return NULL; + + ret = NULL; + bucket = &m2p_overrides[mfn_hash(mfn)]; + + spin_lock_irqsave(&m2p_override_lock, flags); + + list_for_each_entry(p, bucket, lru) { + if (page_private(p) == mfn) { + ret = p; + break; + } + } + + spin_unlock_irqrestore(&m2p_override_lock, flags); + + return ret; +} + +static int m2p_remove_override(struct page *page, + struct gnttab_map_grant_ref *kmap_op, + unsigned long mfn) { unsigned long flags; unsigned long pfn; @@ -1072,7 +832,7 @@ int m2p_remove_override(struct page *page, ptep = lookup_address(address, &level); if (WARN(ptep == NULL || level != PG_LEVEL_4K, - "m2p_remove_override: pfn %lx not mapped", pfn)) + "m2p_remove_override: pfn %lx not mapped", pfn)) return -EINVAL; } @@ -1102,9 +862,8 @@ int m2p_remove_override(struct page *page, * hypercall actually returned an error. */ if (kmap_op->handle == GNTST_general_error) { - printk(KERN_WARNING "m2p_remove_override: " - "pfn %lx mfn %lx, failed to modify kernel mappings", - pfn, mfn); + pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings", + pfn, mfn); put_balloon_scratch_page(); return -1; } @@ -1112,14 +871,14 @@ int m2p_remove_override(struct page *page, xen_mc_batch(); mcs = __xen_mc_entry( - sizeof(struct gnttab_unmap_and_replace)); + sizeof(struct gnttab_unmap_and_replace)); unmap_op = mcs.args; unmap_op->host_addr = kmap_op->host_addr; unmap_op->new_addr = scratch_page_address; unmap_op->handle = kmap_op->handle; MULTI_grant_table_op(mcs.mc, - GNTTABOP_unmap_and_replace, unmap_op, 1); + GNTTABOP_unmap_and_replace, unmap_op, 1); mcs = __xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, scratch_page_address, @@ -1145,35 +904,56 @@ int m2p_remove_override(struct page *page, * pfn again. */ mfn &= ~FOREIGN_FRAME_BIT; pfn = mfn_to_pfn_no_overrides(mfn); - if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && + if (__pfn_to_mfn(pfn) == FOREIGN_FRAME(mfn) && m2p_find_override(mfn) == NULL) set_phys_to_machine(pfn, mfn); return 0; } -EXPORT_SYMBOL_GPL(m2p_remove_override); -struct page *m2p_find_override(unsigned long mfn) +int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) { - unsigned long flags; - struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; - struct page *p, *ret; + int i, ret = 0; + bool lazy = false; - ret = NULL; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; - spin_lock_irqsave(&m2p_override_lock, flags); + if (kmap_ops && + !in_interrupt() && + paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + arch_enter_lazy_mmu_mode(); + lazy = true; + } - list_for_each_entry(p, bucket, lru) { - if (page_private(p) == mfn) { - ret = p; - break; + for (i = 0; i < count; i++) { + unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i])); + unsigned long pfn = page_to_pfn(pages[i]); + + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { + ret = -EINVAL; + goto out; } - } - spin_unlock_irqrestore(&m2p_override_lock, flags); + set_page_private(pages[i], INVALID_P2M_ENTRY); + WARN_ON(!PagePrivate(pages[i])); + ClearPagePrivate(pages[i]); + set_phys_to_machine(pfn, pages[i]->index); + + if (kmap_ops) + ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); + if (ret) + goto out; + } +out: + if (lazy) + arch_leave_lazy_mmu_mode(); return ret; } +EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) { @@ -1192,79 +972,29 @@ EXPORT_SYMBOL_GPL(m2p_find_override_pfn); #include "debugfs.h" static int p2m_dump_show(struct seq_file *m, void *v) { - static const char * const level_name[] = { "top", "middle", - "entry", "abnormal", "error"}; -#define TYPE_IDENTITY 0 -#define TYPE_MISSING 1 -#define TYPE_PFN 2 -#define TYPE_UNKNOWN 3 static const char * const type_name[] = { - [TYPE_IDENTITY] = "identity", - [TYPE_MISSING] = "missing", - [TYPE_PFN] = "pfn", - [TYPE_UNKNOWN] = "abnormal"}; - unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; - unsigned int uninitialized_var(prev_level); - unsigned int uninitialized_var(prev_type); - - if (!p2m_top) - return 0; - - for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - unsigned idx = p2m_index(pfn); - unsigned lvl, type; - - lvl = 4; - type = TYPE_UNKNOWN; - if (p2m_top[topidx] == p2m_mid_missing) { - lvl = 0; type = TYPE_MISSING; - } else if (p2m_top[topidx] == NULL) { - lvl = 0; type = TYPE_UNKNOWN; - } else if (p2m_top[topidx][mididx] == NULL) { - lvl = 1; type = TYPE_UNKNOWN; - } else if (p2m_top[topidx][mididx] == p2m_identity) { - lvl = 1; type = TYPE_IDENTITY; - } else if (p2m_top[topidx][mididx] == p2m_missing) { - lvl = 1; type = TYPE_MISSING; - } else if (p2m_top[topidx][mididx][idx] == 0) { - lvl = 2; type = TYPE_UNKNOWN; - } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) { - lvl = 2; type = TYPE_IDENTITY; - } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) { - lvl = 2; type = TYPE_MISSING; - } else if (p2m_top[topidx][mididx][idx] == pfn) { - lvl = 2; type = TYPE_PFN; - } else if (p2m_top[topidx][mididx][idx] != pfn) { - lvl = 2; type = TYPE_PFN; - } - if (pfn == 0) { - prev_level = lvl; - prev_type = type; - } - if (pfn == MAX_DOMAIN_PAGES-1) { - lvl = 3; - type = TYPE_UNKNOWN; - } - if (prev_type != type) { - seq_printf(m, " [0x%lx->0x%lx] %s\n", - prev_pfn_type, pfn, type_name[prev_type]); - prev_pfn_type = pfn; + [P2M_TYPE_IDENTITY] = "identity", + [P2M_TYPE_MISSING] = "missing", + [P2M_TYPE_PFN] = "pfn", + [P2M_TYPE_UNKNOWN] = "abnormal"}; + unsigned long pfn, first_pfn; + int type, prev_type; + + prev_type = xen_p2m_elem_type(0); + first_pfn = 0; + + for (pfn = 0; pfn < xen_p2m_size; pfn++) { + type = xen_p2m_elem_type(pfn); + if (type != prev_type) { + seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn, + type_name[prev_type]); prev_type = type; - } - if (prev_level != lvl) { - seq_printf(m, " [0x%lx->0x%lx] level %s\n", - prev_pfn_level, pfn, level_name[prev_level]); - prev_pfn_level = pfn; - prev_level = lvl; + first_pfn = pfn; } } + seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn, + type_name[prev_type]); return 0; -#undef TYPE_IDENTITY -#undef TYPE_MISSING -#undef TYPE_PFN -#undef TYPE_UNKNOWN } static int p2m_dump_open(struct inode *inode, struct file *filp) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 29834b3fd87f..dfd77dec8e2b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -30,6 +30,7 @@ #include "xen-ops.h" #include "vdso.h" #include "p2m.h" +#include "mmu.h" /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; @@ -47,8 +48,19 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; -/* Buffer used to remap identity mapped pages */ -unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata; +/* + * Buffer used to remap identity mapped pages. We only need the virtual space. + * The physical page behind this address is remapped as needed to different + * buffer pages. + */ +#define REMAP_SIZE (P2M_PER_PAGE - 3) +static struct { + unsigned long next_area_mfn; + unsigned long target_pfn; + unsigned long size; + unsigned long mfns[REMAP_SIZE]; +} xen_remap_buf __initdata __aligned(PAGE_SIZE); +static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; /* * The maximum amount of extra memory compared to the base size. The @@ -64,7 +76,6 @@ unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata; static void __init xen_add_extra_mem(u64 start, u64 size) { - unsigned long pfn; int i; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { @@ -84,75 +95,76 @@ static void __init xen_add_extra_mem(u64 start, u64 size) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); memblock_reserve(start, size); +} - xen_max_p2m_pfn = PFN_DOWN(start + size); - for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { - unsigned long mfn = pfn_to_mfn(pfn); - - if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) - continue; - WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", - pfn, mfn); +static void __init xen_del_extra_mem(u64 start, u64 size) +{ + int i; + u64 start_r, size_r; - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + start_r = xen_extra_mem[i].start; + size_r = xen_extra_mem[i].size; + + /* Start of region. */ + if (start_r == start) { + BUG_ON(size > size_r); + xen_extra_mem[i].start += size; + xen_extra_mem[i].size -= size; + break; + } + /* End of region. */ + if (start_r + size_r == start + size) { + BUG_ON(size > size_r); + xen_extra_mem[i].size -= size; + break; + } + /* Mid of region. */ + if (start > start_r && start < start_r + size_r) { + BUG_ON(start + size > start_r + size_r); + xen_extra_mem[i].size = start - start_r; + /* Calling memblock_reserve() again is okay. */ + xen_add_extra_mem(start + size, start_r + size_r - + (start + size)); + break; + } } + memblock_free(start, size); } -static unsigned long __init xen_do_chunk(unsigned long start, - unsigned long end, bool release) +/* + * Called during boot before the p2m list can take entries beyond the + * hypervisor supplied p2m list. Entries in extra mem are to be regarded as + * invalid. + */ +unsigned long __ref xen_chk_extra_mem(unsigned long pfn) { - struct xen_memory_reservation reservation = { - .address_bits = 0, - .extent_order = 0, - .domid = DOMID_SELF - }; - unsigned long len = 0; - unsigned long pfn; - int ret; + int i; + unsigned long addr = PFN_PHYS(pfn); - for (pfn = start; pfn < end; pfn++) { - unsigned long frame; - unsigned long mfn = pfn_to_mfn(pfn); + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + if (addr >= xen_extra_mem[i].start && + addr < xen_extra_mem[i].start + xen_extra_mem[i].size) + return INVALID_P2M_ENTRY; + } - if (release) { - /* Make sure pfn exists to start with */ - if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) - continue; - frame = mfn; - } else { - if (mfn != INVALID_P2M_ENTRY) - continue; - frame = pfn; - } - set_xen_guest_handle(reservation.extent_start, &frame); - reservation.nr_extents = 1; + return IDENTITY_FRAME(pfn); +} - ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, - &reservation); - WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", - release ? "release" : "populate", pfn, ret); +/* + * Mark all pfns of extra mem as invalid in p2m list. + */ +void __init xen_inv_extra_mem(void) +{ + unsigned long pfn, pfn_s, pfn_e; + int i; - if (ret == 1) { - if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { - if (release) - break; - set_xen_guest_handle(reservation.extent_start, &frame); - reservation.nr_extents = 1; - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); - break; - } - len++; - } else - break; + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + pfn_s = PFN_DOWN(xen_extra_mem[i].start); + pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); + for (pfn = pfn_s; pfn < pfn_e; pfn++) + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } - if (len) - printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", - release ? "Freeing" : "Populating", - start, end, len, - release ? "freed" : "added"); - - return len; } /* @@ -198,26 +210,62 @@ static unsigned long __init xen_find_pfn_range( return done; } +static int __init xen_free_mfn(unsigned long mfn) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + set_xen_guest_handle(reservation.extent_start, &mfn); + reservation.nr_extents = 1; + + return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); +} + /* - * This releases a chunk of memory and then does the identity map. It's used as + * This releases a chunk of memory and then does the identity map. It's used * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity, unsigned long *released) { + unsigned long len = 0; + unsigned long pfn, end; + int ret; + WARN_ON(start_pfn > end_pfn); + end = min(end_pfn, nr_pages); + for (pfn = start_pfn; pfn < end; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + /* Make sure pfn exists to start with */ + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + continue; + + ret = xen_free_mfn(mfn); + WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); + + if (ret == 1) { + if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) + break; + len++; + } else + break; + } + /* Need to release pages first */ - *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true); + *released += len; *identity += set_phys_range_identity(start_pfn, end_pfn); } /* - * Helper function to update both the p2m and m2p tables. + * Helper function to update the p2m and m2p tables and kernel mapping. */ -static unsigned long __init xen_update_mem_tables(unsigned long pfn, - unsigned long mfn) +static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn) { struct mmu_update update = { .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, @@ -225,161 +273,88 @@ static unsigned long __init xen_update_mem_tables(unsigned long pfn, }; /* Update p2m */ - if (!early_set_phys_to_machine(pfn, mfn)) { + if (!set_phys_to_machine(pfn, mfn)) { WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", pfn, mfn); - return false; + BUG(); } /* Update m2p */ if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) { WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n", mfn, pfn); - return false; + BUG(); } - return true; + /* Update kernel mapping, but not for highmem. */ + if ((pfn << PAGE_SHIFT) >= __pa(high_memory)) + return; + + if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(mfn, PAGE_KERNEL), 0)) { + WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n", + mfn, pfn); + BUG(); + } } /* * This function updates the p2m and m2p tables with an identity map from - * start_pfn to start_pfn+size and remaps the underlying RAM of the original - * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks - * to not exhaust the reserved brk space. Doing it in properly aligned blocks - * ensures we only allocate the minimum required leaf pages in the p2m table. It - * copies the existing mfns from the p2m table under the 1:1 map, overwrites - * them with the identity map and then updates the p2m and m2p tables with the - * remapped memory. + * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the + * original allocation at remap_pfn. The information needed for remapping is + * saved in the memory itself to avoid the need for allocating buffers. The + * complete remap information is contained in a list of MFNs each containing + * up to REMAP_SIZE MFNs and the start target PFN for doing the remap. + * This enables us to preserve the original mfn sequence while doing the + * remapping at a time when the memory management is capable of allocating + * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and + * its callers. */ -static unsigned long __init xen_do_set_identity_and_remap_chunk( +static void __init xen_do_set_identity_and_remap_chunk( unsigned long start_pfn, unsigned long size, unsigned long remap_pfn) { + unsigned long buf = (unsigned long)&xen_remap_buf; + unsigned long mfn_save, mfn; unsigned long ident_pfn_iter, remap_pfn_iter; - unsigned long ident_start_pfn_align, remap_start_pfn_align; - unsigned long ident_end_pfn_align, remap_end_pfn_align; - unsigned long ident_boundary_pfn, remap_boundary_pfn; - unsigned long ident_cnt = 0; - unsigned long remap_cnt = 0; + unsigned long ident_end_pfn = start_pfn + size; unsigned long left = size; - unsigned long mod; - int i; + unsigned long ident_cnt = 0; + unsigned int i, chunk; WARN_ON(size == 0); BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); - /* - * Determine the proper alignment to remap memory in P2M_PER_PAGE sized - * blocks. We need to keep track of both the existing pfn mapping and - * the new pfn remapping. - */ - mod = start_pfn % P2M_PER_PAGE; - ident_start_pfn_align = - mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn; - mod = remap_pfn % P2M_PER_PAGE; - remap_start_pfn_align = - mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn; - mod = (start_pfn + size) % P2M_PER_PAGE; - ident_end_pfn_align = start_pfn + size - mod; - mod = (remap_pfn + size) % P2M_PER_PAGE; - remap_end_pfn_align = remap_pfn + size - mod; - - /* Iterate over each p2m leaf node in each range */ - for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align; - ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align; - ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) { - /* Check we aren't past the end */ - BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size); - BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size); - - /* Save p2m mappings */ - for (i = 0; i < P2M_PER_PAGE; i++) - xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i); - - /* Set identity map which will free a p2m leaf */ - ident_cnt += set_phys_range_identity(ident_pfn_iter, - ident_pfn_iter + P2M_PER_PAGE); + mfn_save = virt_to_mfn(buf); -#ifdef DEBUG - /* Helps verify a p2m leaf has been freed */ - for (i = 0; i < P2M_PER_PAGE; i++) { - unsigned int pfn = ident_pfn_iter + i; - BUG_ON(pfn_to_mfn(pfn) != pfn); - } -#endif - /* Now remap memory */ - for (i = 0; i < P2M_PER_PAGE; i++) { - unsigned long mfn = xen_remap_buf[i]; - - /* This will use the p2m leaf freed above */ - if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) { - WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", - remap_pfn_iter + i, mfn); - return 0; - } - - remap_cnt++; - } - - left -= P2M_PER_PAGE; - } - - /* Max boundary space possible */ - BUG_ON(left > (P2M_PER_PAGE - 1) * 2); + for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn; + ident_pfn_iter < ident_end_pfn; + ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) { + chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE; - /* Now handle the boundary conditions */ - ident_boundary_pfn = start_pfn; - remap_boundary_pfn = remap_pfn; - for (i = 0; i < left; i++) { - unsigned long mfn; + /* Map first pfn to xen_remap_buf */ + mfn = pfn_to_mfn(ident_pfn_iter); + set_pte_mfn(buf, mfn, PAGE_KERNEL); - /* These two checks move from the start to end boundaries */ - if (ident_boundary_pfn == ident_start_pfn_align) - ident_boundary_pfn = ident_pfn_iter; - if (remap_boundary_pfn == remap_start_pfn_align) - remap_boundary_pfn = remap_pfn_iter; + /* Save mapping information in page */ + xen_remap_buf.next_area_mfn = xen_remap_mfn; + xen_remap_buf.target_pfn = remap_pfn_iter; + xen_remap_buf.size = chunk; + for (i = 0; i < chunk; i++) + xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i); - /* Check we aren't past the end */ - BUG_ON(ident_boundary_pfn >= start_pfn + size); - BUG_ON(remap_boundary_pfn >= remap_pfn + size); - - mfn = pfn_to_mfn(ident_boundary_pfn); - - if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) { - WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", - remap_pfn_iter + i, mfn); - return 0; - } - remap_cnt++; + /* Put remap buf into list. */ + xen_remap_mfn = mfn; - ident_boundary_pfn++; - remap_boundary_pfn++; - } + /* Set identity map */ + ident_cnt += set_phys_range_identity(ident_pfn_iter, + ident_pfn_iter + chunk); - /* Finish up the identity map */ - if (ident_start_pfn_align >= ident_end_pfn_align) { - /* - * In this case we have an identity range which does not span an - * aligned block so everything needs to be identity mapped here. - * If we didn't check this we might remap too many pages since - * the align boundaries are not meaningful in this case. - */ - ident_cnt += set_phys_range_identity(start_pfn, - start_pfn + size); - } else { - /* Remapped above so check each end of the chunk */ - if (start_pfn < ident_start_pfn_align) - ident_cnt += set_phys_range_identity(start_pfn, - ident_start_pfn_align); - if (start_pfn + size > ident_pfn_iter) - ident_cnt += set_phys_range_identity(ident_pfn_iter, - start_pfn + size); + left -= chunk; } - BUG_ON(ident_cnt != size); - BUG_ON(remap_cnt != size); - - return size; + /* Restore old xen_remap_buf mapping */ + set_pte_mfn(buf, mfn_save, PAGE_KERNEL); } /* @@ -396,8 +371,7 @@ static unsigned long __init xen_do_set_identity_and_remap_chunk( static unsigned long __init xen_set_identity_and_remap_chunk( const struct e820entry *list, size_t map_size, unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, - unsigned long *identity, unsigned long *remapped, - unsigned long *released) + unsigned long *identity, unsigned long *released) { unsigned long pfn; unsigned long i = 0; @@ -431,19 +405,12 @@ static unsigned long __init xen_set_identity_and_remap_chunk( if (size > remap_range_size) size = remap_range_size; - if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) { - WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n", - cur_pfn, size, remap_pfn); - xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages, identity, released); - break; - } + xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn); /* Update variables to reflect new mappings. */ i += size; remap_pfn += size; *identity += size; - *remapped += size; } /* @@ -458,13 +425,12 @@ static unsigned long __init xen_set_identity_and_remap_chunk( return remap_pfn; } -static unsigned long __init xen_set_identity_and_remap( +static void __init xen_set_identity_and_remap( const struct e820entry *list, size_t map_size, unsigned long nr_pages, unsigned long *released) { phys_addr_t start = 0; unsigned long identity = 0; - unsigned long remapped = 0; unsigned long last_pfn = nr_pages; const struct e820entry *entry; unsigned long num_released = 0; @@ -494,8 +460,7 @@ static unsigned long __init xen_set_identity_and_remap( last_pfn = xen_set_identity_and_remap_chunk( list, map_size, start_pfn, end_pfn, nr_pages, last_pfn, - &identity, &remapped, - &num_released); + &identity, &num_released); start = end; } } @@ -503,12 +468,63 @@ static unsigned long __init xen_set_identity_and_remap( *released = num_released; pr_info("Set %ld page(s) to 1-1 mapping\n", identity); - pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped, - last_pfn); pr_info("Released %ld page(s)\n", num_released); +} + +/* + * Remap the memory prepared in xen_do_set_identity_and_remap_chunk(). + * The remap information (which mfn remap to which pfn) is contained in the + * to be remapped memory itself in a linked list anchored at xen_remap_mfn. + * This scheme allows to remap the different chunks in arbitrary order while + * the resulting mapping will be independant from the order. + */ +void __init xen_remap_memory(void) +{ + unsigned long buf = (unsigned long)&xen_remap_buf; + unsigned long mfn_save, mfn, pfn; + unsigned long remapped = 0; + unsigned int i; + unsigned long pfn_s = ~0UL; + unsigned long len = 0; + + mfn_save = virt_to_mfn(buf); + + while (xen_remap_mfn != INVALID_P2M_ENTRY) { + /* Map the remap information */ + set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL); - return last_pfn; + BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]); + + pfn = xen_remap_buf.target_pfn; + for (i = 0; i < xen_remap_buf.size; i++) { + mfn = xen_remap_buf.mfns[i]; + xen_update_mem_tables(pfn, mfn); + remapped++; + pfn++; + } + if (pfn_s == ~0UL || pfn == pfn_s) { + pfn_s = xen_remap_buf.target_pfn; + len += xen_remap_buf.size; + } else if (pfn_s + len == xen_remap_buf.target_pfn) { + len += xen_remap_buf.size; + } else { + xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + pfn_s = xen_remap_buf.target_pfn; + len = xen_remap_buf.size; + } + + mfn = xen_remap_mfn; + xen_remap_mfn = xen_remap_buf.next_area_mfn; + } + + if (pfn_s != ~0UL && len) + xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + + set_pte_mfn(buf, mfn_save, PAGE_KERNEL); + + pr_info("Remapped %ld page(s)\n", remapped); } + static unsigned long __init xen_get_max_pages(void) { unsigned long max_pages = MAX_DOMAIN_PAGES; @@ -569,7 +585,6 @@ char * __init xen_memory_setup(void) int rc; struct xen_memory_map memmap; unsigned long max_pages; - unsigned long last_pfn = 0; unsigned long extra_pages = 0; int i; int op; @@ -616,17 +631,14 @@ char * __init xen_memory_setup(void) extra_pages += max_pages - max_pfn; /* - * Set identity map on non-RAM pages and remap the underlying RAM. + * Set identity map on non-RAM pages and prepare remapping the + * underlying RAM. */ - last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, - &xen_released_pages); + xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, + &xen_released_pages); extra_pages += xen_released_pages; - if (last_pfn > max_pfn) { - max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); - mem_end = PFN_PHYS(max_pfn); - } /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. On non-highmem systems, the base @@ -653,6 +665,7 @@ char * __init xen_memory_setup(void) size = min(size, (u64)extra_pages * PAGE_SIZE); extra_pages -= size / PAGE_SIZE; xen_add_extra_mem(addr, size); + xen_max_p2m_pfn = PFN_DOWN(addr + size); } else type = E820_UNUSABLE; } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 4ab9298c5e17..5686bd9d58cc 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -29,11 +29,13 @@ void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); -extern unsigned long xen_max_p2m_pfn; void xen_mm_pin_all(void); void xen_mm_unpin_all(void); +unsigned long __ref xen_chk_extra_mem(unsigned long pfn); +void __init xen_inv_extra_mem(void); +void __init xen_remap_memory(void); char * __init xen_memory_setup(void); char * xen_auto_xlated_memory_setup(void); void __init xen_arch_setup(void); @@ -46,7 +48,7 @@ void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); -unsigned long __init xen_revector_p2m_tree(void); +void __init xen_vmalloc_p2m_tree(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); |