summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/crypto/aes_glue.c4
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c10
-rw-r--r--arch/x86/crypto/blowfish_glue.c4
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c4
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c4
-rw-r--r--arch/x86/crypto/camellia_glue.c4
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c2
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c2
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c4
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c4
-rw-r--r--arch/x86/crypto/crct10dif-pclmul_glue.c4
-rw-r--r--arch/x86/crypto/des3_ede_glue.c8
-rw-r--r--arch/x86/crypto/fpu.c3
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c2
-rw-r--r--arch/x86/crypto/salsa20_glue.c4
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c4
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c2
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c2
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb.c3
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c2
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c6
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c6
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c2
-rw-r--r--arch/x86/crypto/twofish_glue.c4
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c4
-rw-r--r--arch/x86/ia32/audit.c1
-rw-r--r--arch/x86/ia32/ia32entry.S1
-rw-r--r--arch/x86/include/asm/barrier.h70
-rw-r--r--arch/x86/include/asm/dma.h2
-rw-r--r--arch/x86/include/asm/hash.h7
-rw-r--r--arch/x86/include/asm/kvm_host.h37
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/platform_sst_audio.h62
-rw-r--r--arch/x86/include/asm/segment.h30
-rw-r--r--arch/x86/include/asm/vmx.h3
-rw-r--r--arch/x86/include/asm/xen/cpuid.h91
-rw-r--r--arch/x86/include/asm/xen/page-coherent.h4
-rw-r--r--arch/x86/include/asm/xen/page.h71
-rw-r--r--arch/x86/include/asm/xsave.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kernel/asm-offsets_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/audit_64.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_iommu.c5
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c6
-rw-r--r--arch/x86/kernel/e820.c4
-rw-r--r--arch/x86/kernel/early-quirks.c23
-rw-r--r--arch/x86/kernel/entry_32.S6
-rw-r--r--arch/x86/kernel/entry_64.S28
-rw-r--r--arch/x86/kernel/kvm.c9
-rw-r--r--arch/x86/kernel/kvmclock.c20
-rw-r--r--arch/x86/kernel/process_64.c101
-rw-r--r--arch/x86/kernel/tls.c45
-rw-r--r--arch/x86/kernel/xsave.c1
-rw-r--r--arch/x86/kvm/Makefile7
-rw-r--r--arch/x86/kvm/assigned-dev.c1052
-rw-r--r--arch/x86/kvm/assigned-dev.h32
-rw-r--r--arch/x86/kvm/cpuid.c57
-rw-r--r--arch/x86/kvm/emulate.c408
-rw-r--r--arch/x86/kvm/ioapic.c675
-rw-r--r--arch/x86/kvm/ioapic.h119
-rw-r--r--arch/x86/kvm/iommu.c353
-rw-r--r--arch/x86/kvm/irq_comm.c332
-rw-r--r--arch/x86/kvm/lapic.c210
-rw-r--r--arch/x86/kvm/lapic.h14
-rw-r--r--arch/x86/kvm/mmu.c7
-rw-r--r--arch/x86/kvm/svm.c24
-rw-r--r--arch/x86/kvm/trace.h37
-rw-r--r--arch/x86/kvm/vmx.c608
-rw-r--r--arch/x86/kvm/x86.c226
-rw-r--r--arch/x86/kvm/x86.h3
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/hash.c92
-rw-r--r--arch/x86/mm/fault.c64
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/pageattr.c22
-rw-r--r--arch/x86/net/bpf_jit_comp.c34
-rw-r--r--arch/x86/pci/xen.c31
-rw-r--r--arch/x86/platform/iris/iris.c1
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c2
-rw-r--r--arch/x86/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/um/asm/barrier.h20
-rw-r--r--arch/x86/um/sys_call_table_64.c1
-rw-r--r--arch/x86/xen/mmu.c40
-rw-r--r--arch/x86/xen/p2m.c1172
-rw-r--r--arch/x86/xen/setup.c441
-rw-r--r--arch/x86/xen/xen-ops.h6
91 files changed, 4894 insertions, 1960 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bea3a0159496..d69f1cd87fd9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,6 +24,7 @@ config X86
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_FAST_MULTIPLIER
+ select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_AOUT if X86_32
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
index aafe8ce0d65d..e26984f7ab8d 100644
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -66,5 +66,5 @@ module_exit(aes_fini);
MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("aes");
-MODULE_ALIAS("aes-asm");
+MODULE_ALIAS_CRYPTO("aes");
+MODULE_ALIAS_CRYPTO("aes-asm");
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 888950f29fd9..ae855f4f64b7 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -43,10 +43,6 @@
#include <asm/crypto/glue_helper.h>
#endif
-#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
-#define HAS_PCBC
-#endif
-
/* This data is stored at the end of the crypto_tfm struct.
* It's a type of per "session" data storage location.
* This needs to be 16 byte aligned.
@@ -547,7 +543,7 @@ static int ablk_ctr_init(struct crypto_tfm *tfm)
#endif
-#ifdef HAS_PCBC
+#if IS_ENABLED(CONFIG_CRYPTO_PCBC)
static int ablk_pcbc_init(struct crypto_tfm *tfm)
{
return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))");
@@ -1377,7 +1373,7 @@ static struct crypto_alg aesni_algs[] = { {
},
},
#endif
-#ifdef HAS_PCBC
+#if IS_ENABLED(CONFIG_CRYPTO_PCBC)
}, {
.cra_name = "pcbc(aes)",
.cra_driver_name = "pcbc-aes-aesni",
@@ -1550,4 +1546,4 @@ module_exit(aesni_exit);
MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("aes");
+MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 8af519ed73d1..17c05531dfd1 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -478,5 +478,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized");
-MODULE_ALIAS("blowfish");
-MODULE_ALIAS("blowfish-asm");
+MODULE_ALIAS_CRYPTO("blowfish");
+MODULE_ALIAS_CRYPTO("blowfish-asm");
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 4209a76fcdaa..9a07fafe3831 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -582,5 +582,5 @@ module_exit(camellia_aesni_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized");
-MODULE_ALIAS("camellia");
-MODULE_ALIAS("camellia-asm");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 87a041a10f4a..ed38d959add6 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -574,5 +574,5 @@ module_exit(camellia_aesni_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
-MODULE_ALIAS("camellia");
-MODULE_ALIAS("camellia-asm");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index c171dcbf192d..5c8b6266a394 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1725,5 +1725,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized");
-MODULE_ALIAS("camellia");
-MODULE_ALIAS("camellia-asm");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index e57e20ab5e0b..60ada677a928 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -491,4 +491,4 @@ module_exit(cast5_exit);
MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("cast5");
+MODULE_ALIAS_CRYPTO("cast5");
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 09f3677393e4..0160f68a57ff 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -611,4 +611,4 @@ module_exit(cast6_exit);
MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("cast6");
+MODULE_ALIAS_CRYPTO("cast6");
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 9d014a74ef96..1937fc1d8763 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -197,5 +197,5 @@ module_exit(crc32_pclmul_mod_fini);
MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("crc32");
-MODULE_ALIAS("crc32-pclmul");
+MODULE_ALIAS_CRYPTO("crc32");
+MODULE_ALIAS_CRYPTO("crc32-pclmul");
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 6812ad98355c..28640c3d6af7 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -280,5 +280,5 @@ MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.c
MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("crc32c");
-MODULE_ALIAS("crc32c-intel");
+MODULE_ALIAS_CRYPTO("crc32c");
+MODULE_ALIAS_CRYPTO("crc32c-intel");
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index 7845d7fd54c0..b6c67bf30fdf 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -147,5 +147,5 @@ MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("crct10dif");
-MODULE_ALIAS("crct10dif-pclmul");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-pclmul");
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index 0e9c0668fe4e..38a14f818ef1 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -502,8 +502,8 @@ module_exit(des3_ede_x86_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
-MODULE_ALIAS("des3_ede");
-MODULE_ALIAS("des3_ede-asm");
-MODULE_ALIAS("des");
-MODULE_ALIAS("des-asm");
+MODULE_ALIAS_CRYPTO("des3_ede");
+MODULE_ALIAS_CRYPTO("des3_ede-asm");
+MODULE_ALIAS_CRYPTO("des");
+MODULE_ALIAS_CRYPTO("des-asm");
MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index 98d7a188f46b..f368ba261739 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -17,6 +17,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/crypto.h>
#include <asm/i387.h>
struct crypto_fpu_ctx {
@@ -159,3 +160,5 @@ void __exit crypto_fpu_exit(void)
{
crypto_unregister_template(&crypto_fpu_tmpl);
}
+
+MODULE_ALIAS_CRYPTO("fpu");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 88bb7ba8b175..8253d85aa165 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -341,4 +341,4 @@ module_exit(ghash_pclmulqdqni_mod_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
"acclerated by PCLMULQDQ-NI");
-MODULE_ALIAS("ghash");
+MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index 5e8e67739bb5..399a29d067d6 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -119,5 +119,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
-MODULE_ALIAS("salsa20");
-MODULE_ALIAS("salsa20-asm");
+MODULE_ALIAS_CRYPTO("salsa20");
+MODULE_ALIAS_CRYPTO("salsa20-asm");
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 2fae489b1524..437e47a4d302 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -558,5 +558,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
-MODULE_ALIAS("serpent");
-MODULE_ALIAS("serpent-asm");
+MODULE_ALIAS_CRYPTO("serpent");
+MODULE_ALIAS_CRYPTO("serpent-asm");
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index ff4870870972..7e217398b4eb 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -617,4 +617,4 @@ module_exit(serpent_exit);
MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("serpent");
+MODULE_ALIAS_CRYPTO("serpent");
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 8c95f8637306..bf025adaea01 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -618,4 +618,4 @@ module_exit(serpent_sse2_exit);
MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("serpent");
+MODULE_ALIAS_CRYPTO("serpent");
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index 99eefd812958..a225a5ca1037 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -204,8 +204,7 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str
continue;
}
- if (ctx)
- ctx->status = HASH_CTX_STS_IDLE;
+ ctx->status = HASH_CTX_STS_IDLE;
return ctx;
}
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 74d16ef707c7..6c20fe04a738 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -278,4 +278,4 @@ module_exit(sha1_ssse3_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-MODULE_ALIAS("sha1");
+MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index f248546da1ca..8fad72f4dfd2 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -211,7 +211,7 @@ static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
sha256_ssse3_final(desc, D);
memcpy(hash, D, SHA224_DIGEST_SIZE);
- memset(D, 0, SHA256_DIGEST_SIZE);
+ memzero_explicit(D, SHA256_DIGEST_SIZE);
return 0;
}
@@ -318,5 +318,5 @@ module_exit(sha256_ssse3_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-MODULE_ALIAS("sha256");
-MODULE_ALIAS("sha224");
+MODULE_ALIAS_CRYPTO("sha256");
+MODULE_ALIAS_CRYPTO("sha224");
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 8626b03e83b7..0b6af26832bf 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -219,7 +219,7 @@ static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
sha512_ssse3_final(desc, D);
memcpy(hash, D, SHA384_DIGEST_SIZE);
- memset(D, 0, SHA512_DIGEST_SIZE);
+ memzero_explicit(D, SHA512_DIGEST_SIZE);
return 0;
}
@@ -326,5 +326,5 @@ module_exit(sha512_ssse3_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-MODULE_ALIAS("sha512");
-MODULE_ALIAS("sha384");
+MODULE_ALIAS_CRYPTO("sha512");
+MODULE_ALIAS_CRYPTO("sha384");
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 4e3c665be129..1ac531ea9bcc 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -579,4 +579,4 @@ module_exit(twofish_exit);
MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("twofish");
+MODULE_ALIAS_CRYPTO("twofish");
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index 0a5202303501..77e06c2da83d 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -96,5 +96,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
-MODULE_ALIAS("twofish");
-MODULE_ALIAS("twofish-asm");
+MODULE_ALIAS_CRYPTO("twofish");
+MODULE_ALIAS_CRYPTO("twofish-asm");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 13e63b3e1dfb..56d8a08ee479 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -495,5 +495,5 @@ module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized");
-MODULE_ALIAS("twofish");
-MODULE_ALIAS("twofish-asm");
+MODULE_ALIAS_CRYPTO("twofish");
+MODULE_ALIAS_CRYPTO("twofish-asm");
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 5d7b381da692..2eccc8932ae6 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall)
case __NR_socketcall:
return 4;
case __NR_execve:
+ case __NR_execveat:
return 5;
default:
return 1;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ffe71228fc10..82e8a1d44658 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -480,6 +480,7 @@ GLOBAL(\label)
PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
PTREGSCALL stub32_sigreturn, sys32_sigreturn
PTREGSCALL stub32_execve, compat_sys_execve
+ PTREGSCALL stub32_execveat, compat_sys_execveat
PTREGSCALL stub32_fork, sys_fork
PTREGSCALL stub32_vfork, sys_vfork
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 0f4460b5636d..2ab1eb33106e 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -24,78 +24,28 @@
#define wmb() asm volatile("sfence" ::: "memory")
#endif
-/**
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier. All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads. This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies. See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- * CPU 0 CPU 1
- *
- * b = 2;
- * memory_barrier();
- * p = &b; q = p;
- * read_barrier_depends();
- * d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends(). However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- * CPU 0 CPU 1
- *
- * a = 2;
- * memory_barrier();
- * b = 3; y = b;
- * read_barrier_depends();
- * x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b". Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
- * in cases like this where there are no data dependencies.
- **/
-
-#define read_barrier_depends() do { } while (0)
-
-#ifdef CONFIG_SMP
-#define smp_mb() mb()
#ifdef CONFIG_X86_PPRO_FENCE
-# define smp_rmb() rmb()
+#define dma_rmb() rmb()
#else
-# define smp_rmb() barrier()
+#define dma_rmb() barrier()
#endif
+#define dma_wmb() barrier()
+
+#ifdef CONFIG_SMP
+#define smp_mb() mb()
+#define smp_rmb() dma_rmb()
#define smp_wmb() barrier()
-#define smp_read_barrier_depends() read_barrier_depends()
#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
#else /* !SMP */
#define smp_mb() barrier()
#define smp_rmb() barrier()
#define smp_wmb() barrier()
-#define smp_read_barrier_depends() do { } while (0)
#define set_mb(var, value) do { var = value; barrier(); } while (0)
#endif /* SMP */
+#define read_barrier_depends() do { } while (0)
+#define smp_read_barrier_depends() do { } while (0)
+
#if defined(CONFIG_X86_PPRO_FENCE)
/*
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 0bdb0c54d9a1..fe884e18fa6e 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -70,7 +70,7 @@
#define MAX_DMA_CHANNELS 8
/* 16MB ISA DMA zone */
-#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
+#define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT)
/* 4GB broken PCI/AGP hardware bus master zone */
#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h
deleted file mode 100644
index e8c58f88b1d4..000000000000
--- a/arch/x86/include/asm/hash.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _ASM_X86_HASH_H
-#define _ASM_X86_HASH_H
-
-struct fast_hash_ops;
-extern void setup_arch_fast_hash(struct fast_hash_ops *ops);
-
-#endif /* _ASM_X86_HASH_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6ed0c30d6a0c..d89c6b828c96 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -33,7 +33,7 @@
#define KVM_MAX_VCPUS 255
#define KVM_SOFT_MAX_VCPUS 160
-#define KVM_USER_MEM_SLOTS 125
+#define KVM_USER_MEM_SLOTS 509
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
@@ -51,6 +51,7 @@
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
+#define CR3_PCID_INVD (1UL << 63)
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
@@ -361,6 +362,7 @@ struct kvm_vcpu_arch {
int mp_state;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
+ u64 ia32_xss;
/*
* Paging state of the vcpu
@@ -542,7 +544,7 @@ struct kvm_apic_map {
struct rcu_head rcu;
u8 ldr_bits;
/* fields bellow are used to decode ldr values in different modes */
- u32 cid_shift, cid_mask, lid_mask;
+ u32 cid_shift, cid_mask, lid_mask, broadcast;
struct kvm_lapic *phys_map[256];
/* first index is cluster id second is cpu id in a cluster */
struct kvm_lapic *logical_map[16][16];
@@ -602,6 +604,9 @@ struct kvm_arch {
struct kvm_xen_hvm_config xen_hvm_config;
+ /* reads protected by irq_srcu, writes by irq_lock */
+ struct hlist_head mask_notifier_list;
+
/* fields used by HYPER-V emulation */
u64 hv_guest_os_id;
u64 hv_hypercall;
@@ -659,6 +664,16 @@ struct msr_data {
u64 data;
};
+struct kvm_lapic_irq {
+ u32 vector;
+ u32 delivery_mode;
+ u32 dest_mode;
+ u32 level;
+ u32 trig_mode;
+ u32 shorthand;
+ u32 dest_id;
+};
+
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -767,6 +782,7 @@ struct kvm_x86_ops {
enum x86_intercept_stage stage);
void (*handle_external_intr)(struct kvm_vcpu *vcpu);
bool (*mpx_supported)(void);
+ bool (*xsaves_supported)(void);
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
@@ -818,6 +834,19 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
+};
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
extern bool tdp_enabled;
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
@@ -863,7 +892,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
-void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
int reason, bool has_error_code, u32 error_code);
@@ -895,6 +924,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
static inline int __kvm_irq_line_state(unsigned long *irq_state,
int irq_source_id, int level)
@@ -1066,6 +1096,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
void kvm_define_shared_msr(unsigned index, u32 msr);
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index af447f95e3be..25bcd4a89517 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -452,6 +452,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
+extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
unsigned numpages, unsigned long page_flags);
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
index 0a4e140315b6..7249e6d0902d 100644
--- a/arch/x86/include/asm/platform_sst_audio.h
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -16,6 +16,9 @@
#include <linux/sfi.h>
+#define MAX_NUM_STREAMS_MRFLD 25
+#define MAX_NUM_STREAMS MAX_NUM_STREAMS_MRFLD
+
enum sst_audio_task_id_mrfld {
SST_TASK_ID_NONE = 0,
SST_TASK_ID_SBA = 1,
@@ -73,6 +76,65 @@ struct sst_platform_data {
unsigned int strm_map_size;
};
+struct sst_info {
+ u32 iram_start;
+ u32 iram_end;
+ bool iram_use;
+ u32 dram_start;
+ u32 dram_end;
+ bool dram_use;
+ u32 imr_start;
+ u32 imr_end;
+ bool imr_use;
+ u32 mailbox_start;
+ bool use_elf;
+ bool lpe_viewpt_rqd;
+ unsigned int max_streams;
+ u32 dma_max_len;
+ u8 num_probes;
+};
+
+struct sst_lib_dnld_info {
+ unsigned int mod_base;
+ unsigned int mod_end;
+ unsigned int mod_table_offset;
+ unsigned int mod_table_size;
+ bool mod_ddr_dnld;
+};
+
+struct sst_res_info {
+ unsigned int shim_offset;
+ unsigned int shim_size;
+ unsigned int shim_phy_addr;
+ unsigned int ssp0_offset;
+ unsigned int ssp0_size;
+ unsigned int dma0_offset;
+ unsigned int dma0_size;
+ unsigned int dma1_offset;
+ unsigned int dma1_size;
+ unsigned int iram_offset;
+ unsigned int iram_size;
+ unsigned int dram_offset;
+ unsigned int dram_size;
+ unsigned int mbox_offset;
+ unsigned int mbox_size;
+ unsigned int acpi_lpe_res_index;
+ unsigned int acpi_ddr_index;
+ unsigned int acpi_ipc_irq_index;
+};
+
+struct sst_ipc_info {
+ int ipc_offset;
+ unsigned int mbox_recv_off;
+};
+
+struct sst_platform_info {
+ const struct sst_info *probe_data;
+ const struct sst_ipc_info *ipc_info;
+ const struct sst_res_info *res_info;
+ const struct sst_lib_dnld_info *lib_info;
+ const char *platform;
+};
int add_sst_platform_device(void);
#endif
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 6f1c3a8a33ab..db257a58571f 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -23,6 +23,15 @@
#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
+#define SEGMENT_RPL_MASK 0x3 /*
+ * Bottom two bits of selector give the ring
+ * privilege level
+ */
+#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */
+#define USER_RPL 0x3 /* User mode is privilege level 3 */
+#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */
+#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */
+
#ifdef CONFIG_X86_32
/*
* The layout of the per-CPU GDT under Linux:
@@ -125,16 +134,6 @@
#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
-/* Bottom two bits of selector give the ring privilege level */
-#define SEGMENT_RPL_MASK 0x3
-/* Bit 2 is table indicator (LDT/GDT) */
-#define SEGMENT_TI_MASK 0x4
-
-/* User mode is privilege level 3 */
-#define USER_RPL 0x3
-/* LDT segment has TI set, GDT has it cleared */
-#define SEGMENT_LDT 0x4
-#define SEGMENT_GDT 0x0
/*
* Matching rules for certain types of segments.
@@ -192,17 +191,6 @@
#define get_kernel_rpl() 0
#endif
-/* User mode is privilege level 3 */
-#define USER_RPL 0x3
-/* LDT segment has TI set, GDT has it cleared */
-#define SEGMENT_LDT 0x4
-#define SEGMENT_GDT 0x0
-
-/* Bottom two bits of selector give the ring privilege level */
-#define SEGMENT_RPL_MASK 0x3
-/* Bit 2 is table indicator (LDT/GDT) */
-#define SEGMENT_TI_MASK 0x4
-
#define IDT_ENTRIES 256
#define NUM_EXCEPTION_VECTORS 32
/* Bitmask of exception vectors which push an error code on the stack */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index bcbfade26d8d..45afaee9555c 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
+#define SECONDARY_EXEC_XSAVES 0x00100000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -159,6 +160,8 @@ enum vmcs_field {
EOI_EXIT_BITMAP3_HIGH = 0x00002023,
VMREAD_BITMAP = 0x00002026,
VMWRITE_BITMAP = 0x00002028,
+ XSS_EXIT_BITMAP = 0x0000202C,
+ XSS_EXIT_BITMAP_HIGH = 0x0000202D,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
VMCS_LINK_POINTER = 0x00002800,
diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h
new file mode 100644
index 000000000000..0d809e9fc975
--- /dev/null
+++ b/arch/x86/include/asm/xen/cpuid.h
@@ -0,0 +1,91 @@
+/******************************************************************************
+ * arch-x86/cpuid.h
+ *
+ * CPUID interface to Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007 Citrix Systems, Inc.
+ *
+ * Authors:
+ * Keir Fraser <keir@xen.org>
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__
+#define __XEN_PUBLIC_ARCH_X86_CPUID_H__
+
+/*
+ * For compatibility with other hypervisor interfaces, the Xen cpuid leaves
+ * can be found at the first otherwise unused 0x100 aligned boundary starting
+ * from 0x40000000.
+ *
+ * e.g If viridian extensions are enabled for an HVM domain, the Xen cpuid
+ * leaves will start at 0x40000100
+ */
+
+#define XEN_CPUID_FIRST_LEAF 0x40000000
+#define XEN_CPUID_LEAF(i) (XEN_CPUID_FIRST_LEAF + (i))
+
+/*
+ * Leaf 1 (0x40000x00)
+ * EAX: Largest Xen-information leaf. All leaves up to an including @EAX
+ * are supported by the Xen host.
+ * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification
+ * of a Xen host.
+ */
+#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */
+#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */
+#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */
+
+/*
+ * Leaf 2 (0x40000x01)
+ * EAX[31:16]: Xen major version.
+ * EAX[15: 0]: Xen minor version.
+ * EBX-EDX: Reserved (currently all zeroes).
+ */
+
+/*
+ * Leaf 3 (0x40000x02)
+ * EAX: Number of hypercall transfer pages. This register is always guaranteed
+ * to specify one hypercall page.
+ * EBX: Base address of Xen-specific MSRs.
+ * ECX: Features 1. Unused bits are set to zero.
+ * EDX: Features 2. Unused bits are set to zero.
+ */
+
+/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */
+#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
+#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0)
+
+/*
+ * Leaf 5 (0x40000x04)
+ * HVM-specific features
+ */
+
+/* EAX Features */
+/* Virtualized APIC registers */
+#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0)
+/* Virtualized x2APIC accesses */
+#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1)
+/* Memory mapped from other domains has valid IOMMU entries */
+#define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2)
+
+#define XEN_CPUID_MAX_NUM_LEAVES 4
+
+#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h
index 7f02fe4e2c7b..acd844c017d3 100644
--- a/arch/x86/include/asm/xen/page-coherent.h
+++ b/arch/x86/include/asm/xen/page-coherent.h
@@ -22,8 +22,8 @@ static inline void xen_free_coherent_pages(struct device *hwdev, size_t size,
}
static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- struct dma_attrs *attrs) { }
+ dma_addr_t dev_addr, unsigned long offset, size_t size,
+ enum dma_data_direction dir, struct dma_attrs *attrs) { }
static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle,
size_t size, enum dma_data_direction dir,
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c949923a5668..5eea09915a15 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -41,10 +41,12 @@ typedef struct xpaddr {
extern unsigned long *machine_to_phys_mapping;
extern unsigned long machine_to_phys_nr;
+extern unsigned long *xen_p2m_addr;
+extern unsigned long xen_p2m_size;
+extern unsigned long xen_max_p2m_pfn;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern unsigned long set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e);
@@ -52,17 +54,52 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s,
extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
struct gnttab_map_grant_ref *kmap_ops,
struct page **pages, unsigned int count);
-extern int m2p_add_override(unsigned long mfn, struct page *page,
- struct gnttab_map_grant_ref *kmap_op);
extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
struct gnttab_map_grant_ref *kmap_ops,
struct page **pages, unsigned int count);
-extern int m2p_remove_override(struct page *page,
- struct gnttab_map_grant_ref *kmap_op,
- unsigned long mfn);
-extern struct page *m2p_find_override(unsigned long mfn);
extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
+/*
+ * Helper functions to write or read unsigned long values to/from
+ * memory, when the access may fault.
+ */
+static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
+{
+ return __put_user(val, (unsigned long __user *)addr);
+}
+
+static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val)
+{
+ return __get_user(*val, (unsigned long __user *)addr);
+}
+
+/*
+ * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine():
+ * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator
+ * bits (identity or foreign) are set.
+ * - __pfn_to_mfn() returns the found entry of the p2m table. A possibly set
+ * identity or foreign indicator will be still set. __pfn_to_mfn() is
+ * encapsulating get_phys_to_machine() which is called in special cases only.
+ * - get_phys_to_machine() is to be called by __pfn_to_mfn() only in special
+ * cases needing an extended handling.
+ */
+static inline unsigned long __pfn_to_mfn(unsigned long pfn)
+{
+ unsigned long mfn;
+
+ if (pfn < xen_p2m_size)
+ mfn = xen_p2m_addr[pfn];
+ else if (unlikely(pfn < xen_max_p2m_pfn))
+ return get_phys_to_machine(pfn);
+ else
+ return IDENTITY_FRAME(pfn);
+
+ if (unlikely(mfn == INVALID_P2M_ENTRY))
+ return get_phys_to_machine(pfn);
+
+ return mfn;
+}
+
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
unsigned long mfn;
@@ -70,7 +107,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
if (xen_feature(XENFEAT_auto_translated_physmap))
return pfn;
- mfn = get_phys_to_machine(pfn);
+ mfn = __pfn_to_mfn(pfn);
if (mfn != INVALID_P2M_ENTRY)
mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
@@ -83,7 +120,7 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
if (xen_feature(XENFEAT_auto_translated_physmap))
return 1;
- return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY;
+ return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY;
}
static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn)
@@ -102,7 +139,7 @@ static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn)
* In such cases it doesn't matter what we return (we return garbage),
* but we must handle the fault without crashing!
*/
- ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
+ ret = xen_safe_read_ulong(&machine_to_phys_mapping[mfn], &pfn);
if (ret < 0)
return ~0;
@@ -117,7 +154,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
return mfn;
pfn = mfn_to_pfn_no_overrides(mfn);
- if (get_phys_to_machine(pfn) != mfn) {
+ if (__pfn_to_mfn(pfn) != mfn) {
/*
* If this appears to be a foreign mfn (because the pfn
* doesn't map back to the mfn), then check the local override
@@ -133,8 +170,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
* entry doesn't map back to the mfn and m2p_override doesn't have a
* valid entry for it.
*/
- if (pfn == ~0 &&
- get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
+ if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn))
pfn = mfn;
return pfn;
@@ -180,7 +216,7 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
return mfn;
pfn = mfn_to_pfn(mfn);
- if (get_phys_to_machine(pfn) != mfn)
+ if (__pfn_to_mfn(pfn) != mfn)
return -1; /* force !pfn_valid() */
return pfn;
}
@@ -236,4 +272,11 @@ void make_lowmem_page_readwrite(void *vaddr);
#define xen_remap(cookie, size) ioremap((cookie), (size));
#define xen_unmap(cookie) iounmap((cookie))
+static inline bool xen_arch_need_swiotlb(struct device *dev,
+ unsigned long pfn,
+ unsigned long mfn)
+{
+ return false;
+}
+
#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 7e7a79ada658..5fa9770035dc 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -16,6 +16,7 @@
#define XSTATE_Hi16_ZMM 0x80
#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
+#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
/* Bit 63 of XCR0 is reserved for future expansion */
#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63)))
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 990a2fe1588d..b813bf9da1e2 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -72,6 +72,8 @@
#define EXIT_REASON_XSETBV 55
#define EXIT_REASON_APIC_WRITE 56
#define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_XSAVES 63
+#define EXIT_REASON_XRSTORS 64
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -116,6 +118,8 @@
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
{ EXIT_REASON_INVD, "INVD" }, \
{ EXIT_REASON_INVVPID, "INVVPID" }, \
- { EXIT_REASON_INVPCID, "INVPCID" }
+ { EXIT_REASON_INVPCID, "INVPCID" }, \
+ { EXIT_REASON_XSAVES, "XSAVES" }, \
+ { EXIT_REASON_XRSTORS, "XRSTORS" }
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index d67c4be3e8b1..3b3b9d33ac1d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,3 +1,7 @@
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
+
#include <asm/ucontext.h>
#include <linux/lguest.h>
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4f9359f36bb7..fdcbb4d27c9f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,3 +1,7 @@
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
+
#include <asm/ia32.h>
#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 06d3e5a14d9d..f3672508b249 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall)
case __NR_openat:
return 3;
case __NR_execve:
+ case __NR_execveat:
return 5;
default:
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
index 639d1289b1ba..97242a9242bd 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
@@ -130,10 +130,7 @@ static ssize_t _iommu_cpumask_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask);
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
+ return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index 30790d798e6b..cc6cedb8f25d 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -219,7 +219,6 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- int n;
cpumask_t *active_mask;
struct pmu *pmu = dev_get_drvdata(dev);
@@ -230,10 +229,7 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
else
return 0;
- n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask);
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
+ return cpumap_print_to_pagebuf(true, buf, active_mask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index d64f275fe274..673f930c700f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -365,11 +365,7 @@ static void rapl_pmu_event_read(struct perf_event *event)
static ssize_t rapl_get_attr_cpumask(struct device *dev,
struct device_attribute *attr, char *buf)
{
- int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
-
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
+ return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 9762dbd9f3f7..08f3fed2b0f2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -647,11 +647,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
static ssize_t uncore_get_attr_cpumask(struct device *dev,
struct device_attribute *attr, char *buf)
{
- int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask);
-
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
+ return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 49f886481615..dd2f07ae9d0c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1114,8 +1114,8 @@ void __init memblock_find_dma_reserve(void)
* at first, and assume boot_mem will not take below MAX_DMA_PFN
*/
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
- start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
- end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
+ start_pfn = min(start_pfn, MAX_DMA_PFN);
+ end_pfn = min(end_pfn, MAX_DMA_PFN);
nr_pages += end_pfn - start_pfn;
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 2e1a6853e00c..fe9f0b79a18b 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -455,6 +455,23 @@ struct intel_stolen_funcs {
u32 (*base)(int num, int slot, int func, size_t size);
};
+static size_t __init gen9_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gmch_ctrl >>= BDW_GMCH_GMS_SHIFT;
+ gmch_ctrl &= BDW_GMCH_GMS_MASK;
+
+ if (gmch_ctrl < 0xf0)
+ return gmch_ctrl << 25; /* 32 MB units */
+ else
+ /* 4MB increments starting at 0xf0 for 4MB */
+ return (gmch_ctrl - 0xf0 + 1) << 22;
+}
+
+typedef size_t (*stolen_size_fn)(int num, int slot, int func);
+
static const struct intel_stolen_funcs i830_stolen_funcs __initconst = {
.base = i830_stolen_base,
.size = i830_stolen_size,
@@ -490,6 +507,11 @@ static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = {
.size = gen8_stolen_size,
};
+static const struct intel_stolen_funcs gen9_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = gen9_stolen_size,
+};
+
static const struct intel_stolen_funcs chv_stolen_funcs __initconst = {
.base = intel_stolen_base,
.size = chv_stolen_size,
@@ -523,6 +545,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = {
INTEL_BDW_M_IDS(&gen8_stolen_funcs),
INTEL_BDW_D_IDS(&gen8_stolen_funcs),
INTEL_CHV_IDS(&chv_stolen_funcs),
+ INTEL_SKL_IDS(&gen9_stolen_funcs),
};
static void __init intel_graphics_stolen(int num, int slot, int func)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 344b63f18d14..1cf7c97ff175 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1191,10 +1191,10 @@ ENTRY(ftrace_graph_caller)
pushl %eax
pushl %ecx
pushl %edx
- movl 0xc(%esp), %edx
- lea 0x4(%ebp), %eax
+ movl 0xc(%esp), %eax
+ lea 0x4(%ebp), %edx
movl (%ebp), %ecx
- subl $MCOUNT_INSN_SIZE, %edx
+ subl $MCOUNT_INSN_SIZE, %eax
call prepare_ftrace_return
popl %edx
popl %ecx
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c0226ab54106..90878aa38dbd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -652,6 +652,20 @@ ENTRY(stub_execve)
CFI_ENDPROC
END(stub_execve)
+ENTRY(stub_execveat)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call sys_execveat
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_execveat)
+
/*
* sigreturn is special because it needs to restore all registers on return.
* This cannot be done with SYSRET, so use the IRET return path instead.
@@ -697,6 +711,20 @@ ENTRY(stub_x32_execve)
CFI_ENDPROC
END(stub_x32_execve)
+ENTRY(stub_x32_execveat)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call compat_sys_execveat
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_x32_execveat)
+
#endif
/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f6945bef2cd1..94f643484300 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -283,7 +283,14 @@ NOKPROBE_SYMBOL(do_async_page_fault);
static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
- pv_info.paravirt_enabled = 1;
+
+ /*
+ * KVM isn't paravirt in the sense of paravirt_enabled. A KVM
+ * guest kernel works like a bare metal kernel with additional
+ * features, and paravirt_enabled is about features that are
+ * missing.
+ */
+ pv_info.paravirt_enabled = 0;
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d9156ceecdff..42caaef897c8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -59,13 +59,12 @@ static void kvm_get_wallclock(struct timespec *now)
native_write_msr(msr_kvm_wall_clock, low, high);
- preempt_disable();
- cpu = smp_processor_id();
+ cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
pvclock_read_wallclock(&wall_clock, vcpu_time, now);
- preempt_enable();
+ put_cpu();
}
static int kvm_set_wallclock(const struct timespec *now)
@@ -107,11 +106,10 @@ static unsigned long kvm_get_tsc_khz(void)
int cpu;
unsigned long tsc_khz;
- preempt_disable();
- cpu = smp_processor_id();
+ cpu = get_cpu();
src = &hv_clock[cpu].pvti;
tsc_khz = pvclock_tsc_khz(src);
- preempt_enable();
+ put_cpu();
return tsc_khz;
}
@@ -263,7 +261,6 @@ void __init kvmclock_init(void)
#endif
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
- pv_info.paravirt_enabled = 1;
pv_info.name = "KVM";
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
@@ -284,23 +281,22 @@ int __init kvm_setup_vsyscall_timeinfo(void)
size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
- preempt_disable();
- cpu = smp_processor_id();
+ cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
flags = pvclock_read_flags(vcpu_time);
if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
- preempt_enable();
+ put_cpu();
return 1;
}
if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
- preempt_enable();
+ put_cpu();
return ret;
}
- preempt_enable();
+ put_cpu();
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
#endif
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3ed4a68d4013..5a2c02913af3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
- /*
- * Reload esp0, LDT and the page table pointer:
- */
+ /* Reload esp0 and ss1. */
load_sp0(tss, next);
- /*
- * Switch DS and ES.
- * This won't pick up thread selector changes, but I guess that is ok.
- */
- savesegment(es, prev->es);
- if (unlikely(next->es | prev->es))
- loadsegment(es, next->es);
-
- savesegment(ds, prev->ds);
- if (unlikely(next->ds | prev->ds))
- loadsegment(ds, next->ds);
-
-
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
*
@@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
savesegment(fs, fsindex);
savesegment(gs, gsindex);
+ /*
+ * Load TLS before restoring any segments so that segment loads
+ * reference the correct GDT entries.
+ */
load_TLS(next, cpu);
/*
- * Leave lazy mode, flushing any hypercalls made here.
- * This must be done before restoring TLS segments so
- * the GDT and LDT are properly updated, and must be
- * done before math_state_restore, so the TS bit is up
- * to date.
+ * Leave lazy mode, flushing any hypercalls made here. This
+ * must be done after loading TLS entries in the GDT but before
+ * loading segments that might reference them, and and it must
+ * be done before math_state_restore, so the TS bit is up to
+ * date.
*/
arch_end_context_switch(next_p);
+ /* Switch DS and ES.
+ *
+ * Reading them only returns the selectors, but writing them (if
+ * nonzero) loads the full descriptor from the GDT or LDT. The
+ * LDT for next is loaded in switch_mm, and the GDT is loaded
+ * above.
+ *
+ * We therefore need to write new values to the segment
+ * registers on every context switch unless both the new and old
+ * values are zero.
+ *
+ * Note that we don't need to do anything for CS and SS, as
+ * those are saved and restored as part of pt_regs.
+ */
+ savesegment(es, prev->es);
+ if (unlikely(next->es | prev->es))
+ loadsegment(es, next->es);
+
+ savesegment(ds, prev->ds);
+ if (unlikely(next->ds | prev->ds))
+ loadsegment(ds, next->ds);
+
/*
* Switch FS and GS.
*
- * Segment register != 0 always requires a reload. Also
- * reload when it has changed. When prev process used 64bit
- * base always reload to avoid an information leak.
+ * These are even more complicated than FS and GS: they have
+ * 64-bit bases are that controlled by arch_prctl. Those bases
+ * only differ from the values in the GDT or LDT if the selector
+ * is 0.
+ *
+ * Loading the segment register resets the hidden base part of
+ * the register to 0 or the value from the GDT / LDT. If the
+ * next base address zero, writing 0 to the segment register is
+ * much faster than using wrmsr to explicitly zero the base.
+ *
+ * The thread_struct.fs and thread_struct.gs values are 0
+ * if the fs and gs bases respectively are not overridden
+ * from the values implied by fsindex and gsindex. They
+ * are nonzero, and store the nonzero base addresses, if
+ * the bases are overridden.
+ *
+ * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
+ * be impossible.
+ *
+ * Therefore we need to reload the segment registers if either
+ * the old or new selector is nonzero, and we need to override
+ * the base address if next thread expects it to be overridden.
+ *
+ * This code is unnecessarily slow in the case where the old and
+ * new indexes are zero and the new base is nonzero -- it will
+ * unnecessarily write 0 to the selector before writing the new
+ * base address.
+ *
+ * Note: This all depends on arch_prctl being the only way that
+ * user code can override the segment base. Once wrfsbase and
+ * wrgsbase are enabled, most of this code will need to change.
*/
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
+
/*
- * Check if the user used a selector != 0; if yes
- * clear 64bit base, since overloaded base is always
- * mapped to the Null selector
+ * If user code wrote a nonzero value to FS, then it also
+ * cleared the overridden base address.
+ *
+ * XXX: if user code wrote 0 to FS and cleared the base
+ * address itself, we won't notice and we'll incorrectly
+ * restore the prior base address next time we reschdule
+ * the process.
*/
if (fsindex)
prev->fs = 0;
}
- /* when next process has a 64bit base use it */
if (next->fs)
wrmsrl(MSR_FS_BASE, next->fs);
prev->fsindex = fsindex;
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
+
+ /* This works (and fails) the same way as fsindex above. */
if (gsindex)
prev->gs = 0;
}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index f7fec09e3e3a..3e551eee87b9 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -27,6 +27,43 @@ static int get_free_idx(void)
return -ESRCH;
}
+static bool tls_desc_okay(const struct user_desc *info)
+{
+ if (LDT_empty(info))
+ return true;
+
+ /*
+ * espfix is required for 16-bit data segments, but espfix
+ * only works for LDT segments.
+ */
+ if (!info->seg_32bit)
+ return false;
+
+ /* Only allow data segments in the TLS array. */
+ if (info->contents > 1)
+ return false;
+
+ /*
+ * Non-present segments with DPL 3 present an interesting attack
+ * surface. The kernel should handle such segments correctly,
+ * but TLS is very difficult to protect in a sandbox, so prevent
+ * such segments from being created.
+ *
+ * If userspace needs to remove a TLS entry, it can still delete
+ * it outright.
+ */
+ if (info->seg_not_present)
+ return false;
+
+#ifdef CONFIG_X86_64
+ /* The L bit makes no sense for data. */
+ if (info->lm)
+ return false;
+#endif
+
+ return true;
+}
+
static void set_tls_desc(struct task_struct *p, int idx,
const struct user_desc *info, int n)
{
@@ -66,6 +103,9 @@ int do_set_thread_area(struct task_struct *p, int idx,
if (copy_from_user(&info, u_info, sizeof(info)))
return -EFAULT;
+ if (!tls_desc_okay(&info))
+ return -EINVAL;
+
if (idx == -1)
idx = info.entry_number;
@@ -192,6 +232,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
{
struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
const struct user_desc *info;
+ int i;
if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
(pos % sizeof(struct user_desc)) != 0 ||
@@ -205,6 +246,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
else
info = infobuf;
+ for (i = 0; i < count / sizeof(struct user_desc); i++)
+ if (!tls_desc_okay(info + i))
+ return -EINVAL;
+
set_tls_desc(target,
GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
info, count / sizeof(struct user_desc));
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 4c540c4719d8..0de1fae2bdf0 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -738,3 +738,4 @@ void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
return (void *)xsave + xstate_comp_offsets[feature];
}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 25d22b2d6509..08f790dfadc9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,14 +7,13 @@ CFLAGS_vmx.o := -I.
KVM := ../../../virt/kvm
-kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
- $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
+kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o cpuid.o pmu.o
+ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
+kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
new file mode 100644
index 000000000000..6eb5c20ee373
--- /dev/null
+++ b/arch/x86/kvm/assigned-dev.c
@@ -0,0 +1,1052 @@
+/*
+ * Kernel-based Virtual Machine - device assignment support
+ *
+ * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+#include "irq.h"
+#include "assigned-dev.h"
+
+struct kvm_assigned_dev_kernel {
+ struct kvm_irq_ack_notifier ack_notifier;
+ struct list_head list;
+ int assigned_dev_id;
+ int host_segnr;
+ int host_busnr;
+ int host_devfn;
+ unsigned int entries_nr;
+ int host_irq;
+ bool host_irq_disabled;
+ bool pci_2_3;
+ struct msix_entry *host_msix_entries;
+ int guest_irq;
+ struct msix_entry *guest_msix_entries;
+ unsigned long irq_requested_type;
+ int irq_source_id;
+ int flags;
+ struct pci_dev *dev;
+ struct kvm *kvm;
+ spinlock_t intx_lock;
+ spinlock_t intx_mask_lock;
+ char irq_name[32];
+ struct pci_saved_state *pci_saved_state;
+};
+
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+ int assigned_dev_id)
+{
+ struct list_head *ptr;
+ struct kvm_assigned_dev_kernel *match;
+
+ list_for_each(ptr, head) {
+ match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+ if (match->assigned_dev_id == assigned_dev_id)
+ return match;
+ }
+ return NULL;
+}
+
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+ *assigned_dev, int irq)
+{
+ int i, index;
+ struct msix_entry *host_msix_entries;
+
+ host_msix_entries = assigned_dev->host_msix_entries;
+
+ index = -1;
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ if (irq == host_msix_entries[i].vector) {
+ index = i;
+ break;
+ }
+ if (index < 0)
+ printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+
+ return index;
+}
+
+static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int ret;
+
+ spin_lock(&assigned_dev->intx_lock);
+ if (pci_check_and_mask_intx(assigned_dev->dev)) {
+ assigned_dev->host_irq_disabled = true;
+ ret = IRQ_WAKE_THREAD;
+ } else
+ ret = IRQ_NONE;
+ spin_unlock(&assigned_dev->intx_lock);
+
+ return ret;
+}
+
+static void
+kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
+ int vector)
+{
+ if (unlikely(assigned_dev->irq_requested_type &
+ KVM_DEV_IRQ_GUEST_INTX)) {
+ spin_lock(&assigned_dev->intx_mask_lock);
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
+ kvm_set_irq(assigned_dev->kvm,
+ assigned_dev->irq_source_id, vector, 1,
+ false);
+ spin_unlock(&assigned_dev->intx_mask_lock);
+ } else
+ kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+ vector, 1, false);
+}
+
+static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ spin_lock_irq(&assigned_dev->intx_lock);
+ disable_irq_nosync(irq);
+ assigned_dev->host_irq_disabled = true;
+ spin_unlock_irq(&assigned_dev->intx_lock);
+ }
+
+ kvm_assigned_dev_raise_guest_irq(assigned_dev,
+ assigned_dev->guest_irq);
+
+ return IRQ_HANDLED;
+}
+
+#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
+ assigned_dev->irq_source_id,
+ assigned_dev->guest_irq, 1);
+ return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
+}
+
+static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+ kvm_assigned_dev_raise_guest_irq(assigned_dev,
+ assigned_dev->guest_irq);
+
+ return IRQ_HANDLED;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int index = find_index_from_host_irq(assigned_dev, irq);
+ u32 vector;
+ int ret = 0;
+
+ if (index >= 0) {
+ vector = assigned_dev->guest_msix_entries[index].vector;
+ ret = kvm_set_irq_inatomic(assigned_dev->kvm,
+ assigned_dev->irq_source_id,
+ vector, 1);
+ }
+
+ return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
+}
+
+static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int index = find_index_from_host_irq(assigned_dev, irq);
+ u32 vector;
+
+ if (index >= 0) {
+ vector = assigned_dev->guest_msix_entries[index].vector;
+ kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
+ }
+
+ return IRQ_HANDLED;
+}
+#endif
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_assigned_dev_kernel *dev =
+ container_of(kian, struct kvm_assigned_dev_kernel,
+ ack_notifier);
+
+ kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
+
+ spin_lock(&dev->intx_mask_lock);
+
+ if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
+ bool reassert = false;
+
+ spin_lock_irq(&dev->intx_lock);
+ /*
+ * The guest IRQ may be shared so this ack can come from an
+ * IRQ for another guest device.
+ */
+ if (dev->host_irq_disabled) {
+ if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
+ enable_irq(dev->host_irq);
+ else if (!pci_check_and_unmask_intx(dev->dev))
+ reassert = true;
+ dev->host_irq_disabled = reassert;
+ }
+ spin_unlock_irq(&dev->intx_lock);
+
+ if (reassert)
+ kvm_set_irq(dev->kvm, dev->irq_source_id,
+ dev->guest_irq, 1, false);
+ }
+
+ spin_unlock(&dev->intx_mask_lock);
+}
+
+static void deassign_guest_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ if (assigned_dev->ack_notifier.gsi != -1)
+ kvm_unregister_irq_ack_notifier(kvm,
+ &assigned_dev->ack_notifier);
+
+ kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+ assigned_dev->guest_irq, 0, false);
+
+ if (assigned_dev->irq_source_id != -1)
+ kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+ assigned_dev->irq_source_id = -1;
+ assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
+
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ /*
+ * We disable irq here to prevent further events.
+ *
+ * Notice this maybe result in nested disable if the interrupt type is
+ * INTx, but it's OK for we are going to free it.
+ *
+ * If this function is a part of VM destroy, please ensure that till
+ * now, the kvm state is still legal for probably we also have to wait
+ * on a currently running IRQ handler.
+ */
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+ int i;
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ disable_irq(assigned_dev->host_msix_entries[i].vector);
+
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ free_irq(assigned_dev->host_msix_entries[i].vector,
+ assigned_dev);
+
+ assigned_dev->entries_nr = 0;
+ kfree(assigned_dev->host_msix_entries);
+ kfree(assigned_dev->guest_msix_entries);
+ pci_disable_msix(assigned_dev->dev);
+ } else {
+ /* Deal with MSI and INTx */
+ if ((assigned_dev->irq_requested_type &
+ KVM_DEV_IRQ_HOST_INTX) &&
+ (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ spin_lock_irq(&assigned_dev->intx_lock);
+ pci_intx(assigned_dev->dev, false);
+ spin_unlock_irq(&assigned_dev->intx_lock);
+ synchronize_irq(assigned_dev->host_irq);
+ } else
+ disable_irq(assigned_dev->host_irq);
+
+ free_irq(assigned_dev->host_irq, assigned_dev);
+
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
+ pci_disable_msi(assigned_dev->dev);
+ }
+
+ assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev,
+ unsigned long irq_requested_type)
+{
+ unsigned long guest_irq_type, host_irq_type;
+
+ if (!irqchip_in_kernel(kvm))
+ return -EINVAL;
+ /* no irq assignment to deassign */
+ if (!assigned_dev->irq_requested_type)
+ return -ENXIO;
+
+ host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+ guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+ if (host_irq_type)
+ deassign_host_irq(kvm, assigned_dev);
+ if (guest_irq_type)
+ deassign_guest_irq(kvm, assigned_dev);
+
+ return 0;
+}
+
+static void kvm_free_assigned_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel
+ *assigned_dev)
+{
+ kvm_free_assigned_irq(kvm, assigned_dev);
+
+ pci_reset_function(assigned_dev->dev);
+ if (pci_load_and_free_saved_state(assigned_dev->dev,
+ &assigned_dev->pci_saved_state))
+ printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
+ __func__, dev_name(&assigned_dev->dev->dev));
+ else
+ pci_restore_state(assigned_dev->dev);
+
+ pci_clear_dev_assigned(assigned_dev->dev);
+
+ pci_release_regions(assigned_dev->dev);
+ pci_disable_device(assigned_dev->dev);
+ pci_dev_put(assigned_dev->dev);
+
+ list_del(&assigned_dev->list);
+ kfree(assigned_dev);
+}
+
+void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+ struct list_head *ptr, *ptr2;
+ struct kvm_assigned_dev_kernel *assigned_dev;
+
+ list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+ assigned_dev = list_entry(ptr,
+ struct kvm_assigned_dev_kernel,
+ list);
+
+ kvm_free_assigned_device(kvm, assigned_dev);
+ }
+}
+
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ irq_handler_t irq_handler;
+ unsigned long flags;
+
+ dev->host_irq = dev->dev->irq;
+
+ /*
+ * We can only share the IRQ line with other host devices if we are
+ * able to disable the IRQ source at device-level - independently of
+ * the guest driver. Otherwise host devices may suffer from unbounded
+ * IRQ latencies when the guest keeps the line asserted.
+ */
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ irq_handler = kvm_assigned_dev_intx;
+ flags = IRQF_SHARED;
+ } else {
+ irq_handler = NULL;
+ flags = IRQF_ONESHOT;
+ }
+ if (request_threaded_irq(dev->host_irq, irq_handler,
+ kvm_assigned_dev_thread_intx, flags,
+ dev->irq_name, dev))
+ return -EIO;
+
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ spin_lock_irq(&dev->intx_lock);
+ pci_intx(dev->dev, true);
+ spin_unlock_irq(&dev->intx_lock);
+ }
+ return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ int r;
+
+ if (!dev->dev->msi_enabled) {
+ r = pci_enable_msi(dev->dev);
+ if (r)
+ return r;
+ }
+
+ dev->host_irq = dev->dev->irq;
+ if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
+ kvm_assigned_dev_thread_msi, 0,
+ dev->irq_name, dev)) {
+ pci_disable_msi(dev->dev);
+ return -EIO;
+ }
+
+ return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ int i, r = -EINVAL;
+
+ /* host_msix_entries and guest_msix_entries should have been
+ * initialized */
+ if (dev->entries_nr == 0)
+ return r;
+
+ r = pci_enable_msix_exact(dev->dev,
+ dev->host_msix_entries, dev->entries_nr);
+ if (r)
+ return r;
+
+ for (i = 0; i < dev->entries_nr; i++) {
+ r = request_threaded_irq(dev->host_msix_entries[i].vector,
+ kvm_assigned_dev_msix,
+ kvm_assigned_dev_thread_msix,
+ 0, dev->irq_name, dev);
+ if (r)
+ goto err;
+ }
+
+ return 0;
+err:
+ for (i -= 1; i >= 0; i--)
+ free_irq(dev->host_msix_entries[i].vector, dev);
+ pci_disable_msix(dev->dev);
+ return r;
+}
+
+#endif
+
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = irq->guest_irq;
+ return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = -1;
+ return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = -1;
+ return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ __u32 host_irq_type)
+{
+ int r = -EEXIST;
+
+ if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+ return r;
+
+ snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
+ pci_name(dev->dev));
+
+ switch (host_irq_type) {
+ case KVM_DEV_IRQ_HOST_INTX:
+ r = assigned_device_enable_host_intx(kvm, dev);
+ break;
+#ifdef __KVM_HAVE_MSI
+ case KVM_DEV_IRQ_HOST_MSI:
+ r = assigned_device_enable_host_msi(kvm, dev);
+ break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+ case KVM_DEV_IRQ_HOST_MSIX:
+ r = assigned_device_enable_host_msix(kvm, dev);
+ break;
+#endif
+ default:
+ r = -EINVAL;
+ }
+ dev->host_irq_disabled = false;
+
+ if (!r)
+ dev->irq_requested_type |= host_irq_type;
+
+ return r;
+}
+
+static int assign_guest_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq,
+ unsigned long guest_irq_type)
+{
+ int id;
+ int r = -EEXIST;
+
+ if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+ return r;
+
+ id = kvm_request_irq_source_id(kvm);
+ if (id < 0)
+ return id;
+
+ dev->irq_source_id = id;
+
+ switch (guest_irq_type) {
+ case KVM_DEV_IRQ_GUEST_INTX:
+ r = assigned_device_enable_guest_intx(kvm, dev, irq);
+ break;
+#ifdef __KVM_HAVE_MSI
+ case KVM_DEV_IRQ_GUEST_MSI:
+ r = assigned_device_enable_guest_msi(kvm, dev, irq);
+ break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+ case KVM_DEV_IRQ_GUEST_MSIX:
+ r = assigned_device_enable_guest_msix(kvm, dev, irq);
+ break;
+#endif
+ default:
+ r = -EINVAL;
+ }
+
+ if (!r) {
+ dev->irq_requested_type |= guest_irq_type;
+ if (dev->ack_notifier.gsi != -1)
+ kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+ } else {
+ kvm_free_irq_source_id(kvm, dev->irq_source_id);
+ dev->irq_source_id = -1;
+ }
+
+ return r;
+}
+
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+ struct kvm_assigned_irq *assigned_irq)
+{
+ int r = -EINVAL;
+ struct kvm_assigned_dev_kernel *match;
+ unsigned long host_irq_type, guest_irq_type;
+
+ if (!irqchip_in_kernel(kvm))
+ return r;
+
+ mutex_lock(&kvm->lock);
+ r = -ENODEV;
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_irq->assigned_dev_id);
+ if (!match)
+ goto out;
+
+ host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+ guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
+
+ r = -EINVAL;
+ /* can only assign one type at a time */
+ if (hweight_long(host_irq_type) > 1)
+ goto out;
+ if (hweight_long(guest_irq_type) > 1)
+ goto out;
+ if (host_irq_type == 0 && guest_irq_type == 0)
+ goto out;
+
+ r = 0;
+ if (host_irq_type)
+ r = assign_host_irq(kvm, match, host_irq_type);
+ if (r)
+ goto out;
+
+ if (guest_irq_type)
+ r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+ struct kvm_assigned_irq
+ *assigned_irq)
+{
+ int r = -ENODEV;
+ struct kvm_assigned_dev_kernel *match;
+ unsigned long irq_type;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_irq->assigned_dev_id);
+ if (!match)
+ goto out;
+
+ irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+ KVM_DEV_IRQ_GUEST_MASK);
+ r = kvm_deassign_irq(kvm, match, irq_type);
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+/*
+ * We want to test whether the caller has been granted permissions to
+ * use this device. To be able to configure and control the device,
+ * the user needs access to PCI configuration space and BAR resources.
+ * These are accessed through PCI sysfs. PCI config space is often
+ * passed to the process calling this ioctl via file descriptor, so we
+ * can't rely on access to that file. We can check for permissions
+ * on each of the BAR resource files, which is a pretty clear
+ * indicator that the user has been granted access to the device.
+ */
+static int probe_sysfs_permissions(struct pci_dev *dev)
+{
+#ifdef CONFIG_SYSFS
+ int i;
+ bool bar_found = false;
+
+ for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
+ char *kpath, *syspath;
+ struct path path;
+ struct inode *inode;
+ int r;
+
+ if (!pci_resource_len(dev, i))
+ continue;
+
+ kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
+ if (!kpath)
+ return -ENOMEM;
+
+ /* Per sysfs-rules, sysfs is always at /sys */
+ syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
+ kfree(kpath);
+ if (!syspath)
+ return -ENOMEM;
+
+ r = kern_path(syspath, LOOKUP_FOLLOW, &path);
+ kfree(syspath);
+ if (r)
+ return r;
+
+ inode = path.dentry->d_inode;
+
+ r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
+ path_put(&path);
+ if (r)
+ return r;
+
+ bar_found = true;
+ }
+
+ /* If no resources, probably something special */
+ if (!bar_found)
+ return -EPERM;
+
+ return 0;
+#else
+ return -EINVAL; /* No way to control the device without sysfs */
+#endif
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0, idx;
+ struct kvm_assigned_dev_kernel *match;
+ struct pci_dev *dev;
+
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ idx = srcu_read_lock(&kvm->srcu);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (match) {
+ /* device already assigned */
+ r = -EEXIST;
+ goto out;
+ }
+
+ match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+ if (match == NULL) {
+ printk(KERN_INFO "%s: Couldn't allocate memory\n",
+ __func__);
+ r = -ENOMEM;
+ goto out;
+ }
+ dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
+ assigned_dev->busnr,
+ assigned_dev->devfn);
+ if (!dev) {
+ printk(KERN_INFO "%s: host device not found\n", __func__);
+ r = -EINVAL;
+ goto out_free;
+ }
+
+ /* Don't allow bridges to be assigned */
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
+ r = -EPERM;
+ goto out_put;
+ }
+
+ r = probe_sysfs_permissions(dev);
+ if (r)
+ goto out_put;
+
+ if (pci_enable_device(dev)) {
+ printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+ r = -EBUSY;
+ goto out_put;
+ }
+ r = pci_request_regions(dev, "kvm_assigned_device");
+ if (r) {
+ printk(KERN_INFO "%s: Could not get access to device regions\n",
+ __func__);
+ goto out_disable;
+ }
+
+ pci_reset_function(dev);
+ pci_save_state(dev);
+ match->pci_saved_state = pci_store_saved_state(dev);
+ if (!match->pci_saved_state)
+ printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
+ __func__, dev_name(&dev->dev));
+
+ if (!pci_intx_mask_supported(dev))
+ assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
+
+ match->assigned_dev_id = assigned_dev->assigned_dev_id;
+ match->host_segnr = assigned_dev->segnr;
+ match->host_busnr = assigned_dev->busnr;
+ match->host_devfn = assigned_dev->devfn;
+ match->flags = assigned_dev->flags;
+ match->dev = dev;
+ spin_lock_init(&match->intx_lock);
+ spin_lock_init(&match->intx_mask_lock);
+ match->irq_source_id = -1;
+ match->kvm = kvm;
+ match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+
+ list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+ if (!kvm->arch.iommu_domain) {
+ r = kvm_iommu_map_guest(kvm);
+ if (r)
+ goto out_list_del;
+ }
+ r = kvm_assign_device(kvm, match->dev);
+ if (r)
+ goto out_list_del;
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ mutex_unlock(&kvm->lock);
+ return r;
+out_list_del:
+ if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
+ printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
+ __func__, dev_name(&dev->dev));
+ list_del(&match->list);
+ pci_release_regions(dev);
+out_disable:
+ pci_disable_device(dev);
+out_put:
+ pci_dev_put(dev);
+out_free:
+ kfree(match);
+ srcu_read_unlock(&kvm->srcu, idx);
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (!match) {
+ printk(KERN_INFO "%s: device hasn't been assigned before, "
+ "so cannot be deassigned\n", __func__);
+ r = -EINVAL;
+ goto out;
+ }
+
+ kvm_deassign_device(kvm, match->dev);
+
+ kvm_free_assigned_device(kvm, match);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+ struct kvm_assigned_msix_nr *entry_nr)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *adev;
+
+ mutex_lock(&kvm->lock);
+
+ adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ entry_nr->assigned_dev_id);
+ if (!adev) {
+ r = -EINVAL;
+ goto msix_nr_out;
+ }
+
+ if (adev->entries_nr == 0) {
+ adev->entries_nr = entry_nr->entry_nr;
+ if (adev->entries_nr == 0 ||
+ adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
+ r = -EINVAL;
+ goto msix_nr_out;
+ }
+
+ adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+ entry_nr->entry_nr,
+ GFP_KERNEL);
+ if (!adev->host_msix_entries) {
+ r = -ENOMEM;
+ goto msix_nr_out;
+ }
+ adev->guest_msix_entries =
+ kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
+ GFP_KERNEL);
+ if (!adev->guest_msix_entries) {
+ kfree(adev->host_msix_entries);
+ r = -ENOMEM;
+ goto msix_nr_out;
+ }
+ } else /* Not allowed set MSI-X number twice */
+ r = -EINVAL;
+msix_nr_out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+ struct kvm_assigned_msix_entry *entry)
+{
+ int r = 0, i;
+ struct kvm_assigned_dev_kernel *adev;
+
+ mutex_lock(&kvm->lock);
+
+ adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ entry->assigned_dev_id);
+
+ if (!adev) {
+ r = -EINVAL;
+ goto msix_entry_out;
+ }
+
+ for (i = 0; i < adev->entries_nr; i++)
+ if (adev->guest_msix_entries[i].vector == 0 ||
+ adev->guest_msix_entries[i].entry == entry->entry) {
+ adev->guest_msix_entries[i].entry = entry->entry;
+ adev->guest_msix_entries[i].vector = entry->gsi;
+ adev->host_msix_entries[i].entry = entry->entry;
+ break;
+ }
+ if (i == adev->entries_nr) {
+ r = -ENOSPC;
+ goto msix_entry_out;
+ }
+
+msix_entry_out:
+ mutex_unlock(&kvm->lock);
+
+ return r;
+}
+#endif
+
+static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (!match) {
+ r = -ENODEV;
+ goto out;
+ }
+
+ spin_lock(&match->intx_mask_lock);
+
+ match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
+ match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
+
+ if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+ if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
+ kvm_set_irq(match->kvm, match->irq_source_id,
+ match->guest_irq, 0, false);
+ /*
+ * Masking at hardware-level is performed on demand,
+ * i.e. when an IRQ actually arrives at the host.
+ */
+ } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ /*
+ * Unmask the IRQ line if required. Unmasking at
+ * device level will be performed by user space.
+ */
+ spin_lock_irq(&match->intx_lock);
+ if (match->host_irq_disabled) {
+ enable_irq(match->host_irq);
+ match->host_irq_disabled = false;
+ }
+ spin_unlock_irq(&match->intx_lock);
+ }
+ }
+
+ spin_unlock(&match->intx_mask_lock);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ int r;
+
+ switch (ioctl) {
+ case KVM_ASSIGN_PCI_DEVICE: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_ASSIGN_IRQ: {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ case KVM_ASSIGN_DEV_IRQ: {
+ struct kvm_assigned_irq assigned_irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+ goto out;
+ r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_DEASSIGN_DEV_IRQ: {
+ struct kvm_assigned_irq assigned_irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+ goto out;
+ r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_DEASSIGN_PCI_DEVICE: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
+ if (r)
+ goto out;
+ break;
+ }
+#ifdef __KVM_HAVE_MSIX
+ case KVM_ASSIGN_SET_MSIX_NR: {
+ struct kvm_assigned_msix_nr entry_nr;
+ r = -EFAULT;
+ if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+ goto out;
+ r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_ASSIGN_SET_MSIX_ENTRY: {
+ struct kvm_assigned_msix_entry entry;
+ r = -EFAULT;
+ if (copy_from_user(&entry, argp, sizeof entry))
+ goto out;
+ r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+ if (r)
+ goto out;
+ break;
+ }
+#endif
+ case KVM_ASSIGN_SET_INTX_MASK: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ break;
+ }
+out:
+ return r;
+}
diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h
new file mode 100644
index 000000000000..a428c1a211b2
--- /dev/null
+++ b/arch/x86/kvm/assigned-dev.h
@@ -0,0 +1,32 @@
+#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H
+#define ARCH_X86_KVM_ASSIGNED_DEV_H
+
+#include <linux/kvm_host.h>
+
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
+int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev);
+int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev);
+
+int kvm_iommu_map_guest(struct kvm *kvm);
+int kvm_iommu_unmap_guest(struct kvm *kvm);
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg);
+
+void kvm_free_all_assigned_devices(struct kvm *kvm);
+#else
+static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+ return 0;
+}
+
+static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg)
+{
+ return -ENOTTY;
+}
+
+static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
+#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */
+
+#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 976e3a57f9ea..8a80737ee6e6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -23,7 +23,7 @@
#include "mmu.h"
#include "trace.h"
-static u32 xstate_required_size(u64 xstate_bv)
+static u32 xstate_required_size(u64 xstate_bv, bool compacted)
{
int feature_bit = 0;
u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
@@ -31,9 +31,10 @@ static u32 xstate_required_size(u64 xstate_bv)
xstate_bv &= XSTATE_EXTEND_MASK;
while (xstate_bv) {
if (xstate_bv & 0x1) {
- u32 eax, ebx, ecx, edx;
+ u32 eax, ebx, ecx, edx, offset;
cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
- ret = max(ret, eax + ebx);
+ offset = compacted ? ret : ebx;
+ ret = max(ret, offset + eax);
}
xstate_bv >>= 1;
@@ -53,6 +54,8 @@ u64 kvm_supported_xcr0(void)
return xcr0;
}
+#define F(x) bit(X86_FEATURE_##x)
+
int kvm_update_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -64,13 +67,13 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
/* Update OSXSAVE bit */
if (cpu_has_xsave && best->function == 0x1) {
- best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+ best->ecx &= ~F(OSXSAVE);
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
- best->ecx |= bit(X86_FEATURE_OSXSAVE);
+ best->ecx |= F(OSXSAVE);
}
if (apic) {
- if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
+ if (best->ecx & F(TSC_DEADLINE_TIMER))
apic->lapic_timer.timer_mode_mask = 3 << 17;
else
apic->lapic_timer.timer_mode_mask = 1 << 17;
@@ -85,9 +88,13 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
(best->eax | ((u64)best->edx << 32)) &
kvm_supported_xcr0();
vcpu->arch.guest_xstate_size = best->ebx =
- xstate_required_size(vcpu->arch.xcr0);
+ xstate_required_size(vcpu->arch.xcr0, false);
}
+ best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
+ if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
+ best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
+
/*
* The existing code assumes virtual address is 48-bit in the canonical
* address checks; exit if it is ever changed.
@@ -122,8 +129,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
break;
}
}
- if (entry && (entry->edx & bit(X86_FEATURE_NX)) && !is_efer_nx()) {
- entry->edx &= ~bit(X86_FEATURE_NX);
+ if (entry && (entry->edx & F(NX)) && !is_efer_nx()) {
+ entry->edx &= ~F(NX);
printk(KERN_INFO "kvm: guest NX capability removed\n");
}
}
@@ -227,8 +234,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags = 0;
}
-#define F(x) bit(X86_FEATURE_##x)
-
static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
u32 func, u32 index, int *nent, int maxnent)
{
@@ -267,6 +272,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
+ unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
/* cpuid 1.edx */
const u32 kvm_supported_word0_x86_features =
@@ -317,7 +323,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
const u32 kvm_supported_word9_x86_features =
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
- F(ADX) | F(SMAP);
+ F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
+ F(AVX512CD);
+
+ /* cpuid 0xD.1.eax */
+ const u32 kvm_supported_word10_x86_features =
+ F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -453,16 +464,34 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u64 supported = kvm_supported_xcr0();
entry->eax &= supported;
+ entry->ebx = xstate_required_size(supported, false);
+ entry->ecx = entry->ebx;
entry->edx &= supported >> 32;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ if (!supported)
+ break;
+
for (idx = 1, i = 1; idx < 64; ++idx) {
u64 mask = ((u64)1 << idx);
if (*nent >= maxnent)
goto out;
do_cpuid_1_ent(&entry[i], function, idx);
- if (entry[i].eax == 0 || !(supported & mask))
- continue;
+ if (idx == 1) {
+ entry[i].eax &= kvm_supported_word10_x86_features;
+ entry[i].ebx = 0;
+ if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
+ entry[i].ebx =
+ xstate_required_size(supported,
+ true);
+ } else {
+ if (entry[i].eax == 0 || !(supported & mask))
+ continue;
+ if (WARN_ON_ONCE(entry[i].ecx & 1))
+ continue;
+ }
+ entry[i].ecx = 0;
+ entry[i].edx = 0;
entry[i].flags |=
KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
++*nent;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9f8a2faf5040..169b09d76ddd 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -123,6 +123,7 @@
#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
#define Escape (5<<15) /* Escape to coprocessor instruction */
+#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
#define Sse (1<<18) /* SSE Vector instruction */
/* Generic ModRM decode. */
#define ModRM (1<<19)
@@ -166,6 +167,8 @@
#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
#define NoBigReal ((u64)1 << 50) /* No big real mode */
#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
+#define NearBranch ((u64)1 << 52) /* Near branches */
+#define No16 ((u64)1 << 53) /* No 16 bit operand */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -209,6 +212,7 @@ struct opcode {
const struct group_dual *gdual;
const struct gprefix *gprefix;
const struct escape *esc;
+ const struct instr_dual *idual;
void (*fastop)(struct fastop *fake);
} u;
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
@@ -231,6 +235,11 @@ struct escape {
struct opcode high[64];
};
+struct instr_dual {
+ struct opcode mod012;
+ struct opcode mod3;
+};
+
/* EFLAGS bit definitions. */
#define EFLG_ID (1<<21)
#define EFLG_VIP (1<<20)
@@ -379,6 +388,15 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
ON64(FOP2E(op##q, rax, cl)) \
FOP_END
+/* 2 operand, src and dest are reversed */
+#define FASTOP2R(op, name) \
+ FOP_START(name) \
+ FOP2E(op##b, dl, al) \
+ FOP2E(op##w, dx, ax) \
+ FOP2E(op##l, edx, eax) \
+ ON64(FOP2E(op##q, rdx, rax)) \
+ FOP_END
+
#define FOP3E(op, dst, src, src2) \
FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
@@ -477,9 +495,9 @@ address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
}
static inline unsigned long
-register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
+register_address(struct x86_emulate_ctxt *ctxt, int reg)
{
- return address_mask(ctxt, reg);
+ return address_mask(ctxt, reg_read(ctxt, reg));
}
static void masked_increment(ulong *reg, ulong mask, int inc)
@@ -488,7 +506,7 @@ static void masked_increment(ulong *reg, ulong mask, int inc)
}
static inline void
-register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
+register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
{
ulong mask;
@@ -496,7 +514,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in
mask = ~0UL;
else
mask = ad_mask(ctxt);
- masked_increment(reg, mask, inc);
+ masked_increment(reg_rmw(ctxt, reg), mask, inc);
}
static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
@@ -564,40 +582,6 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt)
return emulate_exception(ctxt, NM_VECTOR, 0, false);
}
-static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
- int cs_l)
-{
- switch (ctxt->op_bytes) {
- case 2:
- ctxt->_eip = (u16)dst;
- break;
- case 4:
- ctxt->_eip = (u32)dst;
- break;
-#ifdef CONFIG_X86_64
- case 8:
- if ((cs_l && is_noncanonical_address(dst)) ||
- (!cs_l && (dst >> 32) != 0))
- return emulate_gp(ctxt, 0);
- ctxt->_eip = dst;
- break;
-#endif
- default:
- WARN(1, "unsupported eip assignment size\n");
- }
- return X86EMUL_CONTINUE;
-}
-
-static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
-{
- return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64);
-}
-
-static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
-{
- return assign_eip_near(ctxt, ctxt->_eip + rel);
-}
-
static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
{
u16 selector;
@@ -641,25 +625,24 @@ static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
return true;
}
-static int __linearize(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr,
- unsigned *max_size, unsigned size,
- bool write, bool fetch,
- ulong *linear)
+static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned *max_size, unsigned size,
+ bool write, bool fetch,
+ enum x86emul_mode mode, ulong *linear)
{
struct desc_struct desc;
bool usable;
ulong la;
u32 lim;
u16 sel;
- unsigned cpl;
la = seg_base(ctxt, addr.seg) + addr.ea;
*max_size = 0;
- switch (ctxt->mode) {
+ switch (mode) {
case X86EMUL_MODE_PROT64:
- if (((signed long)la << 16) >> 16 != la)
- return emulate_gp(ctxt, 0);
+ if (is_noncanonical_address(la))
+ goto bad;
*max_size = min_t(u64, ~0u, (1ull << 48) - la);
if (size > *max_size)
@@ -678,46 +661,20 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
if (!fetch && (desc.type & 8) && !(desc.type & 2))
goto bad;
lim = desc_limit_scaled(&desc);
- if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch &&
- (ctxt->d & NoBigReal)) {
- /* la is between zero and 0xffff */
- if (la > 0xffff)
- goto bad;
- *max_size = 0x10000 - la;
- } else if ((desc.type & 8) || !(desc.type & 4)) {
- /* expand-up segment */
- if (addr.ea > lim)
- goto bad;
- *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
- } else {
+ if (!(desc.type & 8) && (desc.type & 4)) {
/* expand-down segment */
if (addr.ea <= lim)
goto bad;
lim = desc.d ? 0xffffffff : 0xffff;
- if (addr.ea > lim)
- goto bad;
- *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
}
+ if (addr.ea > lim)
+ goto bad;
+ *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
if (size > *max_size)
goto bad;
- cpl = ctxt->ops->cpl(ctxt);
- if (!(desc.type & 8)) {
- /* data segment */
- if (cpl > desc.dpl)
- goto bad;
- } else if ((desc.type & 8) && !(desc.type & 4)) {
- /* nonconforming code segment */
- if (cpl != desc.dpl)
- goto bad;
- } else if ((desc.type & 8) && (desc.type & 4)) {
- /* conforming code segment */
- if (cpl < desc.dpl)
- goto bad;
- }
+ la &= (u32)-1;
break;
}
- if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
- la &= (u32)-1;
if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
return emulate_gp(ctxt, 0);
*linear = la;
@@ -735,9 +692,55 @@ static int linearize(struct x86_emulate_ctxt *ctxt,
ulong *linear)
{
unsigned max_size;
- return __linearize(ctxt, addr, &max_size, size, write, false, linear);
+ return __linearize(ctxt, addr, &max_size, size, write, false,
+ ctxt->mode, linear);
+}
+
+static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
+ enum x86emul_mode mode)
+{
+ ulong linear;
+ int rc;
+ unsigned max_size;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = dst };
+
+ if (ctxt->op_bytes != sizeof(unsigned long))
+ addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
+ rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->_eip = addr.ea;
+ return rc;
+}
+
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ return assign_eip(ctxt, dst, ctxt->mode);
}
+static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
+ const struct desc_struct *cs_desc)
+{
+ enum x86emul_mode mode = ctxt->mode;
+
+#ifdef CONFIG_X86_64
+ if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ mode = X86EMUL_MODE_PROT64;
+ }
+#endif
+ if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
+ mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ return assign_eip(ctxt, dst, mode);
+}
+
+static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+{
+ return assign_eip_near(ctxt, ctxt->_eip + rel);
+}
static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
struct segmented_address addr,
@@ -776,7 +779,8 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
* boundary check itself. Instead, we use max_size to check
* against op_size.
*/
- rc = __linearize(ctxt, addr, &max_size, 0, false, true, &linear);
+ rc = __linearize(ctxt, addr, &max_size, 0, false, true, ctxt->mode,
+ &linear);
if (unlikely(rc != X86EMUL_CONTINUE))
return rc;
@@ -911,6 +915,8 @@ FASTOP2W(btc);
FASTOP2(xadd);
+FASTOP2R(cmp, cmp_r);
+
static u8 test_cc(unsigned int condition, unsigned long flags)
{
u8 rc;
@@ -1221,6 +1227,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
if (index_reg != 4)
modrm_ea += reg_read(ctxt, index_reg) << scale;
} else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
+ modrm_ea += insn_fetch(s32, ctxt);
if (ctxt->mode == X86EMUL_MODE_PROT64)
ctxt->rip_relative = 1;
} else {
@@ -1229,10 +1236,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
adjust_modrm_seg(ctxt, base_reg);
}
switch (ctxt->modrm_mod) {
- case 0:
- if (ctxt->modrm_rm == 5)
- modrm_ea += insn_fetch(s32, ctxt);
- break;
case 1:
modrm_ea += insn_fetch(s8, ctxt);
break;
@@ -1284,7 +1287,8 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
else
sv = (s64)ctxt->src.val & (s64)mask;
- ctxt->dst.addr.mem.ea += (sv >> 3);
+ ctxt->dst.addr.mem.ea = address_mask(ctxt,
+ ctxt->dst.addr.mem.ea + (sv >> 3));
}
/* only subword offset */
@@ -1610,6 +1614,9 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
sizeof(base3), &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
return ret;
+ if (is_noncanonical_address(get_desc_base(&seg_desc) |
+ ((u64)base3 << 32)))
+ return emulate_gp(ctxt, 0);
}
load:
ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
@@ -1807,6 +1814,10 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
int seg = ctxt->src2.val;
ctxt->src.val = get_segment_selector(ctxt, seg);
+ if (ctxt->op_bytes == 4) {
+ rsp_increment(ctxt, -2);
+ ctxt->op_bytes = 2;
+ }
return em_push(ctxt);
}
@@ -1850,7 +1861,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
static int em_pushf(struct x86_emulate_ctxt *ctxt)
{
- ctxt->src.val = (unsigned long)ctxt->eflags;
+ ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM;
return em_push(ctxt);
}
@@ -2035,7 +2046,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l);
+ rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
if (rc != X86EMUL_CONTINUE) {
WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64);
/* assigning eip failed; restore the old cs */
@@ -2045,31 +2056,22 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
return rc;
}
-static int em_grp45(struct x86_emulate_ctxt *ctxt)
+static int em_jmp_abs(struct x86_emulate_ctxt *ctxt)
{
- int rc = X86EMUL_CONTINUE;
+ return assign_eip_near(ctxt, ctxt->src.val);
+}
- switch (ctxt->modrm_reg) {
- case 2: /* call near abs */ {
- long int old_eip;
- old_eip = ctxt->_eip;
- rc = assign_eip_near(ctxt, ctxt->src.val);
- if (rc != X86EMUL_CONTINUE)
- break;
- ctxt->src.val = old_eip;
- rc = em_push(ctxt);
- break;
- }
- case 4: /* jmp abs */
- rc = assign_eip_near(ctxt, ctxt->src.val);
- break;
- case 5: /* jmp far */
- rc = em_jmp_far(ctxt);
- break;
- case 6: /* push */
- rc = em_push(ctxt);
- break;
- }
+static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long int old_eip;
+
+ old_eip = ctxt->_eip;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
return rc;
}
@@ -2128,11 +2130,11 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
/* Outer-privilege level return is not implemented */
if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
return X86EMUL_UNHANDLEABLE;
- rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false,
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false,
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, eip, new_desc.l);
+ rc = assign_eip_far(ctxt, eip, &new_desc);
if (rc != X86EMUL_CONTINUE) {
WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64);
ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
@@ -2316,6 +2318,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
ctxt->eflags &= ~msr_data;
+ ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
#endif
} else {
/* legacy mode */
@@ -2349,11 +2352,9 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
&& !vendor_intel(ctxt))
return emulate_ud(ctxt);
- /* XXX sysenter/sysexit have not been tested in 64bit mode.
- * Therefore, we inject an #UD.
- */
+ /* sysenter/sysexit have not been tested in 64bit mode. */
if (ctxt->mode == X86EMUL_MODE_PROT64)
- return emulate_ud(ctxt);
+ return X86EMUL_UNHANDLEABLE;
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -2425,6 +2426,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
ss_sel = (u16)(msr_data + 24);
+ rcx = (u32)rcx;
+ rdx = (u32)rdx;
break;
case X86EMUL_MODE_PROT64:
cs_sel = (u16)(msr_data + 32);
@@ -2599,7 +2602,6 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
save_state_to_tss16(ctxt, &tss_seg);
@@ -2607,13 +2609,11 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
if (old_tss_sel != 0xffff) {
@@ -2624,7 +2624,6 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
sizeof tss_seg.prev_task_link,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
}
@@ -2813,7 +2812,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
*
* 1. jmp/call/int to task gate: Check against DPL of the task gate
* 2. Exception/IRQ/iret: No check is performed
- * 3. jmp/call to TSS: Check against DPL of the TSS
+ * 3. jmp/call to TSS/task-gate: No check is performed since the
+ * hardware checks it before exiting.
*/
if (reason == TASK_SWITCH_GATE) {
if (idt_index != -1) {
@@ -2830,13 +2830,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
return emulate_gp(ctxt, (idt_index << 3) | 0x2);
}
- } else if (reason != TASK_SWITCH_IRET) {
- int dpl = next_tss_desc.dpl;
- if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
- return emulate_gp(ctxt, tss_selector);
}
-
desc_limit = desc_limit_scaled(&next_tss_desc);
if (!next_tss_desc.p ||
((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
@@ -2913,8 +2908,8 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
{
int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
- register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes);
- op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg));
+ register_address_increment(ctxt, reg, df * op->bytes);
+ op->addr.mem.ea = register_address(ctxt, reg);
}
static int em_das(struct x86_emulate_ctxt *ctxt)
@@ -3025,7 +3020,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
- rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l);
+ rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
if (rc != X86EMUL_CONTINUE)
goto fail;
@@ -3215,6 +3210,8 @@ static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
return emulate_ud(ctxt);
ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
+ if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
return X86EMUL_CONTINUE;
}
@@ -3317,7 +3314,7 @@ static int em_sidt(struct x86_emulate_ctxt *ctxt)
return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
}
-static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
{
struct desc_ptr desc_ptr;
int rc;
@@ -3329,12 +3326,23 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt)
ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ if (ctxt->mode == X86EMUL_MODE_PROT64 &&
+ is_noncanonical_address(desc_ptr.address))
+ return emulate_gp(ctxt, 0);
+ if (lgdt)
+ ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ else
+ ctxt->ops->set_idt(ctxt, &desc_ptr);
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
return X86EMUL_CONTINUE;
}
+static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, true);
+}
+
static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
{
int rc;
@@ -3348,20 +3356,7 @@ static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
static int em_lidt(struct x86_emulate_ctxt *ctxt)
{
- struct desc_ptr desc_ptr;
- int rc;
-
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- ctxt->op_bytes = 8;
- rc = read_descriptor(ctxt, ctxt->src.addr.mem,
- &desc_ptr.size, &desc_ptr.address,
- ctxt->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- ctxt->ops->set_idt(ctxt, &desc_ptr);
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
+ return em_lgdt_lidt(ctxt, false);
}
static int em_smsw(struct x86_emulate_ctxt *ctxt)
@@ -3384,7 +3379,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt)
{
int rc = X86EMUL_CONTINUE;
- register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
+ register_address_increment(ctxt, VCPU_REGS_RCX, -1);
if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
(ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
rc = jmp_rel(ctxt, ctxt->src.val);
@@ -3554,7 +3549,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
if (efer & EFER_LMA)
- rsvd = CR3_L_MODE_RESERVED_BITS;
+ rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
if (new_val & rsvd)
return emulate_gp(ctxt, 0);
@@ -3596,8 +3591,15 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
return emulate_ud(ctxt);
- if (check_dr7_gd(ctxt))
+ if (check_dr7_gd(ctxt)) {
+ ulong dr6;
+
+ ctxt->ops->get_dr(ctxt, 6, &dr6);
+ dr6 &= ~15;
+ dr6 |= DR6_BD | DR6_RTM;
+ ctxt->ops->set_dr(ctxt, 6, dr6);
return emulate_db(ctxt);
+ }
return X86EMUL_CONTINUE;
}
@@ -3684,6 +3686,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
+#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
@@ -3780,11 +3783,11 @@ static const struct opcode group4[] = {
static const struct opcode group5[] = {
F(DstMem | SrcNone | Lock, em_inc),
F(DstMem | SrcNone | Lock, em_dec),
- I(SrcMem | Stack, em_grp45),
+ I(SrcMem | NearBranch, em_call_near_abs),
I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
- I(SrcMem | Stack, em_grp45),
- I(SrcMemFAddr | ImplicitOps, em_grp45),
- I(SrcMem | Stack, em_grp45), D(Undefined),
+ I(SrcMem | NearBranch, em_jmp_abs),
+ I(SrcMemFAddr | ImplicitOps, em_jmp_far),
+ I(SrcMem | Stack, em_push), D(Undefined),
};
static const struct opcode group6[] = {
@@ -3845,8 +3848,12 @@ static const struct gprefix pfx_0f_6f_0f_7f = {
I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
};
+static const struct instr_dual instr_dual_0f_2b = {
+ I(0, em_mov), N
+};
+
static const struct gprefix pfx_0f_2b = {
- I(0, em_mov), I(0, em_mov), N, N,
+ ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N,
};
static const struct gprefix pfx_0f_28_0f_29 = {
@@ -3920,6 +3927,10 @@ static const struct escape escape_dd = { {
N, N, N, N, N, N, N, N,
} };
+static const struct instr_dual instr_dual_0f_c3 = {
+ I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
+};
+
static const struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
F6ALU(Lock, em_add),
@@ -3964,7 +3975,7 @@ static const struct opcode opcode_table[256] = {
I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
/* 0x70 - 0x7F */
- X16(D(SrcImmByte)),
+ X16(D(SrcImmByte | NearBranch)),
/* 0x80 - 0x87 */
G(ByteOp | DstMem | SrcImm, group1),
G(DstMem | SrcImm, group1),
@@ -3991,20 +4002,20 @@ static const struct opcode opcode_table[256] = {
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
I2bv(SrcSI | DstDI | Mov | String, em_mov),
- F2bv(SrcSI | DstDI | String | NoWrite, em_cmp),
+ F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r),
/* 0xA8 - 0xAF */
F2bv(DstAcc | SrcImm | NoWrite, em_test),
I2bv(SrcAcc | DstDI | Mov | String, em_mov),
I2bv(SrcSI | DstAcc | Mov | String, em_mov),
- F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp),
+ F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp_r),
/* 0xB0 - 0xB7 */
X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
/* 0xB8 - 0xBF */
X8(I(DstReg | SrcImm64 | Mov, em_mov)),
/* 0xC0 - 0xC7 */
G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
- I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
- I(ImplicitOps | Stack, em_ret),
+ I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm),
+ I(ImplicitOps | NearBranch, em_ret),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
G(ByteOp, group11), G(0, group11),
@@ -4024,13 +4035,14 @@ static const struct opcode opcode_table[256] = {
/* 0xD8 - 0xDF */
N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
/* 0xE0 - 0xE7 */
- X3(I(SrcImmByte, em_loop)),
- I(SrcImmByte, em_jcxz),
+ X3(I(SrcImmByte | NearBranch, em_loop)),
+ I(SrcImmByte | NearBranch, em_jcxz),
I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
/* 0xE8 - 0xEF */
- I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps),
- I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
+ I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch),
+ I(SrcImmFAddr | No64, em_jmp_far),
+ D(SrcImmByte | ImplicitOps | NearBranch),
I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
/* 0xF0 - 0xF7 */
@@ -4090,7 +4102,7 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N,
N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x80 - 0x8F */
- X16(D(SrcImm)),
+ X16(D(SrcImm | NearBranch)),
/* 0x90 - 0x9F */
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
@@ -4121,7 +4133,7 @@ static const struct opcode twobyte_table[256] = {
D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xC0 - 0xC7 */
F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
- N, D(DstMem | SrcReg | ModRM | Mov),
+ N, ID(0, &instr_dual_0f_c3),
N, N, N, GD(0, &group9),
/* 0xC8 - 0xCF */
X8(I(DstReg, em_bswap)),
@@ -4134,12 +4146,20 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
};
+static const struct instr_dual instr_dual_0f_38_f0 = {
+ I(DstReg | SrcMem | Mov, em_movbe), N
+};
+
+static const struct instr_dual instr_dual_0f_38_f1 = {
+ I(DstMem | SrcReg | Mov, em_movbe), N
+};
+
static const struct gprefix three_byte_0f_38_f0 = {
- I(DstReg | SrcMem | Mov, em_movbe), N, N, N
+ ID(0, &instr_dual_0f_38_f0), N, N, N
};
static const struct gprefix three_byte_0f_38_f1 = {
- I(DstMem | SrcReg | Mov, em_movbe), N, N, N
+ ID(0, &instr_dual_0f_38_f1), N, N, N
};
/*
@@ -4152,8 +4172,8 @@ static const struct opcode opcode_map_0f_38[256] = {
/* 0x80 - 0xef */
X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
/* 0xf0 - 0xf1 */
- GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0),
- GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1),
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0),
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1),
/* 0xf2 - 0xff */
N, N, X4(N), X8(N)
};
@@ -4275,7 +4295,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->type = OP_MEM;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
- register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI));
+ register_address(ctxt, VCPU_REGS_RDI);
op->addr.mem.seg = VCPU_SREG_ES;
op->val = 0;
op->count = 1;
@@ -4329,7 +4349,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->type = OP_MEM;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
- register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
+ register_address(ctxt, VCPU_REGS_RSI);
op->addr.mem.seg = ctxt->seg_override;
op->val = 0;
op->count = 1;
@@ -4338,7 +4358,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->type = OP_MEM;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
- register_address(ctxt,
+ address_mask(ctxt,
reg_read(ctxt, VCPU_REGS_RBX) +
(reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
op->addr.mem.seg = ctxt->seg_override;
@@ -4510,8 +4530,7 @@ done_prefixes:
/* vex-prefix instructions are not implemented */
if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
- (mode == X86EMUL_MODE_PROT64 ||
- (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) {
+ (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) {
ctxt->d = NotImpl;
}
@@ -4549,6 +4568,12 @@ done_prefixes:
else
opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
break;
+ case InstrDual:
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.idual->mod3;
+ else
+ opcode = opcode.u.idual->mod012;
+ break;
default:
return EMULATION_FAILED;
}
@@ -4567,7 +4592,8 @@ done_prefixes:
return EMULATION_FAILED;
if (unlikely(ctxt->d &
- (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) {
+ (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm|NearBranch|
+ No16))) {
/*
* These are copied unconditionally here, and checked unconditionally
* in x86_emulate_insn.
@@ -4578,8 +4604,12 @@ done_prefixes:
if (ctxt->d & NotImpl)
return EMULATION_FAILED;
- if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
- ctxt->op_bytes = 8;
+ if (mode == X86EMUL_MODE_PROT64) {
+ if (ctxt->op_bytes == 4 && (ctxt->d & Stack))
+ ctxt->op_bytes = 8;
+ else if (ctxt->d & NearBranch)
+ ctxt->op_bytes = 8;
+ }
if (ctxt->d & Op3264) {
if (mode == X86EMUL_MODE_PROT64)
@@ -4588,6 +4618,9 @@ done_prefixes:
ctxt->op_bytes = 4;
}
+ if ((ctxt->d & No16) && ctxt->op_bytes == 2)
+ ctxt->op_bytes = 4;
+
if (ctxt->d & Sse)
ctxt->op_bytes = 16;
else if (ctxt->d & Mmx)
@@ -4631,7 +4664,8 @@ done_prefixes:
rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
if (ctxt->rip_relative)
- ctxt->memopp->addr.mem.ea += ctxt->_eip;
+ ctxt->memopp->addr.mem.ea = address_mask(ctxt,
+ ctxt->memopp->addr.mem.ea + ctxt->_eip);
done:
return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
@@ -4775,6 +4809,12 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
/* Privileged instruction can be executed only in CPL=0 */
if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
if (ctxt->d & PrivUD)
@@ -4784,12 +4824,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- /* Instruction can only be executed in protected mode */
- if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
/* Do instruction specific permission checks */
if (ctxt->d & CheckPerm) {
rc = ctxt->check_perm(ctxt);
@@ -4974,8 +5008,7 @@ writeback:
count = ctxt->src.count;
else
count = ctxt->dst.count;
- register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX),
- -count);
+ register_address_increment(ctxt, VCPU_REGS_RCX, -count);
if (!string_insn_completed(ctxt)) {
/*
@@ -5053,11 +5086,6 @@ twobyte_insn:
ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
(s16) ctxt->src.val;
break;
- case 0xc3: /* movnti */
- ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val :
- (u32) ctxt->src.val;
- break;
default:
goto cannot_emulate;
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
new file mode 100644
index 000000000000..b1947e0f3e10
--- /dev/null
+++ b/arch/x86/kvm/ioapic.c
@@ -0,0 +1,675 @@
+/*
+ * Copyright (C) 2001 MandrakeSoft S.A.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * MandrakeSoft S.A.
+ * 43, rue d'Aboukir
+ * 75002 Paris - France
+ * http://www.linux-mandrake.com/
+ * http://www.mandrakesoft.com/
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Yunhong Jiang <yunhong.jiang@intel.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ * Based on Xen 3.1 code.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <trace/events/kvm.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+#include "irq.h"
+
+#if 0
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
+#else
+#define ioapic_debug(fmt, arg...)
+#endif
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
+ bool line_status);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
+ unsigned long addr,
+ unsigned long length)
+{
+ unsigned long result = 0;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+ | (IOAPIC_VERSION_ID & 0xff));
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ case IOAPIC_REG_ARB_ID:
+ result = ((ioapic->id & 0xf) << 24);
+ break;
+
+ default:
+ {
+ u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+ u64 redir_content;
+
+ if (redir_index < IOAPIC_NUM_PINS)
+ redir_content =
+ ioapic->redirtbl[redir_index].bits;
+ else
+ redir_content = ~0ULL;
+
+ result = (ioapic->ioregsel & 0x1) ?
+ (redir_content >> 32) & 0xffffffff :
+ redir_content & 0xffffffff;
+ break;
+ }
+ }
+
+ return result;
+}
+
+static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
+{
+ ioapic->rtc_status.pending_eoi = 0;
+ bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
+
+static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic)
+{
+ if (WARN_ON(ioapic->rtc_status.pending_eoi < 0))
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ bool new_val, old_val;
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ union kvm_ioapic_redirect_entry *e;
+
+ e = &ioapic->redirtbl[RTC_GSI];
+ if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id,
+ e->fields.dest_mode))
+ return;
+
+ new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
+ old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+
+ if (new_val == old_val)
+ return;
+
+ if (new_val) {
+ __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi++;
+ } else {
+ __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi--;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ if (RTC_GSI >= IOAPIC_NUM_PINS)
+ return;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm)
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+}
+
+static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
+{
+ if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) {
+ --ioapic->rtc_status.pending_eoi;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
+{
+ if (ioapic->rtc_status.pending_eoi > 0)
+ return true; /* coalesced */
+
+ return false;
+}
+
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+ int irq_level, bool line_status)
+{
+ union kvm_ioapic_redirect_entry entry;
+ u32 mask = 1 << irq;
+ u32 old_irr;
+ int edge, ret;
+
+ entry = ioapic->redirtbl[irq];
+ edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+ * this only happens if a previous edge has not been delivered due
+ * do masking. For level interrupts, the remote_irr field tells
+ * us if the interrupt is waiting for an EOI.
+ *
+ * RTC is special: it is edge-triggered, but userspace likes to know
+ * if it has been already ack-ed via EOI because coalesced RTC
+ * interrupts lead to time drift in Windows guests. So we track
+ * EOI manually for the RTC interrupt.
+ */
+ if (irq == RTC_GSI && line_status &&
+ rtc_irq_check_coalesced(ioapic)) {
+ ret = 0;
+ goto out;
+ }
+
+ old_irr = ioapic->irr;
+ ioapic->irr |= mask;
+ if ((edge && old_irr == ioapic->irr) ||
+ (!edge && entry.fields.remote_irr)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+ u32 idx;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+ ioapic_set_irq(ioapic, idx, 1, true);
+
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+
+static void update_handled_vectors(struct kvm_ioapic *ioapic)
+{
+ DECLARE_BITMAP(handled_vectors, 256);
+ int i;
+
+ memset(handled_vectors, 0, sizeof(handled_vectors));
+ for (i = 0; i < IOAPIC_NUM_PINS; ++i)
+ __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
+ memcpy(ioapic->handled_vectors, handled_vectors,
+ sizeof(handled_vectors));
+ smp_wmb();
+}
+
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
+ u32 *tmr)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ union kvm_ioapic_redirect_entry *e;
+ int index;
+
+ spin_lock(&ioapic->lock);
+ for (index = 0; index < IOAPIC_NUM_PINS; index++) {
+ e = &ioapic->redirtbl[index];
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
+ kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
+ index == RTC_GSI) {
+ if (kvm_apic_match_dest(vcpu, NULL, 0,
+ e->fields.dest_id, e->fields.dest_mode)) {
+ __set_bit(e->fields.vector,
+ (unsigned long *)eoi_exit_bitmap);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
+ __set_bit(e->fields.vector,
+ (unsigned long *)tmr);
+ }
+ }
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (!ioapic)
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+ unsigned index;
+ bool mask_before, mask_after;
+ union kvm_ioapic_redirect_entry *e;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ /* Writes are ignored. */
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ ioapic->id = (val >> 24) & 0xf;
+ break;
+
+ case IOAPIC_REG_ARB_ID:
+ break;
+
+ default:
+ index = (ioapic->ioregsel - 0x10) >> 1;
+
+ ioapic_debug("change redir index %x val %x\n", index, val);
+ if (index >= IOAPIC_NUM_PINS)
+ return;
+ e = &ioapic->redirtbl[index];
+ mask_before = e->fields.mask;
+ if (ioapic->ioregsel & 1) {
+ e->bits &= 0xffffffff;
+ e->bits |= (u64) val << 32;
+ } else {
+ e->bits &= ~0xffffffffULL;
+ e->bits |= (u32) val;
+ e->fields.remote_irr = 0;
+ }
+ update_handled_vectors(ioapic);
+ mask_after = e->fields.mask;
+ if (mask_before != mask_after)
+ kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
+ && ioapic->irr & (1 << index))
+ ioapic_service(ioapic, index, false);
+ kvm_vcpu_request_scan_ioapic(ioapic->kvm);
+ break;
+ }
+}
+
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
+{
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+ struct kvm_lapic_irq irqe;
+ int ret;
+
+ if (entry->fields.mask)
+ return -1;
+
+ ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+ "vector=%x trig_mode=%x\n",
+ entry->fields.dest_id, entry->fields.dest_mode,
+ entry->fields.delivery_mode, entry->fields.vector,
+ entry->fields.trig_mode);
+
+ irqe.dest_id = entry->fields.dest_id;
+ irqe.vector = entry->fields.vector;
+ irqe.dest_mode = entry->fields.dest_mode;
+ irqe.trig_mode = entry->fields.trig_mode;
+ irqe.delivery_mode = entry->fields.delivery_mode << 8;
+ irqe.level = 1;
+ irqe.shorthand = 0;
+
+ if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+ ioapic->irr &= ~(1 << irq);
+
+ if (irq == RTC_GSI && line_status) {
+ /*
+ * pending_eoi cannot ever become negative (see
+ * rtc_status_pending_eoi_check_valid) and the caller
+ * ensures that it is only called if it is >= zero, namely
+ * if rtc_irq_check_coalesced returns false).
+ */
+ BUG_ON(ioapic->rtc_status.pending_eoi != 0);
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
+ ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
+ } else
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
+
+ if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+ entry->fields.remote_irr = 1;
+
+ return ret;
+}
+
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level, bool line_status)
+{
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
+
+ spin_lock(&ioapic->lock);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
+
+ spin_unlock(&ioapic->lock);
+
+ return ret;
+}
+
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
+{
+ int i;
+
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &ioapic->irq_states[i]);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+ int i;
+ struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+ eoi_inject.work);
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+ continue;
+
+ if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+ ioapic_service(ioapic, i, false);
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
+
+static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic, int vector, int trigger_mode)
+{
+ int i;
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.vector != vector)
+ continue;
+
+ if (i == RTC_GSI)
+ rtc_irq_eoi(ioapic, vcpu);
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ spin_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
+ spin_lock(&ioapic->lock);
+
+ if (trigger_mode != IOAPIC_LEVEL_TRIG)
+ continue;
+
+ ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+ ent->fields.remote_irr = 0;
+ if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
+ ++ioapic->irq_eoi[i];
+ if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+ /*
+ * Real hardware does not deliver the interrupt
+ * immediately during eoi broadcast, and this
+ * lets a buggy guest make slow progress
+ * even if it does not correctly handle a
+ * level-triggered interrupt. Emulate this
+ * behavior if we detect an interrupt storm.
+ */
+ schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
+ ioapic->irq_eoi[i] = 0;
+ trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
+ } else {
+ ioapic_service(ioapic, i, false);
+ }
+ } else {
+ ioapic->irq_eoi[i] = 0;
+ }
+ }
+}
+
+bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ smp_rmb();
+ return test_bit(vector, ioapic->handled_vectors);
+}
+
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode);
+ spin_unlock(&ioapic->lock);
+}
+
+static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_ioapic, dev);
+}
+
+static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
+{
+ return ((addr >= ioapic->base_address &&
+ (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+ void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 result;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ioapic_debug("addr %lx\n", (unsigned long)addr);
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ result = ioapic->ioregsel;
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ result = ioapic_read_indirect(ioapic, addr, len);
+ break;
+
+ default:
+ result = 0;
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+
+ switch (len) {
+ case 8:
+ *(u64 *) val = result;
+ break;
+ case 1:
+ case 2:
+ case 4:
+ memcpy(val, (char *)&result, len);
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+ }
+ return 0;
+}
+
+static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
+ const void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 data;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
+ (void*)addr, len, val);
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ switch (len) {
+ case 8:
+ case 4:
+ data = *(u32 *) val;
+ break;
+ case 2:
+ data = *(u16 *) val;
+ break;
+ case 1:
+ data = *(u8 *) val;
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+ return 0;
+ }
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ ioapic->ioregsel = data & 0xFF; /* 8-bit register */
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ ioapic_write_indirect(ioapic, data);
+ break;
+
+ default:
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+ int i;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
+ ioapic->redirtbl[i].fields.mask = 1;
+ ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+ ioapic->ioregsel = 0;
+ ioapic->irr = 0;
+ ioapic->id = 0;
+ memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
+ rtc_irq_eoi_tracking_reset(ioapic);
+ update_handled_vectors(ioapic);
+}
+
+static const struct kvm_io_device_ops ioapic_mmio_ops = {
+ .read = ioapic_mmio_read,
+ .write = ioapic_mmio_write,
+};
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic;
+ int ret;
+
+ ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+ if (!ioapic)
+ return -ENOMEM;
+ spin_lock_init(&ioapic->lock);
+ INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ kvm->arch.vioapic = ioapic;
+ kvm_ioapic_reset(ioapic);
+ kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
+ ioapic->kvm = kvm;
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+ IOAPIC_MEM_LENGTH, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+
+ return ret;
+}
+
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ if (ioapic) {
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+}
+
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+ if (!ioapic)
+ return -EINVAL;
+
+ spin_lock(&ioapic->lock);
+ memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+ if (!ioapic)
+ return -EINVAL;
+
+ spin_lock(&ioapic->lock);
+ memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ ioapic->irr = 0;
+ update_handled_vectors(ioapic);
+ kvm_vcpu_request_scan_ioapic(kvm);
+ kvm_ioapic_inject_all(ioapic, state->irr);
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
new file mode 100644
index 000000000000..3c9195535ffc
--- /dev/null
+++ b/arch/x86/kvm/ioapic.h
@@ -0,0 +1,119 @@
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG 0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
+#define IOAPIC_MEM_LENGTH 0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT 0x00
+#define IOAPIC_REG_WINDOW 0x10
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define IOAPIC_FIXED 0x0
+#define IOAPIC_LOWEST_PRIORITY 0x1
+#define IOAPIC_PMI 0x2
+#define IOAPIC_NMI 0x4
+#define IOAPIC_INIT 0x5
+#define IOAPIC_EXTINT 0x7
+
+#ifdef CONFIG_X86
+#define RTC_GSI 8
+#else
+#define RTC_GSI -1U
+#endif
+
+struct rtc_status {
+ int pending_eoi;
+ DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS);
+};
+
+union kvm_ioapic_redirect_entry {
+ u64 bits;
+ struct {
+ u8 vector;
+ u8 delivery_mode:3;
+ u8 dest_mode:1;
+ u8 delivery_status:1;
+ u8 polarity:1;
+ u8 remote_irr:1;
+ u8 trig_mode:1;
+ u8 mask:1;
+ u8 reserve:7;
+ u8 reserved[4];
+ u8 dest_id;
+ } fields;
+};
+
+struct kvm_ioapic {
+ u64 base_address;
+ u32 ioregsel;
+ u32 id;
+ u32 irr;
+ u32 pad;
+ union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+ unsigned long irq_states[IOAPIC_NUM_PINS];
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+ void (*ack_notifier)(void *opaque, int irq);
+ spinlock_t lock;
+ DECLARE_BITMAP(handled_vectors, 256);
+ struct rtc_status rtc_status;
+ struct delayed_work eoi_inject;
+ u32 irq_eoi[IOAPIC_NUM_PINS];
+};
+
+#ifdef DEBUG
+#define ASSERT(x) \
+do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
+{
+ return kvm->arch.vioapic;
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int short_hand, unsigned int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
+ int trigger_mode);
+bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level, bool line_status);
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, unsigned long *dest_map);
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
+ u32 *tmr);
+
+#endif
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
new file mode 100644
index 000000000000..17b73eeac8a4
--- /dev/null
+++ b/arch/x86/kvm/iommu.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Author: Allen M. Kay <allen.m.kay@intel.com>
+ * Author: Weidong Han <weidong.han@intel.com>
+ * Author: Ben-Ami Yassour <benami@il.ibm.com>
+ */
+
+#include <linux/list.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/stat.h>
+#include <linux/dmar.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+#include "assigned-dev.h"
+
+static bool allow_unsafe_assigned_interrupts;
+module_param_named(allow_unsafe_assigned_interrupts,
+ allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
+ "Enable device assignment on platforms without interrupt remapping support.");
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm);
+static void kvm_iommu_put_pages(struct kvm *kvm,
+ gfn_t base_gfn, unsigned long npages);
+
+static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+ unsigned long npages)
+{
+ gfn_t end_gfn;
+ pfn_t pfn;
+
+ pfn = gfn_to_pfn_memslot(slot, gfn);
+ end_gfn = gfn + npages;
+ gfn += 1;
+
+ if (is_error_noslot_pfn(pfn))
+ return pfn;
+
+ while (gfn < end_gfn)
+ gfn_to_pfn_memslot(slot, gfn++);
+
+ return pfn;
+}
+
+static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; ++i)
+ kvm_release_pfn_clean(pfn + i);
+}
+
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ gfn_t gfn, end_gfn;
+ pfn_t pfn;
+ int r = 0;
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+ int flags;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ gfn = slot->base_gfn;
+ end_gfn = gfn + slot->npages;
+
+ flags = IOMMU_READ;
+ if (!(slot->flags & KVM_MEM_READONLY))
+ flags |= IOMMU_WRITE;
+ if (!kvm->arch.iommu_noncoherent)
+ flags |= IOMMU_CACHE;
+
+
+ while (gfn < end_gfn) {
+ unsigned long page_size;
+
+ /* Check if already mapped */
+ if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
+ gfn += 1;
+ continue;
+ }
+
+ /* Get the page size we could use to map */
+ page_size = kvm_host_page_size(kvm, gfn);
+
+ /* Make sure the page_size does not exceed the memslot */
+ while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
+ page_size >>= 1;
+
+ /* Make sure gfn is aligned to the page size we want to map */
+ while ((gfn << PAGE_SHIFT) & (page_size - 1))
+ page_size >>= 1;
+
+ /* Make sure hva is aligned to the page size we want to map */
+ while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
+ page_size >>= 1;
+
+ /*
+ * Pin all pages we are about to map in memory. This is
+ * important because we unmap and unpin in 4kb steps later.
+ */
+ pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT);
+ if (is_error_noslot_pfn(pfn)) {
+ gfn += 1;
+ continue;
+ }
+
+ /* Map into IO address space */
+ r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
+ page_size, flags);
+ if (r) {
+ printk(KERN_ERR "kvm_iommu_map_address:"
+ "iommu failed to map pfn=%llx\n", pfn);
+ kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT);
+ goto unmap_pages;
+ }
+
+ gfn += page_size >> PAGE_SHIFT;
+
+
+ }
+
+ return 0;
+
+unmap_pages:
+ kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn);
+ return r;
+}
+
+static int kvm_iommu_map_memslots(struct kvm *kvm)
+{
+ int idx, r = 0;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+
+ if (kvm->arch.iommu_noncoherent)
+ kvm_arch_register_noncoherent_dma(kvm);
+
+ idx = srcu_read_lock(&kvm->srcu);
+ slots = kvm_memslots(kvm);
+
+ kvm_for_each_memslot(memslot, slots) {
+ r = kvm_iommu_map_pages(kvm, memslot);
+ if (r)
+ break;
+ }
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return r;
+}
+
+int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
+{
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+ int r;
+ bool noncoherent;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ if (pdev == NULL)
+ return -ENODEV;
+
+ r = iommu_attach_device(domain, &pdev->dev);
+ if (r) {
+ dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
+ return r;
+ }
+
+ noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY);
+
+ /* Check if need to update IOMMU page table for guest memory */
+ if (noncoherent != kvm->arch.iommu_noncoherent) {
+ kvm_iommu_unmap_memslots(kvm);
+ kvm->arch.iommu_noncoherent = noncoherent;
+ r = kvm_iommu_map_memslots(kvm);
+ if (r)
+ goto out_unmap;
+ }
+
+ pci_set_dev_assigned(pdev);
+
+ dev_info(&pdev->dev, "kvm assign device\n");
+
+ return 0;
+out_unmap:
+ kvm_iommu_unmap_memslots(kvm);
+ return r;
+}
+
+int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
+{
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ if (pdev == NULL)
+ return -ENODEV;
+
+ iommu_detach_device(domain, &pdev->dev);
+
+ pci_clear_dev_assigned(pdev);
+
+ dev_info(&pdev->dev, "kvm deassign device\n");
+
+ return 0;
+}
+
+int kvm_iommu_map_guest(struct kvm *kvm)
+{
+ int r;
+
+ if (!iommu_present(&pci_bus_type)) {
+ printk(KERN_ERR "%s: iommu not found\n", __func__);
+ return -ENODEV;
+ }
+
+ mutex_lock(&kvm->slots_lock);
+
+ kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
+ if (!kvm->arch.iommu_domain) {
+ r = -ENOMEM;
+ goto out_unlock;
+ }
+
+ if (!allow_unsafe_assigned_interrupts &&
+ !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) {
+ printk(KERN_WARNING "%s: No interrupt remapping support,"
+ " disallowing device assignment."
+ " Re-enble with \"allow_unsafe_assigned_interrupts=1\""
+ " module option.\n", __func__);
+ iommu_domain_free(kvm->arch.iommu_domain);
+ kvm->arch.iommu_domain = NULL;
+ r = -EPERM;
+ goto out_unlock;
+ }
+
+ r = kvm_iommu_map_memslots(kvm);
+ if (r)
+ kvm_iommu_unmap_memslots(kvm);
+
+out_unlock:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static void kvm_iommu_put_pages(struct kvm *kvm,
+ gfn_t base_gfn, unsigned long npages)
+{
+ struct iommu_domain *domain;
+ gfn_t end_gfn, gfn;
+ pfn_t pfn;
+ u64 phys;
+
+ domain = kvm->arch.iommu_domain;
+ end_gfn = base_gfn + npages;
+ gfn = base_gfn;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return;
+
+ while (gfn < end_gfn) {
+ unsigned long unmap_pages;
+ size_t size;
+
+ /* Get physical address */
+ phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
+
+ if (!phys) {
+ gfn++;
+ continue;
+ }
+
+ pfn = phys >> PAGE_SHIFT;
+
+ /* Unmap address from IO address space */
+ size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
+ unmap_pages = 1ULL << get_order(size);
+
+ /* Unpin all pages we just unmapped to not leak any memory */
+ kvm_unpin_pages(kvm, pfn, unmap_pages);
+
+ gfn += unmap_pages;
+ }
+}
+
+void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
+}
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm)
+{
+ int idx;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ slots = kvm_memslots(kvm);
+
+ kvm_for_each_memslot(memslot, slots)
+ kvm_iommu_unmap_pages(kvm, memslot);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (kvm->arch.iommu_noncoherent)
+ kvm_arch_unregister_noncoherent_dma(kvm);
+
+ return 0;
+}
+
+int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_iommu_unmap_memslots(kvm);
+ kvm->arch.iommu_domain = NULL;
+ kvm->arch.iommu_noncoherent = false;
+ mutex_unlock(&kvm->slots_lock);
+
+ iommu_domain_free(domain);
+ return 0;
+}
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
new file mode 100644
index 000000000000..72298b3ac025
--- /dev/null
+++ b/arch/x86/kvm/irq_comm.c
@@ -0,0 +1,332 @@
+/*
+ * irq_comm.c: Common API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <trace/events/kvm.h>
+
+#include <asm/msidef.h>
+
+#include "irq.h"
+
+#include "ioapic.h"
+
+static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_pic *pic = pic_irqchip(kvm);
+ return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
+}
+
+static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
+ line_status);
+}
+
+inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
+{
+ return irq->delivery_mode == APIC_DM_LOWEST;
+}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, unsigned long *dest_map)
+{
+ int i, r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+
+ if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
+ kvm_is_dm_lowest_prio(irq)) {
+ printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_is_dm_lowest_prio(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_lapic_enabled(vcpu)) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ }
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
+}
+
+static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+
+ irq->dest_id = (e->msi.address_lo &
+ MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+ irq->vector = (e->msi.data &
+ MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+ irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+ irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+ irq->delivery_mode = e->msi.data & 0x700;
+ irq->level = 1;
+ irq->shorthand = 0;
+ /* TODO Deal with RH bit of MSI message address */
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (!level)
+ return -1;
+
+ kvm_set_msi_irq(e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+
+static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ kvm_set_msi_irq(e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ else
+ return -EWOULDBLOCK;
+}
+
+/*
+ * Deliver an IRQ in an atomic context if we can, or return a failure,
+ * user can retry in a process context.
+ * Return value:
+ * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
+ * Other values - No need to retry.
+ */
+int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
+{
+ struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
+ struct kvm_kernel_irq_routing_entry *e;
+ int ret = -EINVAL;
+ int idx;
+
+ trace_kvm_set_irq(irq, level, irq_source_id);
+
+ /*
+ * Injection into either PIC or IOAPIC might need to scan all CPUs,
+ * which would need to be retried from thread context; when same GSI
+ * is connected to both PIC and IOAPIC, we'd have to report a
+ * partial failure here.
+ * Since there's no easy way to do this, we only support injecting MSI
+ * which is limited to 1:1 GSI mapping.
+ */
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
+ e = &entries[0];
+ if (likely(e->type == KVM_IRQ_ROUTING_MSI))
+ ret = kvm_set_msi_inatomic(e, kvm);
+ else
+ ret = -EWOULDBLOCK;
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+int kvm_request_irq_source_id(struct kvm *kvm)
+{
+ unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
+ int irq_source_id;
+
+ mutex_lock(&kvm->irq_lock);
+ irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
+
+ if (irq_source_id >= BITS_PER_LONG) {
+ printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+ irq_source_id = -EFAULT;
+ goto unlock;
+ }
+
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+ set_bit(irq_source_id, bitmap);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+
+ return irq_source_id;
+}
+
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
+{
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+
+ mutex_lock(&kvm->irq_lock);
+ if (irq_source_id < 0 ||
+ irq_source_id >= BITS_PER_LONG) {
+ printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+ goto unlock;
+ }
+ clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+ if (!irqchip_in_kernel(kvm))
+ goto unlock;
+
+ kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
+ kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ int r = -EINVAL;
+ int delta;
+ unsigned max_pin;
+
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ delta = 0;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ e->set = kvm_set_pic_irq;
+ max_pin = PIC_NUM_PINS;
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->set = kvm_set_pic_irq;
+ max_pin = PIC_NUM_PINS;
+ delta = 8;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ max_pin = KVM_IOAPIC_NUM_PINS;
+ e->set = kvm_set_ioapic_irq;
+ break;
+ default:
+ goto out;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ e->irqchip.pin = ue->u.irqchip.pin + delta;
+ if (e->irqchip.pin >= max_pin)
+ goto out;
+ break;
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+ break;
+ default:
+ goto out;
+ }
+
+ r = 0;
+out:
+ return r;
+}
+
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b8345dd41b25..4f0c0b954686 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -68,6 +68,9 @@
#define MAX_APIC_VECTOR 256
#define APIC_VECTORS_PER_REG 32
+#define APIC_BROADCAST 0xFF
+#define X2APIC_BROADCAST 0xFFFFFFFFul
+
#define VEC_POS(v) ((v) & (32 - 1))
#define REG_POS(v) (((v) >> 5) << 4)
@@ -129,8 +132,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}
-#define KVM_X2APIC_CID_BITS 0
-
static void recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
@@ -149,42 +150,56 @@ static void recalculate_apic_map(struct kvm *kvm)
new->cid_shift = 8;
new->cid_mask = 0;
new->lid_mask = 0xff;
+ new->broadcast = APIC_BROADCAST;
kvm_for_each_vcpu(i, vcpu, kvm) {
struct kvm_lapic *apic = vcpu->arch.apic;
- u16 cid, lid;
- u32 ldr;
if (!kvm_apic_present(vcpu))
continue;
+ if (apic_x2apic_mode(apic)) {
+ new->ldr_bits = 32;
+ new->cid_shift = 16;
+ new->cid_mask = new->lid_mask = 0xffff;
+ new->broadcast = X2APIC_BROADCAST;
+ } else if (kvm_apic_get_reg(apic, APIC_LDR)) {
+ if (kvm_apic_get_reg(apic, APIC_DFR) ==
+ APIC_DFR_CLUSTER) {
+ new->cid_shift = 4;
+ new->cid_mask = 0xf;
+ new->lid_mask = 0xf;
+ } else {
+ new->cid_shift = 8;
+ new->cid_mask = 0;
+ new->lid_mask = 0xff;
+ }
+ }
+
/*
* All APICs have to be configured in the same mode by an OS.
* We take advatage of this while building logical id loockup
- * table. After reset APICs are in xapic/flat mode, so if we
- * find apic with different setting we assume this is the mode
+ * table. After reset APICs are in software disabled mode, so if
+ * we find apic with different setting we assume this is the mode
* OS wants all apics to be in; build lookup table accordingly.
*/
- if (apic_x2apic_mode(apic)) {
- new->ldr_bits = 32;
- new->cid_shift = 16;
- new->cid_mask = (1 << KVM_X2APIC_CID_BITS) - 1;
- new->lid_mask = 0xffff;
- } else if (kvm_apic_sw_enabled(apic) &&
- !new->cid_mask /* flat mode */ &&
- kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) {
- new->cid_shift = 4;
- new->cid_mask = 0xf;
- new->lid_mask = 0xf;
- }
+ if (kvm_apic_sw_enabled(apic))
+ break;
+ }
- new->phys_map[kvm_apic_id(apic)] = apic;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u16 cid, lid;
+ u32 ldr, aid;
+ aid = kvm_apic_id(apic);
ldr = kvm_apic_get_reg(apic, APIC_LDR);
cid = apic_cluster_id(new, ldr);
lid = apic_logical_id(new, ldr);
- if (lid)
+ if (aid < ARRAY_SIZE(new->phys_map))
+ new->phys_map[aid] = apic;
+ if (lid && cid < ARRAY_SIZE(new->logical_map))
new->logical_map[cid][ffs(lid) - 1] = apic;
}
out:
@@ -201,11 +216,13 @@ out:
static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
{
- u32 prev = kvm_apic_get_reg(apic, APIC_SPIV);
+ bool enabled = val & APIC_SPIV_APIC_ENABLED;
apic_set_reg(apic, APIC_SPIV, val);
- if ((prev ^ val) & APIC_SPIV_APIC_ENABLED) {
- if (val & APIC_SPIV_APIC_ENABLED) {
+
+ if (enabled != apic->sw_enabled) {
+ apic->sw_enabled = enabled;
+ if (enabled) {
static_key_slow_dec_deferred(&apic_sw_disabled);
recalculate_apic_map(apic->vcpu->kvm);
} else
@@ -237,21 +254,17 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
{
- return ((kvm_apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
}
static inline int apic_lvtt_period(struct kvm_lapic *apic)
{
- return ((kvm_apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
}
static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
{
- return ((kvm_apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) ==
- APIC_LVT_TIMER_TSCDEADLINE);
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
}
static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -326,8 +339,12 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
{
- apic->irr_pending = true;
apic_set_vector(vec, apic->regs + APIC_IRR);
+ /*
+ * irr_pending must be true if any interrupt is pending; set it after
+ * APIC_IRR to avoid race with apic_clear_irr
+ */
+ apic->irr_pending = true;
}
static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -359,13 +376,15 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
vcpu = apic->vcpu;
- apic_clear_vector(vec, apic->regs + APIC_IRR);
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
/* try to update RVI */
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
kvm_make_request(KVM_REQ_EVENT, vcpu);
- else {
- vec = apic_search_irr(apic);
- apic->irr_pending = (vec != -1);
+ } else {
+ apic->irr_pending = false;
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
}
}
@@ -558,16 +577,25 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
apic_update_ppr(apic);
}
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
+static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+{
+ return dest == (apic_x2apic_mode(apic) ?
+ X2APIC_BROADCAST : APIC_BROADCAST);
+}
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
{
- return dest == 0xff || kvm_apic_id(apic) == dest;
+ return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
}
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
{
int result = 0;
u32 logical_id;
+ if (kvm_apic_broadcast(apic, mda))
+ return 1;
+
if (apic_x2apic_mode(apic)) {
logical_id = kvm_apic_get_reg(apic, APIC_LDR);
return logical_id & mda;
@@ -595,7 +623,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
}
int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, int dest, int dest_mode)
+ int short_hand, unsigned int dest, int dest_mode)
{
int result = 0;
struct kvm_lapic *target = vcpu->arch.apic;
@@ -657,15 +685,24 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
if (!map)
goto out;
+ if (irq->dest_id == map->broadcast)
+ goto out;
+
+ ret = true;
+
if (irq->dest_mode == 0) { /* physical mode */
- if (irq->delivery_mode == APIC_DM_LOWEST ||
- irq->dest_id == 0xff)
+ if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
goto out;
- dst = &map->phys_map[irq->dest_id & 0xff];
+
+ dst = &map->phys_map[irq->dest_id];
} else {
u32 mda = irq->dest_id << (32 - map->ldr_bits);
+ u16 cid = apic_cluster_id(map, mda);
+
+ if (cid >= ARRAY_SIZE(map->logical_map))
+ goto out;
- dst = map->logical_map[apic_cluster_id(map, mda)];
+ dst = map->logical_map[cid];
bitmap = apic_logical_id(map, mda);
@@ -691,8 +728,6 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
*r = 0;
*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
}
-
- ret = true;
out:
rcu_read_unlock();
return ret;
@@ -1034,6 +1069,26 @@ static void update_divide_count(struct kvm_lapic *apic)
apic->divide_count);
}
+static void apic_timer_expired(struct kvm_lapic *apic)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ wait_queue_head_t *q = &vcpu->wq;
+
+ /*
+ * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+ * vcpu_enter_guest.
+ */
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ atomic_inc(&apic->lapic_timer.pending);
+ /* FIXME: this code should not know anything about vcpus */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+}
+
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
@@ -1096,9 +1151,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
- }
- hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ hrtimer_start(&apic->lapic_timer.timer,
+ ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ } else
+ apic_timer_expired(apic);
local_irq_restore(flags);
}
@@ -1203,17 +1259,20 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
- case APIC_LVTT:
- if ((kvm_apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) !=
- (val & apic->lapic_timer.timer_mode_mask))
+ case APIC_LVTT: {
+ u32 timer_mode = val & apic->lapic_timer.timer_mode_mask;
+
+ if (apic->lapic_timer.timer_mode != timer_mode) {
+ apic->lapic_timer.timer_mode = timer_mode;
hrtimer_cancel(&apic->lapic_timer.timer);
+ }
if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
apic_set_reg(apic, APIC_LVTT, val);
break;
+ }
case APIC_TMICT:
if (apic_lvtt_tscdeadline(apic))
@@ -1320,7 +1379,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
static_key_slow_dec_deferred(&apic_hw_disabled);
- if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
+ if (!apic->sw_enabled)
static_key_slow_dec_deferred(&apic_sw_disabled);
if (apic->regs)
@@ -1355,9 +1414,6 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
return;
hrtimer_cancel(&apic->lapic_timer.timer);
- /* Inject here so clearing tscdeadline won't override new value */
- if (apic_has_pending_timer(vcpu))
- kvm_inject_apic_timer_irqs(vcpu);
apic->lapic_timer.tscdeadline = data;
start_apic_timer(apic);
}
@@ -1422,6 +1478,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
+ if ((value & MSR_IA32_APICBASE_ENABLE) &&
+ apic->base_address != APIC_DEFAULT_PHYS_BASE)
+ pr_warn_once("APIC base relocation is unsupported by KVM");
+
/* with FSB delivery interrupt, we can restart APIC functionality */
apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
"0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
@@ -1447,6 +1507,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
for (i = 0; i < APIC_LVT_NUM; i++)
apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+ apic->lapic_timer.timer_mode = 0;
apic_set_reg(apic, APIC_LVT0,
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
@@ -1538,23 +1599,8 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
{
struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
- struct kvm_vcpu *vcpu = apic->vcpu;
- wait_queue_head_t *q = &vcpu->wq;
-
- /*
- * There is a race window between reading and incrementing, but we do
- * not care about potentially losing timer events in the !reinject
- * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
- * in vcpu_enter_guest.
- */
- if (!atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
- /* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
- }
- if (waitqueue_active(q))
- wake_up_interruptible(q);
+ apic_timer_expired(apic);
if (lapic_is_periodic(apic)) {
hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
@@ -1693,6 +1739,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
1 : count_vectors(apic->regs + APIC_ISR);
apic->highest_isr_cache = -1;
+ if (kvm_x86_ops->hwapic_irr_update)
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ apic_find_highest_irr(apic));
kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_rtc_eoi_tracking_restore_one(vcpu);
@@ -1837,8 +1886,11 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
return 1;
+ if (reg == APIC_ICR2)
+ return 1;
+
/* if this is ICR write vector before command */
- if (msr == 0x830)
+ if (reg == APIC_ICR)
apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
return apic_reg_write(apic, reg, (u32)data);
}
@@ -1851,9 +1903,15 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
return 1;
+ if (reg == APIC_DFR || reg == APIC_ICR2) {
+ apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n",
+ reg);
+ return 1;
+ }
+
if (apic_reg_read(apic, reg, 4, &low))
return 1;
- if (msr == 0x830)
+ if (reg == APIC_ICR)
apic_reg_read(apic, APIC_ICR2, 4, &high);
*data = (((u64)high) << 32) | low;
@@ -1908,7 +1966,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- unsigned int sipi_vector;
+ u8 sipi_vector;
unsigned long pe;
if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6a11845fd8b9..c674fce53cf9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -11,6 +11,7 @@
struct kvm_timer {
struct hrtimer timer;
s64 period; /* unit: ns */
+ u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
atomic_t pending; /* accumulated triggered timers */
@@ -22,6 +23,7 @@ struct kvm_lapic {
struct kvm_timer lapic_timer;
u32 divide_count;
struct kvm_vcpu *vcpu;
+ bool sw_enabled;
bool irr_pending;
/* Number of bits set in ISR. */
s16 isr_count;
@@ -55,8 +57,8 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
unsigned long *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -119,11 +121,11 @@ static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
extern struct static_key_deferred apic_sw_disabled;
-static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic)
+static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
{
if (static_key_false(&apic_sw_disabled.key))
- return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
- return APIC_SPIV_APIC_ENABLED;
+ return apic->sw_enabled;
+ return true;
}
static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
@@ -152,8 +154,6 @@ static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
ldr >>= 32 - map->ldr_bits;
cid = (ldr >> map->cid_shift) & map->cid_mask;
- BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
-
return cid;
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 978f402006ee..10fbed126b11 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -214,13 +214,12 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
#define MMIO_GEN_LOW_SHIFT 10
#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
-#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
static u64 generation_mmio_spte_mask(unsigned int gen)
{
u64 mask;
- WARN_ON(gen > MMIO_MAX_GEN);
+ WARN_ON(gen & ~MMIO_GEN_MASK);
mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
@@ -263,13 +262,13 @@ static bool is_mmio_spte(u64 spte)
static gfn_t get_mmio_spte_gfn(u64 spte)
{
- u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+ u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
return (spte & ~mask) >> PAGE_SHIFT;
}
static unsigned get_mmio_spte_access(u64 spte)
{
- u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+ u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
return (spte & ~mask) & ~PAGE_MASK;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7527cefc5a43..41dd0387cccb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1056,9 +1056,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
{
struct vcpu_svm *svm = to_svm(vcpu);
- WARN_ON(adjustment < 0);
- if (host)
- adjustment = svm_scale_tsc(vcpu, adjustment);
+ if (host) {
+ if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
+ WARN_ON(adjustment < 0);
+ adjustment = svm_scale_tsc(vcpu, (u64)adjustment);
+ }
svm->vmcb->control.tsc_offset += adjustment;
if (is_guest_mode(vcpu))
@@ -2999,7 +3001,6 @@ static int dr_interception(struct vcpu_svm *svm)
{
int reg, dr;
unsigned long val;
- int err;
if (svm->vcpu.guest_debug == 0) {
/*
@@ -3019,12 +3020,15 @@ static int dr_interception(struct vcpu_svm *svm)
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
if (dr >= 16) { /* mov to DRn */
+ if (!kvm_require_dr(&svm->vcpu, dr - 16))
+ return 1;
val = kvm_register_read(&svm->vcpu, reg);
kvm_set_dr(&svm->vcpu, dr - 16, val);
} else {
- err = kvm_get_dr(&svm->vcpu, dr, &val);
- if (!err)
- kvm_register_write(&svm->vcpu, reg, val);
+ if (!kvm_require_dr(&svm->vcpu, dr))
+ return 1;
+ kvm_get_dr(&svm->vcpu, dr, &val);
+ kvm_register_write(&svm->vcpu, reg, val);
}
skip_emulated_instruction(&svm->vcpu);
@@ -4123,6 +4127,11 @@ static bool svm_mpx_supported(void)
return false;
}
+static bool svm_xsaves_supported(void)
+{
+ return false;
+}
+
static bool svm_has_wbinvd_exit(void)
{
return true;
@@ -4410,6 +4419,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.rdtscp_supported = svm_rdtscp_supported,
.invpcid_supported = svm_invpcid_supported,
.mpx_supported = svm_mpx_supported,
+ .xsaves_supported = svm_xsaves_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6b06ab8748dd..c2a34bb5ad93 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,6 +5,7 @@
#include <asm/vmx.h>
#include <asm/svm.h>
#include <asm/clocksource.h>
+#include <asm/pvclock-abi.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
@@ -877,6 +878,42 @@ TRACE_EVENT(kvm_ple_window,
#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \
trace_kvm_ple_window(false, vcpu_id, new, old)
+TRACE_EVENT(kvm_pvclock_update,
+ TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock),
+ TP_ARGS(vcpu_id, pvclock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u32, version )
+ __field( __u64, tsc_timestamp )
+ __field( __u64, system_time )
+ __field( __u32, tsc_to_system_mul )
+ __field( __s8, tsc_shift )
+ __field( __u8, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->version = pvclock->version;
+ __entry->tsc_timestamp = pvclock->tsc_timestamp;
+ __entry->system_time = pvclock->system_time;
+ __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul;
+ __entry->tsc_shift = pvclock->tsc_shift;
+ __entry->flags = pvclock->flags;
+ ),
+
+ TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, "
+ "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, "
+ "flags 0x%x }",
+ __entry->vcpu_id,
+ __entry->version,
+ __entry->tsc_timestamp,
+ __entry->system_time,
+ __entry->tsc_to_system_mul,
+ __entry->tsc_shift,
+ __entry->flags)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3e556c68351b..feb852b04598 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -99,13 +99,15 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
static bool __read_mostly nested = 0;
module_param(nested, bool, S_IRUGO);
+static u64 __read_mostly host_xss;
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_TSD)
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
@@ -214,6 +216,7 @@ struct __packed vmcs12 {
u64 virtual_apic_page_addr;
u64 apic_access_addr;
u64 ept_pointer;
+ u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
u64 guest_ia32_debugctl;
@@ -616,6 +619,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(EPT_POINTER, ept_pointer),
+ FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
@@ -720,12 +724,15 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(HOST_RSP, host_rsp),
FIELD(HOST_RIP, host_rip),
};
-static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
static inline short vmcs_field_to_offset(unsigned long field)
{
- if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
- return -1;
+ BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+
+ if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
+ vmcs_field_to_offset_table[field] == 0)
+ return -ENOENT;
+
return vmcs_field_to_offset_table[field];
}
@@ -758,6 +765,7 @@ static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static bool vmx_mpx_supported(void);
+static bool vmx_xsaves_supported(void);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1098,6 +1106,12 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
}
+static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) &&
+ vmx_xsaves_supported();
+}
+
static inline bool is_exception(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -1659,12 +1673,20 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
clear_atomic_switch_msr(vmx, MSR_EFER);
- /* On ept, can't emulate nx, and must switch nx atomically */
- if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
+
+ /*
+ * On EPT, we can't emulate NX, so we must switch EFER atomically.
+ * On CPUs that support "load IA32_EFER", always switch EFER
+ * atomically, since it's faster than switching it manually.
+ */
+ if (cpu_has_load_ia32_efer ||
+ (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
guest_efer = vmx->vcpu.arch.efer;
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
- add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
+ if (guest_efer != host_efer)
+ add_atomic_switch_msr(vmx, MSR_EFER,
+ guest_efer, host_efer);
return false;
}
@@ -2377,12 +2399,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_secondary_ctls_low = 0;
nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_UNRESTRICTED_GUEST |
- SECONDARY_EXEC_WBINVD_EXITING;
+ SECONDARY_EXEC_WBINVD_EXITING |
+ SECONDARY_EXEC_XSAVES;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
- nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+ nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_INVEPT_BIT;
@@ -2558,6 +2581,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+ case MSR_IA32_XSS:
+ if (!vmx_xsaves_supported())
+ return 1;
+ data = vcpu->arch.ia32_xss;
+ break;
case MSR_TSC_AUX:
if (!to_vmx(vcpu)->rdtscp_enabled)
return 1;
@@ -2649,6 +2677,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return 1; /* they are read-only */
+ case MSR_IA32_XSS:
+ if (!vmx_xsaves_supported())
+ return 1;
+ /*
+ * The only supported bit as of Skylake is bit 8, but
+ * it is not supported on KVM.
+ */
+ if (data != 0)
+ return 1;
+ vcpu->arch.ia32_xss = data;
+ if (vcpu->arch.ia32_xss != host_xss)
+ add_atomic_switch_msr(vmx, MSR_IA32_XSS,
+ vcpu->arch.ia32_xss, host_xss);
+ else
+ clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+ break;
case MSR_TSC_AUX:
if (!vmx->rdtscp_enabled)
return 1;
@@ -2884,7 +2928,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_SHADOW_VMCS;
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_XSAVES;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -3007,6 +3052,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
}
}
+ if (cpu_has_xsaves)
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
return 0;
}
@@ -3110,76 +3158,6 @@ static __init int alloc_kvm_area(void)
return 0;
}
-static __init int hardware_setup(void)
-{
- if (setup_vmcs_config(&vmcs_config) < 0)
- return -EIO;
-
- if (boot_cpu_has(X86_FEATURE_NX))
- kvm_enable_efer_bits(EFER_NX);
-
- if (!cpu_has_vmx_vpid())
- enable_vpid = 0;
- if (!cpu_has_vmx_shadow_vmcs())
- enable_shadow_vmcs = 0;
- if (enable_shadow_vmcs)
- init_vmcs_shadow_fields();
-
- if (!cpu_has_vmx_ept() ||
- !cpu_has_vmx_ept_4levels()) {
- enable_ept = 0;
- enable_unrestricted_guest = 0;
- enable_ept_ad_bits = 0;
- }
-
- if (!cpu_has_vmx_ept_ad_bits())
- enable_ept_ad_bits = 0;
-
- if (!cpu_has_vmx_unrestricted_guest())
- enable_unrestricted_guest = 0;
-
- if (!cpu_has_vmx_flexpriority()) {
- flexpriority_enabled = 0;
-
- /*
- * set_apic_access_page_addr() is used to reload apic access
- * page upon invalidation. No need to do anything if the
- * processor does not have the APIC_ACCESS_ADDR VMCS field.
- */
- kvm_x86_ops->set_apic_access_page_addr = NULL;
- }
-
- if (!cpu_has_vmx_tpr_shadow())
- kvm_x86_ops->update_cr8_intercept = NULL;
-
- if (enable_ept && !cpu_has_vmx_ept_2m_page())
- kvm_disable_largepages();
-
- if (!cpu_has_vmx_ple())
- ple_gap = 0;
-
- if (!cpu_has_vmx_apicv())
- enable_apicv = 0;
-
- if (enable_apicv)
- kvm_x86_ops->update_cr8_intercept = NULL;
- else {
- kvm_x86_ops->hwapic_irr_update = NULL;
- kvm_x86_ops->deliver_posted_interrupt = NULL;
- kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
- }
-
- if (nested)
- nested_vmx_setup_ctls_msrs();
-
- return alloc_kvm_area();
-}
-
-static __exit void hardware_unsetup(void)
-{
- free_kvm_area();
-}
-
static bool emulation_required(struct kvm_vcpu *vcpu)
{
return emulate_invalid_guest_state && !guest_state_valid(vcpu);
@@ -4396,6 +4374,7 @@ static void ept_set_mmio_spte_mask(void)
kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
}
+#define VMX_XSS_EXIT_BITMAP 0
/*
* Sets up the vmcs for emulated real mode.
*/
@@ -4505,6 +4484,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
set_cr4_guest_host_mask(vmx);
+ if (vmx_xsaves_supported())
+ vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
+
return 0;
}
@@ -5163,13 +5145,20 @@ static int handle_cr(struct kvm_vcpu *vcpu)
static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
- int dr, reg;
+ int dr, dr7, reg;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+
+ /* First, if DR does not exist, trigger UD */
+ if (!kvm_require_dr(vcpu, dr))
+ return 1;
/* Do not handle if the CPL > 0, will trigger GP on re-entry */
if (!kvm_require_cpl(vcpu, 0))
return 1;
- dr = vmcs_readl(GUEST_DR7);
- if (dr & DR7_GD) {
+ dr7 = vmcs_readl(GUEST_DR7);
+ if (dr7 & DR7_GD) {
/*
* As the vm-exit takes precedence over the debug trap, we
* need to emulate the latter, either for the host or the
@@ -5177,17 +5166,14 @@ static int handle_dr(struct kvm_vcpu *vcpu)
*/
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
- vcpu->run->debug.arch.dr7 = dr;
- vcpu->run->debug.arch.pc =
- vmcs_readl(GUEST_CS_BASE) +
- vmcs_readl(GUEST_RIP);
+ vcpu->run->debug.arch.dr7 = dr7;
+ vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
vcpu->run->debug.arch.exception = DB_VECTOR;
vcpu->run->exit_reason = KVM_EXIT_DEBUG;
return 0;
} else {
- vcpu->arch.dr7 &= ~DR7_GD;
+ vcpu->arch.dr6 &= ~15;
vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
kvm_queue_exception(vcpu, DB_VECTOR);
return 1;
}
@@ -5209,8 +5195,6 @@ static int handle_dr(struct kvm_vcpu *vcpu)
return 1;
}
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
unsigned long val;
@@ -5391,6 +5375,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_xsaves(struct kvm_vcpu *vcpu)
+{
+ skip_emulated_instruction(vcpu);
+ WARN(1, "this should never happen\n");
+ return 1;
+}
+
+static int handle_xrstors(struct kvm_vcpu *vcpu)
+{
+ skip_emulated_instruction(vcpu);
+ WARN(1, "this should never happen\n");
+ return 1;
+}
+
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
if (likely(fasteoi)) {
@@ -5492,7 +5490,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
}
/* clear all local breakpoint enable flags */
- vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55);
+ vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
/*
* TODO: What about debug traps on tss switch?
@@ -5539,11 +5537,11 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
trace_kvm_page_fault(gpa, exit_qualification);
/* It is a write fault? */
- error_code = exit_qualification & (1U << 1);
+ error_code = exit_qualification & PFERR_WRITE_MASK;
/* It is a fetch fault? */
- error_code |= (exit_qualification & (1U << 2)) << 2;
+ error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
/* ept page table is present? */
- error_code |= (exit_qualification >> 3) & 0x1;
+ error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
vcpu->arch.exit_qualification = exit_qualification;
@@ -5785,6 +5783,204 @@ static void update_ple_window_actual_max(void)
ple_window_grow, INT_MIN);
}
+static __init int hardware_setup(void)
+{
+ int r = -ENOMEM, i, msr;
+
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
+ kvm_define_shared_msr(i, vmx_msr_index[i]);
+
+ vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_io_bitmap_a)
+ return r;
+
+ vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_io_bitmap_b)
+ goto out;
+
+ vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy)
+ goto out1;
+
+ vmx_msr_bitmap_legacy_x2apic =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy_x2apic)
+ goto out2;
+
+ vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode)
+ goto out3;
+
+ vmx_msr_bitmap_longmode_x2apic =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode_x2apic)
+ goto out4;
+ vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_vmread_bitmap)
+ goto out5;
+
+ vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_vmwrite_bitmap)
+ goto out6;
+
+ memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+ memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+
+ /*
+ * Allow direct access to the PC debug port (it is often used for I/O
+ * delays, but the vmexits simply slow things down).
+ */
+ memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
+ clear_bit(0x80, vmx_io_bitmap_a);
+
+ memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
+
+ memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+ memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+
+ vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+ vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+ vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
+
+ memcpy(vmx_msr_bitmap_legacy_x2apic,
+ vmx_msr_bitmap_legacy, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_longmode_x2apic,
+ vmx_msr_bitmap_longmode, PAGE_SIZE);
+
+ if (enable_apicv) {
+ for (msr = 0x800; msr <= 0x8ff; msr++)
+ vmx_disable_intercept_msr_read_x2apic(msr);
+
+ /* According SDM, in x2apic mode, the whole id reg is used.
+ * But in KVM, it only use the highest eight bits. Need to
+ * intercept it */
+ vmx_enable_intercept_msr_read_x2apic(0x802);
+ /* TMCCT */
+ vmx_enable_intercept_msr_read_x2apic(0x839);
+ /* TPR */
+ vmx_disable_intercept_msr_write_x2apic(0x808);
+ /* EOI */
+ vmx_disable_intercept_msr_write_x2apic(0x80b);
+ /* SELF-IPI */
+ vmx_disable_intercept_msr_write_x2apic(0x83f);
+ }
+
+ if (enable_ept) {
+ kvm_mmu_set_mask_ptes(0ull,
+ (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
+ (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
+ 0ull, VMX_EPT_EXECUTABLE_MASK);
+ ept_set_mmio_spte_mask();
+ kvm_enable_tdp();
+ } else
+ kvm_disable_tdp();
+
+ update_ple_window_actual_max();
+
+ if (setup_vmcs_config(&vmcs_config) < 0) {
+ r = -EIO;
+ goto out7;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
+ if (!cpu_has_vmx_vpid())
+ enable_vpid = 0;
+ if (!cpu_has_vmx_shadow_vmcs())
+ enable_shadow_vmcs = 0;
+ if (enable_shadow_vmcs)
+ init_vmcs_shadow_fields();
+
+ if (!cpu_has_vmx_ept() ||
+ !cpu_has_vmx_ept_4levels()) {
+ enable_ept = 0;
+ enable_unrestricted_guest = 0;
+ enable_ept_ad_bits = 0;
+ }
+
+ if (!cpu_has_vmx_ept_ad_bits())
+ enable_ept_ad_bits = 0;
+
+ if (!cpu_has_vmx_unrestricted_guest())
+ enable_unrestricted_guest = 0;
+
+ if (!cpu_has_vmx_flexpriority()) {
+ flexpriority_enabled = 0;
+
+ /*
+ * set_apic_access_page_addr() is used to reload apic access
+ * page upon invalidation. No need to do anything if the
+ * processor does not have the APIC_ACCESS_ADDR VMCS field.
+ */
+ kvm_x86_ops->set_apic_access_page_addr = NULL;
+ }
+
+ if (!cpu_has_vmx_tpr_shadow())
+ kvm_x86_ops->update_cr8_intercept = NULL;
+
+ if (enable_ept && !cpu_has_vmx_ept_2m_page())
+ kvm_disable_largepages();
+
+ if (!cpu_has_vmx_ple())
+ ple_gap = 0;
+
+ if (!cpu_has_vmx_apicv())
+ enable_apicv = 0;
+
+ if (enable_apicv)
+ kvm_x86_ops->update_cr8_intercept = NULL;
+ else {
+ kvm_x86_ops->hwapic_irr_update = NULL;
+ kvm_x86_ops->deliver_posted_interrupt = NULL;
+ kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
+ }
+
+ if (nested)
+ nested_vmx_setup_ctls_msrs();
+
+ return alloc_kvm_area();
+
+out7:
+ free_page((unsigned long)vmx_vmwrite_bitmap);
+out6:
+ free_page((unsigned long)vmx_vmread_bitmap);
+out5:
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+out4:
+ free_page((unsigned long)vmx_msr_bitmap_longmode);
+out3:
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+out2:
+ free_page((unsigned long)vmx_msr_bitmap_legacy);
+out1:
+ free_page((unsigned long)vmx_io_bitmap_b);
+out:
+ free_page((unsigned long)vmx_io_bitmap_a);
+
+ return r;
+}
+
+static __exit void hardware_unsetup(void)
+{
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_legacy);
+ free_page((unsigned long)vmx_msr_bitmap_longmode);
+ free_page((unsigned long)vmx_io_bitmap_b);
+ free_page((unsigned long)vmx_io_bitmap_a);
+ free_page((unsigned long)vmx_vmwrite_bitmap);
+ free_page((unsigned long)vmx_vmread_bitmap);
+
+ free_kvm_area();
+}
+
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -6361,58 +6557,60 @@ static inline int vmcs_field_readonly(unsigned long field)
* some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
* 64-bit fields are to be returned).
*/
-static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
- unsigned long field, u64 *ret)
+static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
+ unsigned long field, u64 *ret)
{
short offset = vmcs_field_to_offset(field);
char *p;
if (offset < 0)
- return 0;
+ return offset;
p = ((char *)(get_vmcs12(vcpu))) + offset;
switch (vmcs_field_type(field)) {
case VMCS_FIELD_TYPE_NATURAL_WIDTH:
*ret = *((natural_width *)p);
- return 1;
+ return 0;
case VMCS_FIELD_TYPE_U16:
*ret = *((u16 *)p);
- return 1;
+ return 0;
case VMCS_FIELD_TYPE_U32:
*ret = *((u32 *)p);
- return 1;
+ return 0;
case VMCS_FIELD_TYPE_U64:
*ret = *((u64 *)p);
- return 1;
+ return 0;
default:
- return 0; /* can never happen. */
+ WARN_ON(1);
+ return -ENOENT;
}
}
-static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu,
- unsigned long field, u64 field_value){
+static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
+ unsigned long field, u64 field_value){
short offset = vmcs_field_to_offset(field);
char *p = ((char *) get_vmcs12(vcpu)) + offset;
if (offset < 0)
- return false;
+ return offset;
switch (vmcs_field_type(field)) {
case VMCS_FIELD_TYPE_U16:
*(u16 *)p = field_value;
- return true;
+ return 0;
case VMCS_FIELD_TYPE_U32:
*(u32 *)p = field_value;
- return true;
+ return 0;
case VMCS_FIELD_TYPE_U64:
*(u64 *)p = field_value;
- return true;
+ return 0;
case VMCS_FIELD_TYPE_NATURAL_WIDTH:
*(natural_width *)p = field_value;
- return true;
+ return 0;
default:
- return false; /* can never happen. */
+ WARN_ON(1);
+ return -ENOENT;
}
}
@@ -6445,6 +6643,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
case VMCS_FIELD_TYPE_NATURAL_WIDTH:
field_value = vmcs_readl(field);
break;
+ default:
+ WARN_ON(1);
+ continue;
}
vmcs12_write_any(&vmx->vcpu, field, field_value);
}
@@ -6490,6 +6691,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
case VMCS_FIELD_TYPE_NATURAL_WIDTH:
vmcs_writel(field, (long)field_value);
break;
+ default:
+ WARN_ON(1);
+ break;
}
}
}
@@ -6528,7 +6732,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
/* Decode instruction info and find the field to read */
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Read the field, zero-extended to a u64 field_value */
- if (!vmcs12_read_any(vcpu, field, &field_value)) {
+ if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
skip_emulated_instruction(vcpu);
return 1;
@@ -6598,7 +6802,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return 1;
}
- if (!vmcs12_write_any(vcpu, field, field_value)) {
+ if (vmcs12_write_any(vcpu, field, field_value) < 0) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
skip_emulated_instruction(vcpu);
return 1;
@@ -6802,6 +7006,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
[EXIT_REASON_INVEPT] = handle_invept,
[EXIT_REASON_INVVPID] = handle_invvpid,
+ [EXIT_REASON_XSAVES] = handle_xsaves,
+ [EXIT_REASON_XRSTORS] = handle_xrstors,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7089,6 +7295,14 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
case EXIT_REASON_XSETBV:
return 1;
+ case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
+ /*
+ * This should never happen, since it is not possible to
+ * set XSS to a non-zero value---neither in L1 nor in L2.
+ * If if it were, XSS would have to be checked against
+ * the XSS exit bitmap in vmcs12.
+ */
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
default:
return 1;
}
@@ -7277,6 +7491,9 @@ static void vmx_set_rvi(int vector)
u16 status;
u8 old;
+ if (vector == -1)
+ vector = 0;
+
status = vmcs_read16(GUEST_INTR_STATUS);
old = (u8)status & 0xff;
if ((u8)vector != old) {
@@ -7288,22 +7505,23 @@ static void vmx_set_rvi(int vector)
static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
+ if (!is_guest_mode(vcpu)) {
+ vmx_set_rvi(max_irr);
+ return;
+ }
+
if (max_irr == -1)
return;
/*
- * If a vmexit is needed, vmx_check_nested_events handles it.
+ * In guest mode. If a vmexit is needed, vmx_check_nested_events
+ * handles it.
*/
- if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ if (nested_exit_on_intr(vcpu))
return;
- if (!is_guest_mode(vcpu)) {
- vmx_set_rvi(max_irr);
- return;
- }
-
/*
- * Fall back to pre-APICv interrupt injection since L2
+ * Else, fall back to pre-APICv interrupt injection since L2
* is run without virtual interrupt delivery.
*/
if (!kvm_event_needs_reinjection(vcpu) &&
@@ -7400,6 +7618,12 @@ static bool vmx_mpx_supported(void)
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
}
+static bool vmx_xsaves_supported(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_XSAVES;
+}
+
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -8135,6 +8359,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+ if (nested_cpu_has_xsaves(vmcs12))
+ vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
exec_control = vmcs12->pin_based_vm_exec_control;
@@ -8775,6 +9001,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
if (vmx_mpx_supported())
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+ if (nested_cpu_has_xsaves(vmcs12))
+ vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
/* update exit information fields: */
@@ -9176,6 +9404,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.check_intercept = vmx_check_intercept,
.handle_external_intr = vmx_handle_external_intr,
.mpx_supported = vmx_mpx_supported,
+ .xsaves_supported = vmx_xsaves_supported,
.check_nested_events = vmx_check_nested_events,
@@ -9184,150 +9413,21 @@ static struct kvm_x86_ops vmx_x86_ops = {
static int __init vmx_init(void)
{
- int r, i, msr;
-
- rdmsrl_safe(MSR_EFER, &host_efer);
-
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
- kvm_define_shared_msr(i, vmx_msr_index[i]);
-
- vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_a)
- return -ENOMEM;
-
- r = -ENOMEM;
-
- vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_b)
- goto out;
-
- vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy)
- goto out1;
-
- vmx_msr_bitmap_legacy_x2apic =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy_x2apic)
- goto out2;
-
- vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode)
- goto out3;
-
- vmx_msr_bitmap_longmode_x2apic =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode_x2apic)
- goto out4;
- vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_vmread_bitmap)
- goto out5;
-
- vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_vmwrite_bitmap)
- goto out6;
-
- memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
- memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
-
- /*
- * Allow direct access to the PC debug port (it is often used for I/O
- * delays, but the vmexits simply slow things down).
- */
- memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
- clear_bit(0x80, vmx_io_bitmap_a);
-
- memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
-
- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-
- set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
-
- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
- goto out7;
+ return r;
#ifdef CONFIG_KEXEC
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
#endif
- vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
-
- memcpy(vmx_msr_bitmap_legacy_x2apic,
- vmx_msr_bitmap_legacy, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_longmode_x2apic,
- vmx_msr_bitmap_longmode, PAGE_SIZE);
-
- if (enable_apicv) {
- for (msr = 0x800; msr <= 0x8ff; msr++)
- vmx_disable_intercept_msr_read_x2apic(msr);
-
- /* According SDM, in x2apic mode, the whole id reg is used.
- * But in KVM, it only use the highest eight bits. Need to
- * intercept it */
- vmx_enable_intercept_msr_read_x2apic(0x802);
- /* TMCCT */
- vmx_enable_intercept_msr_read_x2apic(0x839);
- /* TPR */
- vmx_disable_intercept_msr_write_x2apic(0x808);
- /* EOI */
- vmx_disable_intercept_msr_write_x2apic(0x80b);
- /* SELF-IPI */
- vmx_disable_intercept_msr_write_x2apic(0x83f);
- }
-
- if (enable_ept) {
- kvm_mmu_set_mask_ptes(0ull,
- (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
- (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK);
- ept_set_mmio_spte_mask();
- kvm_enable_tdp();
- } else
- kvm_disable_tdp();
-
- update_ple_window_actual_max();
-
return 0;
-
-out7:
- free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
- free_page((unsigned long)vmx_vmread_bitmap);
-out5:
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
- free_page((unsigned long)vmx_msr_bitmap_longmode);
-out3:
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-out2:
- free_page((unsigned long)vmx_msr_bitmap_legacy);
-out1:
- free_page((unsigned long)vmx_io_bitmap_b);
-out:
- free_page((unsigned long)vmx_io_bitmap_a);
- return r;
}
static void __exit vmx_exit(void)
{
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
- free_page((unsigned long)vmx_msr_bitmap_legacy);
- free_page((unsigned long)vmx_msr_bitmap_longmode);
- free_page((unsigned long)vmx_io_bitmap_b);
- free_page((unsigned long)vmx_io_bitmap_a);
- free_page((unsigned long)vmx_vmwrite_bitmap);
- free_page((unsigned long)vmx_vmread_bitmap);
-
#ifdef CONFIG_KEXEC
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0033df32a745..c259814200bd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,6 +27,7 @@
#include "kvm_cache_regs.h"
#include "x86.h"
#include "cpuid.h"
+#include "assigned-dev.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -353,6 +354,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
if (!vcpu->arch.exception.pending) {
queue:
+ if (has_error && !is_protmode(vcpu))
+ has_error = false;
vcpu->arch.exception.pending = true;
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.nr = nr;
@@ -455,6 +458,16 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
}
EXPORT_SYMBOL_GPL(kvm_require_cpl);
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
+{
+ if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return true;
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+}
+EXPORT_SYMBOL_GPL(kvm_require_dr);
+
/*
* This function will be used to read from the physical memory of the currently
* running guest. The difference to kvm_read_guest_page is that this function
@@ -656,6 +669,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
return 1;
+ if (xcr0 & XSTATE_AVX512) {
+ if (!(xcr0 & XSTATE_YMM))
+ return 1;
+ if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512)
+ return 1;
+ }
kvm_put_guest_xcr0(vcpu);
vcpu->arch.xcr0 = xcr0;
@@ -732,6 +751,10 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+#ifdef CONFIG_X86_64
+ cr3 &= ~CR3_PCID_INVD;
+#endif
+
if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@@ -811,8 +834,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
vcpu->arch.eff_db[dr] = val;
break;
case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1; /* #UD */
/* fall through */
case 6:
if (val & 0xffffffff00000000ULL)
@@ -821,8 +842,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
kvm_update_dr6(vcpu);
break;
case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1; /* #UD */
/* fall through */
default: /* 7 */
if (val & 0xffffffff00000000ULL)
@@ -837,27 +856,21 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
- int res;
-
- res = __kvm_set_dr(vcpu, dr, val);
- if (res > 0)
- kvm_queue_exception(vcpu, UD_VECTOR);
- else if (res < 0)
+ if (__kvm_set_dr(vcpu, dr, val)) {
kvm_inject_gp(vcpu, 0);
-
- return res;
+ return 1;
+ }
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
switch (dr) {
case 0 ... 3:
*val = vcpu->arch.db[dr];
break;
case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1;
/* fall through */
case 6:
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
@@ -866,23 +879,11 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
*val = kvm_x86_ops->get_dr6(vcpu);
break;
case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1;
/* fall through */
default: /* 7 */
*val = vcpu->arch.dr7;
break;
}
-
- return 0;
-}
-
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
-{
- if (_kvm_get_dr(vcpu, dr, val)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -1237,21 +1238,22 @@ void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
bool vcpus_matched;
- bool do_request = false;
struct kvm_arch *ka = &vcpu->kvm->arch;
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&vcpu->kvm->online_vcpus));
- if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
- if (!ka->use_master_clock)
- do_request = 1;
-
- if (!vcpus_matched && ka->use_master_clock)
- do_request = 1;
-
- if (do_request)
+ /*
+ * Once the masterclock is enabled, always perform request in
+ * order to update it.
+ *
+ * In order to enable masterclock, the host clocksource must be TSC
+ * and the vcpus need to have matched TSCs. When that happens,
+ * perform request to enable masterclock.
+ */
+ if (ka->use_master_clock ||
+ (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
@@ -1637,16 +1639,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
+ if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ &guest_hv_clock, sizeof(guest_hv_clock))))
+ return 0;
+
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
* state, we just increase by 2 at the end.
*/
- vcpu->hv_clock.version += 2;
-
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
- &guest_hv_clock, sizeof(guest_hv_clock))))
- return 0;
+ vcpu->hv_clock.version = guest_hv_clock.version + 2;
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
@@ -1662,6 +1664,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock));
@@ -2140,7 +2144,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC_ADJUST:
if (guest_cpuid_has_tsc_adjust(vcpu)) {
if (!msr_info->host_initiated) {
- u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+ s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
}
vcpu->arch.ia32_tsc_adjust_msr = data;
@@ -3106,7 +3110,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
unsigned long val;
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
- _kvm_get_dr(vcpu, 6, &val);
+ kvm_get_dr(vcpu, 6, &val);
dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
dbgregs->flags = 0;
@@ -3128,15 +3132,89 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
return 0;
}
+#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
+
+static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
+{
+ struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
+ u64 xstate_bv = xsave->xsave_hdr.xstate_bv;
+ u64 valid;
+
+ /*
+ * Copy legacy XSAVE area, to avoid complications with CPUID
+ * leaves 0 and 1 in the loop below.
+ */
+ memcpy(dest, xsave, XSAVE_HDR_OFFSET);
+
+ /* Set XSTATE_BV */
+ *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
+
+ /*
+ * Copy each region from the possibly compacted offset to the
+ * non-compacted offset.
+ */
+ valid = xstate_bv & ~XSTATE_FPSSE;
+ while (valid) {
+ u64 feature = valid & -valid;
+ int index = fls64(feature) - 1;
+ void *src = get_xsave_addr(xsave, feature);
+
+ if (src) {
+ u32 size, offset, ecx, edx;
+ cpuid_count(XSTATE_CPUID, index,
+ &size, &offset, &ecx, &edx);
+ memcpy(dest + offset, src, size);
+ }
+
+ valid -= feature;
+ }
+}
+
+static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
+{
+ struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
+ u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
+ u64 valid;
+
+ /*
+ * Copy legacy XSAVE area, to avoid complications with CPUID
+ * leaves 0 and 1 in the loop below.
+ */
+ memcpy(xsave, src, XSAVE_HDR_OFFSET);
+
+ /* Set XSTATE_BV and possibly XCOMP_BV. */
+ xsave->xsave_hdr.xstate_bv = xstate_bv;
+ if (cpu_has_xsaves)
+ xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
+
+ /*
+ * Copy each region from the non-compacted offset to the
+ * possibly compacted offset.
+ */
+ valid = xstate_bv & ~XSTATE_FPSSE;
+ while (valid) {
+ u64 feature = valid & -valid;
+ int index = fls64(feature) - 1;
+ void *dest = get_xsave_addr(xsave, feature);
+
+ if (dest) {
+ u32 size, offset, ecx, edx;
+ cpuid_count(XSTATE_CPUID, index,
+ &size, &offset, &ecx, &edx);
+ memcpy(dest, src + offset, size);
+ } else
+ WARN_ON_ONCE(1);
+
+ valid -= feature;
+ }
+}
+
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
if (cpu_has_xsave) {
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state->xsave,
- vcpu->arch.guest_xstate_size);
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &=
- vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE;
+ memset(guest_xsave, 0, sizeof(struct kvm_xsave));
+ fill_xsave((u8 *) guest_xsave->region, vcpu);
} else {
memcpy(guest_xsave->region,
&vcpu->arch.guest_fpu.state->fxsave,
@@ -3160,8 +3238,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
*/
if (xstate_bv & ~kvm_supported_xcr0())
return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu.state->xsave,
- guest_xsave->region, vcpu->arch.guest_xstate_size);
+ load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
if (xstate_bv & ~XSTATE_FPSSE)
return -EINVAL;
@@ -4004,7 +4081,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
default:
- ;
+ r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
}
out:
return r;
@@ -4667,7 +4744,7 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+ return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
}
int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
@@ -5211,21 +5288,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag
static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
{
- struct kvm_run *kvm_run = vcpu->run;
- unsigned long eip = vcpu->arch.emulate_ctxt.eip;
- u32 dr6 = 0;
-
if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
- dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ struct kvm_run *kvm_run = vcpu->run;
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.guest_debug_dr7,
vcpu->arch.eff_db);
if (dr6 != 0) {
kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
- kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
-
+ kvm_run->debug.arch.pc = eip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
*r = EMULATE_USER_EXIT;
@@ -5235,7 +5308,8 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
!(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
- dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.dr7,
vcpu->arch.db);
@@ -5365,7 +5439,9 @@ restart:
kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE)
kvm_vcpu_check_singlestep(vcpu, rflags, &r);
- __kvm_set_rflags(vcpu, ctxt->eflags);
+ if (!ctxt->have_exception ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP)
+ __kvm_set_rflags(vcpu, ctxt->eflags);
/*
* For STI, interrupts are shadowed; so KVM_REQ_EVENT will
@@ -5965,6 +6041,12 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
X86_EFLAGS_RF);
+ if (vcpu->arch.exception.nr == DB_VECTOR &&
+ (vcpu->arch.dr7 & DR7_GD)) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
+
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
@@ -6873,6 +6955,9 @@ int fx_init(struct kvm_vcpu *vcpu)
return err;
fpu_finit(&vcpu->arch.guest_fpu);
+ if (cpu_has_xsaves)
+ vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
+ host_xcr0 | XSTATE_COMPACTION_ENABLED;
/*
* Ensure guest xcr0 is valid for loading
@@ -7024,7 +7109,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_reset(vcpu);
}
-void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
struct kvm_segment cs;
@@ -7256,6 +7341,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (type)
return -EINVAL;
+ INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -7536,12 +7622,18 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
return kvm_x86_ops->interrupt_allowed(vcpu);
}
-bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
{
- unsigned long current_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
+ if (is_64_bit_mode(vcpu))
+ return kvm_rip_read(vcpu);
+ return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
+ kvm_rip_read(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
- return current_rip == linear_rip;
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+ return kvm_get_linear_rip(vcpu) == linear_rip;
}
EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7cb9c45a5fe0..cc1d61af6140 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
- | XSTATE_BNDREGS | XSTATE_BNDCSR)
+ | XSTATE_BNDREGS | XSTATE_BNDCSR \
+ | XSTATE_AVX512)
extern u64 host_xcr0;
extern u64 kvm_supported_xcr0(void);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index db92793b7e23..1530afb07c85 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -23,7 +23,7 @@ lib-y += memcpy_$(BITS).o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
-obj-y += msr.o msr-reg.o msr-reg-export.o hash.o
+obj-y += msr.o msr-reg.o msr-reg-export.o
ifeq ($(CONFIG_X86_32),y)
obj-y += atomic64_32.o
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
deleted file mode 100644
index ff4fa51a5b1f..000000000000
--- a/arch/x86/lib/hash.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Some portions derived from code covered by the following notice:
- *
- * Copyright (c) 2010-2013 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/hash.h>
-#include <linux/init.h>
-
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include <asm/hash.h>
-
-static inline u32 crc32_u32(u32 crc, u32 val)
-{
-#ifdef CONFIG_AS_CRC32
- asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
-#else
- asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val));
-#endif
- return crc;
-}
-
-static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
-{
- const u32 *p32 = (const u32 *) data;
- u32 i, tmp = 0;
-
- for (i = 0; i < len / 4; i++)
- seed = crc32_u32(seed, *p32++);
-
- switch (len & 3) {
- case 3:
- tmp |= *((const u8 *) p32 + 2) << 16;
- /* fallthrough */
- case 2:
- tmp |= *((const u8 *) p32 + 1) << 8;
- /* fallthrough */
- case 1:
- tmp |= *((const u8 *) p32);
- seed = crc32_u32(seed, tmp);
- break;
- }
-
- return seed;
-}
-
-static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
-{
- const u32 *p32 = (const u32 *) data;
- u32 i;
-
- for (i = 0; i < len; i++)
- seed = crc32_u32(seed, *p32++);
-
- return seed;
-}
-
-void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
-{
- if (cpu_has_xmm4_2) {
- ops->hash = intel_crc4_2_hash;
- ops->hash2 = intel_crc4_2_hash2;
- }
-}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d973e61e450d..38dcec403b46 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -844,11 +844,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
unsigned int fault)
{
struct task_struct *tsk = current;
- struct mm_struct *mm = tsk->mm;
int code = BUS_ADRERR;
- up_read(&mm->mmap_sem);
-
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
@@ -879,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
- up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address, 0, 0);
return;
}
@@ -887,14 +883,11 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
- up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return;
}
- up_read(&current->mm->mmap_sem);
-
/*
* We ran out of memory, call the OOM killer, and return the
* userspace (which will retry the fault, or kill us if we got
@@ -1062,7 +1055,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
- int fault;
+ int fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
@@ -1237,47 +1230,50 @@ good_area:
* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*/
fault = handle_mm_fault(mm, vma, address, flags);
+ major |= fault & VM_FAULT_MAJOR;
/*
- * If we need to retry but a fatal signal is pending, handle the
- * signal first. We do not need to release the mmap_sem because it
- * would already be released in __lock_page_or_retry in mm/filemap.c.
+ * If we need to retry the mmap_sem has already been released,
+ * and if there is a fatal signal pending there is no guarantee
+ * that we made any progress. Handle this case first.
*/
- if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
+ if (unlikely(fault & VM_FAULT_RETRY)) {
+ /* Retry at most once */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
+ if (!fatal_signal_pending(tsk))
+ goto retry;
+ }
+
+ /* User mode? Just return to handle the fatal exception */
+ if (flags & FAULT_FLAG_USER)
+ return;
+
+ /* Not returning to user mode? Handle exceptions or die: */
+ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
+ }
+ up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
return;
}
/*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
+ * Major/minor page fault accounting. If any of the events
+ * returned VM_FAULT_MAJOR, we account it as a major fault.
*/
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
- if (fault & VM_FAULT_RETRY) {
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
- flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ if (major) {
+ tsk->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ } else {
+ tsk->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
check_v8086_mode(regs, address, tsk);
-
- up_read(&mm->mmap_sem);
}
NOKPROBE_SYMBOL(__do_page_fault);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 82b41d56bb98..a97ee0801475 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -703,10 +703,10 @@ void __init zone_sizes_init(void)
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
#ifdef CONFIG_ZONE_DMA
- max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+ max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
#endif
#ifdef CONFIG_ZONE_DMA32
- max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+ max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a3a5d46605d2..536ea2fb6e33 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -384,6 +384,26 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
}
/*
+ * Lookup the PMD entry for a virtual address. Return a pointer to the entry
+ * or NULL if not present.
+ */
+pmd_t *lookup_pmd_address(unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(address);
+ if (pgd_none(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+ return NULL;
+
+ return pmd_offset(pud, address);
+}
+
+/*
* This is necessary because __pa() does not work on some
* kinds of memory, like vmalloc() or the alloc_remap()
* areas on 32-bit NUMA systems. The percpu areas can
@@ -1817,7 +1837,7 @@ static int __set_pages_np(struct page *page, int numpages)
return __change_page_attr_set_clr(&cpa, 0);
}
-void kernel_map_pages(struct page *page, int numpages, int enable)
+void __kernel_map_pages(struct page *page, int numpages, int enable)
{
if (PageHighMem(page))
return;
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 3f627345d51c..987514396c1e 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -24,7 +24,7 @@ extern u8 sk_load_byte_positive_offset[];
extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
extern u8 sk_load_byte_negative_offset[];
-static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
if (len == 1)
*ptr = bytes;
@@ -52,12 +52,12 @@ static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
#define EMIT4_off32(b1, b2, b3, b4, off) \
do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
-static inline bool is_imm8(int value)
+static bool is_imm8(int value)
{
return value <= 127 && value >= -128;
}
-static inline bool is_simm32(s64 value)
+static bool is_simm32(s64 value)
{
return value == (s64) (s32) value;
}
@@ -94,7 +94,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
#define X86_JGE 0x7D
#define X86_JG 0x7F
-static inline void bpf_flush_icache(void *start, void *end)
+static void bpf_flush_icache(void *start, void *end)
{
mm_segment_t old_fs = get_fs();
@@ -133,24 +133,24 @@ static const int reg2hex[] = {
* which need extra byte of encoding.
* rax,rcx,...,rbp have simpler encoding
*/
-static inline bool is_ereg(u32 reg)
+static bool is_ereg(u32 reg)
{
- if (reg == BPF_REG_5 || reg == AUX_REG ||
- (reg >= BPF_REG_7 && reg <= BPF_REG_9))
- return true;
- else
- return false;
+ return (1 << reg) & (BIT(BPF_REG_5) |
+ BIT(AUX_REG) |
+ BIT(BPF_REG_7) |
+ BIT(BPF_REG_8) |
+ BIT(BPF_REG_9));
}
/* add modifiers if 'reg' maps to x64 registers r8..r15 */
-static inline u8 add_1mod(u8 byte, u32 reg)
+static u8 add_1mod(u8 byte, u32 reg)
{
if (is_ereg(reg))
byte |= 1;
return byte;
}
-static inline u8 add_2mod(u8 byte, u32 r1, u32 r2)
+static u8 add_2mod(u8 byte, u32 r1, u32 r2)
{
if (is_ereg(r1))
byte |= 1;
@@ -160,13 +160,13 @@ static inline u8 add_2mod(u8 byte, u32 r1, u32 r2)
}
/* encode 'dst_reg' register into x64 opcode 'byte' */
-static inline u8 add_1reg(u8 byte, u32 dst_reg)
+static u8 add_1reg(u8 byte, u32 dst_reg)
{
return byte + reg2hex[dst_reg];
}
/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */
-static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
{
return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
}
@@ -178,7 +178,7 @@ static void jit_fill_hole(void *area, unsigned int size)
}
struct jit_context {
- unsigned int cleanup_addr; /* epilogue code offset */
+ int cleanup_addr; /* epilogue code offset */
bool seen_ld_abs;
};
@@ -192,6 +192,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
struct bpf_insn *insn = bpf_prog->insnsi;
int insn_cnt = bpf_prog->len;
bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
+ bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
int i;
int proglen = 0;
@@ -854,10 +855,11 @@ common_load:
goto common_load;
case BPF_JMP | BPF_EXIT:
- if (i != insn_cnt - 1) {
+ if (seen_exit) {
jmp_offset = ctx->cleanup_addr - addrs[i];
goto emit_jmp;
}
+ seen_exit = true;
/* update cleanup_addr */
ctx->cleanup_addr = proglen;
/* mov rbx, qword ptr [rbp-X] */
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 1819a91bbb9f..c489ef2c1a39 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -23,6 +23,8 @@
#include <xen/features.h>
#include <xen/events.h>
#include <asm/xen/pci.h>
+#include <asm/xen/cpuid.h>
+#include <asm/apic.h>
#include <asm/i8259.h>
static int xen_pcifront_enable_irq(struct pci_dev *dev)
@@ -423,6 +425,28 @@ int __init pci_xen_init(void)
return 0;
}
+#ifdef CONFIG_PCI_MSI
+void __init xen_msi_init(void)
+{
+ if (!disable_apic) {
+ /*
+ * If hardware supports (x2)APIC virtualization (as indicated
+ * by hypervisor's leaf 4) then we don't need to use pirqs/
+ * event channels for MSI handling and instead use regular
+ * APIC processing
+ */
+ uint32_t eax = cpuid_eax(xen_cpuid_base() + 4);
+
+ if (((eax & XEN_HVM_CPUID_X2APIC_VIRT) && x2apic_mode) ||
+ ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && cpu_has_apic))
+ return;
+ }
+
+ x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs;
+ x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+}
+#endif
+
int __init pci_xen_hvm_init(void)
{
if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs))
@@ -437,8 +461,11 @@ int __init pci_xen_hvm_init(void)
#endif
#ifdef CONFIG_PCI_MSI
- x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs;
- x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+ /*
+ * We need to wait until after x2apic is initialized
+ * before we can set MSI IRQ ops.
+ */
+ x86_platform.apic_post_init = xen_msi_init;
#endif
return 0;
}
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index 4d171e8640ef..735ba21efe91 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -86,7 +86,6 @@ static int iris_remove(struct platform_device *pdev)
static struct platform_driver iris_driver = {
.driver = {
.name = "iris",
- .owner = THIS_MODULE,
},
.probe = iris_probe,
.remove = iris_remove,
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index a9acde72d4ed..c5350fd27d70 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -170,7 +170,6 @@ static int xo1_pm_remove(struct platform_device *pdev)
static struct platform_driver cs5535_pms_driver = {
.driver = {
.name = "cs5535-pms",
- .owner = THIS_MODULE,
},
.probe = xo1_pm_probe,
.remove = xo1_pm_remove,
@@ -179,7 +178,6 @@ static struct platform_driver cs5535_pms_driver = {
static struct platform_driver cs5535_acpi_driver = {
.driver = {
.name = "olpc-xo1-pm-acpi",
- .owner = THIS_MODULE,
},
.probe = xo1_pm_probe,
.remove = xo1_pm_remove,
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 9fe1b5d002f0..b3560ece1c9f 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -364,3 +364,4 @@
355 i386 getrandom sys_getrandom
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
+358 i386 execveat sys_execveat stub32_execveat
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 281150b539a2..8d656fbb57aa 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -328,6 +328,7 @@
319 common memfd_create sys_memfd_create
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
+322 64 execveat stub_execveat
#
# x32-specific system call numbers start at 512 to avoid cache impact
@@ -366,3 +367,4 @@
542 x32 getsockopt compat_sys_getsockopt
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
+545 x32 execveat stub_x32_execveat
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index cc04e67bfd05..2d7d9a1f5b53 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -29,20 +29,18 @@
#endif /* CONFIG_X86_32 */
-#define read_barrier_depends() do { } while (0)
-
-#ifdef CONFIG_SMP
-
-#define smp_mb() mb()
#ifdef CONFIG_X86_PPRO_FENCE
-#define smp_rmb() rmb()
+#define dma_rmb() rmb()
#else /* CONFIG_X86_PPRO_FENCE */
-#define smp_rmb() barrier()
+#define dma_rmb() barrier()
#endif /* CONFIG_X86_PPRO_FENCE */
+#define dma_wmb() barrier()
-#define smp_wmb() barrier()
+#ifdef CONFIG_SMP
-#define smp_read_barrier_depends() read_barrier_depends()
+#define smp_mb() mb()
+#define smp_rmb() dma_rmb()
+#define smp_wmb() barrier()
#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
#else /* CONFIG_SMP */
@@ -50,11 +48,13 @@
#define smp_mb() barrier()
#define smp_rmb() barrier()
#define smp_wmb() barrier()
-#define smp_read_barrier_depends() do { } while (0)
#define set_mb(var, value) do { var = value; barrier(); } while (0)
#endif /* CONFIG_SMP */
+#define read_barrier_depends() do { } while (0)
+#define smp_read_barrier_depends() do { } while (0)
+
/*
* Stop RDTSC speculation. This is needed when you need to use RDTSC
* (or get_cycles or vread that possibly accesses the TSC) in a defined
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index f2f0723070ca..20c3649d0691 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -31,6 +31,7 @@
#define stub_fork sys_fork
#define stub_vfork sys_vfork
#define stub_execve sys_execve
+#define stub_execveat sys_execveat
#define stub_rt_sigreturn sys_rt_sigreturn
#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 8c8298d78185..5c1f9ace7ae7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -387,7 +387,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
unsigned long mfn;
if (!xen_feature(XENFEAT_auto_translated_physmap))
- mfn = get_phys_to_machine(pfn);
+ mfn = __pfn_to_mfn(pfn);
else
mfn = pfn;
/*
@@ -1113,20 +1113,16 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
* instead of somewhere later and be confusing. */
xen_mc_flush();
}
-static void __init xen_pagetable_p2m_copy(void)
+
+static void __init xen_pagetable_p2m_free(void)
{
unsigned long size;
unsigned long addr;
- unsigned long new_mfn_list;
-
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
- new_mfn_list = xen_revector_p2m_tree();
/* No memory or already called. */
- if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list)
+ if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
return;
/* using __ka address and sticking INVALID_P2M_ENTRY! */
@@ -1144,8 +1140,6 @@ static void __init xen_pagetable_p2m_copy(void)
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
memblock_free(__pa(xen_start_info->mfn_list), size);
- /* And revector! Bye bye old array */
- xen_start_info->mfn_list = new_mfn_list;
/* At this stage, cleanup_highmap has already cleaned __ka space
* from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1169,17 +1163,35 @@ static void __init xen_pagetable_p2m_copy(void)
}
#endif
-static void __init xen_pagetable_init(void)
+static void __init xen_pagetable_p2m_setup(void)
{
- paging_init();
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ xen_vmalloc_p2m_tree();
+
#ifdef CONFIG_X86_64
- xen_pagetable_p2m_copy();
+ xen_pagetable_p2m_free();
#endif
+ /* And revector! Bye bye old array */
+ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+}
+
+static void __init xen_pagetable_init(void)
+{
+ paging_init();
+ xen_post_allocator_init();
+
+ xen_pagetable_p2m_setup();
+
/* Allocate and initialize top and mid mfn levels for p2m structure */
xen_build_mfn_list_list();
+ /* Remap memory freed due to conflicts with E820 map */
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ xen_remap_memory();
+
xen_setup_shared_info();
- xen_post_allocator_init();
}
static void xen_write_cr2(unsigned long cr2)
{
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index b456b048eca9..edbc7a63fd73 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -3,21 +3,22 @@
* guests themselves, but it must also access and update the p2m array
* during suspend/resume when all the pages are reallocated.
*
- * The p2m table is logically a flat array, but we implement it as a
- * three-level tree to allow the address space to be sparse.
+ * The logical flat p2m table is mapped to a linear kernel memory area.
+ * For accesses by Xen a three-level tree linked via mfns only is set up to
+ * allow the address space to be sparse.
*
- * Xen
- * |
- * p2m_top p2m_top_mfn
- * / \ / \
- * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
- * / \ / \ / /
- * p2m p2m p2m p2m p2m p2m p2m ...
+ * Xen
+ * |
+ * p2m_top_mfn
+ * / \
+ * p2m_mid_mfn p2m_mid_mfn
+ * / /
+ * p2m p2m p2m ...
*
* The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
*
- * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
- * maximum representable pseudo-physical address space is:
+ * The p2m_top_mfn level is limited to 1 page, so the maximum representable
+ * pseudo-physical address space is:
* P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
*
* P2M_PER_PAGE depends on the architecture, as a mfn is always
@@ -30,6 +31,9 @@
* leaf entries, or for the top root, or middle one, for which there is a void
* entry, we assume it is "missing". So (for example)
* pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
+ * We have a dedicated page p2m_missing with all entries being
+ * INVALID_P2M_ENTRY. This page may be referenced multiple times in the p2m
+ * list/tree in case there are multiple areas with P2M_PER_PAGE invalid pfns.
*
* We also have the possibility of setting 1-1 mappings on certain regions, so
* that:
@@ -39,122 +43,20 @@
* PCI BARs, or ACPI spaces), we can create mappings easily because we
* get the PFN value to match the MFN.
*
- * For this to work efficiently we have one new page p2m_identity and
- * allocate (via reserved_brk) any other pages we need to cover the sides
- * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
- * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
- * no other fancy value).
+ * For this to work efficiently we have one new page p2m_identity. All entries
+ * in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only
+ * recognizes that and MFNs, no other fancy value).
*
* On lookup we spot that the entry points to p2m_identity and return the
* identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
* If the entry points to an allocated page, we just proceed as before and
- * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
+ * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
* appropriate functions (pfn_to_mfn).
*
* The reason for having the IDENTITY_FRAME_BIT instead of just returning the
* PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
* non-identity pfn. To protect ourselves against we elect to set (and get) the
* IDENTITY_FRAME_BIT on all identity mapped PFNs.
- *
- * This simplistic diagram is used to explain the more subtle piece of code.
- * There is also a digram of the P2M at the end that can help.
- * Imagine your E820 looking as so:
- *
- * 1GB 2GB 4GB
- * /-------------------+---------\/----\ /----------\ /---+-----\
- * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
- * \-------------------+---------/\----/ \----------/ \---+-----/
- * ^- 1029MB ^- 2001MB
- *
- * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
- * 2048MB = 524288 (0x80000)]
- *
- * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
- * is actually not present (would have to kick the balloon driver to put it in).
- *
- * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
- * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
- * of the PFN and the end PFN (263424 and 512256 respectively). The first step
- * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
- * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
- * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as
- * required to split any existing p2m_mid_missing middle pages.
- *
- * With the E820 example above, 263424 is not 1GB aligned so we allocate a
- * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
- * Each entry in the allocate page is "missing" (points to p2m_missing).
- *
- * Next stage is to determine if we need to do a more granular boundary check
- * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
- * We check if the start pfn and end pfn violate that boundary check, and if
- * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer
- * granularity of setting which PFNs are missing and which ones are identity.
- * In our example 263424 and 512256 both fail the check so we reserve_brk two
- * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
- * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
- *
- * At this point we would at minimum reserve_brk one page, but could be up to
- * three. Each call to set_phys_range_identity has at maximum a three page
- * cost. If we were to query the P2M at this stage, all those entries from
- * start PFN through end PFN (so 1029MB -> 2001MB) would return
- * INVALID_P2M_ENTRY ("missing").
- *
- * The next step is to walk from the start pfn to the end pfn setting
- * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
- * If we find that the middle entry is pointing to p2m_missing we can swap it
- * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and
- * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions).
- * At this point we do not need to worry about boundary aligment (so no need to
- * reserve_brk a middle page, figure out which PFNs are "missing" and which
- * ones are identity), as that has been done earlier. If we find that the
- * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
- * that page (which covers 512 PFNs) and set the appropriate PFN with
- * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
- * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
- * IDENTITY_FRAME_BIT set.
- *
- * All other regions that are void (or not filled) either point to p2m_missing
- * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
- * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
- * contain the INVALID_P2M_ENTRY value and are considered "missing."
- *
- * Finally, the region beyond the end of of the E820 (4 GB in this example)
- * is set to be identity (in case there are MMIO regions placed here).
- *
- * This is what the p2m ends up looking (for the E820 above) with this
- * fabulous drawing:
- *
- * p2m /--------------\
- * /-----\ | &mfn_list[0],| /-----------------\
- * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
- * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
- * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
- * |-----| \ | [p2m_identity]+\\ | .... |
- * | 2 |--\ \-------------------->| ... | \\ \----------------/
- * |-----| \ \---------------/ \\
- * | 3 |-\ \ \\ p2m_identity [1]
- * |-----| \ \-------------------->/---------------\ /-----------------\
- * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... |
- * \-----/ | | | [p2m_identity]+-->| ..., ~0 |
- * | | | .... | \-----------------/
- * | | +-[x], ~0, ~0.. +\
- * | | \---------------/ \
- * | | \-> /---------------\
- * | V p2m_mid_missing p2m_missing | IDENTITY[@0] |
- * | /-----------------\ /------------\ | IDENTITY[@256]|
- * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... |
- * | | [p2m_missing] +---->| ..., ~0 | \---------------/
- * | | ... | \------------/
- * | \-----------------/
- * |
- * | p2m_mid_identity
- * | /-----------------\
- * \-->| [p2m_identity] +---->[1]
- * | [p2m_identity] +---->[1]
- * | ... |
- * \-----------------/
- *
- * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
*/
#include <linux/init.h>
@@ -164,9 +66,11 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/bootmem.h>
+#include <linux/slab.h>
#include <asm/cache.h>
#include <asm/setup.h>
+#include <asm/uaccess.h>
#include <asm/xen/page.h>
#include <asm/xen/hypercall.h>
@@ -178,31 +82,26 @@
#include "multicalls.h"
#include "xen-ops.h"
+#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
+
static void __init m2p_override_init(void);
+unsigned long *xen_p2m_addr __read_mostly;
+EXPORT_SYMBOL_GPL(xen_p2m_addr);
+unsigned long xen_p2m_size __read_mostly;
+EXPORT_SYMBOL_GPL(xen_p2m_size);
unsigned long xen_max_p2m_pfn __read_mostly;
+EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
+
+static DEFINE_SPINLOCK(p2m_update_lock);
static unsigned long *p2m_mid_missing_mfn;
static unsigned long *p2m_top_mfn;
static unsigned long **p2m_top_mfn_p;
-
-/* Placeholders for holes in the address space */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
-
-static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
-
-static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
-
-RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-
-/* For each I/O range remapped we may lose up to two leaf pages for the boundary
- * violations and three mid pages to cover up to 3GB. With
- * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the
- * remapped region.
- */
-RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES);
+static unsigned long *p2m_missing;
+static unsigned long *p2m_identity;
+static pte_t *p2m_missing_pte;
+static pte_t *p2m_identity_pte;
static inline unsigned p2m_top_index(unsigned long pfn)
{
@@ -220,14 +119,6 @@ static inline unsigned p2m_index(unsigned long pfn)
return pfn % P2M_PER_PAGE;
}
-static void p2m_top_init(unsigned long ***top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = p2m_mid_missing;
-}
-
static void p2m_top_mfn_init(unsigned long *top)
{
unsigned i;
@@ -244,28 +135,43 @@ static void p2m_top_mfn_p_init(unsigned long **top)
top[i] = p2m_mid_missing_mfn;
}
-static void p2m_mid_init(unsigned long **mid, unsigned long *leaf)
+static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = leaf;
+ mid[i] = virt_to_mfn(leaf);
}
-static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
+static void p2m_init(unsigned long *p2m)
{
unsigned i;
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = virt_to_mfn(leaf);
+ for (i = 0; i < P2M_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
}
-static void p2m_init(unsigned long *p2m)
+static void p2m_init_identity(unsigned long *p2m, unsigned long pfn)
{
unsigned i;
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- p2m[i] = INVALID_P2M_ENTRY;
+ for (i = 0; i < P2M_PER_PAGE; i++)
+ p2m[i] = IDENTITY_FRAME(pfn + i);
+}
+
+static void * __ref alloc_p2m_page(void)
+{
+ if (unlikely(!slab_is_available()))
+ return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
+
+/* Only to be called in case of a race for a page just allocated! */
+static void free_p2m_page(void *p)
+{
+ BUG_ON(!slab_is_available());
+ free_page((unsigned long)p);
}
/*
@@ -280,40 +186,46 @@ static void p2m_init(unsigned long *p2m)
*/
void __ref xen_build_mfn_list_list(void)
{
- unsigned long pfn;
+ unsigned long pfn, mfn;
+ pte_t *ptep;
+ unsigned int level, topidx, mididx;
+ unsigned long *mid_mfn_p;
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
/* Pre-initialize p2m_top_mfn to be completely missing */
if (p2m_top_mfn == NULL) {
- p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_missing_mfn = alloc_p2m_page();
p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
- p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p = alloc_p2m_page();
p2m_top_mfn_p_init(p2m_top_mfn_p);
- p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn = alloc_p2m_page();
p2m_top_mfn_init(p2m_top_mfn);
} else {
/* Reinitialise, mfn's all change after migration */
p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
}
- for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
- unsigned long **mid;
- unsigned long *mid_mfn_p;
+ for (pfn = 0; pfn < xen_max_p2m_pfn && pfn < MAX_P2M_PFN;
+ pfn += P2M_PER_PAGE) {
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
- mid = p2m_top[topidx];
mid_mfn_p = p2m_top_mfn_p[topidx];
+ ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn),
+ &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
+ mfn = pte_mfn(*ptep);
+ ptep = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
/* Don't bother allocating any mfn mid levels if
* they're just missing, just update the stored mfn,
* since all could have changed over a migrate.
*/
- if (mid == p2m_mid_missing) {
+ if (ptep == p2m_missing_pte || ptep == p2m_identity_pte) {
BUG_ON(mididx);
BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
@@ -322,19 +234,14 @@ void __ref xen_build_mfn_list_list(void)
}
if (mid_mfn_p == p2m_mid_missing_mfn) {
- /*
- * XXX boot-time only! We should never find
- * missing parts of the mfn tree after
- * runtime.
- */
- mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+ mid_mfn_p = alloc_p2m_page();
p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
p2m_top_mfn_p[topidx] = mid_mfn_p;
}
p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
- mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+ mid_mfn_p[mididx] = mfn;
}
}
@@ -353,171 +260,235 @@ void xen_setup_mfn_list_list(void)
/* Set up p2m_top to point to the domain-builder provided p2m pages */
void __init xen_build_dynamic_phys_to_machine(void)
{
- unsigned long *mfn_list;
- unsigned long max_pfn;
unsigned long pfn;
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
- mfn_list = (unsigned long *)xen_start_info->mfn_list;
- max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
- xen_max_p2m_pfn = max_pfn;
+ xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list;
+ xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE);
- p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_missing);
- p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_identity);
+ for (pfn = xen_start_info->nr_pages; pfn < xen_p2m_size; pfn++)
+ xen_p2m_addr[pfn] = INVALID_P2M_ENTRY;
- p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_missing, p2m_missing);
- p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_identity, p2m_identity);
+ xen_max_p2m_pfn = xen_p2m_size;
+}
- p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_init(p2m_top);
+#define P2M_TYPE_IDENTITY 0
+#define P2M_TYPE_MISSING 1
+#define P2M_TYPE_PFN 2
+#define P2M_TYPE_UNKNOWN 3
- /*
- * The domain builder gives us a pre-constructed p2m array in
- * mfn_list for all the pages initially given to us, so we just
- * need to graft that into our tree structure.
- */
- for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
+static int xen_p2m_elem_type(unsigned long pfn)
+{
+ unsigned long mfn;
- if (p2m_top[topidx] == p2m_mid_missing) {
- unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid, p2m_missing);
+ if (pfn >= xen_p2m_size)
+ return P2M_TYPE_IDENTITY;
- p2m_top[topidx] = mid;
- }
+ mfn = xen_p2m_addr[pfn];
- /*
- * As long as the mfn_list has enough entries to completely
- * fill a p2m page, pointing into the array is ok. But if
- * not the entries beyond the last pfn will be undefined.
- */
- if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
- unsigned long p2midx;
+ if (mfn == INVALID_P2M_ENTRY)
+ return P2M_TYPE_MISSING;
- p2midx = max_pfn % P2M_PER_PAGE;
- for ( ; p2midx < P2M_PER_PAGE; p2midx++)
- mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
- }
- p2m_top[topidx][mididx] = &mfn_list[pfn];
- }
+ if (mfn & IDENTITY_FRAME_BIT)
+ return P2M_TYPE_IDENTITY;
- m2p_override_init();
+ return P2M_TYPE_PFN;
}
-#ifdef CONFIG_X86_64
-unsigned long __init xen_revector_p2m_tree(void)
+
+static void __init xen_rebuild_p2m_list(unsigned long *p2m)
{
- unsigned long va_start;
- unsigned long va_end;
+ unsigned int i, chunk;
unsigned long pfn;
- unsigned long pfn_free = 0;
- unsigned long *mfn_list = NULL;
- unsigned long size;
-
- va_start = xen_start_info->mfn_list;
- /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long),
- * so make sure it is rounded up to that */
- size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
- va_end = va_start + size;
-
- /* If we were revectored already, don't do it again. */
- if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET)
- return 0;
+ unsigned long *mfns;
+ pte_t *ptep;
+ pmd_t *pmdp;
+ int type;
- mfn_list = alloc_bootmem_align(size, PAGE_SIZE);
- if (!mfn_list) {
- pr_warn("Could not allocate space for a new P2M tree!\n");
- return xen_start_info->mfn_list;
- }
- /* Fill it out with INVALID_P2M_ENTRY value */
- memset(mfn_list, 0xFF, size);
+ p2m_missing = alloc_p2m_page();
+ p2m_init(p2m_missing);
+ p2m_identity = alloc_p2m_page();
+ p2m_init(p2m_identity);
- for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx;
- unsigned long *mid_p;
+ p2m_missing_pte = alloc_p2m_page();
+ paravirt_alloc_pte(&init_mm, __pa(p2m_missing_pte) >> PAGE_SHIFT);
+ p2m_identity_pte = alloc_p2m_page();
+ paravirt_alloc_pte(&init_mm, __pa(p2m_identity_pte) >> PAGE_SHIFT);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ set_pte(p2m_missing_pte + i,
+ pfn_pte(PFN_DOWN(__pa(p2m_missing)), PAGE_KERNEL_RO));
+ set_pte(p2m_identity_pte + i,
+ pfn_pte(PFN_DOWN(__pa(p2m_identity)), PAGE_KERNEL_RO));
+ }
- if (!p2m_top[topidx])
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += chunk) {
+ /*
+ * Try to map missing/identity PMDs or p2m-pages if possible.
+ * We have to respect the structure of the mfn_list_list
+ * which will be built just afterwards.
+ * Chunk size to test is one p2m page if we are in the middle
+ * of a mfn_list_list mid page and the complete mid page area
+ * if we are at index 0 of the mid page. Please note that a
+ * mid page might cover more than one PMD, e.g. on 32 bit PAE
+ * kernels.
+ */
+ chunk = (pfn & (P2M_PER_PAGE * P2M_MID_PER_PAGE - 1)) ?
+ P2M_PER_PAGE : P2M_PER_PAGE * P2M_MID_PER_PAGE;
+
+ type = xen_p2m_elem_type(pfn);
+ i = 0;
+ if (type != P2M_TYPE_PFN)
+ for (i = 1; i < chunk; i++)
+ if (xen_p2m_elem_type(pfn + i) != type)
+ break;
+ if (i < chunk)
+ /* Reset to minimal chunk size. */
+ chunk = P2M_PER_PAGE;
+
+ if (type == P2M_TYPE_PFN || i < chunk) {
+ /* Use initial p2m page contents. */
+#ifdef CONFIG_X86_64
+ mfns = alloc_p2m_page();
+ copy_page(mfns, xen_p2m_addr + pfn);
+#else
+ mfns = xen_p2m_addr + pfn;
+#endif
+ ptep = populate_extra_pte((unsigned long)(p2m + pfn));
+ set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL));
continue;
+ }
- if (p2m_top[topidx] == p2m_mid_missing)
+ if (chunk == P2M_PER_PAGE) {
+ /* Map complete missing or identity p2m-page. */
+ mfns = (type == P2M_TYPE_MISSING) ?
+ p2m_missing : p2m_identity;
+ ptep = populate_extra_pte((unsigned long)(p2m + pfn));
+ set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL_RO));
continue;
+ }
- mididx = p2m_mid_index(pfn);
- mid_p = p2m_top[topidx][mididx];
- if (!mid_p)
- continue;
- if ((mid_p == p2m_missing) || (mid_p == p2m_identity))
- continue;
+ /* Complete missing or identity PMD(s) can be mapped. */
+ ptep = (type == P2M_TYPE_MISSING) ?
+ p2m_missing_pte : p2m_identity_pte;
+ for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
+ pmdp = populate_extra_pmd(
+ (unsigned long)(p2m + pfn + i * PTRS_PER_PTE));
+ set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE));
+ }
+ }
+}
- if ((unsigned long)mid_p == INVALID_P2M_ENTRY)
- continue;
+void __init xen_vmalloc_p2m_tree(void)
+{
+ static struct vm_struct vm;
- /* The old va. Rebase it on mfn_list */
- if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) {
- unsigned long *new;
+ vm.flags = VM_ALLOC;
+ vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn,
+ PMD_SIZE * PMDS_PER_MID_PAGE);
+ vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE);
+ pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size);
- if (pfn_free > (size / sizeof(unsigned long))) {
- WARN(1, "Only allocated for %ld pages, but we want %ld!\n",
- size / sizeof(unsigned long), pfn_free);
- return 0;
- }
- new = &mfn_list[pfn_free];
+ xen_max_p2m_pfn = vm.size / sizeof(unsigned long);
- copy_page(new, mid_p);
- p2m_top[topidx][mididx] = &mfn_list[pfn_free];
+ xen_rebuild_p2m_list(vm.addr);
- pfn_free += P2M_PER_PAGE;
+ xen_p2m_addr = vm.addr;
+ xen_p2m_size = xen_max_p2m_pfn;
- }
- /* This should be the leafs allocated for identity from _brk. */
- }
- return (unsigned long)mfn_list;
+ xen_inv_extra_mem();
+ m2p_override_init();
}
-#else
-unsigned long __init xen_revector_p2m_tree(void)
-{
- return 0;
-}
-#endif
+
unsigned long get_phys_to_machine(unsigned long pfn)
{
- unsigned topidx, mididx, idx;
+ pte_t *ptep;
+ unsigned int level;
+
+ if (unlikely(pfn >= xen_p2m_size)) {
+ if (pfn < xen_max_p2m_pfn)
+ return xen_chk_extra_mem(pfn);
- if (unlikely(pfn >= MAX_P2M_PFN))
return IDENTITY_FRAME(pfn);
+ }
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
+ ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
/*
* The INVALID_P2M_ENTRY is filled in both p2m_*identity
* and in p2m_*missing, so returning the INVALID_P2M_ENTRY
* would be wrong.
*/
- if (p2m_top[topidx][mididx] == p2m_identity)
+ if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
return IDENTITY_FRAME(pfn);
- return p2m_top[topidx][mididx][idx];
+ return xen_p2m_addr[pfn];
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);
-static void *alloc_p2m_page(void)
+/*
+ * Allocate new pmd(s). It is checked whether the old pmd is still in place.
+ * If not, nothing is changed. This is okay as the only reason for allocating
+ * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual
+ * pmd. In case of PAE/x86-32 there are multiple pmds to allocate!
+ */
+static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg)
{
- return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
-}
+ pte_t *ptechk;
+ pte_t *pteret = ptep;
+ pte_t *pte_newpg[PMDS_PER_MID_PAGE];
+ pmd_t *pmdp;
+ unsigned int level;
+ unsigned long flags;
+ unsigned long vaddr;
+ int i;
-static void free_p2m_page(void *p)
-{
- free_page((unsigned long)p);
+ /* Do all allocations first to bail out in error case. */
+ for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
+ pte_newpg[i] = alloc_p2m_page();
+ if (!pte_newpg[i]) {
+ for (i--; i >= 0; i--)
+ free_p2m_page(pte_newpg[i]);
+
+ return NULL;
+ }
+ }
+
+ vaddr = addr & ~(PMD_SIZE * PMDS_PER_MID_PAGE - 1);
+
+ for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
+ copy_page(pte_newpg[i], pte_pg);
+ paravirt_alloc_pte(&init_mm, __pa(pte_newpg[i]) >> PAGE_SHIFT);
+
+ pmdp = lookup_pmd_address(vaddr);
+ BUG_ON(!pmdp);
+
+ spin_lock_irqsave(&p2m_update_lock, flags);
+
+ ptechk = lookup_address(vaddr, &level);
+ if (ptechk == pte_pg) {
+ set_pmd(pmdp,
+ __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
+ if (vaddr == (addr & ~(PMD_SIZE - 1)))
+ pteret = pte_offset_kernel(pmdp, addr);
+ pte_newpg[i] = NULL;
+ }
+
+ spin_unlock_irqrestore(&p2m_update_lock, flags);
+
+ if (pte_newpg[i]) {
+ paravirt_release_pte(__pa(pte_newpg[i]) >> PAGE_SHIFT);
+ free_p2m_page(pte_newpg[i]);
+ }
+
+ vaddr += PMD_SIZE;
+ }
+
+ return pteret;
}
/*
@@ -530,58 +501,62 @@ static void free_p2m_page(void *p)
static bool alloc_p2m(unsigned long pfn)
{
unsigned topidx, mididx;
- unsigned long ***top_p, **mid;
unsigned long *top_mfn_p, *mid_mfn;
- unsigned long *p2m_orig;
+ pte_t *ptep, *pte_pg;
+ unsigned int level;
+ unsigned long flags;
+ unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
+ unsigned long p2m_pfn;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
- top_p = &p2m_top[topidx];
- mid = ACCESS_ONCE(*top_p);
+ ptep = lookup_address(addr, &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
+ pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
- if (mid == p2m_mid_missing) {
- /* Mid level is missing, allocate a new one */
- mid = alloc_p2m_page();
- if (!mid)
+ if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) {
+ /* PMD level is missing, allocate a new one */
+ ptep = alloc_p2m_pmd(addr, ptep, pte_pg);
+ if (!ptep)
return false;
-
- p2m_mid_init(mid, p2m_missing);
-
- if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
- free_p2m_page(mid);
}
- top_mfn_p = &p2m_top_mfn[topidx];
- mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
+ if (p2m_top_mfn) {
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
- BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
- if (mid_mfn == p2m_mid_missing_mfn) {
- /* Separately check the mid mfn level */
- unsigned long missing_mfn;
- unsigned long mid_mfn_mfn;
- unsigned long old_mfn;
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+ unsigned long old_mfn;
- mid_mfn = alloc_p2m_page();
- if (!mid_mfn)
- return false;
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return false;
- p2m_mid_mfn_init(mid_mfn, p2m_missing);
+ p2m_mid_mfn_init(mid_mfn, p2m_missing);
- missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
- mid_mfn_mfn = virt_to_mfn(mid_mfn);
- old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
- if (old_mfn != missing_mfn) {
- free_p2m_page(mid_mfn);
- mid_mfn = mfn_to_virt(old_mfn);
- } else {
- p2m_top_mfn_p[topidx] = mid_mfn;
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
+ if (old_mfn != missing_mfn) {
+ free_p2m_page(mid_mfn);
+ mid_mfn = mfn_to_virt(old_mfn);
+ } else {
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
}
+ } else {
+ mid_mfn = NULL;
}
- p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]);
- if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) {
+ p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep));
+ if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) ||
+ p2m_pfn == PFN_DOWN(__pa(p2m_missing))) {
/* p2m leaf page is missing */
unsigned long *p2m;
@@ -589,183 +564,36 @@ static bool alloc_p2m(unsigned long pfn)
if (!p2m)
return false;
- p2m_init(p2m);
-
- if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
- free_p2m_page(p2m);
+ if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
+ p2m_init(p2m);
else
- mid_mfn[mididx] = virt_to_mfn(p2m);
- }
-
- return true;
-}
-
-static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)
-{
- unsigned topidx, mididx, idx;
- unsigned long *p2m;
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- /* Pfff.. No boundary cross-over, lets get out. */
- if (!idx && check_boundary)
- return false;
-
- WARN(p2m_top[topidx][mididx] == p2m_identity,
- "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
- topidx, mididx);
-
- /*
- * Could be done by xen_build_dynamic_phys_to_machine..
- */
- if (p2m_top[topidx][mididx] != p2m_missing)
- return false;
-
- /* Boundary cross-over for the edges: */
- p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
-
- p2m_init(p2m);
+ p2m_init_identity(p2m, pfn);
- p2m_top[topidx][mididx] = p2m;
+ spin_lock_irqsave(&p2m_update_lock, flags);
- return true;
-}
-
-static bool __init early_alloc_p2m_middle(unsigned long pfn)
-{
- unsigned topidx = p2m_top_index(pfn);
- unsigned long **mid;
-
- mid = p2m_top[topidx];
- if (mid == p2m_mid_missing) {
- mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
-
- p2m_mid_init(mid, p2m_missing);
-
- p2m_top[topidx] = mid;
- }
- return true;
-}
-
-/*
- * Skim over the P2M tree looking at pages that are either filled with
- * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
- * replace the P2M leaf with a p2m_missing or p2m_identity.
- * Stick the old page in the new P2M tree location.
- */
-static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn)
-{
- unsigned topidx;
- unsigned mididx;
- unsigned ident_pfns;
- unsigned inv_pfns;
- unsigned long *p2m;
- unsigned idx;
- unsigned long pfn;
-
- /* We only look when this entails a P2M middle layer */
- if (p2m_index(set_pfn))
- return false;
-
- for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
- topidx = p2m_top_index(pfn);
-
- if (!p2m_top[topidx])
- continue;
-
- if (p2m_top[topidx] == p2m_mid_missing)
- continue;
-
- mididx = p2m_mid_index(pfn);
- p2m = p2m_top[topidx][mididx];
- if (!p2m)
- continue;
-
- if ((p2m == p2m_missing) || (p2m == p2m_identity))
- continue;
-
- if ((unsigned long)p2m == INVALID_P2M_ENTRY)
- continue;
-
- ident_pfns = 0;
- inv_pfns = 0;
- for (idx = 0; idx < P2M_PER_PAGE; idx++) {
- /* IDENTITY_PFNs are 1:1 */
- if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
- ident_pfns++;
- else if (p2m[idx] == INVALID_P2M_ENTRY)
- inv_pfns++;
- else
- break;
+ if (pte_pfn(*ptep) == p2m_pfn) {
+ set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
+ if (mid_mfn)
+ mid_mfn[mididx] = virt_to_mfn(p2m);
+ p2m = NULL;
}
- if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE))
- goto found;
- }
- return false;
-found:
- /* Found one, replace old with p2m_identity or p2m_missing */
- p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
-
- /* Reset where we want to stick the old page in. */
- topidx = p2m_top_index(set_pfn);
- mididx = p2m_mid_index(set_pfn);
-
- /* This shouldn't happen */
- if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
- early_alloc_p2m_middle(set_pfn);
-
- if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
- return false;
-
- p2m_init(p2m);
- p2m_top[topidx][mididx] = p2m;
- return true;
-}
-bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!early_alloc_p2m_middle(pfn))
- return false;
-
- if (early_can_reuse_p2m_middle(pfn))
- return __set_phys_to_machine(pfn, mfn);
-
- if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
- return false;
+ spin_unlock_irqrestore(&p2m_update_lock, flags);
- if (!__set_phys_to_machine(pfn, mfn))
- return false;
+ if (p2m)
+ free_p2m_page(p2m);
}
return true;
}
-static void __init early_split_p2m(unsigned long pfn)
-{
- unsigned long mididx, idx;
-
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- /*
- * Allocate new middle and leaf pages if this pfn lies in the
- * middle of one.
- */
- if (mididx || idx)
- early_alloc_p2m_middle(pfn);
- if (idx)
- early_alloc_p2m(pfn, false);
-}
-
unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e)
{
unsigned long pfn;
- if (unlikely(pfn_s >= MAX_P2M_PFN))
+ if (unlikely(pfn_s >= xen_p2m_size))
return 0;
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
@@ -774,101 +602,51 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
if (pfn_s > pfn_e)
return 0;
- if (pfn_e > MAX_P2M_PFN)
- pfn_e = MAX_P2M_PFN;
-
- early_split_p2m(pfn_s);
- early_split_p2m(pfn_e);
-
- for (pfn = pfn_s; pfn < pfn_e;) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
-
- if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
- break;
- pfn++;
-
- /*
- * If the PFN was set to a middle or leaf identity
- * page the remainder must also be identity, so skip
- * ahead to the next middle or leaf entry.
- */
- if (p2m_top[topidx] == p2m_mid_identity)
- pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE);
- else if (p2m_top[topidx][mididx] == p2m_identity)
- pfn = ALIGN(pfn, P2M_PER_PAGE);
- }
+ if (pfn_e > xen_p2m_size)
+ pfn_e = xen_p2m_size;
- WARN((pfn - pfn_s) != (pfn_e - pfn_s),
- "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
- (pfn_e - pfn_s) - (pfn - pfn_s));
+ for (pfn = pfn_s; pfn < pfn_e; pfn++)
+ xen_p2m_addr[pfn] = IDENTITY_FRAME(pfn);
return pfn - pfn_s;
}
-/* Try to install p2m mapping; fail if intermediate bits missing */
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
- unsigned topidx, mididx, idx;
+ pte_t *ptep;
+ unsigned int level;
/* don't track P2M changes in autotranslate guests */
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
return true;
- if (unlikely(pfn >= MAX_P2M_PFN)) {
+ if (unlikely(pfn >= xen_p2m_size)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
return true;
}
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- /* For sparse holes were the p2m leaf has real PFN along with
- * PCI holes, stick in the PFN as the MFN value.
- *
- * set_phys_range_identity() will have allocated new middle
- * and leaf pages as required so an existing p2m_mid_missing
- * or p2m_missing mean that whole range will be identity so
- * these can be switched to p2m_mid_identity or p2m_identity.
- */
- if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
- if (p2m_top[topidx] == p2m_mid_identity)
- return true;
-
- if (p2m_top[topidx] == p2m_mid_missing) {
- WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing,
- p2m_mid_identity) != p2m_mid_missing);
- return true;
- }
-
- if (p2m_top[topidx][mididx] == p2m_identity)
- return true;
+ if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
+ return true;
- /* Swap over from MISSING to IDENTITY if needed. */
- if (p2m_top[topidx][mididx] == p2m_missing) {
- WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
- p2m_identity) != p2m_missing);
- return true;
- }
- }
+ ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
- if (p2m_top[topidx][mididx] == p2m_missing)
+ if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_missing)))
return mfn == INVALID_P2M_ENTRY;
- p2m_top[topidx][mididx][idx] = mfn;
+ if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
+ return mfn == IDENTITY_FRAME(pfn);
- return true;
+ return false;
}
bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
- if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
if (!alloc_p2m(pfn))
return false;
- if (!__set_phys_to_machine(pfn, mfn))
- return false;
+ return __set_phys_to_machine(pfn, mfn);
}
return true;
@@ -877,15 +655,16 @@ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
#define M2P_OVERRIDE_HASH_SHIFT 10
#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
-static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
+static struct list_head *m2p_overrides;
static DEFINE_SPINLOCK(m2p_override_lock);
static void __init m2p_override_init(void)
{
unsigned i;
- m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
- sizeof(unsigned long));
+ m2p_overrides = alloc_bootmem_align(
+ sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
+ sizeof(unsigned long));
for (i = 0; i < M2P_OVERRIDE_HASH; i++)
INIT_LIST_HEAD(&m2p_overrides[i]);
@@ -896,68 +675,9 @@ static unsigned long mfn_hash(unsigned long mfn)
return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
}
-int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
- struct gnttab_map_grant_ref *kmap_ops,
- struct page **pages, unsigned int count)
-{
- int i, ret = 0;
- bool lazy = false;
- pte_t *pte;
-
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return 0;
-
- if (kmap_ops &&
- !in_interrupt() &&
- paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
- arch_enter_lazy_mmu_mode();
- lazy = true;
- }
-
- for (i = 0; i < count; i++) {
- unsigned long mfn, pfn;
-
- /* Do not add to override if the map failed. */
- if (map_ops[i].status)
- continue;
-
- if (map_ops[i].flags & GNTMAP_contains_pte) {
- pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
- (map_ops[i].host_addr & ~PAGE_MASK));
- mfn = pte_mfn(*pte);
- } else {
- mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
- }
- pfn = page_to_pfn(pages[i]);
-
- WARN_ON(PagePrivate(pages[i]));
- SetPagePrivate(pages[i]);
- set_page_private(pages[i], mfn);
- pages[i]->index = pfn_to_mfn(pfn);
-
- if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (kmap_ops) {
- ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
- if (ret)
- goto out;
- }
- }
-
-out:
- if (lazy)
- arch_leave_lazy_mmu_mode();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
-
/* Add an MFN override for a particular page */
-int m2p_add_override(unsigned long mfn, struct page *page,
- struct gnttab_map_grant_ref *kmap_op)
+static int m2p_add_override(unsigned long mfn, struct page *page,
+ struct gnttab_map_grant_ref *kmap_op)
{
unsigned long flags;
unsigned long pfn;
@@ -970,7 +690,7 @@ int m2p_add_override(unsigned long mfn, struct page *page,
address = (unsigned long)__va(pfn << PAGE_SHIFT);
ptep = lookup_address(address, &level);
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
- "m2p_add_override: pfn %lx not mapped", pfn))
+ "m2p_add_override: pfn %lx not mapped", pfn))
return -EINVAL;
}
@@ -1004,19 +724,19 @@ int m2p_add_override(unsigned long mfn, struct page *page,
* because mfn_to_pfn (that ends up being called by GUPF) will
* return the backend pfn rather than the frontend pfn. */
pfn = mfn_to_pfn_no_overrides(mfn);
- if (get_phys_to_machine(pfn) == mfn)
+ if (__pfn_to_mfn(pfn) == mfn)
set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
return 0;
}
-EXPORT_SYMBOL_GPL(m2p_add_override);
-int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
- struct gnttab_map_grant_ref *kmap_ops,
- struct page **pages, unsigned int count)
+int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
{
int i, ret = 0;
bool lazy = false;
+ pte_t *pte;
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
@@ -1029,35 +749,75 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
}
for (i = 0; i < count; i++) {
- unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i]));
- unsigned long pfn = page_to_pfn(pages[i]);
+ unsigned long mfn, pfn;
- if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
- ret = -EINVAL;
- goto out;
+ /* Do not add to override if the map failed. */
+ if (map_ops[i].status)
+ continue;
+
+ if (map_ops[i].flags & GNTMAP_contains_pte) {
+ pte = (pte_t *)(mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+ (map_ops[i].host_addr & ~PAGE_MASK));
+ mfn = pte_mfn(*pte);
+ } else {
+ mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
}
+ pfn = page_to_pfn(pages[i]);
- set_page_private(pages[i], INVALID_P2M_ENTRY);
- WARN_ON(!PagePrivate(pages[i]));
- ClearPagePrivate(pages[i]);
- set_phys_to_machine(pfn, pages[i]->index);
+ WARN_ON(PagePrivate(pages[i]));
+ SetPagePrivate(pages[i]);
+ set_page_private(pages[i], mfn);
+ pages[i]->index = pfn_to_mfn(pfn);
- if (kmap_ops)
- ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
- if (ret)
+ if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
+ ret = -ENOMEM;
goto out;
+ }
+
+ if (kmap_ops) {
+ ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
+ if (ret)
+ goto out;
+ }
}
out:
if (lazy)
arch_leave_lazy_mmu_mode();
+
return ret;
}
-EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
+EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
-int m2p_remove_override(struct page *page,
- struct gnttab_map_grant_ref *kmap_op,
- unsigned long mfn)
+static struct page *m2p_find_override(unsigned long mfn)
+{
+ unsigned long flags;
+ struct list_head *bucket;
+ struct page *p, *ret;
+
+ if (unlikely(!m2p_overrides))
+ return NULL;
+
+ ret = NULL;
+ bucket = &m2p_overrides[mfn_hash(mfn)];
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+
+ list_for_each_entry(p, bucket, lru) {
+ if (page_private(p) == mfn) {
+ ret = p;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return ret;
+}
+
+static int m2p_remove_override(struct page *page,
+ struct gnttab_map_grant_ref *kmap_op,
+ unsigned long mfn)
{
unsigned long flags;
unsigned long pfn;
@@ -1072,7 +832,7 @@ int m2p_remove_override(struct page *page,
ptep = lookup_address(address, &level);
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
- "m2p_remove_override: pfn %lx not mapped", pfn))
+ "m2p_remove_override: pfn %lx not mapped", pfn))
return -EINVAL;
}
@@ -1102,9 +862,8 @@ int m2p_remove_override(struct page *page,
* hypercall actually returned an error.
*/
if (kmap_op->handle == GNTST_general_error) {
- printk(KERN_WARNING "m2p_remove_override: "
- "pfn %lx mfn %lx, failed to modify kernel mappings",
- pfn, mfn);
+ pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings",
+ pfn, mfn);
put_balloon_scratch_page();
return -1;
}
@@ -1112,14 +871,14 @@ int m2p_remove_override(struct page *page,
xen_mc_batch();
mcs = __xen_mc_entry(
- sizeof(struct gnttab_unmap_and_replace));
+ sizeof(struct gnttab_unmap_and_replace));
unmap_op = mcs.args;
unmap_op->host_addr = kmap_op->host_addr;
unmap_op->new_addr = scratch_page_address;
unmap_op->handle = kmap_op->handle;
MULTI_grant_table_op(mcs.mc,
- GNTTABOP_unmap_and_replace, unmap_op, 1);
+ GNTTABOP_unmap_and_replace, unmap_op, 1);
mcs = __xen_mc_entry(0);
MULTI_update_va_mapping(mcs.mc, scratch_page_address,
@@ -1145,35 +904,56 @@ int m2p_remove_override(struct page *page,
* pfn again. */
mfn &= ~FOREIGN_FRAME_BIT;
pfn = mfn_to_pfn_no_overrides(mfn);
- if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
+ if (__pfn_to_mfn(pfn) == FOREIGN_FRAME(mfn) &&
m2p_find_override(mfn) == NULL)
set_phys_to_machine(pfn, mfn);
return 0;
}
-EXPORT_SYMBOL_GPL(m2p_remove_override);
-struct page *m2p_find_override(unsigned long mfn)
+int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
{
- unsigned long flags;
- struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
- struct page *p, *ret;
+ int i, ret = 0;
+ bool lazy = false;
- ret = NULL;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
- spin_lock_irqsave(&m2p_override_lock, flags);
+ if (kmap_ops &&
+ !in_interrupt() &&
+ paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
+ arch_enter_lazy_mmu_mode();
+ lazy = true;
+ }
- list_for_each_entry(p, bucket, lru) {
- if (page_private(p) == mfn) {
- ret = p;
- break;
+ for (i = 0; i < count; i++) {
+ unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
+ unsigned long pfn = page_to_pfn(pages[i]);
+
+ if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
+ ret = -EINVAL;
+ goto out;
}
- }
- spin_unlock_irqrestore(&m2p_override_lock, flags);
+ set_page_private(pages[i], INVALID_P2M_ENTRY);
+ WARN_ON(!PagePrivate(pages[i]));
+ ClearPagePrivate(pages[i]);
+ set_phys_to_machine(pfn, pages[i]->index);
+
+ if (kmap_ops)
+ ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
+ if (ret)
+ goto out;
+ }
+out:
+ if (lazy)
+ arch_leave_lazy_mmu_mode();
return ret;
}
+EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
{
@@ -1192,79 +972,29 @@ EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
#include "debugfs.h"
static int p2m_dump_show(struct seq_file *m, void *v)
{
- static const char * const level_name[] = { "top", "middle",
- "entry", "abnormal", "error"};
-#define TYPE_IDENTITY 0
-#define TYPE_MISSING 1
-#define TYPE_PFN 2
-#define TYPE_UNKNOWN 3
static const char * const type_name[] = {
- [TYPE_IDENTITY] = "identity",
- [TYPE_MISSING] = "missing",
- [TYPE_PFN] = "pfn",
- [TYPE_UNKNOWN] = "abnormal"};
- unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
- unsigned int uninitialized_var(prev_level);
- unsigned int uninitialized_var(prev_type);
-
- if (!p2m_top)
- return 0;
-
- for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
- unsigned idx = p2m_index(pfn);
- unsigned lvl, type;
-
- lvl = 4;
- type = TYPE_UNKNOWN;
- if (p2m_top[topidx] == p2m_mid_missing) {
- lvl = 0; type = TYPE_MISSING;
- } else if (p2m_top[topidx] == NULL) {
- lvl = 0; type = TYPE_UNKNOWN;
- } else if (p2m_top[topidx][mididx] == NULL) {
- lvl = 1; type = TYPE_UNKNOWN;
- } else if (p2m_top[topidx][mididx] == p2m_identity) {
- lvl = 1; type = TYPE_IDENTITY;
- } else if (p2m_top[topidx][mididx] == p2m_missing) {
- lvl = 1; type = TYPE_MISSING;
- } else if (p2m_top[topidx][mididx][idx] == 0) {
- lvl = 2; type = TYPE_UNKNOWN;
- } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
- lvl = 2; type = TYPE_IDENTITY;
- } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
- lvl = 2; type = TYPE_MISSING;
- } else if (p2m_top[topidx][mididx][idx] == pfn) {
- lvl = 2; type = TYPE_PFN;
- } else if (p2m_top[topidx][mididx][idx] != pfn) {
- lvl = 2; type = TYPE_PFN;
- }
- if (pfn == 0) {
- prev_level = lvl;
- prev_type = type;
- }
- if (pfn == MAX_DOMAIN_PAGES-1) {
- lvl = 3;
- type = TYPE_UNKNOWN;
- }
- if (prev_type != type) {
- seq_printf(m, " [0x%lx->0x%lx] %s\n",
- prev_pfn_type, pfn, type_name[prev_type]);
- prev_pfn_type = pfn;
+ [P2M_TYPE_IDENTITY] = "identity",
+ [P2M_TYPE_MISSING] = "missing",
+ [P2M_TYPE_PFN] = "pfn",
+ [P2M_TYPE_UNKNOWN] = "abnormal"};
+ unsigned long pfn, first_pfn;
+ int type, prev_type;
+
+ prev_type = xen_p2m_elem_type(0);
+ first_pfn = 0;
+
+ for (pfn = 0; pfn < xen_p2m_size; pfn++) {
+ type = xen_p2m_elem_type(pfn);
+ if (type != prev_type) {
+ seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn,
+ type_name[prev_type]);
prev_type = type;
- }
- if (prev_level != lvl) {
- seq_printf(m, " [0x%lx->0x%lx] level %s\n",
- prev_pfn_level, pfn, level_name[prev_level]);
- prev_pfn_level = pfn;
- prev_level = lvl;
+ first_pfn = pfn;
}
}
+ seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn,
+ type_name[prev_type]);
return 0;
-#undef TYPE_IDENTITY
-#undef TYPE_MISSING
-#undef TYPE_PFN
-#undef TYPE_UNKNOWN
}
static int p2m_dump_open(struct inode *inode, struct file *filp)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 29834b3fd87f..dfd77dec8e2b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -30,6 +30,7 @@
#include "xen-ops.h"
#include "vdso.h"
#include "p2m.h"
+#include "mmu.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
@@ -47,8 +48,19 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
-/* Buffer used to remap identity mapped pages */
-unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
+/*
+ * Buffer used to remap identity mapped pages. We only need the virtual space.
+ * The physical page behind this address is remapped as needed to different
+ * buffer pages.
+ */
+#define REMAP_SIZE (P2M_PER_PAGE - 3)
+static struct {
+ unsigned long next_area_mfn;
+ unsigned long target_pfn;
+ unsigned long size;
+ unsigned long mfns[REMAP_SIZE];
+} xen_remap_buf __initdata __aligned(PAGE_SIZE);
+static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
/*
* The maximum amount of extra memory compared to the base size. The
@@ -64,7 +76,6 @@ unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
static void __init xen_add_extra_mem(u64 start, u64 size)
{
- unsigned long pfn;
int i;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
@@ -84,75 +95,76 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
printk(KERN_WARNING "Warning: not enough extra memory regions\n");
memblock_reserve(start, size);
+}
- xen_max_p2m_pfn = PFN_DOWN(start + size);
- for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
- unsigned long mfn = pfn_to_mfn(pfn);
-
- if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
- continue;
- WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
- pfn, mfn);
+static void __init xen_del_extra_mem(u64 start, u64 size)
+{
+ int i;
+ u64 start_r, size_r;
- __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ start_r = xen_extra_mem[i].start;
+ size_r = xen_extra_mem[i].size;
+
+ /* Start of region. */
+ if (start_r == start) {
+ BUG_ON(size > size_r);
+ xen_extra_mem[i].start += size;
+ xen_extra_mem[i].size -= size;
+ break;
+ }
+ /* End of region. */
+ if (start_r + size_r == start + size) {
+ BUG_ON(size > size_r);
+ xen_extra_mem[i].size -= size;
+ break;
+ }
+ /* Mid of region. */
+ if (start > start_r && start < start_r + size_r) {
+ BUG_ON(start + size > start_r + size_r);
+ xen_extra_mem[i].size = start - start_r;
+ /* Calling memblock_reserve() again is okay. */
+ xen_add_extra_mem(start + size, start_r + size_r -
+ (start + size));
+ break;
+ }
}
+ memblock_free(start, size);
}
-static unsigned long __init xen_do_chunk(unsigned long start,
- unsigned long end, bool release)
+/*
+ * Called during boot before the p2m list can take entries beyond the
+ * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
+ * invalid.
+ */
+unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
- struct xen_memory_reservation reservation = {
- .address_bits = 0,
- .extent_order = 0,
- .domid = DOMID_SELF
- };
- unsigned long len = 0;
- unsigned long pfn;
- int ret;
+ int i;
+ unsigned long addr = PFN_PHYS(pfn);
- for (pfn = start; pfn < end; pfn++) {
- unsigned long frame;
- unsigned long mfn = pfn_to_mfn(pfn);
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ if (addr >= xen_extra_mem[i].start &&
+ addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+ return INVALID_P2M_ENTRY;
+ }
- if (release) {
- /* Make sure pfn exists to start with */
- if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
- continue;
- frame = mfn;
- } else {
- if (mfn != INVALID_P2M_ENTRY)
- continue;
- frame = pfn;
- }
- set_xen_guest_handle(reservation.extent_start, &frame);
- reservation.nr_extents = 1;
+ return IDENTITY_FRAME(pfn);
+}
- ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
- &reservation);
- WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
- release ? "release" : "populate", pfn, ret);
+/*
+ * Mark all pfns of extra mem as invalid in p2m list.
+ */
+void __init xen_inv_extra_mem(void)
+{
+ unsigned long pfn, pfn_s, pfn_e;
+ int i;
- if (ret == 1) {
- if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
- if (release)
- break;
- set_xen_guest_handle(reservation.extent_start, &frame);
- reservation.nr_extents = 1;
- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
- &reservation);
- break;
- }
- len++;
- } else
- break;
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ pfn_s = PFN_DOWN(xen_extra_mem[i].start);
+ pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
+ for (pfn = pfn_s; pfn < pfn_e; pfn++)
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
- if (len)
- printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
- release ? "Freeing" : "Populating",
- start, end, len,
- release ? "freed" : "added");
-
- return len;
}
/*
@@ -198,26 +210,62 @@ static unsigned long __init xen_find_pfn_range(
return done;
}
+static int __init xen_free_mfn(unsigned long mfn)
+{
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ set_xen_guest_handle(reservation.extent_start, &mfn);
+ reservation.nr_extents = 1;
+
+ return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+}
+
/*
- * This releases a chunk of memory and then does the identity map. It's used as
+ * This releases a chunk of memory and then does the identity map. It's used
* as a fallback if the remapping fails.
*/
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
unsigned long *released)
{
+ unsigned long len = 0;
+ unsigned long pfn, end;
+ int ret;
+
WARN_ON(start_pfn > end_pfn);
+ end = min(end_pfn, nr_pages);
+ for (pfn = start_pfn; pfn < end; pfn++) {
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ /* Make sure pfn exists to start with */
+ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+ continue;
+
+ ret = xen_free_mfn(mfn);
+ WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
+
+ if (ret == 1) {
+ if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
+ break;
+ len++;
+ } else
+ break;
+ }
+
/* Need to release pages first */
- *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true);
+ *released += len;
*identity += set_phys_range_identity(start_pfn, end_pfn);
}
/*
- * Helper function to update both the p2m and m2p tables.
+ * Helper function to update the p2m and m2p tables and kernel mapping.
*/
-static unsigned long __init xen_update_mem_tables(unsigned long pfn,
- unsigned long mfn)
+static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
{
struct mmu_update update = {
.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
@@ -225,161 +273,88 @@ static unsigned long __init xen_update_mem_tables(unsigned long pfn,
};
/* Update p2m */
- if (!early_set_phys_to_machine(pfn, mfn)) {
+ if (!set_phys_to_machine(pfn, mfn)) {
WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
pfn, mfn);
- return false;
+ BUG();
}
/* Update m2p */
if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
mfn, pfn);
- return false;
+ BUG();
}
- return true;
+ /* Update kernel mapping, but not for highmem. */
+ if ((pfn << PAGE_SHIFT) >= __pa(high_memory))
+ return;
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
+ mfn_pte(mfn, PAGE_KERNEL), 0)) {
+ WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
+ mfn, pfn);
+ BUG();
+ }
}
/*
* This function updates the p2m and m2p tables with an identity map from
- * start_pfn to start_pfn+size and remaps the underlying RAM of the original
- * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks
- * to not exhaust the reserved brk space. Doing it in properly aligned blocks
- * ensures we only allocate the minimum required leaf pages in the p2m table. It
- * copies the existing mfns from the p2m table under the 1:1 map, overwrites
- * them with the identity map and then updates the p2m and m2p tables with the
- * remapped memory.
+ * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
+ * original allocation at remap_pfn. The information needed for remapping is
+ * saved in the memory itself to avoid the need for allocating buffers. The
+ * complete remap information is contained in a list of MFNs each containing
+ * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
+ * This enables us to preserve the original mfn sequence while doing the
+ * remapping at a time when the memory management is capable of allocating
+ * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
+ * its callers.
*/
-static unsigned long __init xen_do_set_identity_and_remap_chunk(
+static void __init xen_do_set_identity_and_remap_chunk(
unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
{
+ unsigned long buf = (unsigned long)&xen_remap_buf;
+ unsigned long mfn_save, mfn;
unsigned long ident_pfn_iter, remap_pfn_iter;
- unsigned long ident_start_pfn_align, remap_start_pfn_align;
- unsigned long ident_end_pfn_align, remap_end_pfn_align;
- unsigned long ident_boundary_pfn, remap_boundary_pfn;
- unsigned long ident_cnt = 0;
- unsigned long remap_cnt = 0;
+ unsigned long ident_end_pfn = start_pfn + size;
unsigned long left = size;
- unsigned long mod;
- int i;
+ unsigned long ident_cnt = 0;
+ unsigned int i, chunk;
WARN_ON(size == 0);
BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
- /*
- * Determine the proper alignment to remap memory in P2M_PER_PAGE sized
- * blocks. We need to keep track of both the existing pfn mapping and
- * the new pfn remapping.
- */
- mod = start_pfn % P2M_PER_PAGE;
- ident_start_pfn_align =
- mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn;
- mod = remap_pfn % P2M_PER_PAGE;
- remap_start_pfn_align =
- mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn;
- mod = (start_pfn + size) % P2M_PER_PAGE;
- ident_end_pfn_align = start_pfn + size - mod;
- mod = (remap_pfn + size) % P2M_PER_PAGE;
- remap_end_pfn_align = remap_pfn + size - mod;
-
- /* Iterate over each p2m leaf node in each range */
- for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align;
- ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align;
- ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) {
- /* Check we aren't past the end */
- BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size);
- BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size);
-
- /* Save p2m mappings */
- for (i = 0; i < P2M_PER_PAGE; i++)
- xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i);
-
- /* Set identity map which will free a p2m leaf */
- ident_cnt += set_phys_range_identity(ident_pfn_iter,
- ident_pfn_iter + P2M_PER_PAGE);
+ mfn_save = virt_to_mfn(buf);
-#ifdef DEBUG
- /* Helps verify a p2m leaf has been freed */
- for (i = 0; i < P2M_PER_PAGE; i++) {
- unsigned int pfn = ident_pfn_iter + i;
- BUG_ON(pfn_to_mfn(pfn) != pfn);
- }
-#endif
- /* Now remap memory */
- for (i = 0; i < P2M_PER_PAGE; i++) {
- unsigned long mfn = xen_remap_buf[i];
-
- /* This will use the p2m leaf freed above */
- if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) {
- WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
- remap_pfn_iter + i, mfn);
- return 0;
- }
-
- remap_cnt++;
- }
-
- left -= P2M_PER_PAGE;
- }
-
- /* Max boundary space possible */
- BUG_ON(left > (P2M_PER_PAGE - 1) * 2);
+ for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
+ ident_pfn_iter < ident_end_pfn;
+ ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
+ chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
- /* Now handle the boundary conditions */
- ident_boundary_pfn = start_pfn;
- remap_boundary_pfn = remap_pfn;
- for (i = 0; i < left; i++) {
- unsigned long mfn;
+ /* Map first pfn to xen_remap_buf */
+ mfn = pfn_to_mfn(ident_pfn_iter);
+ set_pte_mfn(buf, mfn, PAGE_KERNEL);
- /* These two checks move from the start to end boundaries */
- if (ident_boundary_pfn == ident_start_pfn_align)
- ident_boundary_pfn = ident_pfn_iter;
- if (remap_boundary_pfn == remap_start_pfn_align)
- remap_boundary_pfn = remap_pfn_iter;
+ /* Save mapping information in page */
+ xen_remap_buf.next_area_mfn = xen_remap_mfn;
+ xen_remap_buf.target_pfn = remap_pfn_iter;
+ xen_remap_buf.size = chunk;
+ for (i = 0; i < chunk; i++)
+ xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
- /* Check we aren't past the end */
- BUG_ON(ident_boundary_pfn >= start_pfn + size);
- BUG_ON(remap_boundary_pfn >= remap_pfn + size);
-
- mfn = pfn_to_mfn(ident_boundary_pfn);
-
- if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) {
- WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
- remap_pfn_iter + i, mfn);
- return 0;
- }
- remap_cnt++;
+ /* Put remap buf into list. */
+ xen_remap_mfn = mfn;
- ident_boundary_pfn++;
- remap_boundary_pfn++;
- }
+ /* Set identity map */
+ ident_cnt += set_phys_range_identity(ident_pfn_iter,
+ ident_pfn_iter + chunk);
- /* Finish up the identity map */
- if (ident_start_pfn_align >= ident_end_pfn_align) {
- /*
- * In this case we have an identity range which does not span an
- * aligned block so everything needs to be identity mapped here.
- * If we didn't check this we might remap too many pages since
- * the align boundaries are not meaningful in this case.
- */
- ident_cnt += set_phys_range_identity(start_pfn,
- start_pfn + size);
- } else {
- /* Remapped above so check each end of the chunk */
- if (start_pfn < ident_start_pfn_align)
- ident_cnt += set_phys_range_identity(start_pfn,
- ident_start_pfn_align);
- if (start_pfn + size > ident_pfn_iter)
- ident_cnt += set_phys_range_identity(ident_pfn_iter,
- start_pfn + size);
+ left -= chunk;
}
- BUG_ON(ident_cnt != size);
- BUG_ON(remap_cnt != size);
-
- return size;
+ /* Restore old xen_remap_buf mapping */
+ set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
}
/*
@@ -396,8 +371,7 @@ static unsigned long __init xen_do_set_identity_and_remap_chunk(
static unsigned long __init xen_set_identity_and_remap_chunk(
const struct e820entry *list, size_t map_size, unsigned long start_pfn,
unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
- unsigned long *identity, unsigned long *remapped,
- unsigned long *released)
+ unsigned long *identity, unsigned long *released)
{
unsigned long pfn;
unsigned long i = 0;
@@ -431,19 +405,12 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
if (size > remap_range_size)
size = remap_range_size;
- if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) {
- WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n",
- cur_pfn, size, remap_pfn);
- xen_set_identity_and_release_chunk(cur_pfn,
- cur_pfn + left, nr_pages, identity, released);
- break;
- }
+ xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
/* Update variables to reflect new mappings. */
i += size;
remap_pfn += size;
*identity += size;
- *remapped += size;
}
/*
@@ -458,13 +425,12 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
return remap_pfn;
}
-static unsigned long __init xen_set_identity_and_remap(
+static void __init xen_set_identity_and_remap(
const struct e820entry *list, size_t map_size, unsigned long nr_pages,
unsigned long *released)
{
phys_addr_t start = 0;
unsigned long identity = 0;
- unsigned long remapped = 0;
unsigned long last_pfn = nr_pages;
const struct e820entry *entry;
unsigned long num_released = 0;
@@ -494,8 +460,7 @@ static unsigned long __init xen_set_identity_and_remap(
last_pfn = xen_set_identity_and_remap_chunk(
list, map_size, start_pfn,
end_pfn, nr_pages, last_pfn,
- &identity, &remapped,
- &num_released);
+ &identity, &num_released);
start = end;
}
}
@@ -503,12 +468,63 @@ static unsigned long __init xen_set_identity_and_remap(
*released = num_released;
pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
- pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped,
- last_pfn);
pr_info("Released %ld page(s)\n", num_released);
+}
+
+/*
+ * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
+ * The remap information (which mfn remap to which pfn) is contained in the
+ * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
+ * This scheme allows to remap the different chunks in arbitrary order while
+ * the resulting mapping will be independant from the order.
+ */
+void __init xen_remap_memory(void)
+{
+ unsigned long buf = (unsigned long)&xen_remap_buf;
+ unsigned long mfn_save, mfn, pfn;
+ unsigned long remapped = 0;
+ unsigned int i;
+ unsigned long pfn_s = ~0UL;
+ unsigned long len = 0;
+
+ mfn_save = virt_to_mfn(buf);
+
+ while (xen_remap_mfn != INVALID_P2M_ENTRY) {
+ /* Map the remap information */
+ set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);
- return last_pfn;
+ BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);
+
+ pfn = xen_remap_buf.target_pfn;
+ for (i = 0; i < xen_remap_buf.size; i++) {
+ mfn = xen_remap_buf.mfns[i];
+ xen_update_mem_tables(pfn, mfn);
+ remapped++;
+ pfn++;
+ }
+ if (pfn_s == ~0UL || pfn == pfn_s) {
+ pfn_s = xen_remap_buf.target_pfn;
+ len += xen_remap_buf.size;
+ } else if (pfn_s + len == xen_remap_buf.target_pfn) {
+ len += xen_remap_buf.size;
+ } else {
+ xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+ pfn_s = xen_remap_buf.target_pfn;
+ len = xen_remap_buf.size;
+ }
+
+ mfn = xen_remap_mfn;
+ xen_remap_mfn = xen_remap_buf.next_area_mfn;
+ }
+
+ if (pfn_s != ~0UL && len)
+ xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+
+ set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
+
+ pr_info("Remapped %ld page(s)\n", remapped);
}
+
static unsigned long __init xen_get_max_pages(void)
{
unsigned long max_pages = MAX_DOMAIN_PAGES;
@@ -569,7 +585,6 @@ char * __init xen_memory_setup(void)
int rc;
struct xen_memory_map memmap;
unsigned long max_pages;
- unsigned long last_pfn = 0;
unsigned long extra_pages = 0;
int i;
int op;
@@ -616,17 +631,14 @@ char * __init xen_memory_setup(void)
extra_pages += max_pages - max_pfn;
/*
- * Set identity map on non-RAM pages and remap the underlying RAM.
+ * Set identity map on non-RAM pages and prepare remapping the
+ * underlying RAM.
*/
- last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
- &xen_released_pages);
+ xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
+ &xen_released_pages);
extra_pages += xen_released_pages;
- if (last_pfn > max_pfn) {
- max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
- mem_end = PFN_PHYS(max_pfn);
- }
/*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
* factor the base size. On non-highmem systems, the base
@@ -653,6 +665,7 @@ char * __init xen_memory_setup(void)
size = min(size, (u64)extra_pages * PAGE_SIZE);
extra_pages -= size / PAGE_SIZE;
xen_add_extra_mem(addr, size);
+ xen_max_p2m_pfn = PFN_DOWN(addr + size);
} else
type = E820_UNUSABLE;
}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 4ab9298c5e17..5686bd9d58cc 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -29,11 +29,13 @@ void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_reserve_top(void);
-extern unsigned long xen_max_p2m_pfn;
void xen_mm_pin_all(void);
void xen_mm_unpin_all(void);
+unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
+void __init xen_inv_extra_mem(void);
+void __init xen_remap_memory(void);
char * __init xen_memory_setup(void);
char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);
@@ -46,7 +48,7 @@ void xen_hvm_init_shared_info(void);
void xen_unplug_emulated_devices(void);
void __init xen_build_dynamic_phys_to_machine(void);
-unsigned long __init xen_revector_p2m_tree(void);
+void __init xen_vmalloc_p2m_tree(void);
void xen_init_irq_ops(void);
void xen_setup_timer(int cpu);