aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-30 09:22:15 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-30 09:22:15 -0800
commitf080815fdb3e3cff5a004ca83b3815ac17ef71b1 (patch)
tree5b26b29d9d3d623a0b9c52a193b7b41c26a2c411
parentd6e6a27d960f9f07aef0b979c49c6736ede28f75 (diff)
parent7cfc5c653b07782e7059527df8dc1e3143a7591e (diff)
downloadlinux-f080815fdb3e3cff5a004ca83b3815ac17ef71b1.tar.gz
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "ARM64: - Fix constant sign extension affecting TCR_EL2 and preventing running on ARMv8.7 models due to spurious bits being set - Fix use of helpers using PSTATE early on exit by always sampling it as soon as the exit takes place - Move pkvm's 32bit handling into a common helper RISC-V: - Fix incorrect KVM_MAX_VCPUS value - Unmap stage2 mapping when deleting/moving a memslot x86: - Fix and downgrade BUG_ON due to uninitialized cache - Many APICv and MOVE_ENC_CONTEXT_FROM fixes - Correctly emulate TLB flushes around nested vmentry/vmexit and when the nested hypervisor uses VPID - Prevent modifications to CPUID after the VM has run - Other smaller bugfixes Generic: - Memslot handling bugfixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (44 commits) KVM: fix avic_set_running for preemptable kernels KVM: VMX: clear vmx_x86_ops.sync_pir_to_irr if APICv is disabled KVM: SEV: accept signals in sev_lock_two_vms KVM: SEV: do not take kvm->lock when destroying KVM: SEV: Prohibit migration of a VM that has mirrors KVM: SEV: Do COPY_ENC_CONTEXT_FROM with both VMs locked selftests: sev_migrate_tests: add tests for KVM_CAP_VM_COPY_ENC_CONTEXT_FROM KVM: SEV: move mirror status to destination of KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM KVM: SEV: initialize regions_list of a mirror VM KVM: SEV: cleanup locking for KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM KVM: SEV: do not use list_replace_init on an empty list KVM: x86: Use a stable condition around all VT-d PI paths KVM: x86: check PIR even for vCPUs with disabled APICv KVM: VMX: prepare sync_pir_to_irr for running with APICv disabled KVM: selftests: page_table_test: fix calculation of guest_test_phys_mem KVM: x86/mmu: Handle "default" period when selectively waking kthread KVM: MMU: shadow nested paging does not have PKU KVM: x86/mmu: Remove spurious TLB flushes in TDP MMU zap collapsible path KVM: x86/mmu: Use yield-safe TDP MMU root iter in MMU notifier unmapping KVM: X86: Use vcpu->arch.walk_mmu for kvm_mmu_invlpg() ...
-rw-r--r--arch/arm64/include/asm/kvm_arm.h4
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h14
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h7
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c8
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c4
-rw-r--r--arch/riscv/include/asm/kvm_host.h8
-rw-r--r--arch/riscv/kvm/mmu.c6
-rw-r--r--arch/x86/kvm/ioapic.h1
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c97
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c38
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h5
-rw-r--r--arch/x86/kvm/svm/avic.c16
-rw-r--r--arch/x86/kvm/svm/sev.c161
-rw-r--r--arch/x86/kvm/svm/svm.c1
-rw-r--r--arch/x86/kvm/svm/svm.h1
-rw-r--r--arch/x86/kvm/vmx/nested.c49
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c20
-rw-r--r--arch/x86/kvm/vmx/vmx.c66
-rw-r--r--arch/x86/kvm/x86.c66
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--tools/testing/selftests/kvm/kvm_create_max_vcpus.c30
-rw-r--r--tools/testing/selftests/kvm/kvm_page_table_test.c2
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_features.c140
-rw-r--r--tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c165
-rw-r--r--virt/kvm/kvm_main.c56
27 files changed, 623 insertions, 352 deletions
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index a39fcf318c774d..01d47c5886dc43 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -91,7 +91,7 @@
#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
/* TCR_EL2 Registers bits */
-#define TCR_EL2_RES1 ((1 << 31) | (1 << 23))
+#define TCR_EL2_RES1 ((1U << 31) | (1 << 23))
#define TCR_EL2_TBI (1 << 20)
#define TCR_EL2_PS_SHIFT 16
#define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT)
@@ -276,7 +276,7 @@
#define CPTR_EL2_TFP_SHIFT 10
/* Hyp Coprocessor Trap Register */
-#define CPTR_EL2_TCPAC (1 << 31)
+#define CPTR_EL2_TCPAC (1U << 31)
#define CPTR_EL2_TAM (1 << 30)
#define CPTR_EL2_TTA (1 << 20)
#define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT)
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 7a0af1d39303cd..96c5f3fb78389e 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -403,6 +403,8 @@ typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *);
static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu);
+static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code);
+
/*
* Allow the hypervisor to handle the exit with an exit handler if it has one.
*
@@ -429,6 +431,18 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
*/
static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
+ /*
+ * Save PSTATE early so that we can evaluate the vcpu mode
+ * early on.
+ */
+ vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR);
+
+ /*
+ * Check whether we want to repaint the state one way or
+ * another.
+ */
+ early_exit_filter(vcpu, exit_code);
+
if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index de7e14c862e6c9..7ecca8b078519f 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -70,7 +70,12 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
{
ctxt->regs.pc = read_sysreg_el2(SYS_ELR);
- ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR);
+ /*
+ * Guest PSTATE gets saved at guest fixup time in all
+ * cases. We still need to handle the nVHE host side here.
+ */
+ if (!has_vhe() && ctxt->__hyp_running_vcpu)
+ ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR);
if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2);
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index c0e3fed26d9306..d13115a1243416 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -233,7 +233,7 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
* Returns false if the guest ran in AArch32 when it shouldn't have, and
* thus should exit to the host, or true if a the guest run loop can continue.
*/
-static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code)
+static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
{
struct kvm *kvm = kern_hyp_va(vcpu->kvm);
@@ -248,10 +248,7 @@ static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code)
vcpu->arch.target = -1;
*exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT);
*exit_code |= ARM_EXCEPTION_IL;
- return false;
}
-
- return true;
}
/* Switch to the guest for legacy non-VHE systems */
@@ -316,9 +313,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
/* Jump in the fire! */
exit_code = __guest_enter(vcpu);
- if (unlikely(!handle_aarch32_guest(vcpu, &exit_code)))
- break;
-
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 5a2cb5d9bc4b22..fbb26b93c34773 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -112,6 +112,10 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
return hyp_exit_handlers;
}
+static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+}
+
/* Switch to the guest for VHE systems running in EL2 */
static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
{
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 25ba21f9850412..2639b9ee48f97d 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -12,14 +12,12 @@
#include <linux/types.h>
#include <linux/kvm.h>
#include <linux/kvm_types.h>
+#include <asm/csr.h>
#include <asm/kvm_vcpu_fp.h>
#include <asm/kvm_vcpu_timer.h>
-#ifdef CONFIG_64BIT
-#define KVM_MAX_VCPUS (1U << 16)
-#else
-#define KVM_MAX_VCPUS (1U << 9)
-#endif
+#define KVM_MAX_VCPUS \
+ ((HGATP_VMID_MASK >> HGATP_VMID_SHIFT) + 1)
#define KVM_HALT_POLL_NS_DEFAULT 500000
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index d81bae8eb55ea0..fc058ff5f4b6f3 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -453,6 +453,12 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
+ gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
+ phys_addr_t size = slot->npages << PAGE_SHIFT;
+
+ spin_lock(&kvm->mmu_lock);
+ stage2_unmap_range(kvm, gpa, size, false);
+ spin_unlock(&kvm->mmu_lock);
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index e66e620c3bed9e..539333ac4b3808 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -81,7 +81,6 @@ struct kvm_ioapic {
unsigned long irq_states[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
- void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 650642b18d1510..c2d7cfe82d004b 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -56,7 +56,6 @@ struct kvm_pic {
struct kvm_io_device dev_master;
struct kvm_io_device dev_slave;
struct kvm_io_device dev_elcr;
- void (*ack_notifier)(void *opaque, int irq);
unsigned long irq_states[PIC_NUM_PINS];
};
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 759952dd122284..f206fc35deff6e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -707,7 +707,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
- if (apic->vcpu->arch.apicv_active)
+ if (kvm_x86_ops.sync_pir_to_irr)
highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3be9beea838d13..6354297e92aee1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1582,7 +1582,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
return flush;
}
@@ -2173,10 +2173,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
iterator->shadow_addr = root;
iterator->level = vcpu->arch.mmu->shadow_root_level;
- if (iterator->level == PT64_ROOT_4LEVEL &&
+ if (iterator->level >= PT64_ROOT_4LEVEL &&
vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu->direct_map)
- --iterator->level;
+ iterator->level = PT32E_ROOT_LEVEL;
if (iterator->level == PT32E_ROOT_LEVEL) {
/*
@@ -4855,7 +4855,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
struct kvm_mmu_role_regs regs = {
.cr0 = cr0,
- .cr4 = cr4,
+ .cr4 = cr4 & ~X86_CR4_PKE,
.efer = efer,
};
union kvm_mmu_role new_role;
@@ -4919,7 +4919,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->direct_map = false;
update_permission_bitmask(context, true);
- update_pkru_bitmask(context);
+ context->pkru_mask = 0;
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
@@ -5025,6 +5025,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
/*
* Invalidate all MMU roles to force them to reinitialize as CPUID
* information is factored into reserved bit calculations.
+ *
+ * Correctly handling multiple vCPU models with respect to paging and
+ * physical address properties) in a single VM would require tracking
+ * all relevant CPUID information in kvm_mmu_page_role. That is very
+ * undesirable as it would increase the memory requirements for
+ * gfn_track (see struct kvm_mmu_page_role comments). For now that
+ * problem is swept under the rug; KVM's CPUID API is horrific and
+ * it's all but impossible to solve it without introducing a new API.
*/
vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
@@ -5032,24 +5040,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
/*
- * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
- * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
- * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
- * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
- * sweep the problem under the rug.
- *
- * KVM's horrific CPUID ABI makes the problem all but impossible to
- * solve, as correctly handling multiple vCPU models (with respect to
- * paging and physical address properties) in a single VM would require
- * tracking all relevant CPUID information in kvm_mmu_page_role. That
- * is very undesirable as it would double the memory requirements for
- * gfn_track (see struct kvm_mmu_page_role comments), and in practice
- * no sane VMM mucks with the core vCPU model on the fly.
+ * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
+ * kvm_arch_vcpu_ioctl().
*/
- if (vcpu->arch.last_vmentry_cpu != -1) {
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n");
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n");
- }
+ KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -5369,7 +5363,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
+ kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5854,8 +5848,6 @@ restart:
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
- bool flush = false;
-
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
/*
@@ -5863,17 +5855,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
* logging at a 4k granularity and never creates collapsible
* 2m SPTEs during dirty logging.
*/
- flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
- if (flush)
+ if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
}
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
- flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
- if (flush)
- kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
read_unlock(&kvm->mmu_lock);
}
}
@@ -6182,23 +6171,46 @@ void kvm_mmu_module_exit(void)
mmu_audit_disable();
}
+/*
+ * Calculate the effective recovery period, accounting for '0' meaning "let KVM
+ * select a halving time of 1 hour". Returns true if recovery is enabled.
+ */
+static bool calc_nx_huge_pages_recovery_period(uint *period)
+{
+ /*
+ * Use READ_ONCE to get the params, this may be called outside of the
+ * param setters, e.g. by the kthread to compute its next timeout.
+ */
+ bool enabled = READ_ONCE(nx_huge_pages);
+ uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ if (!enabled || !ratio)
+ return false;
+
+ *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ if (!*period) {
+ /* Make sure the period is not less than one second. */
+ ratio = min(ratio, 3600u);
+ *period = 60 * 60 * 1000 / ratio;
+ }
+ return true;
+}
+
static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
{
bool was_recovery_enabled, is_recovery_enabled;
uint old_period, new_period;
int err;
- was_recovery_enabled = nx_huge_pages_recovery_ratio;
- old_period = nx_huge_pages_recovery_period_ms;
+ was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
err = param_set_uint(val, kp);
if (err)
return err;
- is_recovery_enabled = nx_huge_pages_recovery_ratio;
- new_period = nx_huge_pages_recovery_period_ms;
+ is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
- if (READ_ONCE(nx_huge_pages) && is_recovery_enabled &&
+ if (is_recovery_enabled &&
(!was_recovery_enabled || old_period > new_period)) {
struct kvm *kvm;
@@ -6262,18 +6274,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
static long get_nx_lpage_recovery_timeout(u64 start_time)
{
- uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
- uint period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ bool enabled;
+ uint period;
- if (!period && ratio) {
- /* Make sure the period is not less than one second. */
- ratio = min(ratio, 3600u);
- period = 60 * 60 * 1000 / ratio;
- }
+ enabled = calc_nx_huge_pages_recovery_period(&period);
- return READ_ONCE(nx_huge_pages) && ratio
- ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
- : MAX_SCHEDULE_TIMEOUT;
+ return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
}
static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index a54c3491af42c9..1db8496259add5 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -317,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
- u64 old_child_spte;
- u64 *sptep;
- gfn_t gfn;
int i;
trace_kvm_mmu_prepare_zap_page(sp);
@@ -327,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
tdp_mmu_unlink_page(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- sptep = rcu_dereference(pt) + i;
- gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 *sptep = rcu_dereference(pt) + i;
+ gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 old_child_spte;
if (shared) {
/*
@@ -374,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
shared);
}
- kvm_flush_remote_tlbs_with_address(kvm, gfn,
+ kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
KVM_PAGES_PER_HPAGE(level + 1));
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -1033,9 +1031,9 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
- flush |= zap_gfn_range(kvm, root, range->start, range->end,
- range->may_block, flush, false);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
+ flush = zap_gfn_range(kvm, root, range->start, range->end,
+ range->may_block, flush, false);
return flush;
}
@@ -1364,10 +1362,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
-static bool zap_collapsible_spte_range(struct kvm *kvm,
+static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
- const struct kvm_memory_slot *slot,
- bool flush)
+ const struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
@@ -1378,10 +1375,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
tdp_root_for_each_pte(iter, root, start, end) {
retry:
- if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
- flush = false;
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
- }
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
@@ -1393,6 +1388,7 @@ retry:
pfn, PG_LEVEL_NUM))
continue;
+ /* Note, a successful atomic zap also does a remote TLB flush. */
if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
/*
* The iter must explicitly re-read the SPTE because
@@ -1401,30 +1397,24 @@ retry:
iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
goto retry;
}
- flush = true;
}
rcu_read_unlock();
-
- return flush;
}
/*
* Clear non-leaf entries (and free associated page tables) which could
* be replaced by large mappings, for GFNs within the slot.
*/
-bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- bool flush)
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
- flush = zap_collapsible_spte_range(kvm, root, slot, flush);
-
- return flush;
+ zap_collapsible_spte_range(kvm, root, slot);
}
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 476b133544dd94..3899004a5d91e7 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -64,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
-bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- bool flush);
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index affc0ea98d3022..9d6066eb7c10ed 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -989,16 +989,18 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ int cpu = get_cpu();
+ WARN_ON(cpu != vcpu->cpu);
svm->avic_is_running = is_run;
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
- if (is_run)
- avic_vcpu_load(vcpu, vcpu->cpu);
- else
- avic_vcpu_put(vcpu);
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ if (is_run)
+ avic_vcpu_load(vcpu, cpu);
+ else
+ avic_vcpu_put(vcpu);
+ }
+ put_cpu();
}
void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 21ac0a5de4e0c8..59727a966f902b 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1543,28 +1543,50 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id)
return false;
}
-static int sev_lock_for_migration(struct kvm *kvm)
+static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
/*
- * Bail if this VM is already involved in a migration to avoid deadlock
- * between two VMs trying to migrate to/from each other.
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
*/
- if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1))
+ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
return -EBUSY;
- mutex_lock(&kvm->lock);
+ if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
+ goto release_dst;
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable(&src_kvm->lock))
+ goto unlock_dst;
return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+release_dst:
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ return r;
}
-static void sev_unlock_after_migration(struct kvm *kvm)
+static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
- mutex_unlock(&kvm->lock);
- atomic_set_release(&sev->migration_in_progress, 0);
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ atomic_set_release(&src_sev->migration_in_progress, 0);
}
@@ -1607,14 +1629,15 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
dst->asid = src->asid;
dst->handle = src->handle;
dst->pages_locked = src->pages_locked;
+ dst->enc_context_owner = src->enc_context_owner;
src->asid = 0;
src->active = false;
src->handle = 0;
src->pages_locked = 0;
+ src->enc_context_owner = NULL;
- INIT_LIST_HEAD(&dst->regions_list);
- list_replace_init(&src->regions_list, &dst->regions_list);
+ list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
}
static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1666,15 +1689,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
bool charged = false;
int ret;
- ret = sev_lock_for_migration(kvm);
- if (ret)
- return ret;
-
- if (sev_guest(kvm)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
@@ -1682,16 +1696,26 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
}
source_kvm = source_kvm_file->private_data;
- ret = sev_lock_for_migration(source_kvm);
+ ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto out_fput;
- if (!sev_guest(source_kvm)) {
+ if (sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL;
- goto out_source;
+ goto out_unlock;
}
src_sev = &to_kvm_svm(source_kvm)->sev_info;
+
+ /*
+ * VMs mirroring src's encryption context rely on it to keep the
+ * ASID allocated, but below we are clearing src_sev->asid.
+ */
+ if (src_sev->num_mirrored_vms) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1728,13 +1752,11 @@ out_dst_cgroup:
sev_misc_cg_uncharge(cg_cleanup_sev);
put_misc_cg(cg_cleanup_sev->misc_cg);
cg_cleanup_sev->misc_cg = NULL;
-out_source:
- sev_unlock_after_migration(source_kvm);
+out_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
out_fput:
if (source_kvm_file)
fput(source_kvm_file);
-out_unlock:
- sev_unlock_after_migration(kvm);
return ret;
}
@@ -1953,76 +1975,60 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
{
struct file *source_kvm_file;
struct kvm *source_kvm;
- struct kvm_sev_info source_sev, *mirror_sev;
+ struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
- goto e_source_put;
+ goto e_source_fput;
}
source_kvm = source_kvm_file->private_data;
- mutex_lock(&source_kvm->lock);
-
- if (!sev_guest(source_kvm)) {
- ret = -EINVAL;
- goto e_source_unlock;
- }
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto e_source_fput;
- /* Mirrors of mirrors should work, but let's not get silly */
- if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+ /*
+ * Mirrors of mirrors should work, but let's not get silly. Also
+ * disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
+ is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
ret = -EINVAL;
- goto e_source_unlock;
+ goto e_unlock;
}
- memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info,
- sizeof(source_sev));
-
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
+ source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
-
- fput(source_kvm_file);
- mutex_unlock(&source_kvm->lock);
- mutex_lock(&kvm->lock);
-
- /*
- * Disallow out-of-band SEV/SEV-ES init if the target is already an
- * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
- * created after SEV/SEV-ES initialization, e.g. to init intercepts.
- */
- if (sev_guest(kvm) || kvm->created_vcpus) {
- ret = -EINVAL;
- goto e_mirror_unlock;
- }
+ source_sev->num_mirrored_vms++;
/* Set enc_context_owner and copy its encryption context over */
mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
- mirror_sev->asid = source_sev.asid;
- mirror_sev->fd = source_sev.fd;
- mirror_sev->es_active = source_sev.es_active;
- mirror_sev->handle = source_sev.handle;
+ mirror_sev->asid = source_sev->asid;
+ mirror_sev->fd = source_sev->fd;
+ mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->handle = source_sev->handle;
+ INIT_LIST_HEAD(&mirror_sev->regions_list);
+ ret = 0;
+
/*
* Do not copy ap_jump_table. Since the mirror does not share the same
* KVM contexts as the original, and they may have different
* memory-views.
*/
- mutex_unlock(&kvm->lock);
- return 0;
-
-e_mirror_unlock:
- mutex_unlock(&kvm->lock);
- kvm_put_kvm(source_kvm);
- return ret;
-e_source_unlock:
- mutex_unlock(&source_kvm->lock);
-e_source_put:
+e_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+e_source_fput:
if (source_kvm_file)
fput(source_kvm_file);
return ret;
@@ -2034,17 +2040,24 @@ void sev_vm_destroy(struct kvm *kvm)
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
+ WARN_ON(sev->num_mirrored_vms);
+
if (!sev_guest(kvm))
return;
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
if (is_mirroring_enc_context(kvm)) {
- kvm_put_kvm(sev->enc_context_owner);
+ struct kvm *owner_kvm = sev->enc_context_owner;
+ struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
+
+ mutex_lock(&owner_kvm->lock);
+ if (!WARN_ON(!owner_sev->num_mirrored_vms))
+ owner_sev->num_mirrored_vms--;
+ mutex_unlock(&owner_kvm->lock);
+ kvm_put_kvm(owner_kvm);
return;
}
- mutex_lock(&kvm->lock);
-
/*
* Ensure that all guest tagged cache entries are flushed before
* releasing the pages back to the system for use. CLFLUSH will
@@ -2064,8 +2077,6 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
- mutex_unlock(&kvm->lock);
-
sev_unbind_asid(kvm, sev->handle);
sev_asid_free(sev);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5630c241d5f6e0..d0f68d11ec70be 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4651,7 +4651,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
- .sync_pir_to_irr = kvm_lapic_find_highest_irr,
.apicv_post_state_restore = avic_post_state_restore,
.set_tss_addr = svm_set_tss_addr,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 5faad3dc10e27a..1c7306c370fa3c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,6 +79,7 @@ struct kvm_sev_info {
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
+ unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
};
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1e2f669515665b..64f2828035c2be 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1162,29 +1162,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
WARN_ON(!enable_vpid);
/*
- * If VPID is enabled and used by vmc12, but L2 does not have a unique
- * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
- * a VPID for L2, flush the current context as the effective ASID is
- * common to both L1 and L2.
- *
- * Defer the flush so that it runs after vmcs02.EPTP has been set by
- * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
- * redundant flushes further down the nested pipeline.
- *
- * If a TLB flush isn't required due to any of the above, and vpid12 is
- * changing then the new "virtual" VPID (vpid12) will reuse the same
- * "real" VPID (vpid02), and so needs to be flushed. There's no direct
- * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
- * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate
- * guest-physical mappings, so there is no need to sync the nEPT MMU.
+ * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
+ * emulate a guest TLB flush as KVM does not track vpid12 history nor
+ * is the VPID incorporated into the MMU context. I.e. KVM must assume
+ * that the new vpid12 has never been used and thus represents a new
+ * guest ASID that cannot have entries in the TLB.
*/
- if (!nested_has_guest_tlb_tag(vcpu)) {
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- } else if (is_vmenter &&
- vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- vpid_sync_context(nested_get_vpid02(vcpu));
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
}
+
+ /*
+ * If VPID is enabled, used by vmc12, and vpid12 is not changing but
+ * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
+ * KVM was unable to allocate a VPID for L2, flush the current context
+ * as the effective ASID is common to both L1 and L2.
+ */
+ if (!nested_has_guest_tlb_tag(vcpu))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
@@ -3344,8 +3341,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
};
u32 failed_index;
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
@@ -4502,9 +4498,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
(void)nested_get_evmcs_page(vcpu);
}
- /* Service the TLB flush request for L2 before switching to L1. */
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ /* Service pending TLB flush requests for L2 before switching to L1. */
+ kvm_service_local_tlb_flush_requests(vcpu);
/*
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
@@ -4857,6 +4852,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
+ vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -5289,8 +5285,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
struct vmcs_hdr hdr;
- if (ghc->gpa != vmptr &&
- kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
/*
* Reads from an unbacked page return all 1s,
* which means that the 32 bits located at the
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 5f81ef092bd436..1c94783b5a54c5 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -5,6 +5,7 @@
#include <asm/cpu.h>
#include "lapic.h"
+#include "irq.h"
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
@@ -77,13 +78,18 @@ after_clear_sn:
pi_set_on(pi_desc);
}
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm) && enable_apicv &&
+ kvm_arch_has_assigned_device(kvm) &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
/* Set SN when the vCPU is preempted */
@@ -141,9 +147,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return 0;
WARN_ON(irqs_disabled());
@@ -270,9 +274,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
struct vcpu_data vcpu_info;
int idx, ret = 0;
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ if (!vmx_can_use_vtd_pi(kvm))
return 0;
idx = srcu_read_lock(&kvm->irq_srcu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ba66c171d951ba..f9044880969093 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2918,6 +2918,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
}
}
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -2930,31 +2937,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
if (enable_ept)
ept_sync_context(construct_eptp(vcpu, root_hpa,
mmu->shadow_root_level));
- else if (!is_guest_mode(vcpu))
- vpid_sync_context(to_vmx(vcpu)->vpid);
else
- vpid_sync_context(nested_get_vpid02(vcpu));
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
- * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
* vmx_flush_tlb_guest() for an explanation of why this is ok.
*/
- vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
- * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
- * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
- * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
* i.e. no explicit INVVPID is necessary.
*/
- vpid_sync_context(to_vmx(vcpu)->vpid);
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -6262,9 +6267,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
- bool max_irr_updated;
+ bool got_posted_interrupt;
- if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;
if (pi_test_on(&vmx->pi_desc)) {
@@ -6274,22 +6279,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
- max_irr_updated =
+ got_posted_interrupt =
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
-
- /*
- * If we are running L2 and L1 has a new pending interrupt
- * which can be injected, this may cause a vmexit or it may
- * be injected into L2. Either way, this interrupt will be
- * processed via KVM_REQ_EVENT, not RVI, because we do not use
- * virtual interrupt delivery to inject L1 interrupts into L2.
- */
- if (is_guest_mode(vcpu) && max_irr_updated)
- kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
}
- vmx_hwapic_irr_update(vcpu, max_irr);
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return max_irr;
}
@@ -7761,10 +7777,10 @@ static __init int hardware_setup(void)
ple_window_shrink = 0;
}
- if (!cpu_has_vmx_apicv()) {
+ if (!cpu_has_vmx_apicv())
enable_apicv = 0;
+ if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
- }
if (cpu_has_vmx_tsc_scaling()) {
kvm_has_tsc_control = true;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5a403d92833f51..0ee1a039b49099 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3258,6 +3258,29 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
+
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
+}
+
+/*
+ * Service "local" TLB flush requests, which are specific to the current MMU
+ * context. In addition to the generic event handling in vcpu_enter_guest(),
+ * TLB flushes that are targeted at an MMU context also need to be serviced
+ * prior before nested VM-Enter/VM-Exit.
+ */
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
+
static void record_steal_time(struct kvm_vcpu *vcpu)
{
struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
@@ -4133,6 +4156,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SGX_ATTRIBUTE:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
@@ -4448,8 +4472,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -5124,6 +5147,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+ * the core vCPU model on the fly, so fail.
+ */
+ r = -EINVAL;
+ if (vcpu->arch.last_vmentry_cpu != -1)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -5134,6 +5168,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
+ /*
+ * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
+ * KVM_SET_CPUID case above.
+ */
+ r = -EINVAL;
+ if (vcpu->arch.last_vmentry_cpu != -1)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -9528,8 +9570,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -9648,10 +9689,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* Flushing all ASIDs flushes the current ASID... */
kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
- kvm_vcpu_flush_tlb_guest(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -9802,10 +9840,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/*
* This handles the case where a posted interrupt was
- * notified with kvm_vcpu_kick.
+ * notified with kvm_vcpu_kick. Assigned devices can
+ * use the POSTED_INTR_VECTOR even if APICv is disabled,
+ * so do it even if APICv is disabled on this vCPU.
*/
- if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -9849,8 +9889,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 997669ae9caa21..4abcd8d9836ddc 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -103,6 +103,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
@@ -185,12 +186,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
-static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_current)(vcpu);
-}
-
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
index f968dfd4ee8892..aed9dc3ca1e9ee 100644
--- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
+++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/resource.h>
#include "test_util.h"
@@ -40,11 +41,40 @@ int main(int argc, char *argv[])
{
int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID);
int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+ /*
+ * Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds +
+ * an arbitrary number for everything else.
+ */
+ int nr_fds_wanted = kvm_max_vcpus + 100;
+ struct rlimit rl;
pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id);
pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus);
/*
+ * Check that we're allowed to open nr_fds_wanted file descriptors and
+ * try raising the limits if needed.
+ */
+ TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!");
+
+ if (rl.rlim_cur < nr_fds_wanted) {
+ rl.rlim_cur = nr_fds_wanted;
+ if (rl.rlim_max < nr_fds_wanted) {
+ int old_rlim_max = rl.rlim_max;
+ rl.rlim_max = nr_fds_wanted;
+
+ int r = setrlimit(RLIMIT_NOFILE, &rl);
+ if (r < 0) {
+ printf("RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n",
+ old_rlim_max, nr_fds_wanted);
+ exit(KSFT_SKIP);
+ }
+ } else {
+ TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!");
+ }
+ }
+
+ /*
* Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID.
* Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID
* in this case.
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
index 3836322add00ce..ba1fdc3dcf4a90 100644
--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -280,7 +280,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
#ifdef __s390x__
alignment = max(0x100000, alignment);
#endif
- guest_test_phys_mem = align_down(guest_test_virt_mem, alignment);
+ guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);
/* Set up the shared data structure test_args */
test_args.vm = vm;
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index 91d88aaa989928..672915ce73d8f6 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -165,10 +165,10 @@ static void hv_set_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid,
vcpu_set_cpuid(vm, VCPU_ID, cpuid);
}
-static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
- struct kvm_cpuid2 *best)
+static void guest_test_msrs_access(void)
{
struct kvm_run *run;
+ struct kvm_vm *vm;
struct ucall uc;
int stage = 0, r;
struct kvm_cpuid_entry2 feat = {
@@ -180,11 +180,34 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
struct kvm_cpuid_entry2 dbg = {
.function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
};
- struct kvm_enable_cap cap = {0};
-
- run = vcpu_state(vm, VCPU_ID);
+ struct kvm_cpuid2 *best;
+ vm_vaddr_t msr_gva;
+ struct kvm_enable_cap cap = {
+ .cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
+ .args = {1}
+ };
+ struct msr_data *msr;
while (true) {
+ vm = vm_create_default(VCPU_ID, 0, guest_msr);
+
+ msr_gva = vm_vaddr_alloc_page(vm);
+ memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
+ msr = addr_gva2hva(vm, msr_gva);
+
+ vcpu_args_set(vm, VCPU_ID, 1, msr_gva);
+ vcpu_enable_cap(vm, VCPU_ID, &cap);
+
+ vcpu_set_hv_cpuid(vm, VCPU_ID);
+
+ best = kvm_get_supported_hv_cpuid();
+
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vm, VCPU_ID);
+ vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
+
+ run = vcpu_state(vm, VCPU_ID);
+
switch (stage) {
case 0:
/*
@@ -315,6 +338,7 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
* capability enabled and guest visible CPUID bit unset.
*/
cap.cap = KVM_CAP_HYPERV_SYNIC2;
+ cap.args[0] = 0;
vcpu_enable_cap(vm, VCPU_ID, &cap);
break;
case 22:
@@ -461,9 +485,9 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
switch (get_ucall(vm, VCPU_ID, &uc)) {
case UCALL_SYNC:
- TEST_ASSERT(uc.args[1] == stage,
- "Unexpected stage: %ld (%d expected)\n",
- uc.args[1], stage);
+ TEST_ASSERT(uc.args[1] == 0,
+ "Unexpected stage: %ld (0 expected)\n",
+ uc.args[1]);
break;
case UCALL_ABORT:
TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
@@ -474,13 +498,14 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
}
stage++;
+ kvm_vm_free(vm);
}
}
-static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall,
- void *input, void *output, struct kvm_cpuid2 *best)
+static void guest_test_hcalls_access(void)
{
struct kvm_run *run;
+ struct kvm_vm *vm;
struct ucall uc;
int stage = 0, r;
struct kvm_cpuid_entry2 feat = {
@@ -493,10 +518,38 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
struct kvm_cpuid_entry2 dbg = {
.function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
};
-
- run = vcpu_state(vm, VCPU_ID);
+ struct kvm_enable_cap cap = {
+ .cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
+ .args = {1}
+ };
+ vm_vaddr_t hcall_page, hcall_params;
+ struct hcall_data *hcall;
+ struct kvm_cpuid2 *best;
while (true) {
+ vm = vm_create_default(VCPU_ID, 0, guest_hcall);
+
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vm, VCPU_ID);
+ vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+
+ /* Hypercall input/output */
+ hcall_page = vm_vaddr_alloc_pages(vm, 2);
+ hcall = addr_gva2hva(vm, hcall_page);
+ memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
+
+ hcall_params = vm_vaddr_alloc_page(vm);
+ memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
+
+ vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
+ vcpu_enable_cap(vm, VCPU_ID, &cap);
+
+ vcpu_set_hv_cpuid(vm, VCPU_ID);
+
+ best = kvm_get_supported_hv_cpuid();
+
+ run = vcpu_state(vm, VCPU_ID);
+
switch (stage) {
case 0:
hcall->control = 0xdeadbeef;
@@ -606,9 +659,9 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
switch (get_ucall(vm, VCPU_ID, &uc)) {
case UCALL_SYNC:
- TEST_ASSERT(uc.args[1] == stage,
- "Unexpected stage: %ld (%d expected)\n",
- uc.args[1], stage);
+ TEST_ASSERT(uc.args[1] == 0,
+ "Unexpected stage: %ld (0 expected)\n",
+ uc.args[1]);
break;
case UCALL_ABORT:
TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
@@ -619,66 +672,15 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
}
stage++;
+ kvm_vm_free(vm);
}
}
int main(void)
{
- struct kvm_cpuid2 *best;
- struct kvm_vm *vm;
- vm_vaddr_t msr_gva, hcall_page, hcall_params;
- struct kvm_enable_cap cap = {
- .cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
- .args = {1}
- };
-
- /* Test MSRs */
- vm = vm_create_default(VCPU_ID, 0, guest_msr);
-
- msr_gva = vm_vaddr_alloc_page(vm);
- memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
- vcpu_args_set(vm, VCPU_ID, 1, msr_gva);
- vcpu_enable_cap(vm, VCPU_ID, &cap);
-
- vcpu_set_hv_cpuid(vm, VCPU_ID);
-
- best = kvm_get_supported_hv_cpuid();
-
- vm_init_descriptor_tables(vm);
- vcpu_init_descriptor_tables(vm, VCPU_ID);
- vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
-
pr_info("Testing access to Hyper-V specific MSRs\n");
- guest_test_msrs_access(vm, addr_gva2hva(vm, msr_gva),
- best);
- kvm_vm_free(vm);
-
- /* Test hypercalls */
- vm = vm_create_default(VCPU_ID, 0, guest_hcall);
-
- vm_init_descriptor_tables(vm);
- vcpu_init_descriptor_tables(vm, VCPU_ID);
- vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
-
- /* Hypercall input/output */
- hcall_page = vm_vaddr_alloc_pages(vm, 2);
- memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
-
- hcall_params = vm_vaddr_alloc_page(vm);
- memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
-
- vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
- vcpu_enable_cap(vm, VCPU_ID, &cap);
-
- vcpu_set_hv_cpuid(vm, VCPU_ID);
-
- best = kvm_get_supported_hv_cpuid();
+ guest_test_msrs_access();
pr_info("Testing access to Hyper-V hypercalls\n");
- guest_test_hcalls_access(vm, addr_gva2hva(vm, hcall_params),
- addr_gva2hva(vm, hcall_page),
- addr_gva2hva(vm, hcall_page) + getpagesize(),
- best);
-
- kvm_vm_free(vm);
+ guest_test_hcalls_access();
}
diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index 5ba325cd64bfd8..29b18d565cf4ce 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -54,12 +54,15 @@ static struct kvm_vm *sev_vm_create(bool es)
return vm;
}
-static struct kvm_vm *__vm_create(void)
+static struct kvm_vm *aux_vm_create(bool with_vcpus)
{
struct kvm_vm *vm;
int i;
vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+ if (!with_vcpus)
+ return vm;
+
for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
vm_vcpu_add(vm, i);
@@ -89,11 +92,11 @@ static void test_sev_migrate_from(bool es)
{
struct kvm_vm *src_vm;
struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS];
- int i;
+ int i, ret;
src_vm = sev_vm_create(es);
for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
- dst_vms[i] = __vm_create();
+ dst_vms[i] = aux_vm_create(true);
/* Initial migration from the src to the first dst. */
sev_migrate_from(dst_vms[0]->fd, src_vm->fd);
@@ -102,7 +105,10 @@ static void test_sev_migrate_from(bool es)
sev_migrate_from(dst_vms[i]->fd, dst_vms[i - 1]->fd);
/* Migrate the guest back to the original VM. */
- sev_migrate_from(src_vm->fd, dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd);
+ ret = __sev_migrate_from(src_vm->fd, dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd);
+ TEST_ASSERT(ret == -1 && errno == EIO,
+ "VM that was migrated from should be dead. ret %d, errno: %d\n", ret,
+ errno);
kvm_vm_free(src_vm);
for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
@@ -146,6 +152,8 @@ static void test_sev_migrate_locking(void)
for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
pthread_join(pt[i], NULL);
+ for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
+ kvm_vm_free(input[i].vm);
}
static void test_sev_migrate_parameters(void)
@@ -157,12 +165,11 @@ static void test_sev_migrate_parameters(void)
sev_vm = sev_vm_create(/* es= */ false);
sev_es_vm = sev_vm_create(/* es= */ true);
vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
- vm_no_sev = __vm_create();
+ vm_no_sev = aux_vm_create(true);
sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL);
vm_vcpu_add(sev_es_vm_no_vmsa, 1);
-
ret = __sev_migrate_from(sev_vm->fd, sev_es_vm->fd);
TEST_ASSERT(
ret == -1 && errno == EINVAL,
@@ -191,13 +198,151 @@ static void test_sev_migrate_parameters(void)
TEST_ASSERT(ret == -1 && errno == EINVAL,
"Migrations require SEV enabled. ret %d, errno: %d\n", ret,
errno);
+
+ kvm_vm_free(sev_vm);
+ kvm_vm_free(sev_es_vm);
+ kvm_vm_free(sev_es_vm_no_vmsa);
+ kvm_vm_free(vm_no_vcpu);
+ kvm_vm_free(vm_no_sev);
+}
+
+static int __sev_mirror_create(int dst_fd, int src_fd)
+{
+ struct kvm_enable_cap cap = {
+ .cap = KVM_CAP_VM_COPY_ENC_CONTEXT_FROM,
+ .args = { src_fd }
+ };
+
+ return ioctl(dst_fd, KVM_ENABLE_CAP, &cap);
+}
+
+
+static void sev_mirror_create(int dst_fd, int src_fd)
+{
+ int ret;
+
+ ret = __sev_mirror_create(dst_fd, src_fd);
+ TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d\n", ret, errno);
+}
+
+static void test_sev_mirror(bool es)
+{
+ struct kvm_vm *src_vm, *dst_vm;
+ struct kvm_sev_launch_start start = {
+ .policy = es ? SEV_POLICY_ES : 0
+ };
+ int i;
+
+ src_vm = sev_vm_create(es);
+ dst_vm = aux_vm_create(false);
+
+ sev_mirror_create(dst_vm->fd, src_vm->fd);
+
+ /* Check that we can complete creation of the mirror VM. */
+ for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
+ vm_vcpu_add(dst_vm, i);
+ sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_START, &start);
+ if (es)
+ sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
+
+ kvm_vm_free(src_vm);
+ kvm_vm_free(dst_vm);
+}
+
+static void test_sev_mirror_parameters(void)
+{
+ struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_with_vcpu;
+ int ret;
+
+ sev_vm = sev_vm_create(/* es= */ false);
+ sev_es_vm = sev_vm_create(/* es= */ true);
+ vm_with_vcpu = aux_vm_create(true);
+ vm_no_vcpu = aux_vm_create(false);
+
+ ret = __sev_mirror_create(sev_vm->fd, sev_vm->fd);
+ TEST_ASSERT(
+ ret == -1 && errno == EINVAL,
+ "Should not be able copy context to self. ret: %d, errno: %d\n",
+ ret, errno);
+
+ ret = __sev_mirror_create(sev_vm->fd, sev_es_vm->fd);
+ TEST_ASSERT(
+ ret == -1 && errno == EINVAL,
+ "Should not be able copy context to SEV enabled VM. ret: %d, errno: %d\n",
+ ret, errno);
+
+ ret = __sev_mirror_create(sev_es_vm->fd, sev_vm->fd);
+ TEST_ASSERT(
+ ret == -1 && errno == EINVAL,
+ "Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d\n",
+ ret, errno);
+
+ ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd);
+ TEST_ASSERT(ret == -1 && errno == EINVAL,
+ "Copy context requires SEV enabled. ret %d, errno: %d\n", ret,
+ errno);
+
+ ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd);
+ TEST_ASSERT(
+ ret == -1 && errno == EINVAL,
+ "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n",
+ ret, errno);
+
+ kvm_vm_free(sev_vm);
+ kvm_vm_free(sev_es_vm);
+ kvm_vm_free(vm_with_vcpu);
+ kvm_vm_free(vm_no_vcpu);
+}
+
+static void test_sev_move_copy(void)
+{
+ struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm;
+ int ret;
+
+ sev_vm = sev_vm_create(/* es= */ false);
+ dst_vm = aux_vm_create(true);
+ mirror_vm = aux_vm_create(false);
+ dst_mirror_vm = aux_vm_create(false);
+
+ sev_mirror_create(mirror_vm->fd, sev_vm->fd);
+ ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
+ TEST_ASSERT(ret == -1 && errno == EBUSY,
+ "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
+ errno);
+
+ /* The mirror itself can be migrated. */
+ sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
+ ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
+ TEST_ASSERT(ret == -1 && errno == EBUSY,
+ "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
+ errno);
+
+ /*
+ * mirror_vm is not a mirror anymore, dst_mirror_vm is. Thus,
+ * the owner can be copied as soon as dst_mirror_vm is gone.
+ */
+ kvm_vm_free(dst_mirror_vm);
+ sev_migrate_from(dst_vm->fd, sev_vm->fd);
+
+ kvm_vm_free(mirror_vm);
+ kvm_vm_free(dst_vm);
+ kvm_vm_free(sev_vm);
}
int main(int argc, char *argv[])
{
- test_sev_migrate_from(/* es= */ false);
- test_sev_migrate_from(/* es= */ true);
- test_sev_migrate_locking();
- test_sev_migrate_parameters();
+ if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) {
+ test_sev_migrate_from(/* es= */ false);
+ test_sev_migrate_from(/* es= */ true);
+ test_sev_migrate_locking();
+ test_sev_migrate_parameters();
+ if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM))
+ test_sev_move_copy();
+ }
+ if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) {
+ test_sev_mirror(/* es= */ false);
+ test_sev_mirror(/* es= */ true);
+ test_sev_mirror_parameters();
+ }
return 0;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9646bb9112c101..72c4e6b393896a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1531,11 +1531,10 @@ static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
static int kvm_set_memslot(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
- struct kvm_memory_slot *old,
struct kvm_memory_slot *new, int as_id,
enum kvm_mr_change change)
{
- struct kvm_memory_slot *slot;
+ struct kvm_memory_slot *slot, old;
struct kvm_memslots *slots;
int r;
@@ -1566,7 +1565,7 @@ static int kvm_set_memslot(struct kvm *kvm,
* Note, the INVALID flag needs to be in the appropriate entry
* in the freshly allocated memslots, not in @old or @new.
*/
- slot = id_to_memslot(slots, old->id);
+ slot = id_to_memslot(slots, new->id);
slot->flags |= KVM_MEMSLOT_INVALID;
/*
@@ -1597,6 +1596,26 @@ static int kvm_set_memslot(struct kvm *kvm,
kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
}
+ /*
+ * Make a full copy of the old memslot, the pointer will become stale
+ * when the memslots are re-sorted by update_memslots(), and the old
+ * memslot needs to be referenced after calling update_memslots(), e.g.
+ * to free its resources and for arch specific behavior. This needs to
+ * happen *after* (re)acquiring slots_arch_lock.
+ */
+ slot = id_to_memslot(slots, new->id);
+ if (slot) {
+ old = *slot;
+ } else {
+ WARN_ON_ONCE(change != KVM_MR_CREATE);
+ memset(&old, 0, sizeof(old));
+ old.id = new->id;
+ old.as_id = as_id;
+ }
+
+ /* Copy the arch-specific data, again after (re)acquiring slots_arch_lock. */
+ memcpy(&new->arch, &old.arch, sizeof(old.arch));
+
r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
if (r)
goto out_slots;
@@ -1604,14 +1623,18 @@ static int kvm_set_memslot(struct kvm *kvm,
update_memslots(slots, new, change);
slots = install_new_memslots(kvm, as_id, slots);
- kvm_arch_commit_memory_region(kvm, mem, old, new, change);
+ kvm_arch_commit_memory_region(kvm, mem, &old, new, change);
+
+ /* Free the old memslot's metadata. Note, this is the full copy!!! */
+ if (change == KVM_MR_DELETE)
+ kvm_free_memslot(kvm, &old);
kvfree(slots);
return 0;
out_slots:
if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
- slot = id_to_memslot(slots, old->id);
+ slot = id_to_memslot(slots, new->id);
slot->flags &= ~KVM_MEMSLOT_INVALID;
slots = install_new_memslots(kvm, as_id, slots);
} else {
@@ -1626,7 +1649,6 @@ static int kvm_delete_memslot(struct kvm *kvm,
struct kvm_memory_slot *old, int as_id)
{
struct kvm_memory_slot new;
- int r;
if (!old->npages)
return -EINVAL;
@@ -1639,12 +1661,7 @@ static int kvm_delete_memslot(struct kvm *kvm,
*/
new.as_id = as_id;
- r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
- if (r)
- return r;
-
- kvm_free_memslot(kvm, old);
- return 0;
+ return kvm_set_memslot(kvm, mem, &new, as_id, KVM_MR_DELETE);
}
/*
@@ -1672,7 +1689,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
id = (u16)mem->slot;
/* General sanity checks */
- if (mem->memory_size & (PAGE_SIZE - 1))
+ if ((mem->memory_size & (PAGE_SIZE - 1)) ||
+ (mem->memory_size != (unsigned long)mem->memory_size))
return -EINVAL;
if (mem->guest_phys_addr & (PAGE_SIZE - 1))
return -EINVAL;
@@ -1718,7 +1736,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (!old.npages) {
change = KVM_MR_CREATE;
new.dirty_bitmap = NULL;
- memset(&new.arch, 0, sizeof(new.arch));
} else { /* Modify an existing slot. */
if ((new.userspace_addr != old.userspace_addr) ||
(new.npages != old.npages) ||
@@ -1732,9 +1749,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
else /* Nothing to change. */
return 0;
- /* Copy dirty_bitmap and arch from the current memslot. */
+ /* Copy dirty_bitmap from the current memslot. */
new.dirty_bitmap = old.dirty_bitmap;
- memcpy(&new.arch, &old.arch, sizeof(new.arch));
}
if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
@@ -1760,7 +1776,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
bitmap_set(new.dirty_bitmap, 0, new.npages);
}
- r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
+ r = kvm_set_memslot(kvm, mem, &new, as_id, change);
if (r)
goto out_bitmap;
@@ -2915,7 +2931,8 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
int r;
gpa_t gpa = ghc->gpa + offset;
- BUG_ON(len + offset > ghc->len);
+ if (WARN_ON_ONCE(len + offset > ghc->len))
+ return -EINVAL;
if (slots->generation != ghc->generation) {
if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
@@ -2952,7 +2969,8 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
int r;
gpa_t gpa = ghc->gpa + offset;
- BUG_ON(len + offset > ghc->len);
+ if (WARN_ON_ONCE(len + offset > ghc->len))
+ return -EINVAL;
if (slots->generation != ghc->generation) {
if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))