aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-05-21 13:58:37 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-05-21 13:58:37 -0700
commita35747c3107ebb8ef2749d4dabaf71c205e0d0fe (patch)
tree0b4479029258272e8286cac977be214b22850b0f
parentc47d122c5ba5f3b3371cfe051d770b5bbd591f6b (diff)
parentb9846a698c9aff4eb2214a06ac83638ad098f33f (diff)
downloadintel-a35747c3107ebb8ef2749d4dabaf71c205e0d0fe.tar.gz
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "ARM: - Plug a race in the stage-2 mapping code where the IPA and the PA would end up being out of sync - Make better use of the bitmap API (bitmap_zero, bitmap_zalloc...) - FP/SVE/SME documentation update, in the hope that this field becomes clearer... - Add workaround for Apple SEIS brokenness to a new SoC - Random comment fixes x86: - add MSR_IA32_TSX_CTRL into msrs_to_save - fixes for XCR0 handling in SGX enclaves Generic: - Fix vcpu_array[0] races - Fix race between starting a VM and 'reboot -f'" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: VMX: add MSR_IA32_TSX_CTRL into msrs_to_save KVM: x86: Don't adjust guest's CPUID.0x12.1 (allowed SGX enclave XFRM) KVM: VMX: Don't rely _only_ on CPUID to enforce XCR0 restrictions for ECREATE KVM: Fix vcpu_array[0] races KVM: VMX: Fix header file dependency of asm/vmx.h KVM: Don't enable hardware after a restart/shutdown is initiated KVM: Use syscore_ops instead of reboot_notifier to hook restart/shutdown KVM: arm64: vgic: Add Apple M2 PRO/MAX cpus to the list of broken SEIS implementations KVM: arm64: Clarify host SME state management KVM: arm64: Restructure check for SVE support in FP trap handler KVM: arm64: Document check for TIF_FOREIGN_FPSTATE KVM: arm64: Fix repeated words in comments KVM: arm64: Constify start/end/phys fields of the pgtable walker data KVM: arm64: Infer PA offset from VA in hyp map walker KVM: arm64: Infer the PA offset from IPA in stage-2 map walker KVM: arm64: Use the bitmap API to allocate bitmaps KVM: arm64: Slightly optimize flush_context()
-rw-r--r--arch/arm64/include/asm/cputype.h8
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h1
-rw-r--r--arch/arm64/kvm/fpsimd.c26
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h12
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c41
-rw-r--r--arch/arm64/kvm/inject_fault.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c4
-rw-r--r--arch/arm64/kvm/vmid.c7
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/kvm/cpuid.c16
-rw-r--r--arch/x86/kvm/vmx/sgx.c11
-rw-r--r--arch/x86/kvm/x86.c6
-rw-r--r--virt/kvm/kvm_main.c59
13 files changed, 129 insertions, 66 deletions
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 683ca3af408485..5f6f84837a4903 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -126,6 +126,10 @@
#define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029
#define APPLE_CPU_PART_M2_BLIZZARD 0x032
#define APPLE_CPU_PART_M2_AVALANCHE 0x033
+#define APPLE_CPU_PART_M2_BLIZZARD_PRO 0x034
+#define APPLE_CPU_PART_M2_AVALANCHE_PRO 0x035
+#define APPLE_CPU_PART_M2_BLIZZARD_MAX 0x038
+#define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039
#define AMPERE_CPU_PART_AMPERE1 0xAC3
@@ -181,6 +185,10 @@
#define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX)
#define MIDR_APPLE_M2_BLIZZARD MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD)
#define MIDR_APPLE_M2_AVALANCHE MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE)
+#define MIDR_APPLE_M2_BLIZZARD_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_PRO)
+#define MIDR_APPLE_M2_AVALANCHE_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_PRO)
+#define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX)
+#define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX)
#define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1)
/* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 4cd6762bda805d..dc3c072e862f1e 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -209,6 +209,7 @@ struct kvm_pgtable_visit_ctx {
kvm_pte_t old;
void *arg;
struct kvm_pgtable_mm_ops *mm_ops;
+ u64 start;
u64 addr;
u64 end;
u32 level;
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 1279949599b5fc..4c9dcd8fc93950 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -81,26 +81,34 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
fpsimd_kvm_prepare();
+ /*
+ * We will check TIF_FOREIGN_FPSTATE just before entering the
+ * guest in kvm_arch_vcpu_ctxflush_fp() and override this to
+ * FP_STATE_FREE if the flag set.
+ */
vcpu->arch.fp_state = FP_STATE_HOST_OWNED;
vcpu_clear_flag(vcpu, HOST_SVE_ENABLED);
if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
vcpu_set_flag(vcpu, HOST_SVE_ENABLED);
- /*
- * We don't currently support SME guests but if we leave
- * things in streaming mode then when the guest starts running
- * FPSIMD or SVE code it may generate SME traps so as a
- * special case if we are in streaming mode we force the host
- * state to be saved now and exit streaming mode so that we
- * don't have to handle any SME traps for valid guest
- * operations. Do this for ZA as well for now for simplicity.
- */
if (system_supports_sme()) {
vcpu_clear_flag(vcpu, HOST_SME_ENABLED);
if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
vcpu_set_flag(vcpu, HOST_SME_ENABLED);
+ /*
+ * If PSTATE.SM is enabled then save any pending FP
+ * state and disable PSTATE.SM. If we leave PSTATE.SM
+ * enabled and the guest does not enable SME via
+ * CPACR_EL1.SMEN then operations that should be valid
+ * may generate SME traps from EL1 to EL1 which we
+ * can't intercept and which would confuse the guest.
+ *
+ * Do the same for PSTATE.ZA in the case where there
+ * is state in the registers which has not already
+ * been saved, this is very unlikely to happen.
+ */
if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) {
vcpu->arch.fp_state = FP_STATE_FREE;
fpsimd_save_and_flush_cpu_state();
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index c41166f1a1dd7d..e78a08a72a3c9f 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -177,9 +177,17 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
sve_guest = vcpu_has_sve(vcpu);
esr_ec = kvm_vcpu_trap_get_class(vcpu);
- /* Don't handle SVE traps for non-SVE vcpus here: */
- if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD)
+ /* Only handle traps the vCPU can support here: */
+ switch (esr_ec) {
+ case ESR_ELx_EC_FP_ASIMD:
+ break;
+ case ESR_ELx_EC_SVE:
+ if (!sve_guest)
+ return false;
+ break;
+ default:
return false;
+ }
/* Valid trap. Switch the context: */
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 3d61bd3e591d27..5282cb9ca4cff4 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -58,8 +58,9 @@
struct kvm_pgtable_walk_data {
struct kvm_pgtable_walker *walker;
+ const u64 start;
u64 addr;
- u64 end;
+ const u64 end;
};
static bool kvm_phys_is_valid(u64 phys)
@@ -201,6 +202,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
.old = READ_ONCE(*ptep),
.arg = data->walker->arg,
.mm_ops = mm_ops,
+ .start = data->start,
.addr = data->addr,
.end = data->end,
.level = level,
@@ -293,6 +295,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_pgtable_walker *walker)
{
struct kvm_pgtable_walk_data walk_data = {
+ .start = ALIGN_DOWN(addr, PAGE_SIZE),
.addr = ALIGN_DOWN(addr, PAGE_SIZE),
.end = PAGE_ALIGN(walk_data.addr + size),
.walker = walker,
@@ -349,7 +352,7 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
}
struct hyp_map_data {
- u64 phys;
+ const u64 phys;
kvm_pte_t attr;
};
@@ -407,13 +410,12 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
struct hyp_map_data *data)
{
+ u64 phys = data->phys + (ctx->addr - ctx->start);
kvm_pte_t new;
- u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
if (!kvm_block_mapping_supported(ctx, phys))
return false;
- data->phys += granule;
new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
if (ctx->old == new)
return true;
@@ -576,7 +578,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
}
struct stage2_map_data {
- u64 phys;
+ const u64 phys;
kvm_pte_t attr;
u8 owner_id;
@@ -794,20 +796,43 @@ static bool stage2_pte_executable(kvm_pte_t pte)
return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
}
+static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
+ const struct stage2_map_data *data)
+{
+ u64 phys = data->phys;
+
+ /*
+ * Stage-2 walks to update ownership data are communicated to the map
+ * walker using an invalid PA. Avoid offsetting an already invalid PA,
+ * which could overflow and make the address valid again.
+ */
+ if (!kvm_phys_is_valid(phys))
+ return phys;
+
+ /*
+ * Otherwise, work out the correct PA based on how far the walk has
+ * gotten.
+ */
+ return phys + (ctx->addr - ctx->start);
+}
+
static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
+ u64 phys = stage2_map_walker_phys_addr(ctx, data);
+
if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1)))
return false;
- return kvm_block_mapping_supported(ctx, data->phys);
+ return kvm_block_mapping_supported(ctx, phys);
}
static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
kvm_pte_t new;
- u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
+ u64 phys = stage2_map_walker_phys_addr(ctx, data);
+ u64 granule = kvm_granule_size(ctx->level);
struct kvm_pgtable *pgt = data->mmu->pgt;
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
@@ -841,8 +866,6 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
stage2_make_pte(ctx, new);
- if (kvm_phys_is_valid(phys))
- data->phys += granule;
return 0;
}
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 64c3aec0d937c3..0bd93a5f21ce38 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -204,7 +204,7 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
* Size Fault at level 0, as if exceeding PARange.
*
* Non-LPAE guests will only get the external abort, as there
- * is no way to to describe the ASF.
+ * is no way to describe the ASF.
*/
if (vcpu_el1_is_32bit(vcpu) &&
!(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE))
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 469d816f356f3f..93a47a515c1371 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -616,6 +616,10 @@ static const struct midr_range broken_seis[] = {
MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX),
MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD),
MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX),
{},
};
diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c
index 08978d0672e7e9..7fe8ba1a2851c5 100644
--- a/arch/arm64/kvm/vmid.c
+++ b/arch/arm64/kvm/vmid.c
@@ -47,7 +47,7 @@ static void flush_context(void)
int cpu;
u64 vmid;
- bitmap_clear(vmid_map, 0, NUM_USER_VMIDS);
+ bitmap_zero(vmid_map, NUM_USER_VMIDS);
for_each_possible_cpu(cpu) {
vmid = atomic64_xchg_relaxed(&per_cpu(active_vmids, cpu), 0);
@@ -182,8 +182,7 @@ int __init kvm_arm_vmid_alloc_init(void)
*/
WARN_ON(NUM_USER_VMIDS - 1 <= num_possible_cpus());
atomic64_set(&vmid_generation, VMID_FIRST_VERSION);
- vmid_map = kcalloc(BITS_TO_LONGS(NUM_USER_VMIDS),
- sizeof(*vmid_map), GFP_KERNEL);
+ vmid_map = bitmap_zalloc(NUM_USER_VMIDS, GFP_KERNEL);
if (!vmid_map)
return -ENOMEM;
@@ -192,5 +191,5 @@ int __init kvm_arm_vmid_alloc_init(void)
void __init kvm_arm_vmid_alloc_free(void)
{
- kfree(vmid_map);
+ bitmap_free(vmid_map);
}
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498dc600bd5c8e..0d02c4aafa6f49 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -13,7 +13,9 @@
#include <linux/bitops.h>
+#include <linux/bug.h>
#include <linux/types.h>
+
#include <uapi/asm/vmx.h>
#include <asm/vmxfeatures.h>
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 123bf8b97a4b21..0c9660a07b233f 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -253,7 +253,6 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
int nent)
{
struct kvm_cpuid_entry2 *best;
- u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
if (best) {
@@ -292,21 +291,6 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT);
}
-
- /*
- * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
- * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
- * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
- * at the time of EENTER, thus adjust the allowed XFRM by the guest's
- * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
- * '1' even on CPUs that don't support XSAVE.
- */
- best = cpuid_entry2_find(entries, nent, 0x12, 0x1);
- if (best) {
- best->ecx &= guest_supported_xcr0 & 0xffffffff;
- best->edx &= guest_supported_xcr0 >> 32;
- best->ecx |= XFEATURE_MASK_FPSSE;
- }
}
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 0574030b071fbf..2261b684a7d447 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -170,12 +170,19 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
return 1;
}
- /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
+ /*
+ * Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. Note
+ * that the allowed XFRM (XFeature Request Mask) isn't strictly bound
+ * by the supported XCR0. FP+SSE *must* be set in XFRM, even if XSAVE
+ * is unsupported, i.e. even if XCR0 itself is completely unsupported.
+ */
if ((u32)miscselect & ~sgx_12_0->ebx ||
(u32)attributes & ~sgx_12_1->eax ||
(u32)(attributes >> 32) & ~sgx_12_1->ebx ||
(u32)xfrm & ~sgx_12_1->ecx ||
- (u32)(xfrm >> 32) & ~sgx_12_1->edx) {
+ (u32)(xfrm >> 32) & ~sgx_12_1->edx ||
+ xfrm & ~(vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE) ||
+ (xfrm & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
kvm_inject_gp(vcpu, 0);
return 1;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ceb7c5e9cf9e92..c0778ca39650f4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1446,7 +1446,7 @@ static const u32 msrs_to_save_base[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
@@ -7155,6 +7155,10 @@ static void kvm_probe_msr_to_save(u32 msr_index)
if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
return;
break;
+ case MSR_IA32_TSX_CTRL:
+ if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR))
+ return;
+ break;
default:
break;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cb5c13eee1936e..479802a892d4f7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3962,18 +3962,19 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
}
vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
- r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
- BUG_ON(r == -EBUSY);
+ r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
if (r)
goto unlock_vcpu_destroy;
/* Now it's all set up, let userspace reach it */
kvm_get_kvm(kvm);
r = create_vcpu_fd(vcpu);
- if (r < 0) {
- xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
- kvm_put_kvm_no_destroy(kvm);
- goto unlock_vcpu_destroy;
+ if (r < 0)
+ goto kvm_put_xa_release;
+
+ if (KVM_BUG_ON(!!xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
+ r = -EINVAL;
+ goto kvm_put_xa_release;
}
/*
@@ -3988,6 +3989,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
kvm_create_vcpu_debugfs(vcpu);
return r;
+kvm_put_xa_release:
+ kvm_put_kvm_no_destroy(kvm);
+ xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
kvm_dirty_ring_free(&vcpu->dirty_ring);
@@ -5184,7 +5188,20 @@ static void hardware_disable_all(void)
static int hardware_enable_all(void)
{
atomic_t failed = ATOMIC_INIT(0);
- int r = 0;
+ int r;
+
+ /*
+ * Do not enable hardware virtualization if the system is going down.
+ * If userspace initiated a forced reboot, e.g. reboot -f, then it's
+ * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
+ * after kvm_reboot() is called. Note, this relies on system_state
+ * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
+ * hook instead of registering a dedicated reboot notifier (the latter
+ * runs before system_state is updated).
+ */
+ if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
+ system_state == SYSTEM_RESTART)
+ return -EBUSY;
/*
* When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
@@ -5197,6 +5214,8 @@ static int hardware_enable_all(void)
cpus_read_lock();
mutex_lock(&kvm_lock);
+ r = 0;
+
kvm_usage_count++;
if (kvm_usage_count == 1) {
on_each_cpu(hardware_enable_nolock, &failed, 1);
@@ -5213,26 +5232,24 @@ static int hardware_enable_all(void)
return r;
}
-static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
- void *v)
+static void kvm_shutdown(void)
{
/*
- * Some (well, at least mine) BIOSes hang on reboot if
- * in vmx root mode.
- *
- * And Intel TXT required VMX off for all cpu when system shutdown.
+ * Disable hardware virtualization and set kvm_rebooting to indicate
+ * that KVM has asynchronously disabled hardware virtualization, i.e.
+ * that relevant errors and exceptions aren't entirely unexpected.
+ * Some flavors of hardware virtualization need to be disabled before
+ * transferring control to firmware (to perform shutdown/reboot), e.g.
+ * on x86, virtualization can block INIT interrupts, which are used by
+ * firmware to pull APs back under firmware control. Note, this path
+ * is used for both shutdown and reboot scenarios, i.e. neither name is
+ * 100% comprehensive.
*/
pr_info("kvm: exiting hardware virtualization\n");
kvm_rebooting = true;
on_each_cpu(hardware_disable_nolock, NULL, 1);
- return NOTIFY_OK;
}
-static struct notifier_block kvm_reboot_notifier = {
- .notifier_call = kvm_reboot,
- .priority = 0,
-};
-
static int kvm_suspend(void)
{
/*
@@ -5263,6 +5280,7 @@ static void kvm_resume(void)
static struct syscore_ops kvm_syscore_ops = {
.suspend = kvm_suspend,
.resume = kvm_resume,
+ .shutdown = kvm_shutdown,
};
#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
static int hardware_enable_all(void)
@@ -5967,7 +5985,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
if (r)
return r;
- register_reboot_notifier(&kvm_reboot_notifier);
register_syscore_ops(&kvm_syscore_ops);
#endif
@@ -6039,7 +6056,6 @@ err_cpu_kick_mask:
err_vcpu_cache:
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
- unregister_reboot_notifier(&kvm_reboot_notifier);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
return r;
@@ -6065,7 +6081,6 @@ void kvm_exit(void)
kvm_async_pf_deinit();
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
- unregister_reboot_notifier(&kvm_reboot_notifier);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
kvm_irqfd_exit();