diff options
author | Marc Zyngier <marc.zyngier@arm.com> | 2012-07-04 16:49:31 +0100 |
---|---|---|
committer | Marc Zyngier <marc.zyngier@arm.com> | 2012-07-04 16:49:31 +0100 |
commit | fd33ed44aa9a5ad92cf0fa25fc0921a34f298402 (patch) | |
tree | c27c4a37c30437941cd4b1eb2eee2028dd37cf7b | |
parent | 6887a4131da3adaab011613776d865f4bcfb5678 (diff) | |
parent | d5321dceeaccf756755e76b38d8b5905bd99d250 (diff) | |
download | linux-kvm-arm-kvm-a15-v10-stage-v3.5-rc5.tar.gz |
Merge branch 'kvm-a15-v10-stage' into kvm-a15-v10-stage-v3.5-rckvm-a15-v10-stage-v3.5-rc5
83 files changed, 5585 insertions, 281 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 930126698a0f5..79c10fced19d8 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -614,15 +614,19 @@ only go to the IOAPIC. On ia64, a IOSAPIC is created. 4.25 KVM_IRQ_LINE Capability: KVM_CAP_IRQCHIP -Architectures: x86, ia64 +Architectures: x86, ia64, arm Type: vm ioctl Parameters: struct kvm_irq_level Returns: 0 on success, -1 on error Sets the level of a GSI input to the interrupt controller model in the kernel. -Requires that an interrupt controller model has been previously created with -KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level -to be set to 1 and then back to 0. +On some architectures it is required that an interrupt controller model has +been previously created with KVM_CREATE_IRQCHIP. Note that edge-triggered +interrupts require the level to be set to 1 and then back to 0. + +ARM uses two types of interrupt lines per CPU: IRQ and FIQ. The value of the +irq field should be (vcpu_index << 1) for IRQs and ((vcpu_index << 1) | 1) for +FIQs. Level is used to raise/lower the line. struct kvm_irq_level { union { @@ -1930,6 +1934,42 @@ The "pte_enc" field provides a value that can OR'ed into the hash PTE's RPN field (ie, it needs to be shifted left by 12 to OR it into the hash PTE second double word). + +4.75 KVM_PPC_ALLOCATE_HTAB + +Capability: KVM_CAP_PPC_ALLOC_HTAB +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to u32 containing hash table order (in/out) +Returns: 0 on success, -1 on error + +This requests the host kernel to allocate an MMU hash table for a +guest using the PAPR paravirtualization interface. This only does +anything if the kernel is configured to use the Book 3S HV style of +virtualization. Otherwise the capability doesn't exist and the ioctl +returns an ENOTTY error. The rest of this description assumes Book 3S +HV. + +There must be no vcpus running when this ioctl is called; if there +are, it will do nothing and return an EBUSY error. + +The parameter is a pointer to a 32-bit unsigned integer variable +containing the order (log base 2) of the desired size of the hash +table, which must be between 18 and 46. On successful return from the +ioctl, it will have been updated with the order of the hash table that +was allocated. + +If no hash table has been allocated when any vcpu is asked to run +(with the KVM_RUN ioctl), the host kernel will allocate a +default-sized hash table (16 MB). + +If this ioctl is called when a hash table has already been allocated, +the kernel will clear out the existing hash table (zero all HPTEs) and +return the hash table order in the parameter. (If the guest is using +the virtualized real-mode area (VRMA) facility, the kernel will +re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) + + 5. The kvm_run structure ------------------------ diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 96b41bd975233..730471048583b 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt @@ -223,3 +223,36 @@ MSR_KVM_STEAL_TIME: 0x4b564d03 steal: the amount of time in which this vCPU did not run, in nanoseconds. Time during which the vcpu is idle, will not be reported as steal time. + +MSR_KVM_EOI_EN: 0x4b564d04 + data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 + when disabled. Bit 1 is reserved and must be zero. When PV end of + interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned + physical address of a 4 byte memory area which must be in guest RAM and + must be zeroed. + + The first, least significant bit of 4 byte memory location will be + written to by the hypervisor, typically at the time of interrupt + injection. Value of 1 means that guest can skip writing EOI to the apic + (using MSR or MMIO write); instead, it is sufficient to signal + EOI by clearing the bit in guest memory - this location will + later be polled by the hypervisor. + Value of 0 means that the EOI write is required. + + It is always safe for the guest to ignore the optimization and perform + the APIC EOI write anyway. + + Hypervisor is guaranteed to only modify this least + significant bit while in the current VCPU context, this means that + guest does not need to use either lock prefix or memory ordering + primitives to synchronise with the hypervisor. + + However, hypervisor can set and clear this memory bit at any time: + therefore to make sure hypervisor does not interrupt the + guest and clear the least significant bit in the memory area + in the window between guest testing it to detect + whether it can skip EOI apic write and between guest + clearing it to signal EOI to the hypervisor, + guest must both read the least significant bit in the memory area and + clear it using a single CPU instruction, such as test and clear, or + compare and exchange. diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt index 6e7c370509304..4911cf95c67e5 100644 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ b/Documentation/virtual/kvm/ppc-pv.txt @@ -109,8 +109,6 @@ The following bits are safe to be set inside the guest: MSR_EE MSR_RI - MSR_CR - MSR_ME If any other bit changes in the MSR, please still use mtmsr(d). diff --git a/MAINTAINERS b/MAINTAINERS index eb22272b2116c..d5e291b00ae8a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3990,8 +3990,8 @@ F: arch/ia64/include/asm/kvm* F: arch/ia64/kvm/ KERNEL VIRTUAL MACHINE for s390 (KVM/s390) -M: Carsten Otte <cotte@de.ibm.com> M: Christian Borntraeger <borntraeger@de.ibm.com> +M: Cornelia Huck <cornelia.huck@de.ibm.com> M: linux390@de.ibm.com L: linux-s390@vger.kernel.org W: http://www.ibm.com/developerworks/linux/linux390/ diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a91009c618706..3fb62444882bd 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2274,3 +2274,5 @@ source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "arch/arm/kvm/Kconfig" diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 0298b00fe2413..64f1e167de26d 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -250,6 +250,7 @@ core-$(CONFIG_VFP) += arch/arm/vfp/ # If we have a machine-specific directory, then include it in the build. core-y += arch/arm/kernel/ arch/arm/mm/ arch/arm/common/ core-y += arch/arm/net/ +core-y += arch/arm/kvm/ core-y += $(machdirs) $(platdirs) drivers-$(CONFIG_OPROFILE) += arch/arm/oprofile/ diff --git a/arch/arm/include/asm/idmap.h b/arch/arm/include/asm/idmap.h index bf863edb517dd..a1ab8d68874cb 100644 --- a/arch/arm/include/asm/idmap.h +++ b/arch/arm/include/asm/idmap.h @@ -11,4 +11,11 @@ extern pgd_t *idmap_pgd; void setup_mm_for_reboot(void); +#ifdef CONFIG_ARM_VIRT_EXT +extern pgd_t *hyp_pgd; + +void hyp_idmap_teardown(void); +void hyp_idmap_setup(void); +#endif + #endif /* __ASM_IDMAP_H */ diff --git a/arch/arm/include/asm/kvm.h b/arch/arm/include/asm/kvm.h new file mode 100644 index 0000000000000..54f301d3c445b --- /dev/null +++ b/arch/arm/include/asm/kvm.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __ARM_KVM_H__ +#define __ARM_KVM_H__ + +#include <asm/types.h> + +#define __KVM_HAVE_GUEST_DEBUG +#define __KVM_HAVE_IRQ_LINE + +/* + * KVM_IRQ_LINE macros to set/read IRQ/FIQ for specific VCPU index. + */ +enum KVM_ARM_IRQ_LINE_TYPE { + KVM_ARM_IRQ_LINE = 0, + KVM_ARM_FIQ_LINE = 1, +}; + +/* + * Modes used for short-hand mode determinition in the world-switch code and + * in emulation code. + * + * Note: These indices do NOT correspond to the value of the CPSR mode bits! + */ +enum vcpu_mode { + MODE_FIQ = 0, + MODE_IRQ, + MODE_SVC, + MODE_ABT, + MODE_UND, + MODE_USR, + MODE_SYS +}; + +struct kvm_regs { + __u32 regs0_7[8]; /* Unbanked regs. (r0 - r7) */ + __u32 fiq_regs8_12[5]; /* Banked fiq regs. (r8 - r12) */ + __u32 usr_regs8_12[5]; /* Banked usr registers (r8 - r12) */ + __u32 reg13[6]; /* Banked r13, indexed by MODE_ */ + __u32 reg14[6]; /* Banked r13, indexed by MODE_ */ + __u32 reg15; + __u32 cpsr; + __u32 spsr[5]; /* Banked SPSR, indexed by MODE_ */ + struct { + __u32 c0_midr; + __u32 c1_sys; + __u32 c2_base0; + __u32 c2_base1; + __u32 c2_control; + __u32 c3_dacr; + } cp15; + +}; + +struct kvm_sregs { +}; + +struct kvm_fpu { +}; + +struct kvm_guest_debug_arch { +}; + +struct kvm_debug_exit_arch { +}; + +struct kvm_sync_regs { +}; + +struct kvm_arch_memory_slot { +}; + +#endif /* __ARM_KVM_H__ */ diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h new file mode 100644 index 0000000000000..1efa452ae6142 --- /dev/null +++ b/arch/arm/include/asm/kvm_arm.h @@ -0,0 +1,184 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __ARM_KVM_ARM_H__ +#define __ARM_KVM_ARM_H__ + +#include <asm/types.h> + +/* Supported Processor Types */ +#define CORTEX_A15 (0xC0F) + +/* Multiprocessor Affinity Register */ +#define MPIDR_CPUID (0x3 << 0) + +/* Hyp Configuration Register (HCR) bits */ +#define HCR_TGE (1 << 27) +#define HCR_TVM (1 << 26) +#define HCR_TTLB (1 << 25) +#define HCR_TPU (1 << 24) +#define HCR_TPC (1 << 23) +#define HCR_TSW (1 << 22) +#define HCR_TAC (1 << 21) +#define HCR_TIDCP (1 << 20) +#define HCR_TSC (1 << 19) +#define HCR_TID3 (1 << 18) +#define HCR_TID2 (1 << 17) +#define HCR_TID1 (1 << 16) +#define HCR_TID0 (1 << 15) +#define HCR_TWE (1 << 14) +#define HCR_TWI (1 << 13) +#define HCR_DC (1 << 12) +#define HCR_BSU (3 << 10) +#define HCR_BSU_IS (1 << 10) +#define HCR_FB (1 << 9) +#define HCR_VA (1 << 8) +#define HCR_VI_BIT_NR 7 +#define HCR_VF_BIT_NR 6 +#define HCR_VI (1 << HCR_VI_BIT_NR) +#define HCR_VF (1 << HCR_VF_BIT_NR) +#define HCR_AMO (1 << 5) +#define HCR_IMO (1 << 4) +#define HCR_FMO (1 << 3) +#define HCR_PTW (1 << 2) +#define HCR_SWIO (1 << 1) +#define HCR_VM 1 + +/* + * The bits we set in HCR: + * TAC: Trap ACTLR + * TSC: Trap SMC + * TSW: Trap cache operations by set/way + * TWI: Trap WFI + * BSU_IS: Upgrade barriers to the inner shareable domain + * FB: Force broadcast of all maintainance operations + * AMO: Override CPSR.A and enable signaling with VA + * IMO: Override CPSR.I and enable signaling with VI + * FMO: Override CPSR.F and enable signaling with VF + * SWIO: Turn set/way invalidates into set/way clean+invalidate + */ +#define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ + HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ + HCR_SWIO) +#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) + +/* System Control Register (SCTLR) bits */ +#define SCTLR_TE (1 << 30) +#define SCTLR_EE (1 << 25) +#define SCTLR_V (1 << 13) + +/* Hyp System Control Register (HSCTLR) bits */ +#define HSCTLR_TE (1 << 30) +#define HSCTLR_EE (1 << 25) +#define HSCTLR_FI (1 << 21) +#define HSCTLR_WXN (1 << 19) +#define HSCTLR_I (1 << 12) +#define HSCTLR_C (1 << 2) +#define HSCTLR_A (1 << 1) +#define HSCTLR_M 1 +#define HSCTLR_MASK (HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I | \ + HSCTLR_WXN | HSCTLR_FI | HSCTLR_EE | HSCTLR_TE) + +/* TTBCR and HTCR Registers bits */ +#define TTBCR_EAE (1 << 31) +#define TTBCR_IMP (1 << 30) +#define TTBCR_SH1 (3 << 28) +#define TTBCR_ORGN1 (3 << 26) +#define TTBCR_IRGN1 (3 << 24) +#define TTBCR_EPD1 (1 << 23) +#define TTBCR_A1 (1 << 22) +#define TTBCR_T1SZ (3 << 16) +#define TTBCR_SH0 (3 << 12) +#define TTBCR_ORGN0 (3 << 10) +#define TTBCR_IRGN0 (3 << 8) +#define TTBCR_EPD0 (1 << 7) +#define TTBCR_T0SZ 3 +#define HTCR_MASK (TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0) + +/* Hyp System Trap Register */ +#define HSTR_T(x) (1 << x) +#define HSTR_TTEE (1 << 16) +#define HSTR_TJDBX (1 << 17) + +/* Hyp Coprocessor Trap Register */ +#define HCPTR_TCP(x) (1 << x) +#define HCPTR_TCP_MASK (0x3fff) +#define HCPTR_TASE (1 << 15) +#define HCPTR_TTA (1 << 20) +#define HCPTR_TCPAC (1 << 31) + +/* Virtualization Translation Control Register (VTCR) bits */ +#define VTCR_SH0 (3 << 12) +#define VTCR_ORGN0 (3 << 10) +#define VTCR_IRGN0 (3 << 8) +#define VTCR_SL0 (3 << 6) +#define VTCR_S (1 << 4) +#define VTCR_T0SZ 3 +#define VTCR_MASK (VTCR_SH0 | VTCR_ORGN0 | VTCR_IRGN0 | VTCR_SL0 | \ + VTCR_S | VTCR_T0SZ | VTCR_MASK) +#define VTCR_HTCR_SH (VTCR_SH0 | VTCR_ORGN0 | VTCR_IRGN0) +#define VTCR_SL_L2 0 /* Starting-level: 2 */ +#define VTCR_SL_L1 (1 << 6) /* Starting-level: 1 */ +#define VTCR_GUEST_SL VTCR_SL_L1 +#define VTCR_GUEST_T0SZ 0 +#if VTCR_GUEST_SL == 0 +#define VTTBR_X (14 - VTCR_GUEST_T0SZ) +#else +#define VTTBR_X (5 - VTCR_GUEST_T0SZ) +#endif + +/* Hyp Syndrome Register (HSR) bits */ +#define HSR_EC_SHIFT (26) +#define HSR_EC (0x3fU << HSR_EC_SHIFT) +#define HSR_IL (1U << 25) +#define HSR_ISS (HSR_IL - 1) +#define HSR_ISV_SHIFT (24) +#define HSR_ISV (1U << HSR_ISV_SHIFT) +#define HSR_SRT_SHIFT (16) +#define HSR_SRT_MASK (0xf << HSR_SRT_SHIFT) +#define HSR_FSC (0x3f) +#define HSR_FSC_TYPE (0x3c) +#define HSR_SSE (1 << 21) +#define HSR_WNR (1 << 6) + +#define FSC_FAULT (0x04) +#define FSC_PERM (0x0c) + +/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ +#define HPFAR_MASK (~0xf) + +#define HSR_EC_UNKNOWN (0x00) +#define HSR_EC_WFI (0x01) +#define HSR_EC_CP15_32 (0x03) +#define HSR_EC_CP15_64 (0x04) +#define HSR_EC_CP14_MR (0x05) +#define HSR_EC_CP14_LS (0x06) +#define HSR_EC_CP_0_13 (0x07) +#define HSR_EC_CP10_ID (0x08) +#define HSR_EC_JAZELLE (0x09) +#define HSR_EC_BXJ (0x0A) +#define HSR_EC_CP14_64 (0x0C) +#define HSR_EC_SVC_HYP (0x11) +#define HSR_EC_HVC (0x12) +#define HSR_EC_SMC (0x13) +#define HSR_EC_IABT (0x20) +#define HSR_EC_IABT_HYP (0x21) +#define HSR_EC_DABT (0x24) +#define HSR_EC_DABT_HYP (0x25) + +#endif /* __ARM_KVM_ARM_H__ */ diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h new file mode 100644 index 0000000000000..e01dfabacdd26 --- /dev/null +++ b/arch/arm/include/asm/kvm_asm.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __ARM_KVM_ASM_H__ +#define __ARM_KVM_ASM_H__ + +#define ARM_EXCEPTION_RESET 0 +#define ARM_EXCEPTION_UNDEFINED 1 +#define ARM_EXCEPTION_SOFTWARE 2 +#define ARM_EXCEPTION_PREF_ABORT 3 +#define ARM_EXCEPTION_DATA_ABORT 4 +#define ARM_EXCEPTION_IRQ 5 +#define ARM_EXCEPTION_FIQ 6 +#define ARM_EXCEPTION_HVC 7 + +/* + * SMC Hypervisor API call number + */ +#define SMCHYP_HVBAR_W 0xfffffff0 + +#ifndef __ASSEMBLY__ +struct kvm; +struct kvm_vcpu; + +extern char __kvm_hyp_init[]; +extern char __kvm_hyp_init_end[]; + +extern char __kvm_hyp_exit[]; +extern char __kvm_hyp_exit_end[]; + +extern char __kvm_hyp_vector[]; + +extern char __kvm_hyp_code_start[]; +extern char __kvm_hyp_code_end[]; + +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); + +extern void __kvm_flush_vm_context(void); + +extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); +#endif + +#endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h new file mode 100644 index 0000000000000..c41537b972e0a --- /dev/null +++ b/arch/arm/include/asm/kvm_emulate.h @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __ARM_KVM_EMULATE_H__ +#define __ARM_KVM_EMULATE_H__ + +#include <linux/kvm_host.h> +#include <asm/kvm_asm.h> + +u32 *vcpu_reg_mode(struct kvm_vcpu *vcpu, u8 reg_num, enum vcpu_mode mode); + +static inline u8 __vcpu_mode(u32 cpsr) +{ + u8 modes_table[32] = { + 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, + 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, + MODE_USR, /* 0x0 */ + MODE_FIQ, /* 0x1 */ + MODE_IRQ, /* 0x2 */ + MODE_SVC, /* 0x3 */ + 0xf, 0xf, 0xf, + MODE_ABT, /* 0x7 */ + 0xf, 0xf, 0xf, + MODE_UND, /* 0xb */ + 0xf, 0xf, 0xf, + MODE_SYS /* 0xf */ + }; + + return modes_table[cpsr & 0x1f]; +} + +static inline enum vcpu_mode vcpu_mode(struct kvm_vcpu *vcpu) +{ + u8 mode = __vcpu_mode(vcpu->arch.regs.cpsr); + BUG_ON(mode == 0xf); + return mode; +} + +int kvm_handle_cp10_id(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp_0_13_access(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_emulate_mmio_ls(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + unsigned long instr); +void kvm_adjust_itstate(struct kvm_vcpu *vcpu); +void kvm_inject_undefined(struct kvm_vcpu *vcpu); + +/* + * Return the SPSR for the specified mode of the virtual CPU. + */ +static inline u32 *vcpu_spsr_mode(struct kvm_vcpu *vcpu, enum vcpu_mode mode) +{ + switch (mode) { + case MODE_SVC: + return &vcpu->arch.regs.svc_regs[2]; + case MODE_ABT: + return &vcpu->arch.regs.abt_regs[2]; + case MODE_UND: + return &vcpu->arch.regs.und_regs[2]; + case MODE_IRQ: + return &vcpu->arch.regs.irq_regs[2]; + case MODE_FIQ: + return &vcpu->arch.regs.fiq_regs[7]; + default: + BUG(); + } +} + +/* Get vcpu register for current mode */ +static inline u32 *vcpu_reg(struct kvm_vcpu *vcpu, unsigned long reg_num) +{ + return vcpu_reg_mode(vcpu, reg_num, vcpu_mode(vcpu)); +} + +static inline u32 *vcpu_pc(struct kvm_vcpu *vcpu) +{ + return vcpu_reg(vcpu, 15); +} + +static inline u32 *vcpu_cpsr(struct kvm_vcpu *vcpu) +{ + return &vcpu->arch.regs.cpsr; +} + +/* Get vcpu SPSR for current mode */ +static inline u32 *vcpu_spsr(struct kvm_vcpu *vcpu) +{ + return vcpu_spsr_mode(vcpu, vcpu_mode(vcpu)); +} + +static inline bool mode_has_spsr(struct kvm_vcpu *vcpu) +{ + return (vcpu_mode(vcpu) < MODE_USR); +} + +static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu) +{ + BUG_ON(vcpu_mode(vcpu) > MODE_SYS); + return vcpu_mode(vcpu) != MODE_USR; +} + +#endif /* __ARM_KVM_EMULATE_H__ */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h new file mode 100644 index 0000000000000..0c7e782b2e97e --- /dev/null +++ b/arch/arm/include/asm/kvm_host.h @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __ARM_KVM_HOST_H__ +#define __ARM_KVM_HOST_H__ + +#define KVM_MAX_VCPUS 4 +#define KVM_MEMORY_SLOTS 32 +#define KVM_PRIVATE_MEM_SLOTS 4 +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + +/* We don't currently support large pages. */ +#define KVM_HPAGE_GFN_SHIFT(x) 0 +#define KVM_NR_PAGE_SIZES 1 +#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) + +struct kvm_vcpu; +u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); +int kvm_target_cpu(void); +int kvm_reset_vcpu(struct kvm_vcpu *vcpu); + +struct kvm_arch { + /* The VMID generation used for the virt. memory system */ + u64 vmid_gen; + u32 vmid; + + /* 1-level 2nd stage table and lock */ + struct mutex pgd_mutex; + pgd_t *pgd; + + /* VTTBR value associated with above pgd and vmid */ + u64 vttbr; +}; + +#define EXCEPTION_NONE 0 +#define EXCEPTION_RESET 0x80 +#define EXCEPTION_UNDEFINED 0x40 +#define EXCEPTION_SOFTWARE 0x20 +#define EXCEPTION_PREFETCH 0x10 +#define EXCEPTION_DATA 0x08 +#define EXCEPTION_IMPRECISE 0x04 +#define EXCEPTION_IRQ 0x02 +#define EXCEPTION_FIQ 0x01 + +struct kvm_vcpu_regs { + u32 usr_regs[15]; /* R0_usr - R14_usr */ + u32 svc_regs[3]; /* SP_svc, LR_svc, SPSR_svc */ + u32 abt_regs[3]; /* SP_abt, LR_abt, SPSR_abt */ + u32 und_regs[3]; /* SP_und, LR_und, SPSR_und */ + u32 irq_regs[3]; /* SP_irq, LR_irq, SPSR_irq */ + u32 fiq_regs[8]; /* R8_fiq - R14_fiq, SPSR_fiq */ + u32 pc; /* The program counter (r15) */ + u32 cpsr; /* The guest CPSR */ +} __packed; + +enum cp15_regs { + c0_MIDR, /* Main ID Register */ + c0_MPIDR, /* MultiProcessor ID Register */ + c1_SCTLR, /* System Control Register */ + c1_ACTLR, /* Auxilliary Control Register */ + c1_CPACR, /* Coprocessor Access Control */ + c2_TTBR0, /* Translation Table Base Register 0 */ + c2_TTBR0_high, /* TTBR0 top 32 bits */ + c2_TTBR1, /* Translation Table Base Register 1 */ + c2_TTBR1_high, /* TTBR1 top 32 bits */ + c2_TTBCR, /* Translation Table Base Control R. */ + c3_DACR, /* Domain Access Control Register */ + c5_DFSR, /* Data Fault Status Register */ + c5_IFSR, /* Instruction Fault Status Register */ + c5_ADFSR, /* Auxilary Data Fault Status Register */ + c5_AIFSR, /* Auxilary Instruction Fault Status Register */ + c6_DFAR, /* Data Fault Address Register */ + c6_IFAR, /* Instruction Fault Address Register */ + c10_PRRR, /* Primary Region Remap Register */ + c10_NMRR, /* Normal Memory Remap Register */ + c12_VBAR, /* Vector Base Address Register */ + c13_CID, /* Context ID Register */ + c13_TID_URW, /* Thread ID, User R/W */ + c13_TID_URO, /* Thread ID, User R/O */ + c13_TID_PRIV, /* Thread ID, Priveleged */ + + nr_cp15_regs +}; + +struct kvm_vcpu_arch { + struct kvm_vcpu_regs regs; + + /* System control coprocessor (cp15) */ + u32 cp15[nr_cp15_regs]; + + /* Exception Information */ + u32 hsr; /* Hyp Syndrom Register */ + u32 hdfar; /* Hyp Data Fault Address Register */ + u32 hifar; /* Hyp Inst. Fault Address Register */ + u32 hpfar; /* Hyp IPA Fault Address Register */ + u64 pc_ipa; /* IPA for the current PC (VA to PA result) */ + u64 pc_ipa2; /* same as above, but for non-aligned wide thumb + instructions */ + + /* dcache set/way operation pending */ + cpumask_t require_dcache_flush; + + /* IO related fields */ + bool mmio_sign_extend; /* for byte/halfword loads */ + u32 mmio_rd; + + /* Interrupt related fields */ + u32 irq_lines; /* IRQ and FIQ levels */ + u32 wait_for_interrupts; + + /* Hyp exception information */ + u32 hyp_pc; /* PC when exception was taken from Hyp mode */ +}; + +struct kvm_vm_stat { + u32 remote_tlb_flush; +}; + +struct kvm_vcpu_stat { + u32 exits; + u32 mmio_exits; + u32 kmmio_exits; + u32 wfi_exits; + u32 irq_exits; + u32 halt_wakeup; +}; + +#define KVM_ARCH_WANT_MMU_NOTIFIER +struct kvm; +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); + +/* We do not have shadow page tables, hence the empty hooks */ +static inline int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + return 0; +} + +static inline int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return 0; +} + +#endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h new file mode 100644 index 0000000000000..7ccd259f85d31 --- /dev/null +++ b/arch/arm/include/asm/kvm_mmu.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __ARM_KVM_MMU_H__ +#define __ARM_KVM_MMU_H__ + +/* + * The architecture supports 40-bit IPA as input to the 2nd stage translations + * and PTRS_PER_PGD2 could therefore be 1024. + * + * To save a bit of memory and to avoid alignment issues we assume 39-bit IPA + * for now, but remember that the level-1 table must be aligned to its size. + */ +#define PTRS_PER_PGD2 512 +#define PGD2_ORDER get_order(PTRS_PER_PGD2 * sizeof(pgd_t)) + +int create_hyp_mappings(void *from, void *to); +void free_hyp_pmds(void); + +int kvm_alloc_stage2_pgd(struct kvm *kvm); +void kvm_free_stage2_pgd(struct kvm *kvm); + +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); + +#endif /* __ARM_KVM_MMU_H__ */ diff --git a/arch/arm/include/asm/mach/map.h b/arch/arm/include/asm/mach/map.h index a6efcdd6fd251..3787c9fdc6e67 100644 --- a/arch/arm/include/asm/mach/map.h +++ b/arch/arm/include/asm/mach/map.h @@ -37,6 +37,7 @@ extern void iotable_init(struct map_desc *, int); struct mem_type; extern const struct mem_type *get_mem_type(unsigned int type); +extern pteval_t get_mem_type_prot_pte(unsigned int type); /* * external interface to remap single page with appropriate type */ diff --git a/arch/arm/include/asm/pgtable-3level-hwdef.h b/arch/arm/include/asm/pgtable-3level-hwdef.h index d7952824c5c4e..18f5cef82ad58 100644 --- a/arch/arm/include/asm/pgtable-3level-hwdef.h +++ b/arch/arm/include/asm/pgtable-3level-hwdef.h @@ -32,6 +32,9 @@ #define PMD_TYPE_SECT (_AT(pmdval_t, 1) << 0) #define PMD_BIT4 (_AT(pmdval_t, 0)) #define PMD_DOMAIN(x) (_AT(pmdval_t, 0)) +#define PMD_APTABLE_SHIFT (61) +#define PMD_APTABLE (_AT(pgdval_t, 3) << PGD_APTABLE_SHIFT) +#define PMD_PXNTABLE (_AT(pgdval_t, 1) << 59) /* * - section @@ -41,9 +44,11 @@ #define PMD_SECT_S (_AT(pmdval_t, 3) << 8) #define PMD_SECT_AF (_AT(pmdval_t, 1) << 10) #define PMD_SECT_nG (_AT(pmdval_t, 1) << 11) +#define PMD_SECT_PXN (_AT(pmdval_t, 1) << 53) #define PMD_SECT_XN (_AT(pmdval_t, 1) << 54) #define PMD_SECT_AP_WRITE (_AT(pmdval_t, 0)) #define PMD_SECT_AP_READ (_AT(pmdval_t, 0)) +#define PMD_SECT_AP1 (_AT(pmdval_t, 1) << 6) #define PMD_SECT_TEX(x) (_AT(pmdval_t, 0)) /* diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index b24903549d1c1..7351eee5e5473 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -102,11 +102,24 @@ */ #define L_PGD_SWAPPER (_AT(pgdval_t, 1) << 55) /* swapper_pg_dir entry */ +/* + * 2-nd stage PTE definitions for LPAE. + */ +#define L_PTE2_SHARED L_PTE_SHARED +#define L_PTE2_READ (_AT(pteval_t, 1) << 6) /* HAP[0] */ +#define L_PTE2_WRITE (_AT(pteval_t, 1) << 7) /* HAP[1] */ +#define L_PTE2_NORM_WB (_AT(pteval_t, 3) << 4) /* MemAttr[3:2] */ +#define L_PTE2_INNER_WB (_AT(pteval_t, 3) << 2) /* MemAttr[1:0] */ + #ifndef __ASSEMBLY__ #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) (!(pud_val(pud) & 2)) #define pud_present(pud) (pud_val(pud)) +#define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ + PMD_TYPE_TABLE) +#define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ + PMD_TYPE_SECT) #define pud_clear(pudp) \ do { \ diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index f66626d71e7d1..a31d0e996ec45 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -70,6 +70,7 @@ extern void __pgd_error(const char *file, int line, pgd_t); extern pgprot_t pgprot_user; extern pgprot_t pgprot_kernel; +extern pgprot_t pgprot_guest; #define _MOD_PROT(p, b) __pgprot(pgprot_val(p) | (b)) @@ -82,6 +83,10 @@ extern pgprot_t pgprot_kernel; #define PAGE_READONLY_EXEC _MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY) #define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN) #define PAGE_KERNEL_EXEC pgprot_kernel +#define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_USER) +#define PAGE_KVM_GUEST _MOD_PROT(pgprot_guest, L_PTE2_READ | \ + L_PTE2_NORM_WB | L_PTE2_INNER_WB | \ + L_PTE2_SHARED) #define __PAGE_NONE __pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | L_PTE_XN) #define __PAGE_SHARED __pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_XN) diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h index f5989f46b4d2d..e5dc5f6793d78 100644 --- a/arch/arm/include/asm/unified.h +++ b/arch/arm/include/asm/unified.h @@ -54,6 +54,18 @@ #endif /* CONFIG_THUMB2_KERNEL */ +#ifdef CONFIG_KVM_ARM_HOST +#ifdef __ASSEMBLY__ +.arch_extension sec +.arch_extension virt +#else +__asm__( +" .arch_extension sec\n" +" .arch_extension virt\n" +); +#endif +#endif + #ifndef CONFIG_ARM_ASM_UNIFIED /* diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index b57c75e0b01f9..38d3a122dbd8d 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c @@ -48,6 +48,13 @@ extern void __aeabi_ulcmp(void); extern void fpundefinstr(void); +#ifdef CONFIG_KVM_ARM_HOST +/* This is needed for KVM */ +extern void __irq_svc(void); + +EXPORT_SYMBOL_GPL(__irq_svc); +#endif + /* platform dependent support */ EXPORT_SYMBOL(__udelay); EXPORT_SYMBOL(__const_udelay); diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 1429d8989fb90..9c76b538a14ba 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -13,6 +13,7 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/dma-mapping.h> +#include <linux/kvm_host.h> #include <asm/cacheflush.h> #include <asm/glue-df.h> #include <asm/glue-pf.h> @@ -144,5 +145,47 @@ int main(void) DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL); DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); +#ifdef CONFIG_KVM_ARM_HOST + DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); + DEFINE(VCPU_MIDR, offsetof(struct kvm_vcpu, arch.cp15[c0_MIDR])); + DEFINE(VCPU_MPIDR, offsetof(struct kvm_vcpu, arch.cp15[c0_MPIDR])); + DEFINE(VCPU_SCTLR, offsetof(struct kvm_vcpu, arch.cp15[c1_SCTLR])); + DEFINE(VCPU_CPACR, offsetof(struct kvm_vcpu, arch.cp15[c1_CPACR])); + DEFINE(VCPU_TTBR0, offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR0])); + DEFINE(VCPU_TTBR1, offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR1])); + DEFINE(VCPU_TTBCR, offsetof(struct kvm_vcpu, arch.cp15[c2_TTBCR])); + DEFINE(VCPU_DACR, offsetof(struct kvm_vcpu, arch.cp15[c3_DACR])); + DEFINE(VCPU_DFSR, offsetof(struct kvm_vcpu, arch.cp15[c5_DFSR])); + DEFINE(VCPU_IFSR, offsetof(struct kvm_vcpu, arch.cp15[c5_IFSR])); + DEFINE(VCPU_ADFSR, offsetof(struct kvm_vcpu, arch.cp15[c5_ADFSR])); + DEFINE(VCPU_AIFSR, offsetof(struct kvm_vcpu, arch.cp15[c5_AIFSR])); + DEFINE(VCPU_DFAR, offsetof(struct kvm_vcpu, arch.cp15[c6_DFAR])); + DEFINE(VCPU_IFAR, offsetof(struct kvm_vcpu, arch.cp15[c6_IFAR])); + DEFINE(VCPU_PRRR, offsetof(struct kvm_vcpu, arch.cp15[c10_PRRR])); + DEFINE(VCPU_NMRR, offsetof(struct kvm_vcpu, arch.cp15[c10_NMRR])); + DEFINE(VCPU_VBAR, offsetof(struct kvm_vcpu, arch.cp15[c12_VBAR])); + DEFINE(VCPU_CID, offsetof(struct kvm_vcpu, arch.cp15[c13_CID])); + DEFINE(VCPU_TID_URW, offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URW])); + DEFINE(VCPU_TID_URO, offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URO])); + DEFINE(VCPU_TID_PRIV, offsetof(struct kvm_vcpu, arch.cp15[c13_TID_PRIV])); + DEFINE(VCPU_REGS, offsetof(struct kvm_vcpu, arch.regs)); + DEFINE(VCPU_USR_REGS, offsetof(struct kvm_vcpu, arch.regs.usr_regs)); + DEFINE(VCPU_SVC_REGS, offsetof(struct kvm_vcpu, arch.regs.svc_regs)); + DEFINE(VCPU_ABT_REGS, offsetof(struct kvm_vcpu, arch.regs.abt_regs)); + DEFINE(VCPU_UND_REGS, offsetof(struct kvm_vcpu, arch.regs.und_regs)); + DEFINE(VCPU_IRQ_REGS, offsetof(struct kvm_vcpu, arch.regs.irq_regs)); + DEFINE(VCPU_FIQ_REGS, offsetof(struct kvm_vcpu, arch.regs.fiq_regs)); + DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.regs.pc)); + DEFINE(VCPU_CPSR, offsetof(struct kvm_vcpu, arch.regs.cpsr)); + DEFINE(VCPU_IRQ_LINES, offsetof(struct kvm_vcpu, arch.irq_lines)); + DEFINE(VCPU_HSR, offsetof(struct kvm_vcpu, arch.hsr)); + DEFINE(VCPU_HDFAR, offsetof(struct kvm_vcpu, arch.hdfar)); + DEFINE(VCPU_HIFAR, offsetof(struct kvm_vcpu, arch.hifar)); + DEFINE(VCPU_HPFAR, offsetof(struct kvm_vcpu, arch.hpfar)); + DEFINE(VCPU_PC_IPA, offsetof(struct kvm_vcpu, arch.pc_ipa)); + DEFINE(VCPU_PC_IPA2, offsetof(struct kvm_vcpu, arch.pc_ipa2)); + DEFINE(VCPU_HYP_PC, offsetof(struct kvm_vcpu, arch.hyp_pc)); + DEFINE(KVM_VTTBR, offsetof(struct kvm, arch.vttbr)); +#endif return 0; } diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 0d1851ca6eb99..1c4937a220b7c 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -209,6 +209,7 @@ __dabt_svc: ENDPROC(__dabt_svc) .align 5 + .globl __irq_svc __irq_svc: svc_entry irq_handler diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 43a31fb06318d..33da40aac39ab 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -19,7 +19,11 @@ ALIGN_FUNCTION(); \ VMLINUX_SYMBOL(__idmap_text_start) = .; \ *(.idmap.text) \ - VMLINUX_SYMBOL(__idmap_text_end) = .; + VMLINUX_SYMBOL(__idmap_text_end) = .; \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__hyp_idmap_text_start) = .; \ + *(.hyp.idmap.text) \ + VMLINUX_SYMBOL(__hyp_idmap_text_end) = .; #ifdef CONFIG_HOTPLUG_CPU #define ARM_CPU_DISCARD(x) diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig new file mode 100644 index 0000000000000..7fa50d3df6294 --- /dev/null +++ b/arch/arm/kvm/Kconfig @@ -0,0 +1,45 @@ +# +# KVM configuration +# + +source "virt/kvm/Kconfig" + +menuconfig VIRTUALIZATION + bool "Virtualization" + ---help--- + Say Y here to get to see options for using your Linux host to run + other operating systems inside virtual machines (guests). + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled. + +if VIRTUALIZATION + +config KVM + tristate "Kernel-based Virtual Machine (KVM) support" + select PREEMPT_NOTIFIERS + select ANON_INODES + select KVM_MMIO + depends on CPU_V7 && ARM_VIRT_EXT + ---help--- + Support hosting virtualized guest machines. You will also + need to select one or more of the processor modules below. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + If unsure, say N. + +config KVM_ARM_HOST + bool "KVM host support for ARM cpus." + depends on KVM + depends on MMU + depends on CPU_V7 && ARM_VIRT_EXT + select MMU_NOTIFIER + ---help--- + Provides host support for ARM processors. + +source drivers/virtio/Kconfig + +endif # VIRTUALIZATION diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile new file mode 100644 index 0000000000000..5f40c5a5a2bda --- /dev/null +++ b/arch/arm/kvm/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for Kernel-based Virtual Machine module +# + +ccflags-y += -Ivirt/kvm -Iarch/arm/kvm +CFLAGS_arm.o := -I. +CFLAGS_mmu.o := -I. + +AFLAGS_interrupts.o := -I$(obj) + +obj-$(CONFIG_KVM_ARM_HOST) += init.o interrupts.o exports.o + +kvm-arm-y += $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) + +kvm-arm-y += arm.o guest.o mmu.o emulate.o reset.o + +obj-$(CONFIG_KVM) += kvm-arm.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c new file mode 100644 index 0000000000000..f3b206ac1fe55 --- /dev/null +++ b/arch/arm/kvm/arm.c @@ -0,0 +1,881 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/fs.h> +#include <linux/mman.h> +#include <linux/sched.h> +#include <linux/kvm.h> +#include <trace/events/kvm.h> + +#define CREATE_TRACE_POINTS +#include "trace.h" + +#include <asm/cputype.h> +#include <asm/unified.h> +#include <asm/uaccess.h> +#include <asm/ptrace.h> +#include <asm/mman.h> +#include <asm/idmap.h> +#include <asm/tlbflush.h> +#include <asm/cacheflush.h> +#include <asm/cputype.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_emulate.h> + +static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); + +/* The VMID used in the VTTBR */ +static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); +static u8 kvm_next_vmid; +DEFINE_SPINLOCK(kvm_vmid_lock); + +int kvm_arch_hardware_enable(void *garbage) +{ + return 0; +} + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return 1; +} + +void kvm_arch_hardware_disable(void *garbage) +{ +} + +int kvm_arch_hardware_setup(void) +{ + return 0; +} + +void kvm_arch_hardware_unsetup(void) +{ +} + +void kvm_arch_check_processor_compat(void *rtn) +{ + *(int *)rtn = 0; +} + +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + +/** + * kvm_arch_init_vm - initializes a VM data structure + * @kvm: pointer to the KVM struct + */ +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +{ + int ret = 0; + + if (type) + return -EINVAL; + + ret = kvm_alloc_stage2_pgd(kvm); + if (ret) + goto out_fail_alloc; + mutex_init(&kvm->arch.pgd_mutex); + + ret = create_hyp_mappings(kvm, kvm + 1); + if (ret) + goto out_free_stage2_pgd; + + /* Mark the initial VMID generation invalid */ + kvm->arch.vmid_gen = 0; + + return ret; +out_free_stage2_pgd: + kvm_free_stage2_pgd(kvm); +out_fail_alloc: + return ret; +} + +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + +/** + * kvm_arch_destroy_vm - destroy the VM data structure + * @kvm: pointer to the KVM struct + */ +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + int i; + + kvm_free_stage2_pgd(kvm); + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (kvm->vcpus[i]) { + kvm_arch_vcpu_free(kvm->vcpus[i]); + kvm->vcpus[i] = NULL; + } + } +} + +int kvm_dev_ioctl_check_extension(long ext) +{ + int r; + switch (ext) { + case KVM_CAP_USER_MEMORY: + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: + r = 1; + break; + case KVM_CAP_COALESCED_MMIO: + r = KVM_COALESCED_MMIO_PAGE_OFFSET; + break; + default: + r = 0; + break; + } + return r; +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -EINVAL; +} + +int kvm_arch_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return 0; +} + +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) +{ + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ +} + +void kvm_arch_flush_shadow(struct kvm *kvm) +{ +} + +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +{ + int err; + struct kvm_vcpu *vcpu; + + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) { + err = -ENOMEM; + goto out; + } + + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_vcpu; + + err = create_hyp_mappings(vcpu, vcpu + 1); + if (err) + goto vcpu_uninit; + + return vcpu; +vcpu_uninit: + kvm_vcpu_uninit(vcpu); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); +out: + return ERR_PTR(err); +} + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +{ + kmem_cache_free(kvm_vcpu_cache, vcpu); +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + kvm_arch_vcpu_free(vcpu); +} + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return 0; +} + +int __attribute_const__ kvm_target_cpu(void) +{ + unsigned int midr; + + midr = read_cpuid_id(); + switch ((midr >> 4) & 0xfff) { + case CORTEX_A15: + return CORTEX_A15; + default: + return -EINVAL; + } +} + +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) +{ + int ret; + + ret = kvm_reset_vcpu(vcpu); + if (ret < 0) + return ret; + + return 0; +} + +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->cpu = cpu; + + /* + * Check whether this vcpu requires the cache to be flushed on + * this physical CPU. This is a consequence of doing dcache + * operations by set/way on this vcpu. We do it here in order + * to be in a non-preemptible section. + */ + if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush)) + flush_cache_all(); /* We'd really want v7_flush_dcache_all() */ +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + return -EINVAL; +} + + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + return -EINVAL; +} + +/** + * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled + * @v: The VCPU pointer + * + * If the guest CPU is not waiting for interrupts (or waiting and + * an interrupt is pending) then it is by definition runnable. + */ +int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) +{ + return !!v->arch.irq_lines || + !v->arch.wait_for_interrupts; +} + +int kvm_arch_vcpu_in_guest_mode(struct kvm_vcpu *v) +{ + return v->mode == IN_GUEST_MODE; +} + +static void reset_vm_context(void *info) +{ + __kvm_flush_vm_context(); +} + +/** + * check_new_vmid_gen - check that the VMID is still valid + * @kvm: The VM's VMID to checkt + * + * return true if there is a new generation of VMIDs being used + * + * The hardware supports only 256 values with the value zero reserved for the + * host, so we check if an assigned value belongs to a previous generation, + * which which requires us to assign a new value. If we're the first to use a + * VMID for the new generation, we must flush necessary caches and TLBs on all + * CPUs. + */ +static bool check_new_vmid_gen(struct kvm *kvm) +{ + return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen)); +} + +/** + * update_vttbr - Update the VTTBR with a valid VMID before the guest runs + * @kvm The guest that we are about to run + * + * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the + * VM has a valid VMID, otherwise assigns a new one and flushes corresponding + * caches and TLBs. + */ +static void update_vttbr(struct kvm *kvm) +{ + phys_addr_t pgd_phys; + + if (!check_new_vmid_gen(kvm)) + return; + + spin_lock(&kvm_vmid_lock); + + /* First user of a new VMID generation? */ + if (unlikely(kvm_next_vmid == 0)) { + atomic64_inc(&kvm_vmid_gen); + kvm_next_vmid = 1; + + /* This does nothing on UP */ + smp_call_function(reset_vm_context, NULL, 1); + + /* + * On SMP we know no other CPUs can use this CPU's or + * each other's VMID since the kvm_vmid_lock blocks + * them from reentry to the guest. + */ + + reset_vm_context(NULL); + } + + kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen); + kvm->arch.vmid = kvm_next_vmid; + kvm_next_vmid++; + + /* update vttbr to be used with the new vmid */ + pgd_phys = virt_to_phys(kvm->arch.pgd); + kvm->arch.vttbr = pgd_phys & ((1LLU << 40) - 1) + & ~((2 << VTTBR_X) - 1); + kvm->arch.vttbr |= (u64)(kvm->arch.vmid) << 48; + + spin_unlock(&kvm_vmid_lock); +} + +static int handle_svc_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + /* SVC called from Hyp mode should never get here */ + kvm_debug("SVC called from Hyp mode shouldn't go here\n"); + BUG(); + return -EINVAL; /* Squash warning */ +} + +static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + /* + * Guest called HVC instruction: + * Let it know we don't want that by injecting an undefined exception. + */ + kvm_debug("hvc: %x (at %08x)", vcpu->arch.hsr & ((1 << 16) - 1), + vcpu->arch.regs.pc); + kvm_debug(" HSR: %8x", vcpu->arch.hsr); + kvm_inject_undefined(vcpu); + return 0; +} + +static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + /* We don't support SMC; don't do that. */ + kvm_debug("smc: at %08x", vcpu->arch.regs.pc); + return -EINVAL; +} + +static int handle_pabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + /* The hypervisor should never cause aborts */ + kvm_err("Prefetch Abort taken from Hyp mode at %#08x (HSR: %#08x)\n", + vcpu->arch.hifar, vcpu->arch.hsr); + return -EFAULT; +} + +static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + /* This is either an error in the ws. code or an external abort */ + kvm_err("Data Abort taken from Hyp mode at %#08x (HSR: %#08x)\n", + vcpu->arch.hdfar, vcpu->arch.hsr); + return -EFAULT; +} + +typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); +static exit_handle_fn arm_exit_handlers[] = { + [HSR_EC_WFI] = kvm_handle_wfi, + [HSR_EC_CP15_32] = kvm_handle_cp15_32, + [HSR_EC_CP15_64] = kvm_handle_cp15_64, + [HSR_EC_CP14_MR] = kvm_handle_cp14_access, + [HSR_EC_CP14_LS] = kvm_handle_cp14_load_store, + [HSR_EC_CP14_64] = kvm_handle_cp14_access, + [HSR_EC_CP_0_13] = kvm_handle_cp_0_13_access, + [HSR_EC_CP10_ID] = kvm_handle_cp10_id, + [HSR_EC_SVC_HYP] = handle_svc_hyp, + [HSR_EC_HVC] = handle_hvc, + [HSR_EC_SMC] = handle_smc, + [HSR_EC_IABT] = kvm_handle_guest_abort, + [HSR_EC_IABT_HYP] = handle_pabt_hyp, + [HSR_EC_DABT] = kvm_handle_guest_abort, + [HSR_EC_DABT_HYP] = handle_dabt_hyp, +}; + +/* + * Return 0 to return to guest, < 0 on error, exit_reason ( > 0) on proper + * exit to QEMU. + */ +static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, + int exception_index) +{ + unsigned long hsr_ec; + + switch (exception_index) { + case ARM_EXCEPTION_IRQ: + vcpu->stat.irq_exits++; + return 0; + case ARM_EXCEPTION_UNDEFINED: + kvm_err("Undefined exception in Hyp mode at: %#08x\n", + vcpu->arch.hyp_pc); + BUG(); + panic("KVM: Hypervisor undefined exception!\n"); + case ARM_EXCEPTION_DATA_ABORT: + case ARM_EXCEPTION_PREF_ABORT: + case ARM_EXCEPTION_HVC: + hsr_ec = (vcpu->arch.hsr & HSR_EC) >> HSR_EC_SHIFT; + + if (hsr_ec >= ARRAY_SIZE(arm_exit_handlers) + || !arm_exit_handlers[hsr_ec]) { + kvm_err("Unkown exception class: %#08lx, " + "hsr: %#08x\n", hsr_ec, + (unsigned int)vcpu->arch.hsr); + BUG(); + } + + return arm_exit_handlers[hsr_ec](vcpu, run); + default: + kvm_pr_unimpl("Unsupported exception type: %d", + exception_index); + return -EINVAL; + } +} + +/* + * Return 0 to proceed with guest entry + */ +static int vcpu_pre_guest_enter(struct kvm_vcpu *vcpu, int *exit_reason) +{ + if (signal_pending(current)) { + *exit_reason = KVM_EXIT_INTR; + return -EINTR; + } + + if (check_new_vmid_gen(vcpu->kvm)) + return 1; + + BUG_ON(__vcpu_mode(*vcpu_cpsr(vcpu)) == 0xf); + + return 0; +} + +/** + * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code + * @vcpu: The VCPU pointer + * @run: The kvm_run structure pointer used for userspace state exchange + * + * This function is called through the VCPU_RUN ioctl called from user space. It + * will execute VM code in a loop until the time slice for the process is used + * or some emulation is needed from user space in which case the function will + * return with return value 0 and with the kvm_run structure filled in with the + * required data for the requested emulation. + */ +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + int ret = 0; + int exit_reason; + sigset_t sigsaved; + + if (run->exit_reason == KVM_EXIT_MMIO) { + ret = kvm_handle_mmio_return(vcpu, vcpu->run); + if (ret) + return ret; + } + + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + + exit_reason = KVM_EXIT_UNKNOWN; + while (exit_reason == KVM_EXIT_UNKNOWN) { + /* + * Check conditions before entering the guest + */ + cond_resched(); + + if (vcpu->arch.wait_for_interrupts) + kvm_vcpu_block(vcpu); + + update_vttbr(vcpu->kvm); + + /* + * Make sure preemption is disabled while calling handle_exit + * as exit handling touches CPU-specific resources, such as + * caches, and we must stay on the same CPU. + * + * Code that might sleep must disable preemption and access + * CPU-specific resources first. + */ + preempt_disable(); + local_irq_disable(); + + /* Re-check atomic conditions */ + ret = vcpu_pre_guest_enter(vcpu, &exit_reason); + if (ret != 0) { + local_irq_enable(); + preempt_enable(); + continue; + } + + /************************************************************** + * Enter the guest + */ + trace_kvm_entry(vcpu->arch.regs.pc); + kvm_guest_enter(); + vcpu->mode = IN_GUEST_MODE; + + ret = __kvm_vcpu_run(vcpu); + + vcpu->mode = OUTSIDE_GUEST_MODE; + vcpu->stat.exits++; + kvm_guest_exit(); + trace_kvm_exit(vcpu->arch.regs.pc); + local_irq_enable(); + + /* + * Back from guest + *************************************************************/ + + ret = handle_exit(vcpu, run, ret); + preempt_enable(); + if (ret < 0) { + kvm_err("Error in handle_exit\n"); + break; + } else { + exit_reason = ret; /* 0 == KVM_EXIT_UNKNOWN */ + } + } + + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + run->exit_reason = exit_reason; + return ret; +} + +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level) +{ + unsigned int vcpu_idx; + struct kvm_vcpu *vcpu; + unsigned long *ptr; + bool set; + int bit_nr; + + vcpu_idx = irq_level->irq >> 1; + if (vcpu_idx >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (!vcpu) + return -EINVAL; + + trace_kvm_set_irq(irq_level->irq, irq_level->level, 0); + + if ((irq_level->irq & 1) == KVM_ARM_IRQ_LINE) + bit_nr = HCR_VI_BIT_NR; + else /* KVM_ARM_FIQ_LINE */ + bit_nr = HCR_VF_BIT_NR; + + ptr = (unsigned long *)&vcpu->arch.irq_lines; + if (irq_level->level) + set = test_and_set_bit(bit_nr, ptr); + else + set = test_and_clear_bit(bit_nr, ptr); + + /* + * If we didn't change anything, no need to wake up or kick other CPUs + */ + if (!!set == !!irq_level->level) + return 0; + + /* + * The vcpu irq_lines field was updated, wake up sleeping VCPUs and + * trigger a world-switch round on the running physical CPU to set the + * virtual IRQ/FIQ fields in the HCR appropriately. + */ + if (irq_level->level) + vcpu->arch.wait_for_interrupts = 0; + kvm_vcpu_kick(vcpu); + + return 0; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -EINVAL; +} + +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ + return -EINVAL; +} + +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -EINVAL; +} + +static void cpu_set_vector(void *vector) +{ + unsigned long vector_ptr; + unsigned long smc_hyp_nr; + + vector_ptr = (unsigned long)vector; + smc_hyp_nr = SMCHYP_HVBAR_W; + + /* + * Set the HVBAR + */ + asm volatile ( + "mov r0, %[vector_ptr]\n\t" + "mov r7, %[smc_hyp_nr]\n\t" + "smc #0\n\t" : : + [vector_ptr] "r" (vector_ptr), + [smc_hyp_nr] "r" (smc_hyp_nr) : + "r0", "r7"); +} + +static void cpu_init_hyp_mode(void *vector) +{ + unsigned long pgd_ptr; + unsigned long hyp_stack_ptr; + unsigned long stack_page; + + cpu_set_vector(vector); + + pgd_ptr = virt_to_phys(hyp_pgd); + stack_page = __get_cpu_var(kvm_arm_hyp_stack_page); + hyp_stack_ptr = stack_page + PAGE_SIZE; + + /* + * Call initialization code + */ + asm volatile ( + "mov r0, %[pgd_ptr]\n\t" + "mov r1, %[hyp_stack_ptr]\n\t" + "hvc #0\n\t" : : + [pgd_ptr] "r" (pgd_ptr), + [hyp_stack_ptr] "r" (hyp_stack_ptr) : + "r0", "r1"); +} + +/** + * Inits Hyp-mode on all online CPUs + */ +static int init_hyp_mode(void) +{ + phys_addr_t init_phys_addr; + int cpu; + int err = 0; + + /* + * Allocate stack pages for Hypervisor-mode + */ + for_each_possible_cpu(cpu) { + unsigned long stack_page; + + stack_page = __get_free_page(GFP_KERNEL); + if (!stack_page) { + err = -ENOMEM; + goto out_free_stack_pages; + } + + per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; + } + + /* + * Execute the init code on each CPU. + * + * Note: The stack is not mapped yet, so don't do anything else than + * initializing the hypervisor mode on each CPU using a local stack + * space for temporary storage. + */ + init_phys_addr = virt_to_phys(__kvm_hyp_init); + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, cpu_init_hyp_mode, + (void *)(long)init_phys_addr, 1); + } + + /* + * Unmap the identity mapping + */ + hyp_idmap_teardown(); + + /* + * Map the Hyp-code called directly from the host + */ + err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end); + if (err) { + kvm_err("Cannot map world-switch code\n"); + goto out_free_mappings; + } + + /* + * Map the Hyp stack pages + */ + for_each_possible_cpu(cpu) { + char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); + err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE); + + if (err) { + kvm_err("Cannot map hyp stack\n"); + goto out_free_mappings; + } + } + + /* + * Set the HVBAR to the virtual kernel address + */ + for_each_online_cpu(cpu) + smp_call_function_single(cpu, cpu_set_vector, + __kvm_hyp_vector, 1); + + return 0; +out_free_mappings: + free_hyp_pmds(); +out_free_stack_pages: + for_each_possible_cpu(cpu) + free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); + return err; +} + +/** + * Initialize Hyp-mode and memory mappings on all CPUs. + */ +int kvm_arch_init(void *opaque) +{ + int err; + + if (kvm_target_cpu() < 0) { + kvm_err("Target CPU not supported!\n"); + return -ENODEV; + } + + err = init_hyp_mode(); + if (err) + goto out_err; + + return 0; +out_err: + return err; +} + +static void cpu_exit_hyp_mode(void *vector) +{ + cpu_set_vector(vector); + + /* + * Disable Hyp-MMU for each cpu + */ + asm volatile ("hvc #0"); +} + +static int exit_hyp_mode(void) +{ + phys_addr_t exit_phys_addr; + int cpu; + + /* + * TODO: flush Hyp TLB in case idmap code overlaps. + * Note that we should do this in the monitor code when switching the + * HVBAR, but this is going away and should be rather done in the Hyp + * mode change of HVBAR. + */ + hyp_idmap_setup(); + exit_phys_addr = virt_to_phys(__kvm_hyp_exit); + BUG_ON(exit_phys_addr & 0x1f); + + /* + * Execute the exit code on each CPU. + * + * Note: The stack is not mapped yet, so don't do anything else than + * initializing the hypervisor mode on each CPU using a local stack + * space for temporary storage. + */ + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, cpu_exit_hyp_mode, + (void *)(long)exit_phys_addr, 1); + } + + return 0; +} + +void kvm_arch_exit(void) +{ + int cpu; + + exit_hyp_mode(); + + free_hyp_pmds(); + for_each_possible_cpu(cpu) + free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); +} + +static int arm_init(void) +{ + int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + return rc; +} + +static void __exit arm_exit(void) +{ + kvm_exit(); +} + +module_init(arm_init); +module_exit(arm_exit) diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c new file mode 100644 index 0000000000000..564add20f2e02 --- /dev/null +++ b/arch/arm/kvm/emulate.c @@ -0,0 +1,873 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/mm.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_host.h> +#include <asm/kvm_emulate.h> +#include <trace/events/kvm.h> + +#include "trace.h" + +#define REG_OFFSET(_reg) \ + (offsetof(struct kvm_vcpu_regs, _reg) / sizeof(u32)) + +#define USR_REG_OFFSET(_num) REG_OFFSET(usr_regs[_num]) + +static const unsigned long vcpu_reg_offsets[MODE_SYS + 1][16] = { + /* FIQ Registers */ + { + USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), + USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), + USR_REG_OFFSET(6), USR_REG_OFFSET(7), + REG_OFFSET(fiq_regs[1]), /* r8 */ + REG_OFFSET(fiq_regs[1]), /* r9 */ + REG_OFFSET(fiq_regs[2]), /* r10 */ + REG_OFFSET(fiq_regs[3]), /* r11 */ + REG_OFFSET(fiq_regs[4]), /* r12 */ + REG_OFFSET(fiq_regs[5]), /* r13 */ + REG_OFFSET(fiq_regs[6]), /* r14 */ + REG_OFFSET(pc) /* r15 */ + }, + + /* IRQ Registers */ + { + USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), + USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), + USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), + USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), + USR_REG_OFFSET(12), + REG_OFFSET(irq_regs[0]), /* r13 */ + REG_OFFSET(irq_regs[1]), /* r14 */ + REG_OFFSET(pc) /* r15 */ + }, + + /* SVC Registers */ + { + USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), + USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), + USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), + USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), + USR_REG_OFFSET(12), + REG_OFFSET(svc_regs[0]), /* r13 */ + REG_OFFSET(svc_regs[1]), /* r14 */ + REG_OFFSET(pc) /* r15 */ + }, + + /* ABT Registers */ + { + USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), + USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), + USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), + USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), + USR_REG_OFFSET(12), + REG_OFFSET(abt_regs[0]), /* r13 */ + REG_OFFSET(abt_regs[1]), /* r14 */ + REG_OFFSET(pc) /* r15 */ + }, + + /* UND Registers */ + { + USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), + USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), + USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), + USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), + USR_REG_OFFSET(12), + REG_OFFSET(und_regs[0]), /* r13 */ + REG_OFFSET(und_regs[1]), /* r14 */ + REG_OFFSET(pc) /* r15 */ + }, + + /* USR Registers */ + { + USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), + USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), + USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), + USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), + USR_REG_OFFSET(12), + REG_OFFSET(usr_regs[13]), /* r13 */ + REG_OFFSET(usr_regs[14]), /* r14 */ + REG_OFFSET(pc) /* r15 */ + }, + + /* SYS Registers */ + { + USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), + USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), + USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), + USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), + USR_REG_OFFSET(12), + REG_OFFSET(usr_regs[13]), /* r13 */ + REG_OFFSET(usr_regs[14]), /* r14 */ + REG_OFFSET(pc) /* r15 */ + }, +}; + +/* + * Return a pointer to the register number valid in the specified mode of + * the virtual CPU. + */ +u32 *vcpu_reg_mode(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode) +{ + u32 *reg_array = (u32 *)&vcpu->arch.regs; + + BUG_ON(reg_num > 15); + BUG_ON(mode > MODE_SYS); + + return reg_array + vcpu_reg_offsets[mode][reg_num]; +} + +/****************************************************************************** + * Utility functions common for all emulation code + *****************************************************************************/ + +/* + * This one accepts a matrix where the first element is the + * bits as they must be, and the second element is the bitmask. + */ +#define INSTR_NONE -1 +static int kvm_instr_index(u32 instr, u32 table[][2], int table_entries) +{ + int i; + u32 mask; + + for (i = 0; i < table_entries; i++) { + mask = table[i][1]; + if ((table[i][0] & mask) == (instr & mask)) + return i; + } + return INSTR_NONE; +} + +/****************************************************************************** + * Co-processor emulation + *****************************************************************************/ + +struct coproc_params { + unsigned long CRn; + unsigned long CRm; + unsigned long Op1; + unsigned long Op2; + unsigned long Rt1; + unsigned long Rt2; + bool is_64bit; + bool is_write; +}; + +static void print_cp_instr(const struct coproc_params *p) +{ + /* Look, we even formatted it for you to paste into the table! */ + if (p->is_64bit) { + kvm_err("{ CRn(DF), CRm(%2lu), Op1(%2lu), Op2(DF), is64, %-6s" + " func, arg},\n", + p->CRm, p->Op1, p->is_write ? "WRITE," : "READ,"); + } else { + kvm_err("{ CRn(%2lu), CRm(%2lu), Op1(%2lu), Op2(%2lu), is32," + " %-6s func, arg},\n", + p->CRn, p->CRm, p->Op1, p->Op2, + p->is_write ? "WRITE," : "READ,"); + } +} + +int kvm_handle_cp10_id(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + kvm_inject_undefined(vcpu); + return 0; +} + +int kvm_handle_cp_0_13_access(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + kvm_inject_undefined(vcpu); + return 0; +} + +int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + kvm_inject_undefined(vcpu); + return 0; +} + +int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + kvm_inject_undefined(vcpu); + return 0; +} + +static bool ignore_write(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + unsigned long arg) +{ + if (arg) + trace_kvm_emulate_cp15_imp(p->Op1, p->Rt1, p->CRn, p->CRm, + p->Op2, p->is_write); + return true; +} + +static bool read_zero(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + unsigned long arg) +{ + if (arg) + trace_kvm_emulate_cp15_imp(p->Op1, p->Rt1, p->CRn, p->CRm, + p->Op2, p->is_write); + *vcpu_reg(vcpu, p->Rt1) = 0; + return true; +} + +static bool read_l2ctlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + unsigned long arg) +{ + u32 l2ctlr, ncores; + + switch (kvm_target_cpu()) { + case CORTEX_A15: + asm volatile("mrc p15, 1, %0, c9, c0, 2\n" : "=r" (l2ctlr)); + l2ctlr &= ~(3 << 24); + ncores = atomic_read(&vcpu->kvm->online_vcpus) - 1; + l2ctlr |= (ncores & 3) << 24; + *vcpu_reg(vcpu, p->Rt1) = l2ctlr; + return true; + default: + return false; + } +} + +static bool write_l2ctlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + unsigned long arg) +{ + return false; +} + +static bool access_l2ectlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + unsigned long arg) +{ + switch (kvm_target_cpu()) { + case CORTEX_A15: + if (!p->is_write) + *vcpu_reg(vcpu, p->Rt1) = 0; + return true; + default: + return false; + } +} + +static bool read_actlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + unsigned long arg) +{ + u32 actlr; + + switch (kvm_target_cpu()) { + case CORTEX_A15: + asm volatile("mrc p15, 0, %0, c1, c0, 1\n" : "=r" (actlr)); + /* Make the SMP bit consistent with the guest configuration */ + if (atomic_read(&vcpu->kvm->online_vcpus) > 1) + actlr |= 1U << 6; + else + actlr &= ~(1U << 6); + *vcpu_reg(vcpu, p->Rt1) = actlr; + break; + default: + asm volatile("mrc p15, 0, %0, c1, c0, 1\n" : "=r" (actlr)); + *vcpu_reg(vcpu, p->Rt1) = actlr; + break; + } + + return true; +} + +static bool write_dcsw(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + unsigned long cp15_reg) +{ + u32 val; + + val = *vcpu_reg(vcpu, p->Rt1); + + switch (p->CRm) { + case 6: /* Upgrade DCISW to DCCISW, as per HCR.SWIO */ + case 14: /* DCCISW */ + asm volatile("mcr p15, 0, %0, c7, c14, 2" : : "r" (val)); + break; + + case 10: /* DCCSW */ + asm volatile("mcr p15, 0, %0, c7, c10, 2" : : "r" (val)); + break; + } + + cpumask_setall(&vcpu->arch.require_dcache_flush); + cpumask_clear_cpu(vcpu->cpu, &vcpu->arch.require_dcache_flush); + + return true; +} + +static bool access_cp15_reg(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + unsigned long cp15_reg) +{ + if (p->is_write) + vcpu->arch.cp15[cp15_reg] = *vcpu_reg(vcpu, p->Rt1); + else + *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[cp15_reg]; + return true; +} + +/* Any field which is 0xFFFFFFFF == DF */ +struct coproc_emulate { + unsigned long CRn; + unsigned long CRm; + unsigned long Op1; + unsigned long Op2; + + unsigned long is_64; + unsigned long is_w; + + bool (*f)(struct kvm_vcpu *, + const struct coproc_params *, + unsigned long); + unsigned long arg; +}; + +#define DF (-1UL) /* Default: If nothing else fits, use this one */ +#define CRn(_x) .CRn = _x +#define CRm(_x) .CRm = _x +#define Op1(_x) .Op1 = _x +#define Op2(_x) .Op2 = _x +#define is64 .is_64 = true +#define is32 .is_64 = false +#define READ .is_w = false +#define WRITE .is_w = true +#define RW .is_w = DF + +static const struct coproc_emulate coproc_emulate[] = { + /* + * ACTRL access: + * + * Ignore writes, and read returns the host settings. + */ + { CRn( 1), CRm( 0), Op1( 0), Op2( 1), is32, WRITE, ignore_write}, + { CRn( 1), CRm( 0), Op1( 0), Op2( 1), is32, READ, read_actlr}, + /* + * DC{C,I,CI}SW operations: + */ + { CRn( 7), CRm( 6), Op1( 0), Op2( 2), is32, WRITE, write_dcsw}, + { CRn( 7), CRm( 6), Op1( 0), Op2( 2), is32, READ, read_zero}, + { CRn( 7), CRm(10), Op1( 0), Op2( 2), is32, WRITE, write_dcsw}, + { CRn( 7), CRm(10), Op1( 0), Op2( 2), is32, READ, read_zero}, + { CRn( 7), CRm(14), Op1( 0), Op2( 2), is32, WRITE, write_dcsw}, + { CRn( 7), CRm(14), Op1( 0), Op2( 2), is32, READ, read_zero}, + /* + * L2CTLR access (guest wants to know #CPUs). + * + * FIXME: Hack Alert: Read zero as default case. + */ + { CRn( 9), CRm( 0), Op1( 1), Op2( 2), is32, READ, read_l2ctlr}, + { CRn( 9), CRm( 0), Op1( 1), Op2( 2), is32, WRITE, write_l2ctlr}, + { CRn( 9), CRm( 0), Op1( 1), Op2( 3), is32, READ, access_l2ectlr}, + { CRn( 9), CRm(DF), Op1(DF), Op2(DF), is32, WRITE, ignore_write}, + { CRn( 9), CRm(DF), Op1(DF), Op2(DF), is32, READ, read_zero}, + + /* + * These CRn == 10 entries may not need to exist - if we can + * ignore guest attempts to tamper with TLB lockdowns then it + * should be enough to store/restore the host/guest PRRR and + * NMRR memory remap registers and allow guest direct access + * to these registers. + * + * TLB Lockdown operations - ignored + */ + { CRn(10), CRm( 0), Op1(DF), Op2(DF), is32, WRITE, ignore_write}, + { CRn(10), CRm( 2), Op1( 0), Op2( 0), is32, RW, access_cp15_reg, + c10_PRRR}, + { CRn(10), CRm( 2), Op1( 0), Op2( 1), is32, RW, access_cp15_reg, + c10_NMRR}, + + /* + * The CP15 c15 register is architecturally implementation + * defined, but some guest kernels attempt to read/write a + * diagnostics register here. We always return 0 and ignore + * writes and hope for the best. + */ + { CRn(15), CRm(DF), Op1(DF), Op2(DF), is32, WRITE, ignore_write, 1}, + { CRn(15), CRm(DF), Op1(DF), Op2(DF), is32, READ, read_zero, 1}, +}; + +#undef is64 +#undef is32 +#undef READ +#undef WRITE +#undef RW + +static inline bool match(unsigned long val, unsigned long param) +{ + return param == DF || val == param; +} + +static int emulate_cp15(struct kvm_vcpu *vcpu, + const struct coproc_params *params) +{ + unsigned long instr_len, i; + + for (i = 0; i < ARRAY_SIZE(coproc_emulate); i++) { + const struct coproc_emulate *e = &coproc_emulate[i]; + + if (!match(params->is_64bit, e->is_64)) + continue; + if (!match(params->is_write, e->is_w)) + continue; + if (!match(params->CRn, e->CRn)) + continue; + if (!match(params->CRm, e->CRm)) + continue; + if (!match(params->Op1, e->Op1)) + continue; + if (!match(params->Op2, e->Op2)) + continue; + + /* If function fails, it should complain. */ + if (!e->f(vcpu, params, e->arg)) + goto undef; + + /* Skip instruction, since it was emulated */ + instr_len = ((vcpu->arch.hsr >> 25) & 1) ? 4 : 2; + *vcpu_pc(vcpu) += instr_len; + kvm_adjust_itstate(vcpu); + return 0; + } + + kvm_err("Unsupported guest CP15 access at: %08x\n", + vcpu->arch.regs.pc); + print_cp_instr(params); +undef: + kvm_inject_undefined(vcpu); + return 0; +} + +/** + * kvm_handle_cp15_64 -- handles a mrrc/mcrr trap on a guest CP15 access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct coproc_params params; + + params.CRm = (vcpu->arch.hsr >> 1) & 0xf; + params.Rt1 = (vcpu->arch.hsr >> 5) & 0xf; + params.is_write = ((vcpu->arch.hsr & 1) == 0); + params.is_64bit = true; + + params.Op1 = (vcpu->arch.hsr >> 16) & 0xf; + params.Op2 = 0; + params.Rt2 = (vcpu->arch.hsr >> 10) & 0xf; + params.CRn = 0; + + return emulate_cp15(vcpu, ¶ms); +} + +/** + * kvm_handle_cp15_32 -- handles a mrc/mcr trap on a guest CP15 access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct coproc_params params; + + params.CRm = (vcpu->arch.hsr >> 1) & 0xf; + params.Rt1 = (vcpu->arch.hsr >> 5) & 0xf; + params.is_write = ((vcpu->arch.hsr & 1) == 0); + params.is_64bit = false; + + params.CRn = (vcpu->arch.hsr >> 10) & 0xf; + params.Op1 = (vcpu->arch.hsr >> 14) & 0x7; + params.Op2 = (vcpu->arch.hsr >> 17) & 0x7; + params.Rt2 = 0; + + return emulate_cp15(vcpu, ¶ms); +} + +/** + * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest + * @vcpu: the vcpu pointer + * @run: the kvm_run structure pointer + * + * Simply sets the wait_for_interrupts flag on the vcpu structure, which will + * halt execution of world-switches and schedule other host processes until + * there is an incoming IRQ or FIQ to the VM. + */ +int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + trace_kvm_wfi(vcpu->arch.regs.pc); + vcpu->stat.wfi_exits++; + if (!vcpu->arch.irq_lines) + vcpu->arch.wait_for_interrupts = 1; + return 0; +} + + +/****************************************************************************** + * Load-Store instruction emulation + *****************************************************************************/ + +/* + * Must be ordered with LOADS first and WRITES afterwards + * for easy distinction when doing MMIO. + */ +#define NUM_LD_INSTR 9 +enum INSTR_LS_INDEXES { + INSTR_LS_LDRBT, INSTR_LS_LDRT, INSTR_LS_LDR, INSTR_LS_LDRB, + INSTR_LS_LDRD, INSTR_LS_LDREX, INSTR_LS_LDRH, INSTR_LS_LDRSB, + INSTR_LS_LDRSH, + INSTR_LS_STRBT, INSTR_LS_STRT, INSTR_LS_STR, INSTR_LS_STRB, + INSTR_LS_STRD, INSTR_LS_STREX, INSTR_LS_STRH, + NUM_LS_INSTR +}; + +static u32 ls_instr[NUM_LS_INSTR][2] = { + {0x04700000, 0x0d700000}, /* LDRBT */ + {0x04300000, 0x0d700000}, /* LDRT */ + {0x04100000, 0x0c500000}, /* LDR */ + {0x04500000, 0x0c500000}, /* LDRB */ + {0x000000d0, 0x0e1000f0}, /* LDRD */ + {0x01900090, 0x0ff000f0}, /* LDREX */ + {0x001000b0, 0x0e1000f0}, /* LDRH */ + {0x001000d0, 0x0e1000f0}, /* LDRSB */ + {0x001000f0, 0x0e1000f0}, /* LDRSH */ + {0x04600000, 0x0d700000}, /* STRBT */ + {0x04200000, 0x0d700000}, /* STRT */ + {0x04000000, 0x0c500000}, /* STR */ + {0x04400000, 0x0c500000}, /* STRB */ + {0x000000f0, 0x0e1000f0}, /* STRD */ + {0x01800090, 0x0ff000f0}, /* STREX */ + {0x000000b0, 0x0e1000f0} /* STRH */ +}; + +static inline int get_arm_ls_instr_index(u32 instr) +{ + return kvm_instr_index(instr, ls_instr, NUM_LS_INSTR); +} + +/* + * Load-Store instruction decoding + */ +#define INSTR_LS_TYPE_BIT 26 +#define INSTR_LS_RD_MASK 0x0000f000 +#define INSTR_LS_RD_SHIFT 12 +#define INSTR_LS_RN_MASK 0x000f0000 +#define INSTR_LS_RN_SHIFT 16 +#define INSTR_LS_RM_MASK 0x0000000f +#define INSTR_LS_OFFSET12_MASK 0x00000fff + +#define INSTR_LS_BIT_P 24 +#define INSTR_LS_BIT_U 23 +#define INSTR_LS_BIT_B 22 +#define INSTR_LS_BIT_W 21 +#define INSTR_LS_BIT_L 20 +#define INSTR_LS_BIT_S 6 +#define INSTR_LS_BIT_H 5 + +/* + * ARM addressing mode defines + */ +#define OFFSET_IMM_MASK 0x0e000000 +#define OFFSET_IMM_VALUE 0x04000000 +#define OFFSET_REG_MASK 0x0e000ff0 +#define OFFSET_REG_VALUE 0x06000000 +#define OFFSET_SCALE_MASK 0x0e000010 +#define OFFSET_SCALE_VALUE 0x06000000 + +#define SCALE_SHIFT_MASK 0x000000a0 +#define SCALE_SHIFT_SHIFT 5 +#define SCALE_SHIFT_LSL 0x0 +#define SCALE_SHIFT_LSR 0x1 +#define SCALE_SHIFT_ASR 0x2 +#define SCALE_SHIFT_ROR_RRX 0x3 +#define SCALE_SHIFT_IMM_MASK 0x00000f80 +#define SCALE_SHIFT_IMM_SHIFT 6 + +#define PSR_BIT_C 29 + +static unsigned long ls_word_calc_offset(struct kvm_vcpu *vcpu, + unsigned long instr) +{ + int offset = 0; + + if ((instr & OFFSET_IMM_MASK) == OFFSET_IMM_VALUE) { + /* Immediate offset/index */ + offset = instr & INSTR_LS_OFFSET12_MASK; + + if (!(instr & (1U << INSTR_LS_BIT_U))) + offset = -offset; + } + + if ((instr & OFFSET_REG_MASK) == OFFSET_REG_VALUE) { + /* Register offset/index */ + u8 rm = instr & INSTR_LS_RM_MASK; + offset = *vcpu_reg(vcpu, rm); + + if (!(instr & (1U << INSTR_LS_BIT_P))) + offset = 0; + } + + if ((instr & OFFSET_SCALE_MASK) == OFFSET_SCALE_VALUE) { + /* Scaled register offset */ + u8 rm = instr & INSTR_LS_RM_MASK; + u8 shift = (instr & SCALE_SHIFT_MASK) >> SCALE_SHIFT_SHIFT; + u32 shift_imm = (instr & SCALE_SHIFT_IMM_MASK) + >> SCALE_SHIFT_IMM_SHIFT; + offset = *vcpu_reg(vcpu, rm); + + switch (shift) { + case SCALE_SHIFT_LSL: + offset = offset << shift_imm; + break; + case SCALE_SHIFT_LSR: + if (shift_imm == 0) + offset = 0; + else + offset = ((u32)offset) >> shift_imm; + break; + case SCALE_SHIFT_ASR: + if (shift_imm == 0) { + if (offset & (1U << 31)) + offset = 0xffffffff; + else + offset = 0; + } else { + /* Ensure arithmetic shift */ + asm("mov %[r], %[op], ASR %[s]" : + [r] "=r" (offset) : + [op] "r" (offset), [s] "r" (shift_imm)); + } + break; + case SCALE_SHIFT_ROR_RRX: + if (shift_imm == 0) { + u32 C = (vcpu->arch.regs.cpsr & + (1U << PSR_BIT_C)); + offset = (C << 31) | offset >> 1; + } else { + /* Ensure arithmetic shift */ + asm("mov %[r], %[op], ASR %[s]" : + [r] "=r" (offset) : + [op] "r" (offset), [s] "r" (shift_imm)); + } + break; + } + + if (instr & (1U << INSTR_LS_BIT_U)) + return offset; + else + return -offset; + } + + if (instr & (1U << INSTR_LS_BIT_U)) + return offset; + else + return -offset; + + BUG(); +} + +static int kvm_ls_length(struct kvm_vcpu *vcpu, u32 instr) +{ + int index; + + index = get_arm_ls_instr_index(instr); + + if (instr & (1U << INSTR_LS_TYPE_BIT)) { + /* LS word or unsigned byte */ + if (instr & (1U << INSTR_LS_BIT_B)) + return sizeof(unsigned char); + else + return sizeof(u32); + } else { + /* LS halfword, doubleword or signed byte */ + u32 H = (instr & (1U << INSTR_LS_BIT_H)); + u32 S = (instr & (1U << INSTR_LS_BIT_S)); + u32 L = (instr & (1U << INSTR_LS_BIT_L)); + + if (!L && S) { + kvm_err("WARNING: d-word for MMIO\n"); + return 2 * sizeof(u32); + } else if (L && S && !H) + return sizeof(char); + else + return sizeof(u16); + } + + BUG(); +} + +/** + * kvm_emulate_mmio_ls - emulates load/store instructions made to I/O memory + * @vcpu: The vcpu pointer + * @fault_ipa: The IPA that caused the 2nd stage fault + * @instr: The instruction that caused the fault + * + * Handles emulation of load/store instructions which cannot be emulated through + * information found in the HSR on faults. It is necessary in this case to + * simply decode the offending instruction in software and determine the + * required operands. + */ +int kvm_emulate_mmio_ls(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + unsigned long instr) +{ + unsigned long rd, rn, offset, len, instr_len; + int index; + bool is_write, is_thumb; + + trace_kvm_mmio_emulate(vcpu->arch.regs.pc, instr, vcpu->arch.regs.cpsr); + + index = get_arm_ls_instr_index(instr); + if (index == INSTR_NONE) { + kvm_err("Unknown load/store instruction\n"); + return -EINVAL; + } + + is_write = (index < NUM_LD_INSTR) ? false : true; + rd = (instr & INSTR_LS_RD_MASK) >> INSTR_LS_RD_SHIFT; + len = kvm_ls_length(vcpu, instr); + + vcpu->run->mmio.is_write = is_write; + vcpu->run->mmio.phys_addr = fault_ipa; + vcpu->run->mmio.len = len; + vcpu->arch.mmio_sign_extend = false; + vcpu->arch.mmio_rd = rd; + + trace_kvm_mmio((is_write) ? KVM_TRACE_MMIO_WRITE : + KVM_TRACE_MMIO_READ_UNSATISFIED, + len, fault_ipa, (is_write) ? *vcpu_reg(vcpu, rd) : 0); + + /* Handle base register writeback */ + if (!(instr & (1U << INSTR_LS_BIT_P)) || + (instr & (1U << INSTR_LS_BIT_W))) { + rn = (instr & INSTR_LS_RN_MASK) >> INSTR_LS_RN_SHIFT; + offset = ls_word_calc_offset(vcpu, instr); + *vcpu_reg(vcpu, rn) += offset; + } + + /* + * The MMIO instruction is emulated and should not be re-executed + * in the guest. + */ + is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_T_BIT); + if (is_thumb && !is_wide_instruction(instr)) + instr_len = 2; + else + instr_len = 4; + + *vcpu_pc(vcpu) += instr_len; + kvm_adjust_itstate(vcpu); + return KVM_EXIT_MMIO; +} + +/** + * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block + * @vcpu: The VCPU pointer + * + * When exceptions occur while instructions are executed in Thumb IF-THEN + * blocks, the ITSTATE field of the CPSR is not advanved (updated), so we have + * to do this little bit of work manually. The fields map like this: + * + * IT[7:0] -> CPSR[26:25],CPSR[15:10] + */ +void kvm_adjust_itstate(struct kvm_vcpu *vcpu) +{ + unsigned long itbits, cond; + unsigned long cpsr = *vcpu_cpsr(vcpu); + bool is_arm = !(cpsr & PSR_T_BIT); + + BUG_ON(is_arm && (cpsr & PSR_IT_MASK)); + + if (!(cpsr & PSR_IT_MASK)) + return; + + cond = (cpsr & 0xe000) >> 13; + itbits = (cpsr & 0x1c00) >> (10 - 2); + itbits |= (cpsr & (0x3 << 25)) >> 25; + + /* Perform ITAdvance (see page A-52 in ARM DDI 0406C) */ + if ((itbits & 0x7) == 0) + itbits = cond = 0; + else + itbits = (itbits << 1) & 0x1f; + + cpsr &= ~PSR_IT_MASK; + cpsr |= cond << 13; + cpsr |= (itbits & 0x1c) << (10 - 2); + cpsr |= (itbits & 0x3) << 25; + *vcpu_cpsr(vcpu) = cpsr; +} + +/****************************************************************************** + * Inject exceptions into the guest + */ + +static u32 exc_vector_base(struct kvm_vcpu *vcpu) +{ + u32 sctlr = vcpu->arch.cp15[c1_SCTLR]; + u32 vbar = vcpu->arch.cp15[c12_VBAR]; + + if (sctlr & SCTLR_V) + return 0xffff0000; + else /* always have security exceptions */ + return vbar; +} + +/** + * kvm_inject_undefined - inject an undefined exception into the guest + * @vcpu: The VCPU to receive the undefined exception + * + * It is assumed that this code is called from the VCPU thread and that the + * VCPU therefore is not currently executing guest code. + * + * Modelled after TakeUndefInstrException() pseudocode. + */ +void kvm_inject_undefined(struct kvm_vcpu *vcpu) +{ + u32 new_lr_value; + u32 new_spsr_value; + u32 cpsr = *vcpu_cpsr(vcpu); + u32 sctlr = vcpu->arch.cp15[c1_SCTLR]; + bool is_thumb = (cpsr & PSR_T_BIT); + u32 vect_offset = 4; + u32 return_offset = (is_thumb) ? 2 : 4; + + new_spsr_value = cpsr; + new_lr_value = *vcpu_pc(vcpu) - return_offset; + + *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | UND_MODE; + *vcpu_cpsr(vcpu) |= PSR_I_BIT; + *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT); + + if (sctlr & SCTLR_TE) + *vcpu_cpsr(vcpu) |= PSR_T_BIT; + if (sctlr & SCTLR_EE) + *vcpu_cpsr(vcpu) |= PSR_E_BIT; + + /* Note: These now point to UND banked copies */ + *vcpu_spsr(vcpu) = cpsr; + *vcpu_reg(vcpu, 14) = new_lr_value; + + /* Branch to exception vector */ + *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset; +} diff --git a/arch/arm/kvm/exports.c b/arch/arm/kvm/exports.c new file mode 100644 index 0000000000000..f39f82380a981 --- /dev/null +++ b/arch/arm/kvm/exports.c @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/module.h> +#include <asm/kvm_asm.h> + +EXPORT_SYMBOL_GPL(__kvm_hyp_init); +EXPORT_SYMBOL_GPL(__kvm_hyp_init_end); + +EXPORT_SYMBOL_GPL(__kvm_hyp_exit); +EXPORT_SYMBOL_GPL(__kvm_hyp_exit_end); + +EXPORT_SYMBOL_GPL(__kvm_hyp_vector); + +EXPORT_SYMBOL_GPL(__kvm_hyp_code_start); +EXPORT_SYMBOL_GPL(__kvm_hyp_code_end); + +EXPORT_SYMBOL_GPL(__kvm_vcpu_run); + +EXPORT_SYMBOL_GPL(__kvm_flush_vm_context); +EXPORT_SYMBOL_GPL(__kvm_tlb_flush_vmid); + +EXPORT_SYMBOL_GPL(smp_send_reschedule); diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c new file mode 100644 index 0000000000000..57c0e296da551 --- /dev/null +++ b/arch/arm/kvm/guest.c @@ -0,0 +1,165 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/fs.h> +#include <asm/uaccess.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> + +#define VM_STAT(x) { #x, offsetof(struct kvm, stat.x), KVM_STAT_VM } +#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU } + +struct kvm_stats_debugfs_item debugfs_entries[] = { + /* vcpu stats */ + VCPU_STAT(exits), + VCPU_STAT(mmio_exits), + VCPU_STAT(kmmio_exits), + VCPU_STAT(wfi_exits), + VCPU_STAT(irq_exits), + VCPU_STAT(halt_wakeup), + + /* vm stats */ + VM_STAT(remote_tlb_flush), + { NULL } +}; + +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + return 0; +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + struct kvm_vcpu_regs *vcpu_regs = &vcpu->arch.regs; + + /* + * GPRs and PSRs + */ + memcpy(regs->regs0_7, &(vcpu_regs->usr_regs[0]), sizeof(u32) * 8); + memcpy(regs->usr_regs8_12, &(vcpu_regs->usr_regs[8]), sizeof(u32) * 5); + memcpy(regs->fiq_regs8_12, &(vcpu_regs->fiq_regs[0]), sizeof(u32) * 5); + regs->reg13[MODE_FIQ] = vcpu_regs->fiq_regs[5]; + regs->reg14[MODE_FIQ] = vcpu_regs->fiq_regs[6]; + regs->reg13[MODE_IRQ] = vcpu_regs->irq_regs[0]; + regs->reg14[MODE_IRQ] = vcpu_regs->irq_regs[1]; + regs->reg13[MODE_SVC] = vcpu_regs->svc_regs[0]; + regs->reg14[MODE_SVC] = vcpu_regs->svc_regs[1]; + regs->reg13[MODE_ABT] = vcpu_regs->abt_regs[0]; + regs->reg14[MODE_ABT] = vcpu_regs->abt_regs[1]; + regs->reg13[MODE_UND] = vcpu_regs->und_regs[0]; + regs->reg14[MODE_UND] = vcpu_regs->und_regs[1]; + regs->reg13[MODE_USR] = vcpu_regs->usr_regs[0]; + regs->reg14[MODE_USR] = vcpu_regs->usr_regs[1]; + + regs->spsr[MODE_FIQ] = vcpu_regs->fiq_regs[7]; + regs->spsr[MODE_IRQ] = vcpu_regs->irq_regs[2]; + regs->spsr[MODE_SVC] = vcpu_regs->svc_regs[2]; + regs->spsr[MODE_ABT] = vcpu_regs->abt_regs[2]; + regs->spsr[MODE_UND] = vcpu_regs->und_regs[2]; + + regs->reg15 = vcpu_regs->pc; + regs->cpsr = vcpu_regs->cpsr; + + + /* + * Co-processor registers. + */ + regs->cp15.c0_midr = vcpu->arch.cp15[c0_MIDR]; + regs->cp15.c1_sys = vcpu->arch.cp15[c1_SCTLR]; + regs->cp15.c2_base0 = vcpu->arch.cp15[c2_TTBR0]; + regs->cp15.c2_base1 = vcpu->arch.cp15[c2_TTBR1]; + regs->cp15.c2_control = vcpu->arch.cp15[c2_TTBCR]; + regs->cp15.c3_dacr = vcpu->arch.cp15[c3_DACR]; + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + struct kvm_vcpu_regs *vcpu_regs = &vcpu->arch.regs; + + if (__vcpu_mode(regs->cpsr) == 0xf) + return -EINVAL; + + memcpy(&(vcpu_regs->usr_regs[0]), regs->regs0_7, sizeof(u32) * 8); + memcpy(&(vcpu_regs->usr_regs[8]), regs->usr_regs8_12, sizeof(u32) * 5); + memcpy(&(vcpu_regs->fiq_regs[0]), regs->fiq_regs8_12, sizeof(u32) * 5); + + vcpu_regs->fiq_regs[5] = regs->reg13[MODE_FIQ]; + vcpu_regs->fiq_regs[6] = regs->reg14[MODE_FIQ]; + vcpu_regs->irq_regs[0] = regs->reg13[MODE_IRQ]; + vcpu_regs->irq_regs[1] = regs->reg14[MODE_IRQ]; + vcpu_regs->svc_regs[0] = regs->reg13[MODE_SVC]; + vcpu_regs->svc_regs[1] = regs->reg14[MODE_SVC]; + vcpu_regs->abt_regs[0] = regs->reg13[MODE_ABT]; + vcpu_regs->abt_regs[1] = regs->reg14[MODE_ABT]; + vcpu_regs->und_regs[0] = regs->reg13[MODE_UND]; + vcpu_regs->und_regs[1] = regs->reg14[MODE_UND]; + vcpu_regs->usr_regs[0] = regs->reg13[MODE_USR]; + vcpu_regs->usr_regs[1] = regs->reg14[MODE_USR]; + + vcpu_regs->fiq_regs[7] = regs->spsr[MODE_FIQ]; + vcpu_regs->irq_regs[2] = regs->spsr[MODE_IRQ]; + vcpu_regs->svc_regs[2] = regs->spsr[MODE_SVC]; + vcpu_regs->abt_regs[2] = regs->spsr[MODE_ABT]; + vcpu_regs->und_regs[2] = regs->spsr[MODE_UND]; + + /* + * Co-processor registers. + */ + vcpu->arch.cp15[c0_MIDR] = regs->cp15.c0_midr; + vcpu->arch.cp15[c1_SCTLR] = regs->cp15.c1_sys; + + vcpu_regs->pc = regs->reg15; + vcpu_regs->cpsr = regs->cpsr; + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + return -EINVAL; +} diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S new file mode 100644 index 0000000000000..4db26cb536c2d --- /dev/null +++ b/arch/arm/kvm/init.S @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/linkage.h> +#include <asm/unified.h> +#include <asm/asm-offsets.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_arm.h> + +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@ Hypervisor initialization +@ - should be called with: +@ r0 = Hypervisor pgd pointer +@ r1 = top of Hyp stack (kernel VA) +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + .text + .arm + .pushsection .hyp.idmap.text,"ax" + .align 12 +__kvm_hyp_init: + .globl __kvm_hyp_init + + @ Hyp-mode exception vector + nop + nop + nop + nop + nop + b __do_hyp_init + nop + nop + +__do_hyp_init: + @ Set the sp to end of this page and push data for later use + mov sp, pc + bic sp, sp, #0x0ff + bic sp, sp, #0xf00 + add sp, sp, #0x1000 + push {r0, r1, r2, r12} + + @ Set the HTTBR to point to the hypervisor PGD pointer passed to + @ function and set the upper bits equal to the kernel PGD. + mrrc p15, 1, r1, r2, c2 + mcrr p15, 4, r0, r2, c2 + + @ Set the HTCR and VTCR to the same shareability and cacheability + @ settings as the non-secure TTBCR and with T0SZ == 0. + mrc p15, 4, r0, c2, c0, 2 @ HTCR + ldr r12, =HTCR_MASK + bic r0, r0, r12 + mrc p15, 0, r1, c2, c0, 2 @ TTBCR + and r1, r1, #(HTCR_MASK & ~TTBCR_T0SZ) + orr r0, r0, r1 + mcr p15, 4, r0, c2, c0, 2 @ HTCR + + mrc p15, 4, r1, c2, c1, 2 @ VTCR + bic r1, r1, #(VTCR_HTCR_SH | VTCR_SL0) + bic r0, r0, #(~VTCR_HTCR_SH) + orr r1, r0, r1 + orr r1, r1, #(VTCR_SL_L1 | VTCR_GUEST_T0SZ) + mcr p15, 4, r1, c2, c1, 2 @ VTCR + + @ Use the same memory attributes for hyp. accesses as the kernel + @ (copy MAIRx ro HMAIRx). + mrc p15, 0, r0, c10, c2, 0 + mcr p15, 4, r0, c10, c2, 0 + mrc p15, 0, r0, c10, c2, 1 + mcr p15, 4, r0, c10, c2, 1 + + @ Set the HSCTLR to: + @ - ARM/THUMB exceptions: Kernel config (Thumb-2 kernel) + @ - Endianness: Kernel config + @ - Fast Interrupt Features: Kernel config + @ - Write permission implies XN: disabled + @ - Instruction cache: enabled + @ - Data/Unified cache: enabled + @ - Memory alignment checks: enabled + @ - MMU: enabled (this code must be run from an identity mapping) + mrc p15, 4, r0, c1, c0, 0 @ HSCR + ldr r12, =HSCTLR_MASK + bic r0, r0, r12 + mrc p15, 0, r1, c1, c0, 0 @ SCTLR + ldr r12, =(HSCTLR_EE | HSCTLR_FI) + and r1, r1, r12 + ARM( ldr r12, =(HSCTLR_M | HSCTLR_A | HSCTLR_I) ) + THUMB( ldr r12, =(HSCTLR_M | HSCTLR_A | HSCTLR_I | HSCTLR_TE) ) + orr r1, r1, r12 + orr r0, r0, r1 + isb + mcr p15, 4, r0, c1, c0, 0 @ HSCR + isb + + @ Set stack pointer and return to the kernel + pop {r0, r1, r2, r12} + mov sp, r1 + eret + + .ltorg + + .align 12 + + __kvm_init_sp: + .globl __kvm_hyp_init_end +__kvm_hyp_init_end: + + .align 12 +__kvm_hyp_exit: + .globl __kvm_hyp_exit + + @ Hyp-mode exception vector + nop + nop + nop + nop + nop + b __do_hyp_exit + nop + nop + +__do_hyp_exit: + @ Clear the MMU and TE bits in the HSCR + mrc p15, 4, sp, c1, c0, 0 @ HSCR + bic sp, sp, #((1 << 30) | (1 << 0)) + + isb + mcr p15, 4, sp, c1, c0, 0 @ HSCR + mcr p15, 4, r0, c8, c7, 0 @ Flush Hyp TLB, r0 ignored + isb + eret + + .globl __kvm_hyp_exit_end +__kvm_hyp_exit_end: + + .popsection diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S new file mode 100644 index 0000000000000..fd7331c6b4698 --- /dev/null +++ b/arch/arm/kvm/interrupts.S @@ -0,0 +1,698 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/unified.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_arm.h> + +#define VCPU_USR_REG(_reg_nr) (VCPU_USR_REGS + (_reg_nr * 4)) +#define VCPU_USR_SP (VCPU_USR_REG(13)) +#define VCPU_FIQ_REG(_reg_nr) (VCPU_FIQ_REGS + (_reg_nr * 4)) +#define VCPU_FIQ_SPSR (VCPU_FIQ_REG(7)) + + .text + .align PAGE_SHIFT + +__kvm_hyp_code_start: + .globl __kvm_hyp_code_start + +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@ Flush per-VMID TLBs +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +/* + * void __kvm_tlb_flush_vmid(struct kvm *kvm); + * + * We rely on the hardware to broadcast the TLB invalidation to all CPUs + * inside the inner-shareable domain (which is the case for all v7 + * implementations). If we come across a non-IS SMP implementation, we'll + * have to use an IPI based mechanism. Until then, we stick to the simple + * hardware assisted version. + */ +ENTRY(__kvm_tlb_flush_vmid) + hvc #0 @ Switch to Hyp mode + push {r2, r3} + + add r0, r0, #KVM_VTTBR + ldrd r2, r3, [r0] + mcrr p15, 6, r2, r3, c2 @ Write VTTBR + isb + mcr p15, 0, r0, c8, c3, 0 @ TLBIALLIS (rt ignored) + dsb + isb + mov r2, #0 + mov r3, #0 + mcrr p15, 6, r2, r3, c2 @ Back to VMID #0 + isb + + pop {r2, r3} + hvc #0 @ Back to SVC + bx lr +ENDPROC(__kvm_tlb_flush_vmid) + +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@ Flush TLBs and instruction caches of current CPU for all VMIDs +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +/* + * void __kvm_flush_vm_context(void); + */ +ENTRY(__kvm_flush_vm_context) + hvc #0 @ switch to hyp-mode + + mov r0, #0 @ rn parameter for c15 flushes is SBZ + mcr p15, 4, r0, c8, c7, 4 @ Invalidate Non-secure Non-Hyp TLB + mcr p15, 0, r0, c7, c5, 0 @ Invalidate instruction caches + dsb + isb + + hvc #0 @ switch back to svc-mode, see hyp_svc + bx lr +ENDPROC(__kvm_flush_vm_context) + +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@ Hypervisor world-switch code +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +/* These are simply for the macros to work - value don't have meaning */ +.equ usr, 0 +.equ svc, 1 +.equ abt, 2 +.equ und, 3 +.equ irq, 4 +.equ fiq, 5 + +.macro store_mode_state base_reg, mode + .if \mode == usr + mrs r2, SP_usr + mov r3, lr + stmdb \base_reg!, {r2, r3} + .elseif \mode != fiq + mrs r2, SP_\mode + mrs r3, LR_\mode + mrs r4, SPSR_\mode + stmdb \base_reg!, {r2, r3, r4} + .else + mrs r2, r8_fiq + mrs r3, r9_fiq + mrs r4, r10_fiq + mrs r5, r11_fiq + mrs r6, r12_fiq + mrs r7, SP_fiq + mrs r8, LR_fiq + mrs r9, SPSR_fiq + stmdb \base_reg!, {r2-r9} + .endif +.endm + +.macro load_mode_state base_reg, mode + .if \mode == usr + ldmia \base_reg!, {r2, r3} + msr SP_usr, r2 + mov lr, r3 + .elseif \mode != fiq + ldmia \base_reg!, {r2, r3, r4} + msr SP_\mode, r2 + msr LR_\mode, r3 + msr SPSR_\mode, r4 + .else + ldmia \base_reg!, {r2-r9} + msr r8_fiq, r2 + msr r9_fiq, r3 + msr r10_fiq, r4 + msr r11_fiq, r5 + msr r12_fiq, r6 + msr SP_fiq, r7 + msr LR_fiq, r8 + msr SPSR_fiq, r9 + .endif +.endm + +/* Reads cp15 registers from hardware and stores them in memory + * @vcpu: If 0, registers are written in-order to the stack, + * otherwise to the VCPU struct pointed to by vcpup + * @vcpup: Register pointing to VCPU struct + */ +.macro read_cp15_state vcpu=0, vcpup + mrc p15, 0, r2, c1, c0, 0 @ SCTLR + mrc p15, 0, r3, c1, c0, 2 @ CPACR + mrc p15, 0, r4, c2, c0, 2 @ TTBCR + mrc p15, 0, r5, c3, c0, 0 @ DACR + mrrc p15, 0, r6, r7, c2 @ TTBR 0 + mrrc p15, 1, r8, r9, c2 @ TTBR 1 + mrc p15, 0, r10, c10, c2, 0 @ PRRR + mrc p15, 0, r11, c10, c2, 1 @ NMRR + + .if \vcpu == 0 + push {r2-r11} @ Push CP15 registers + .else + str r2, [\vcpup, #VCPU_SCTLR] + str r3, [\vcpup, #VCPU_CPACR] + str r4, [\vcpup, #VCPU_TTBCR] + str r5, [\vcpup, #VCPU_DACR] + add \vcpup, \vcpup, #VCPU_TTBR0 + strd r6, r7, [\vcpup] + add \vcpup, \vcpup, #(VCPU_TTBR1 - VCPU_TTBR0) + strd r8, r9, [\vcpup] + sub \vcpup, \vcpup, #(VCPU_TTBR1) + str r10, [\vcpup, #VCPU_PRRR] + str r11, [\vcpup, #VCPU_NMRR] + .endif + + mrc p15, 0, r2, c13, c0, 1 @ CID + mrc p15, 0, r3, c13, c0, 2 @ TID_URW + mrc p15, 0, r4, c13, c0, 3 @ TID_URO + mrc p15, 0, r5, c13, c0, 4 @ TID_PRIV + mrc p15, 0, r6, c5, c0, 0 @ DFSR + mrc p15, 0, r7, c5, c0, 1 @ IFSR + mrc p15, 0, r8, c5, c1, 0 @ ADFSR + mrc p15, 0, r9, c5, c1, 1 @ AIFSR + mrc p15, 0, r10, c6, c0, 0 @ DFAR + mrc p15, 0, r11, c6, c0, 2 @ IFAR + mrc p15, 0, r12, c12, c0, 0 @ VBAR + + .if \vcpu == 0 + push {r2-r12} @ Push CP15 registers + .else + str r2, [\vcpup, #VCPU_CID] + str r3, [\vcpup, #VCPU_TID_URW] + str r4, [\vcpup, #VCPU_TID_URO] + str r5, [\vcpup, #VCPU_TID_PRIV] + str r6, [\vcpup, #VCPU_DFSR] + str r7, [\vcpup, #VCPU_IFSR] + str r8, [\vcpup, #VCPU_ADFSR] + str r9, [\vcpup, #VCPU_AIFSR] + str r10, [\vcpup, #VCPU_DFAR] + str r11, [\vcpup, #VCPU_IFAR] + str r12, [\vcpup, #VCPU_VBAR] + .endif +.endm + +/* Reads cp15 registers from memory and writes them to hardware + * @vcpu: If 0, registers are read in-order from the stack, + * otherwise from the VCPU struct pointed to by vcpup + * @vcpup: Register pointing to VCPU struct + */ +.macro write_cp15_state vcpu=0, vcpup + .if \vcpu == 0 + pop {r2-r12} + .else + ldr r2, [\vcpup, #VCPU_CID] + ldr r3, [\vcpup, #VCPU_TID_URW] + ldr r4, [\vcpup, #VCPU_TID_URO] + ldr r5, [\vcpup, #VCPU_TID_PRIV] + ldr r6, [\vcpup, #VCPU_DFSR] + ldr r7, [\vcpup, #VCPU_IFSR] + ldr r8, [\vcpup, #VCPU_ADFSR] + ldr r9, [\vcpup, #VCPU_AIFSR] + ldr r10, [\vcpup, #VCPU_DFAR] + ldr r11, [\vcpup, #VCPU_IFAR] + ldr r12, [\vcpup, #VCPU_VBAR] + .endif + + mcr p15, 0, r2, c13, c0, 1 @ CID + mcr p15, 0, r3, c13, c0, 2 @ TID_URW + mcr p15, 0, r4, c13, c0, 3 @ TID_URO + mcr p15, 0, r5, c13, c0, 4 @ TID_PRIV + mcr p15, 0, r6, c5, c0, 0 @ DFSR + mcr p15, 0, r7, c5, c0, 1 @ IFSR + mcr p15, 0, r8, c5, c1, 0 @ ADFSR + mcr p15, 0, r9, c5, c1, 1 @ AIFSR + mcr p15, 0, r10, c6, c0, 0 @ DFAR + mcr p15, 0, r11, c6, c0, 2 @ IFAR + mcr p15, 0, r12, c12, c0, 0 @ VBAR + + .if \vcpu == 0 + pop {r2-r11} + .else + ldr r2, [\vcpup, #VCPU_SCTLR] + ldr r3, [\vcpup, #VCPU_CPACR] + ldr r4, [\vcpup, #VCPU_TTBCR] + ldr r5, [\vcpup, #VCPU_DACR] + add \vcpup, \vcpup, #VCPU_TTBR0 + ldrd r6, r7, [\vcpup] + add \vcpup, \vcpup, #(VCPU_TTBR1 - VCPU_TTBR0) + ldrd r8, r9, [\vcpup] + sub \vcpup, \vcpup, #(VCPU_TTBR1) + ldr r10, [\vcpup, #VCPU_PRRR] + ldr r11, [\vcpup, #VCPU_NMRR] + .endif + + mcr p15, 0, r2, c1, c0, 0 @ SCTLR + mcr p15, 0, r3, c1, c0, 2 @ CPACR + mcr p15, 0, r4, c2, c0, 2 @ TTBCR + mcr p15, 0, r5, c3, c0, 0 @ DACR + mcrr p15, 0, r6, r7, c2 @ TTBR 0 + mcrr p15, 1, r8, r9, c2 @ TTBR 1 + mcr p15, 0, r10, c10, c2, 0 @ PRRR + mcr p15, 0, r11, c10, c2, 1 @ NMRR +.endm + +/* Configures the HSTR (Hyp System Trap Register) on entry/return + * (hardware reset value is 0) */ +.macro set_hstr entry + mrc p15, 4, r2, c1, c1, 3 + ldr r3, =(HSTR_T(9) | HSTR_T(10) | HSTR_T(11) | HSTR_T(15)) + .if \entry == 1 + orr r2, r2, r3 @ Trap CR{9,10,11,15} + .else + bic r2, r2, r3 @ Don't trap any CRx accesses + .endif + mcr p15, 4, r2, c1, c1, 3 +.endm + +/* Configures the HCPTR (Hyp Coprocessor Trap Register) on entry/return + * (hardware reset value is 0) */ +.macro set_hcptr entry + mrc p15, 4, r2, c1, c1, 2 + ldr r3, =(HCPTR_TTA) + .if \entry == 1 + orr r2, r2, r3 @ Trap some coproc-accesses + .else + bic r2, r2, r3 @ Don't trap any coproc- accesses + .endif + mcr p15, 4, r2, c1, c1, 2 +.endm + +/* Enable/Disable: stage-2 trans., trap interrupts, trap wfi, trap smc */ +.macro configure_hyp_role entry, vcpu_ptr + mrc p15, 4, r2, c1, c1, 0 @ HCR + bic r2, r2, #HCR_VIRT_EXCP_MASK + ldr r3, =HCR_GUEST_MASK + .if \entry == 1 + orr r2, r2, r3 + ldr r3, [\vcpu_ptr, #VCPU_IRQ_LINES] + orr r2, r2, r3 + .else + bic r2, r2, r3 + .endif + mcr p15, 4, r2, c1, c1, 0 +.endm + +@ Arguments: +@ r0: pointer to vcpu struct +ENTRY(__kvm_vcpu_run) + hvc #0 @ switch to hyp-mode + + @ Now we're in Hyp-mode and lr_usr, spsr_hyp are on the stack + mrs r2, sp_usr + push {r2} @ Push r13_usr + push {r4-r12} @ Push r4-r12 + + store_mode_state sp, svc + store_mode_state sp, abt + store_mode_state sp, und + store_mode_state sp, irq + store_mode_state sp, fiq + + @ Store hardware CP15 state and load guest state + read_cp15_state + write_cp15_state 1, r0 + + push {r0} @ Push the VCPU pointer + + @ Configure Hyp-role + configure_hyp_role 1, r0 + + @ Trap coprocessor CRx accesses + set_hstr 1 + set_hcptr 1 + + @ Write configured ID register into MIDR alias + ldr r1, [r0, #VCPU_MIDR] + mcr p15, 4, r1, c0, c0, 0 + + @ Write guest view of MPIDR into VMPIDR + ldr r1, [r0, #VCPU_MPIDR] + mcr p15, 4, r1, c0, c0, 5 + + @ Load guest registers + add r0, r0, #(VCPU_USR_SP) + load_mode_state r0, usr + load_mode_state r0, svc + load_mode_state r0, abt + load_mode_state r0, und + load_mode_state r0, irq + load_mode_state r0, fiq + + @ Load return state (r0 now points to vcpu->arch.regs.pc) + ldmia r0, {r2, r3} + msr ELR_hyp, r2 + msr SPSR_cxsf, r3 + + @ Set up guest memory translation + sub r1, r0, #(VCPU_PC - VCPU_KVM) @ r1 points to kvm struct + ldr r1, [r1] + add r1, r1, #KVM_VTTBR + ldrd r2, r3, [r1] + mcrr p15, 6, r2, r3, c2 @ Write VTTBR + + @ Load remaining registers and do the switch + sub r0, r0, #(VCPU_PC - VCPU_USR_REGS) + ldmia r0, {r0-r12} + eret + +__kvm_vcpu_return: + @ Set VMID == 0 + mov r2, #0 + mov r3, #0 + mcrr p15, 6, r2, r3, c2 @ Write VTTBR + + @ Store return state + mrs r2, ELR_hyp + mrs r3, spsr + str r2, [r1, #VCPU_PC] + str r3, [r1, #VCPU_CPSR] + + @ Store guest registers + add r1, r1, #(VCPU_FIQ_SPSR + 4) + store_mode_state r1, fiq + store_mode_state r1, irq + store_mode_state r1, und + store_mode_state r1, abt + store_mode_state r1, svc + store_mode_state r1, usr + sub r1, r1, #(VCPU_USR_REG(13)) + + @ Don't trap coprocessor accesses for host kernel + set_hstr 0 + set_hcptr 0 + + @ Reset Hyp-role + configure_hyp_role 0, r1 + + @ Let host read hardware MIDR + mrc p15, 0, r2, c0, c0, 0 + mcr p15, 4, r2, c0, c0, 0 + + @ Back to hardware MPIDR + mrc p15, 0, r2, c0, c0, 5 + mcr p15, 4, r2, c0, c0, 5 + + @ Store guest CP15 state and restore host state + read_cp15_state 1, r1 + write_cp15_state + + load_mode_state sp, fiq + load_mode_state sp, irq + load_mode_state sp, und + load_mode_state sp, abt + load_mode_state sp, svc + + pop {r4-r12} @ Pop r4-r12 + pop {r2} @ Pop r13_usr + msr sp_usr, r2 + + ldr r2, =(~PAGE_MASK) @ Get svc-cpsr in case we need it for + mov r1, sp @ the __irq_svc call + tst r1, r2 + subeq r2, r1, #0x1000 + bicne r2, r1, r2 + ldr r2, [r2, #4] @ r2 = svc_cpsr + + hvc #0 @ switch back to svc-mode, see hyp_svc + + cmp r0, #ARM_EXCEPTION_IRQ + bxne lr @ return to IOCTL + + /* + * It's time to launch the kernel IRQ handler for IRQ exceptions. This + * requires some manipulation though. + * + * - The easiest entry point to the host handler is __irq_svc. + * - The __irq_svc expects to be called from SVC mode, which has been + * switched to from vector_stub code in entry-armv.S. The __irq_svc + * calls svc_entry which uses values stored in memory and pointed to + * by r0 to return from handler. We allocate this memory on the + * stack, which will contain these values: + * 0x8: cpsr + * 0x4: return_address + * 0x0: r0 + */ + adr r1, irq_kernel_resume @ Where to resume + push {r0 - r2} + mov r0, sp + b __irq_svc + +irq_kernel_resume: + pop {r0} + add sp, sp, #8 + bx lr @ return to IOCTL + +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@ Hypervisor exception vector and handlers +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +/* + * The KVM/ARM Hypervisor ABI is defined as follows: + * + * Entry to Hyp mode from the host kernel will happen _only_ when an HVC + * instruction is issued since all traps are disabled when running the host + * kernel as per the Hyp-mode initialization at boot time. + * + * HVC instructions cause a trap to the vector page + offset 0x18 (see hyp_hvc + * below) when the HVC instruction is called from SVC mode (i.e. a guest or the + * host kernel) and they cause a trap to the vector page + offset 0xc when HVC + * instructions are called from within Hyp-mode. + * + * Hyp-ABI: Switching from host kernel to Hyp-mode: + * Switching to Hyp mode is done through a simple HVC instructions. The + * exception vector code will check that the HVC comes from VMID==0 and if + * so will store the necessary state on the Hyp stack, which will look like + * this (growing downwards, see the hyp_hvc handler): + * ... + * stack_page + 4: spsr (Host-SVC cpsr) + * stack_page : lr_usr + * --------------: stack bottom + * + * Hyp-ABI: Switching from Hyp-mode to host kernel SVC mode: + * When returning from Hyp mode to SVC mode, another HVC instruction is + * executed from Hyp mode, which is taken in the hyp_svc handler. The + * bottom of the Hyp is derived from the Hyp stack pointer (only a single + * page aligned stack is used per CPU) and the initial SVC registers are + * used to restore the host state. + * + * + * Note that the above is used to execute code in Hyp-mode from a host-kernel + * point of view, and is a different concept from performing a world-switch and + * executing guest code SVC mode (with a VMID != 0). + */ + +@ Handle undef, svc, pabt, or dabt by crashing with a user notice +.macro bad_exception exception_code, panic_str + mrrc p15, 6, r2, r3, c2 @ Read VTTBR + lsr r3, r3, #16 + ands r3, r3, #0xff + + @ COND:neq means we're probably in the guest and we can try fetching + @ the vcpu pointer and stuff off the stack and keep our fingers crossed + beq 99f + mov r0, #\exception_code + pop {r1} @ Load VCPU pointer + .if \exception_code == ARM_EXCEPTION_DATA_ABORT + mrc p15, 4, r2, c5, c2, 0 @ HSR + mrc p15, 4, r3, c6, c0, 0 @ HDFAR + str r2, [r1, #VCPU_HSR] + str r3, [r1, #VCPU_HDFAR] + .endif + .if \exception_code == ARM_EXCEPTION_PREF_ABORT + mrc p15, 4, r2, c5, c2, 0 @ HSR + mrc p15, 4, r3, c6, c0, 2 @ HIFAR + str r2, [r1, #VCPU_HSR] + str r3, [r1, #VCPU_HIFAR] + .endif + mrs r2, ELR_hyp + str r2, [r1, #VCPU_HYP_PC] + b __kvm_vcpu_return + + @ We were in the host already +99: hvc #0 @ switch to SVC mode + ldr r0, \panic_str + mrs r1, ELR_hyp + b panic + +.endm + + .text + + .align 5 +__kvm_hyp_vector: + .globl __kvm_hyp_vector + + @ Hyp-mode exception vector + W(b) hyp_reset + W(b) hyp_undef + W(b) hyp_svc + W(b) hyp_pabt + W(b) hyp_dabt + W(b) hyp_hvc + W(b) hyp_irq + W(b) hyp_fiq + + .align +hyp_reset: + b hyp_reset + + .align +hyp_undef: + bad_exception ARM_EXCEPTION_UNDEFINED, und_die_str + + .align +hyp_svc: + @ Can only get here if HVC or SVC is called from Hyp, mode which means + @ we want to change mode back to SVC mode. + push {r12} + mov r12, sp + bic r12, r12, #0x0ff + bic r12, r12, #0xf00 + ldr lr, [r12, #4] + msr SPSR_csxf, lr + ldr lr, [r12] + pop {r12} + eret + + .align +hyp_pabt: + bad_exception ARM_EXCEPTION_PREF_ABORT, pabt_die_str + + .align +hyp_dabt: + bad_exception ARM_EXCEPTION_DATA_ABORT, dabt_die_str + + .align +hyp_hvc: + @ Getting here is either becuase of a trap from a guest or from calling + @ HVC from the host kernel, which means "switch to Hyp mode". + push {r0, r1, r2} + + @ Check syndrome register + mrc p15, 4, r0, c5, c2, 0 @ HSR + lsr r1, r0, #HSR_EC_SHIFT + cmp r1, #HSR_EC_HVC + bne guest_trap @ Not HVC instr. + + @ Let's check if the HVC came from VMID 0 and allow simple + @ switch to Hyp mode + mrrc p15, 6, r1, r2, c2 + lsr r2, r2, #16 + and r2, r2, #0xff + cmp r2, #0 + bne guest_trap @ Guest called HVC + + @ Store lr_usr,spsr (svc cpsr) on bottom of stack + mov r1, sp + bic r1, r1, #0x0ff + bic r1, r1, #0xf00 + str lr, [r1] + mrs lr, spsr + str lr, [r1, #4] + + pop {r0, r1, r2} + + @ Return to caller in Hyp mode + mrs lr, ELR_hyp + mov pc, lr + +guest_trap: + ldr r1, [sp, #12] @ Load VCPU pointer + str r0, [r1, #VCPU_HSR] + add r1, r1, #VCPU_USR_REG(3) + stmia r1, {r3-r12} + sub r1, r1, #(VCPU_USR_REG(3) - VCPU_USR_REG(0)) + pop {r3, r4, r5} + add sp, sp, #4 @ We loaded the VCPU pointer above + stmia r1, {r3, r4, r5} + sub r1, r1, #VCPU_USR_REG(0) + + @ Check if we need the fault information + lsr r2, r0, #HSR_EC_SHIFT + cmp r2, #HSR_EC_IABT + beq 2f + cmpne r2, #HSR_EC_DABT + bne 1f + + @ For non-valid data aborts, get the offending instr. PA + lsr r2, r0, #HSR_ISV_SHIFT + ands r2, r2, #1 + bne 2f + mrs r3, ELR_hyp + mcr p15, 0, r3, c7, c8, 0 @ VA to PA, ATS1CPR + mrrc p15, 0, r4, r5, c7 @ PAR + add r6, r1, #VCPU_PC_IPA + strd r4, r5, [r6] + + @ Check if we might have a wide thumb instruction spill-over + ldr r5, =0xfff + bic r4, r3, r5 @ clear page mask + sub r5, r5, #1 @ last 2-byte page bounday, 0xffe + cmp r4, r5 + bne 2f + add r4, r3, #2 @ _really_ unlikely! + mcr p15, 0, r4, c7, c8, 0 @ VA to PA, ATS1CPR + mrrc p15, 0, r4, r5, c7 @ PAR + add r6, r1, #VCPU_PC_IPA2 + strd r4, r5, [r6] + +2: mrc p15, 4, r2, c6, c0, 0 @ HDFAR + mrc p15, 4, r3, c6, c0, 2 @ HIFAR + mrc p15, 4, r4, c6, c0, 4 @ HPFAR + add r5, r1, #VCPU_HDFAR + stmia r5, {r2, r3, r4} + +1: mov r0, #ARM_EXCEPTION_HVC + b __kvm_vcpu_return + + .align +hyp_irq: + push {r0} + ldr r0, [sp, #4] @ Load VCPU pointer + add r0, r0, #(VCPU_USR_REG(1)) + stmia r0, {r1-r12} + pop {r0, r1} @ r1 == vcpu pointer + str r0, [r1, #VCPU_USR_REG(0)] + + mov r0, #ARM_EXCEPTION_IRQ + b __kvm_vcpu_return + + .align +hyp_fiq: + b hyp_fiq + + .ltorg + +und_die_str: + .ascii "unexpected undefined exception in Hyp mode at: %#08x" +pabt_die_str: + .ascii "unexpected prefetch abort in Hyp mode at: %#08x" +dabt_die_str: + .ascii "unexpected data abort in Hyp mode at: %#08x" + +/* + * The below lines makes sure the HYP mode code fits in a single page (the + * assembler will bark at you if it doesn't). Please keep them together. If + * you plan to restructure the code or increase its size over a page, you'll + * have to fix the code in init_hyp_mode(). + */ +__kvm_hyp_code_end: + .globl __kvm_hyp_code_end + + .org __kvm_hyp_code_start + PAGE_SIZE diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c new file mode 100644 index 0000000000000..8c303769a863a --- /dev/null +++ b/arch/arm/kvm/mmu.c @@ -0,0 +1,644 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/mman.h> +#include <linux/kvm_host.h> +#include <trace/events/kvm.h> +#include <asm/idmap.h> +#include <asm/pgalloc.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> + +#include "trace.h" + +static DEFINE_MUTEX(kvm_hyp_pgd_mutex); + +static void free_ptes(pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + unsigned int i; + + for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) { + if (!pmd_none(*pmd) && pmd_table(*pmd)) { + pte = pte_offset_kernel(pmd, addr); + pte_free_kernel(NULL, pte); + } + pmd++; + } +} + +/** + * free_hyp_pmds - free a Hyp-mode level-2 tables and child level-3 tables + * + * Assumes this is a page table used strictly in Hyp-mode and therefore contains + * only mappings in the kernel memory area, which is above PAGE_OFFSET. + */ +void free_hyp_pmds(void) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + unsigned long addr; + + mutex_lock(&kvm_hyp_pgd_mutex); + for (addr = PAGE_OFFSET; addr != 0; addr += PGDIR_SIZE) { + pgd = hyp_pgd + pgd_index(addr); + pud = pud_offset(pgd, addr); + + if (pud_none(*pud)) + continue; + BUG_ON(pud_bad(*pud)); + + pmd = pmd_offset(pud, addr); + free_ptes(pmd, addr); + pmd_free(NULL, pmd); + pud_clear(pud); + } + mutex_unlock(&kvm_hyp_pgd_mutex); +} + +static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, + unsigned long end) +{ + pte_t *pte; + struct page *page; + unsigned long addr; + + for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + pte = pte_offset_kernel(pmd, addr); + BUG_ON(!virt_addr_valid(addr)); + page = virt_to_page(addr); + + set_pte_ext(pte, mk_pte(page, PAGE_HYP), 0); + } +} + +static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, + unsigned long end) +{ + pmd_t *pmd; + pte_t *pte; + unsigned long addr, next; + + for (addr = start; addr < end; addr = next) { + pmd = pmd_offset(pud, addr); + + BUG_ON(pmd_sect(*pmd)); + + if (pmd_none(*pmd)) { + pte = pte_alloc_one_kernel(NULL, addr); + if (!pte) { + kvm_err("Cannot allocate Hyp pte\n"); + return -ENOMEM; + } + pmd_populate_kernel(NULL, pmd, pte); + } + + next = pmd_addr_end(addr, end); + create_hyp_pte_mappings(pmd, addr, next); + } + + return 0; +} + +/** + * create_hyp_mappings - map a kernel virtual address range in Hyp mode + * @from: The virtual kernel start address of the range + * @to: The virtual kernel end address of the range (exclusive) + * + * The same virtual address as the kernel virtual address is also used in + * Hyp-mode mapping to the same underlying physical pages. + * + * Note: Wrapping around zero in the "to" address is not supported. + */ +int create_hyp_mappings(void *from, void *to) +{ + unsigned long start = (unsigned long)from; + unsigned long end = (unsigned long)to; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + unsigned long addr, next; + int err = 0; + + BUG_ON(start > end); + if (start < PAGE_OFFSET) + return -EINVAL; + + mutex_lock(&kvm_hyp_pgd_mutex); + for (addr = start; addr < end; addr = next) { + pgd = hyp_pgd + pgd_index(addr); + pud = pud_offset(pgd, addr); + + if (pud_none_or_clear_bad(pud)) { + pmd = pmd_alloc_one(NULL, addr); + if (!pmd) { + kvm_err("Cannot allocate Hyp pmd\n"); + err = -ENOMEM; + goto out; + } + pud_populate(NULL, pud, pmd); + } + + next = pgd_addr_end(addr, end); + err = create_hyp_pmd_mappings(pud, addr, next); + if (err) + goto out; + } +out: + mutex_unlock(&kvm_hyp_pgd_mutex); + return err; +} + +/** + * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. + * @kvm: The KVM struct pointer for the VM. + * + * Allocates the 1st level table only of size defined by PGD2_ORDER (can + * support either full 40-bit input addresses or limited to 32-bit input + * addresses). Clears the allocated pages. + * + * Note we don't need locking here as this is only called when the VM is + * created, which can only be done once. + */ +int kvm_alloc_stage2_pgd(struct kvm *kvm) +{ + pgd_t *pgd; + + if (kvm->arch.pgd != NULL) { + kvm_err("kvm_arch already initialized?\n"); + return -EINVAL; + } + + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD2_ORDER); + if (!pgd) + return -ENOMEM; + + memset(pgd, 0, PTRS_PER_PGD2 * sizeof(pgd_t)); + kvm->arch.pgd = pgd; + + return 0; +} + +static void free_guest_pages(pte_t *pte, unsigned long addr) +{ + unsigned int i; + struct page *page; + + for (i = 0; i < PTRS_PER_PTE; i++) { + if (pte_present(*pte)) { + page = pfn_to_page(pte_pfn(*pte)); + put_page(page); + } + pte++; + } +} + +static void free_stage2_ptes(pmd_t *pmd, unsigned long addr) +{ + unsigned int i; + pte_t *pte; + struct page *page; + + for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) { + BUG_ON(pmd_sect(*pmd)); + if (!pmd_none(*pmd) && pmd_table(*pmd)) { + pte = pte_offset_kernel(pmd, addr); + free_guest_pages(pte, addr); + page = virt_to_page((void *)pte); + WARN_ON(atomic_read(&page->_count) != 1); + pte_free_kernel(NULL, pte); + } + pmd++; + } +} + +/** + * kvm_free_stage2_pgd - free all stage-2 tables + * @kvm: The KVM struct pointer for the VM. + * + * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all + * underlying level-2 and level-3 tables before freeing the actual level-1 table + * and setting the struct pointer to NULL. + * + * Note we don't need locking here as this is only called when the VM is + * destroyed, which can only be done once. + */ +void kvm_free_stage2_pgd(struct kvm *kvm) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + unsigned long long i, addr; + + if (kvm->arch.pgd == NULL) + return; + + /* + * We do this slightly different than other places, since we need more + * than 32 bits and for instance pgd_addr_end converts to unsigned long. + */ + addr = 0; + for (i = 0; i < PTRS_PER_PGD2; i++) { + addr = i * (unsigned long long)PGDIR_SIZE; + pgd = kvm->arch.pgd + i; + pud = pud_offset(pgd, addr); + + if (pud_none(*pud)) + continue; + + BUG_ON(pud_bad(*pud)); + + pmd = pmd_offset(pud, addr); + free_stage2_ptes(pmd, addr); + pmd_free(NULL, pmd); + } + + free_pages((unsigned long)kvm->arch.pgd, PGD2_ORDER); + kvm->arch.pgd = NULL; +} + +static const pte_t null_pte; + +static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr, + const pte_t *new_pte) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* Create 2nd stage page table mapping - Level 1 */ + pgd = kvm->arch.pgd + pgd_index(addr); + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + BUG_ON(new_pte == &null_pte); + pmd = pmd_alloc_one(NULL, addr); + if (!pmd) { + kvm_err("Cannot allocate 2nd stage pmd\n"); + return -ENOMEM; + } + pud_populate(NULL, pud, pmd); + pmd += pmd_index(addr); + } else + pmd = pmd_offset(pud, addr); + + /* Create 2nd stage page table mapping - Level 2 */ + if (pmd_none(*pmd)) { + BUG_ON(new_pte == &null_pte); + pte = pte_alloc_one_kernel(NULL, addr); + if (!pte) { + kvm_err("Cannot allocate 2nd stage pte\n"); + return -ENOMEM; + } + pmd_populate_kernel(NULL, pmd, pte); + pte += pte_index(addr); + } else + pte = pte_offset_kernel(pmd, addr); + + /* Create 2nd stage page table mapping - Level 3 */ + set_pte_ext(pte, *new_pte, 0); + + return 0; +} + +static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + gfn_t gfn, struct kvm_memory_slot *memslot, + bool is_iabt) +{ + pte_t new_pte; + pfn_t pfn; + int ret; + bool write_fault, writable; + + /* TODO: Use instr. decoding for non-ISV to determine r/w fault */ + if (is_iabt) + write_fault = false; + else if ((vcpu->arch.hsr & HSR_ISV) && !(vcpu->arch.hsr & HSR_WNR)) + write_fault = false; + else + write_fault = true; + + if ((vcpu->arch.hsr & HSR_FSC_TYPE) == FSC_PERM && !write_fault) { + kvm_err("Unexpected L2 read permission error\n"); + return -EFAULT; + } + + /* preemption disabled for handle_exit, gfn_to_pfn may sleep */ + preempt_enable(); + pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); + preempt_disable(); + + if (is_error_pfn(pfn)) { + put_page(pfn_to_page(pfn)); + kvm_err("No host mapping: gfn %u (0x%08x)\n", + (unsigned int)gfn, + (unsigned int)gfn << PAGE_SHIFT); + return -EFAULT; + } + + mutex_lock(&vcpu->kvm->arch.pgd_mutex); + new_pte = pfn_pte(pfn, PAGE_KVM_GUEST); + if (writable) + new_pte |= L_PTE2_WRITE; + ret = stage2_set_pte(vcpu->kvm, fault_ipa, &new_pte); + if (ret) + put_page(pfn_to_page(pfn)); + mutex_unlock(&vcpu->kvm->arch.pgd_mutex); + + return ret; +} + +/** + * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation + * @vcpu: The VCPU pointer + * @run: The VCPU run struct containing the mmio data + * + * This should only be called after returning from userspace for MMIO load + * emulation. + */ +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + int *dest; + unsigned int len; + int mask; + + if (!run->mmio.is_write) { + dest = vcpu_reg(vcpu, vcpu->arch.mmio_rd); + memset(dest, 0, sizeof(int)); + + len = run->mmio.len; + if (len > 4) + return -EINVAL; + + memcpy(dest, run->mmio.data, len); + + trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, + *((u64 *)run->mmio.data)); + + if (vcpu->arch.mmio_sign_extend && len < 4) { + mask = 1U << ((len * 8) - 1); + *dest = (*dest ^ mask) - mask; + } + } + + return 0; +} + +/** + * invalid_io_mem_abort -- Handle I/O aborts ISV bit is clear + * + * @vcpu: The vcpu pointer + * @fault_ipa: The IPA that caused the 2nd stage fault + * + * Some load/store instructions cannot be emulated using the information + * presented in the HSR, for instance, register write-back instructions are not + * supported. We therefore need to fetch the instruction, decode it, and then + * emulate its behavior. + */ +static int invalid_io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) +{ + unsigned long instr; + phys_addr_t pc_ipa; + + if (vcpu->arch.pc_ipa & 1) { + kvm_err("I/O Abort from invalid instruction address? Wrong!\n"); + return -EINVAL; + } + + if (vcpu->arch.pc_ipa & (1U << 11)) { + /* LPAE PAR format */ + /* TODO: Check if this ever happens - called from Hyp mode */ + pc_ipa = vcpu->arch.pc_ipa & PAGE_MASK & ((1ULL << 32) - 1); + } else { + /* VMSAv7 PAR format */ + pc_ipa = vcpu->arch.pc_ipa & PAGE_MASK & ((1ULL << 40) - 1); + } + pc_ipa += vcpu->arch.regs.pc & ~PAGE_MASK; + + if (vcpu->arch.regs.cpsr & PSR_T_BIT) { + /* TODO: Check validity of PC IPA and IPA2!!! */ + /* Need to decode thumb instructions as well */ + kvm_err("Thumb guest support not there yet :(\n"); + return -EINVAL; + } + + if (kvm_read_guest(vcpu->kvm, pc_ipa, &instr, sizeof(instr))) { + kvm_err("Could not copy guest instruction\n"); + return -EFAULT; + } + + return kvm_emulate_mmio_ls(vcpu, fault_ipa, instr); +} + +static int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, + phys_addr_t fault_ipa, struct kvm_memory_slot *memslot) +{ + unsigned long rd, len, instr_len; + bool is_write, sign_extend; + + if (!(vcpu->arch.hsr & HSR_ISV)) + return invalid_io_mem_abort(vcpu, fault_ipa); + + if (((vcpu->arch.hsr >> 8) & 1)) { + kvm_err("Not supported, Cache operation on I/O addr.\n"); + return -EFAULT; + } + + if ((vcpu->arch.hsr >> 7) & 1) { + kvm_err("Translation table accesses I/O memory\n"); + return -EFAULT; + } + + switch ((vcpu->arch.hsr >> 22) & 0x3) { + case 0: + len = 1; + break; + case 1: + len = 2; + break; + case 2: + len = 4; + break; + default: + kvm_err("Invalid I/O abort\n"); + return -EFAULT; + } + + is_write = vcpu->arch.hsr & HSR_WNR; + sign_extend = vcpu->arch.hsr & HSR_SSE; + rd = (vcpu->arch.hsr & HSR_SRT_MASK) >> HSR_SRT_SHIFT; + BUG_ON(rd > 15); + + if (rd == 15) { + kvm_err("I/O memory trying to read/write pc\n"); + return -EFAULT; + } + + /* Get instruction length in bytes */ + instr_len = (vcpu->arch.hsr & HSR_IL) ? 4 : 2; + + /* Export MMIO operations to user space */ + run->mmio.is_write = is_write; + run->mmio.phys_addr = fault_ipa; + run->mmio.len = len; + vcpu->arch.mmio_sign_extend = sign_extend; + vcpu->arch.mmio_rd = rd; + + trace_kvm_mmio((is_write) ? KVM_TRACE_MMIO_WRITE : + KVM_TRACE_MMIO_READ_UNSATISFIED, + len, fault_ipa, (is_write) ? *vcpu_reg(vcpu, rd) : 0); + + if (is_write) + memcpy(run->mmio.data, vcpu_reg(vcpu, rd), len); + + /* + * The MMIO instruction is emulated and should not be re-executed + * in the guest. + */ + *vcpu_pc(vcpu) += instr_len; + kvm_adjust_itstate(vcpu); + vcpu->stat.mmio_exits++; + return KVM_EXIT_MMIO; +} + +/** + * kvm_handle_guest_abort - handles all 2nd stage aborts + * @vcpu: the VCPU pointer + * @run: the kvm_run structure + * + * Any abort that gets to the host is almost guaranteed to be caused by a + * missing second stage translation table entry, which can mean that either the + * guest simply needs more memory and we must allocate an appropriate page or it + * can mean that the guest tried to access I/O memory, which is emulated by user + * space. The distinction is based on the IPA causing the fault and whether this + * memory region has been registered as standard RAM by user space. + */ +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + unsigned long hsr_ec; + unsigned long fault_status; + phys_addr_t fault_ipa; + struct kvm_memory_slot *memslot = NULL; + bool is_iabt; + gfn_t gfn; + + hsr_ec = vcpu->arch.hsr >> HSR_EC_SHIFT; + is_iabt = (hsr_ec == HSR_EC_IABT); + + /* Check that the second stage fault is a translation fault */ + fault_status = (vcpu->arch.hsr & HSR_FSC_TYPE); + if (fault_status != FSC_FAULT && fault_status != FSC_PERM) { + kvm_err("Unsupported fault status: EC=%#lx DFCS=%#lx\n", + hsr_ec, fault_status); + return -EFAULT; + } + + fault_ipa = ((phys_addr_t)vcpu->arch.hpfar & HPFAR_MASK) << 8; + + gfn = fault_ipa >> PAGE_SHIFT; + if (!kvm_is_visible_gfn(vcpu->kvm, gfn)) { + if (is_iabt) { + kvm_err("Inst. abort on I/O address %08lx\n", + (unsigned long)fault_ipa); + return -EFAULT; + } + + /* Adjust page offset */ + fault_ipa |= vcpu->arch.hdfar & ~PAGE_MASK; + return io_mem_abort(vcpu, run, fault_ipa, memslot); + } + + memslot = gfn_to_memslot(vcpu->kvm, gfn); + if (!memslot->user_alloc) { + kvm_err("non user-alloc memslots not supported\n"); + return -EINVAL; + } + + return user_mem_abort(vcpu, fault_ipa, gfn, memslot, is_iabt); +} + +static bool hva_to_gpa(struct kvm *kvm, unsigned long hva, gpa_t *gpa) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + bool found = false; + + mutex_lock(&kvm->slots_lock); + slots = kvm_memslots(kvm); + + /* we only care about the pages that the guest sees */ + kvm_for_each_memslot(memslot, slots) { + unsigned long start = memslot->userspace_addr; + unsigned long end; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gpa_t gpa_offset = hva - start; + *gpa = (memslot->base_gfn << PAGE_SHIFT) + gpa_offset; + found = true; + /* no overlapping memslots allowed: break */ + break; + } + } + + mutex_unlock(&kvm->slots_lock); + return found; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + bool found; + gpa_t gpa; + + if (!kvm->arch.pgd) + return 0; + + mutex_lock(&kvm->arch.pgd_mutex); + found = hva_to_gpa(kvm, hva, &gpa); + if (found) { + stage2_set_pte(kvm, gpa, &null_pte); + __kvm_tlb_flush_vmid(kvm); + } + mutex_unlock(&kvm->arch.pgd_mutex); + return 0; +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + gpa_t gpa; + bool found; + + if (!kvm->arch.pgd) + return; + + mutex_lock(&kvm->arch.pgd_mutex); + found = hva_to_gpa(kvm, hva, &gpa); + if (found) { + stage2_set_pte(kvm, gpa, &pte); + /* + * Ignore return code from stage2_set_pte, since -ENOMEM would + * indicate this IPA is is not mapped and there is no harm + * that the PTE changed. + */ + __kvm_tlb_flush_vmid(kvm); + } + mutex_unlock(&kvm->arch.pgd_mutex); +} diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c new file mode 100644 index 0000000000000..78488be1c4a04 --- /dev/null +++ b/arch/arm/kvm/reset.c @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall <c.dall@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kvm_host.h> +#include <linux/kvm.h> + +#include <asm/unified.h> +#include <asm/ptrace.h> +#include <asm/cputype.h> +#include <asm/kvm_arm.h> + +#define CT_ASSERT(expr, name) extern char name[(expr) ? 1 : -1] +#define CP15_REGS_ASSERT(_array, _name) \ + CT_ASSERT((sizeof(_array) / sizeof(_array[0])) == nr_cp15_regs, _name) +#define UNKNOWN 0xdecafbad + +/****************************************************************************** + * Cortex-A15 Register Reset Values + */ + +static const int a15_max_cpu_idx = 3; + +static struct kvm_vcpu_regs a15_regs_reset = { + .cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, +}; + +static u32 a15_cp15_regs_reset[][2] = { + { c0_MIDR, 0x412FC0F0 }, + { c0_MPIDR, 0x00000000 }, /* see kvm_arch_vcpu_init */ + { c1_SCTLR, 0x00C50078 }, + { c1_ACTLR, 0x00000000 }, + { c1_CPACR, 0x00000000 }, + { c2_TTBR0, UNKNOWN }, + { c2_TTBR0_high, UNKNOWN }, + { c2_TTBR1, UNKNOWN }, + { c2_TTBR1_high, UNKNOWN }, + { c2_TTBCR, 0x00000000 }, + { c3_DACR, UNKNOWN }, + { c5_DFSR, UNKNOWN }, + { c5_IFSR, UNKNOWN }, + { c5_ADFSR, UNKNOWN }, + { c5_AIFSR, UNKNOWN }, + { c6_DFAR, UNKNOWN }, + { c6_IFAR, UNKNOWN }, + { c10_PRRR, 0x00098AA4 }, + { c10_NMRR, 0x44E048E0 }, + { c12_VBAR, 0x00000000 }, + { c13_CID, 0x00000000 }, + { c13_TID_URW, UNKNOWN }, + { c13_TID_URO, UNKNOWN }, + { c13_TID_PRIV, UNKNOWN }, +}; +CP15_REGS_ASSERT(a15_cp15_regs_reset, a15_cp15_regs_reset_init); + +static void a15_reset_vcpu(struct kvm_vcpu *vcpu) +{ + /* + * Compute guest MPIDR: + * (Even if we present only one VCPU to the guest on an SMP + * host we don't set the U bit in the MPIDR, or vice versa, as + * revealing the underlying hardware properties is likely to + * be the best choice). + */ + vcpu->arch.cp15[c0_MPIDR] = (read_cpuid_mpidr() & ~MPIDR_CPUID) + | (vcpu->vcpu_id & MPIDR_CPUID); +} + + +/******************************************************************************* + * Exported reset function + */ + +/** + * kvm_reset_vcpu - sets core registers and cp15 registers to reset value + * @vcpu: The VCPU pointer + * + * This function finds the right table above and sets the registers on the + * virtual CPU struct to their architectually defined reset values. + */ +int kvm_reset_vcpu(struct kvm_vcpu *vcpu) +{ + unsigned int i; + struct kvm_vcpu_regs *cpu_reset; + u32 (*cp15_reset)[2]; + void (*cpu_reset_vcpu)(struct kvm_vcpu *vcpu); + + switch (kvm_target_cpu()) { + case CORTEX_A15: + if (vcpu->vcpu_id > a15_max_cpu_idx) + return -EINVAL; + cpu_reset = &a15_regs_reset; + cp15_reset = a15_cp15_regs_reset; + cpu_reset_vcpu = a15_reset_vcpu; + break; + default: + return -ENODEV; + } + + /* Reset core registers */ + memcpy(&vcpu->arch.regs, cpu_reset, sizeof(vcpu->arch.regs)); + + /* Reset CP15 registers */ + for (i = 0; i < nr_cp15_regs; i++) { + if (cp15_reset[i][0] != i) { + kvm_err("CP15 field %d is %d, expected %d\n", + i, cp15_reset[i][0], i); + return -ENXIO; + } + vcpu->arch.cp15[i] = cp15_reset[i][1]; + } + + /* Physical CPU specific runtime reset operations */ + cpu_reset_vcpu(vcpu); + + return 0; +} diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h new file mode 100644 index 0000000000000..28ed1a18b289e --- /dev/null +++ b/arch/arm/kvm/trace.h @@ -0,0 +1,117 @@ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +/* + * Tracepoints for entry/exit to guest + */ +TRACE_EVENT(kvm_entry, + TP_PROTO(unsigned long vcpu_pc), + TP_ARGS(vcpu_pc), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + ), + + TP_printk("PC: 0x%08lx", __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned long vcpu_pc), + TP_ARGS(vcpu_pc), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + ), + + TP_printk("PC: 0x%08lx", __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_mmio_emulate, + TP_PROTO(unsigned long vcpu_pc, unsigned long instr, + unsigned long cpsr), + TP_ARGS(vcpu_pc, instr, cpsr), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, instr ) + __field( unsigned long, cpsr ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->instr = instr; + __entry->cpsr = cpsr; + ), + + TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)", + __entry->vcpu_pc, __entry->instr, __entry->cpsr) +); + +/* Architecturally implementation defined CP15 register access */ +TRACE_EVENT(kvm_emulate_cp15_imp, + TP_PROTO(unsigned long Op1, unsigned long Rt1, unsigned long CRn, + unsigned long CRm, unsigned long Op2, bool is_write), + TP_ARGS(Op1, Rt1, CRn, CRm, Op2, is_write), + + TP_STRUCT__entry( + __field( unsigned int, Op1 ) + __field( unsigned int, Rt1 ) + __field( unsigned int, CRn ) + __field( unsigned int, CRm ) + __field( unsigned int, Op2 ) + __field( bool, is_write ) + ), + + TP_fast_assign( + __entry->is_write = is_write; + __entry->Op1 = Op1; + __entry->Rt1 = Rt1; + __entry->CRn = CRn; + __entry->CRm = CRm; + __entry->Op2 = Op2; + ), + + TP_printk("Implementation defined CP15: %s\tp15, %u, r%u, c%u, c%u, %u", + (__entry->is_write) ? "mcr" : "mrc", + __entry->Op1, __entry->Rt1, __entry->CRn, + __entry->CRm, __entry->Op2) +); + +TRACE_EVENT(kvm_wfi, + TP_PROTO(unsigned long vcpu_pc), + TP_ARGS(vcpu_pc), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + ), + + TP_printk("guest executed wfi at: 0x%08lx", __entry->vcpu_pc) +); + + +#endif /* _TRACE_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH arch/arm/kvm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 101b9681c08c2..037dc53e00905 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -597,6 +597,16 @@ config ARM_LPAE If unsure, say N. +config ARM_VIRT_EXT + bool "Support for ARM Virtualization Extensions" + depends on ARM_LPAE + help + Say Y if you have an ARMv7 processor supporting the ARM hardware + Virtualization extensions. KVM depends on this feature and will + not run without it being selected. If you say Y here, the kernel + will not boot on a machine without virtualization extensions and + will not boot as a KVM guest. + config ARCH_PHYS_ADDR_T_64BIT def_bool ARM_LPAE diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c index ab88ed4f8e08f..7a944af10dbda 100644 --- a/arch/arm/mm/idmap.c +++ b/arch/arm/mm/idmap.c @@ -1,4 +1,6 @@ +#include <linux/module.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <asm/cputype.h> #include <asm/idmap.h> @@ -59,11 +61,20 @@ static void idmap_add_pud(pgd_t *pgd, unsigned long addr, unsigned long end, } while (pud++, addr = next, addr != end); } -static void identity_mapping_add(pgd_t *pgd, unsigned long addr, unsigned long end) +static void identity_mapping_add(pgd_t *pgd, const char *text_start, + const char *text_end, unsigned long prot) { - unsigned long prot, next; + unsigned long addr, end; + unsigned long next; + + addr = virt_to_phys(text_start); + end = virt_to_phys(text_end); + + pr_info("Setting up static %sidentity map for 0x%llx - 0x%llx\n", + prot ? "HYP " : "", + (long long)addr, (long long)end); + prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF; - prot = PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF; if (cpu_architecture() <= CPU_ARCH_ARMv5TEJ && !cpu_is_xscale()) prot |= PMD_BIT4; @@ -78,24 +89,77 @@ extern char __idmap_text_start[], __idmap_text_end[]; static int __init init_static_idmap(void) { - phys_addr_t idmap_start, idmap_end; - idmap_pgd = pgd_alloc(&init_mm); if (!idmap_pgd) return -ENOMEM; - /* Add an identity mapping for the physical address of the section. */ - idmap_start = virt_to_phys((void *)__idmap_text_start); - idmap_end = virt_to_phys((void *)__idmap_text_end); - - pr_info("Setting up static identity map for 0x%llx - 0x%llx\n", - (long long)idmap_start, (long long)idmap_end); - identity_mapping_add(idmap_pgd, idmap_start, idmap_end); + identity_mapping_add(idmap_pgd, __idmap_text_start, + __idmap_text_end, 0); return 0; } early_initcall(init_static_idmap); +#ifdef CONFIG_ARM_VIRT_EXT +pgd_t *hyp_pgd; +EXPORT_SYMBOL_GPL(hyp_pgd); + +static void hyp_idmap_del_pmd(pgd_t *pgd, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = pud_offset(pgd, addr); + pmd = pmd_offset(pud, addr); + pud_clear(pud); + clean_pmd_entry(pmd); + pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK)); +} + +extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; + +/* + * This version actually frees the underlying pmds for all pgds in range and + * clear the pgds themselves afterwards. + */ +void hyp_idmap_teardown(void) +{ + unsigned long addr, end; + unsigned long next; + pgd_t *pgd = hyp_pgd; + + addr = virt_to_phys(__hyp_idmap_text_start); + end = virt_to_phys(__hyp_idmap_text_end); + + pgd += pgd_index(addr); + do { + next = pgd_addr_end(addr, end); + if (!pgd_none_or_clear_bad(pgd)) + hyp_idmap_del_pmd(pgd, addr); + } while (pgd++, addr = next, addr < end); +} +EXPORT_SYMBOL_GPL(hyp_idmap_teardown); + +void hyp_idmap_setup(void) +{ + identity_mapping_add(hyp_pgd, __hyp_idmap_text_start, + __hyp_idmap_text_end, PMD_SECT_AP1); +} +EXPORT_SYMBOL_GPL(hyp_idmap_setup); + +static int __init hyp_init_static_idmap(void) +{ + hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL); + if (!hyp_pgd) + return -ENOMEM; + + hyp_idmap_setup(); + + return 0; +} +early_initcall(hyp_init_static_idmap); +#endif + /* * In order to soft-boot, we need to switch to a 1:1 mapping for the * cpu_reset functions. This will then ensure that we have predictable diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index e5dad60b558b4..7dd4b5463f32d 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -56,9 +56,11 @@ static unsigned int cachepolicy __initdata = CPOLICY_WRITEBACK; static unsigned int ecc_mask __initdata = 0; pgprot_t pgprot_user; pgprot_t pgprot_kernel; +pgprot_t pgprot_guest; EXPORT_SYMBOL(pgprot_user); EXPORT_SYMBOL(pgprot_kernel); +EXPORT_SYMBOL(pgprot_guest); struct cachepolicy { const char policy[16]; @@ -301,6 +303,12 @@ const struct mem_type *get_mem_type(unsigned int type) } EXPORT_SYMBOL(get_mem_type); +pteval_t get_mem_type_prot_pte(unsigned int type) +{ + return get_mem_type(type)->prot_pte; +} +EXPORT_SYMBOL(get_mem_type_prot_pte); + /* * Adjust the PMD section entries according to the CPU in use. */ @@ -514,6 +522,7 @@ static void __init build_mem_type_table(void) pgprot_user = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | user_pgprot); pgprot_kernel = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | kern_pgprot); + pgprot_guest = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG); mem_types[MT_LOW_VECTORS].prot_l1 |= ecc_mask; mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask; diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index b9f82c84f0932..ec6c6b3012389 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -26,6 +26,7 @@ /* Select x86 specific features in <linux/kvm.h> */ #define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_DEVICE_ASSIGNMENT /* Architectural interrupt line count. */ diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 9806e55f91bee..df5351e3eed73 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -19,6 +19,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" + depends on BROKEN depends on HAVE_KVM && MODULES && EXPERIMENTAL # for device assignment: depends on PCI diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index bd77cb507c1c7..122a4b2a714af 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -924,6 +924,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->statusstatus = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -963,29 +973,6 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; } break; - case KVM_IRQ_LINE_STATUS: - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; - - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - r = -ENXIO; - if (irqchip_in_kernel(kvm)) { - __s32 status; - status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event.irq, irq_event.level); - if (ioctl == KVM_IRQ_LINE_STATUS) { - r = -EFAULT; - irq_event.status = status; - if (copy_to_user(argp, &irq_event, - sizeof irq_event)) - goto out; - } - r = 0; - } - break; - } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip chip; diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index 976835d8f22e2..bf2c06c338719 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -153,6 +153,8 @@ #define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5" #define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4" +extern bool epapr_paravirt_enabled; +extern u32 epapr_hypercall_start[]; /* * We use "uintptr_t" to define a register because it's guaranteed to be a diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index b0c08b142770d..0dd1d86d3e316 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -36,11 +36,8 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) #define SPAPR_TCE_SHIFT 12 #ifdef CONFIG_KVM_BOOK3S_64_HV -/* For now use fixed-size 16MB page table */ -#define HPT_ORDER 24 -#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ -#define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */ -#define HPT_HASH_MASK (HPT_NPTEG - 1) +#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ +extern int kvm_hpt_order; /* order of preallocated HPTs */ #endif #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d848cdc497152..50ea12fd7bf5e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -237,6 +237,10 @@ struct kvm_arch { unsigned long vrma_slb_v; int rma_setup_done; int using_mmu_notifiers; + u32 hpt_order; + atomic_t vcpus_running; + unsigned long hpt_npte; + unsigned long hpt_mask; spinlock_t slot_phys_lock; unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; int slot_npages[KVM_MEM_SLOTS_NUM]; @@ -414,7 +418,9 @@ struct kvm_vcpu_arch { ulong mcsrr1; ulong mcsr; u32 dec; +#ifdef CONFIG_BOOKE u32 decar; +#endif u32 tbl; u32 tbu; u32 tcr; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index f68c22fa2fceb..0124937a23b97 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -119,7 +119,8 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); -extern long kvmppc_alloc_hpt(struct kvm *kvm); +extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp); +extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); extern void kvmppc_free_hpt(struct kvm *kvm); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 83afacd3ba7b3..bb282dd816121 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -128,6 +128,7 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),) obj-y += ppc_save_regs.o endif +obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o # Disable GCOV in odd or sensitive code diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S new file mode 100644 index 0000000000000..697b390ebfd8a --- /dev/null +++ b/arch/powerpc/kernel/epapr_hcalls.S @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2012 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/threads.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/thread_info.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +/* Hypercall entry point. Will be patched with device tree instructions. */ +.global epapr_hypercall_start +epapr_hypercall_start: + li r3, -1 + nop + nop + nop + blr diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c new file mode 100644 index 0000000000000..028aeae370b65 --- /dev/null +++ b/arch/powerpc/kernel/epapr_paravirt.c @@ -0,0 +1,52 @@ +/* + * ePAPR para-virtualization support. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) 2012 Freescale Semiconductor, Inc. + */ + +#include <linux/of.h> +#include <asm/epapr_hcalls.h> +#include <asm/cacheflush.h> +#include <asm/code-patching.h> + +bool epapr_paravirt_enabled; + +static int __init epapr_paravirt_init(void) +{ + struct device_node *hyper_node; + const u32 *insts; + int len, i; + + hyper_node = of_find_node_by_path("/hypervisor"); + if (!hyper_node) + return -ENODEV; + + insts = of_get_property(hyper_node, "hcall-instructions", &len); + if (!insts) + return -ENODEV; + + if (len % 4 || len > (4 * 4)) + return -ENODEV; + + for (i = 0; i < (len / 4); i++) + patch_instruction(epapr_hypercall_start + i, insts[i]); + + epapr_paravirt_enabled = true; + + return 0; +} + +early_initcall(epapr_paravirt_init); diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 62bdf23896698..1c13307e03088 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -31,6 +31,7 @@ #include <asm/cacheflush.h> #include <asm/disassemble.h> #include <asm/ppc-opcode.h> +#include <asm/epapr_hcalls.h> #define KVM_MAGIC_PAGE (-4096L) #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) @@ -726,7 +727,7 @@ unsigned long kvm_hypercall(unsigned long *in, unsigned long register r11 asm("r11") = nr; unsigned long register r12 asm("r12"); - asm volatile("bl kvm_hypercall_start" + asm volatile("bl epapr_hypercall_start" : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6), "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11), "=r"(r12) @@ -747,29 +748,6 @@ unsigned long kvm_hypercall(unsigned long *in, } EXPORT_SYMBOL_GPL(kvm_hypercall); -static int kvm_para_setup(void) -{ - extern u32 kvm_hypercall_start; - struct device_node *hyper_node; - u32 *insts; - int len, i; - - hyper_node = of_find_node_by_path("/hypervisor"); - if (!hyper_node) - return -1; - - insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len); - if (len % 4) - return -1; - if (len > (4 * 4)) - return -1; - - for (i = 0; i < (len / 4); i++) - kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]); - - return 0; -} - static __init void kvm_free_tmp(void) { unsigned long start, end; @@ -791,7 +769,7 @@ static int __init kvm_guest_init(void) if (!kvm_para_available()) goto free_tmp; - if (kvm_para_setup()) + if (!epapr_paravirt_enabled) goto free_tmp; if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index e291cf3cf954b..e100ff324a85c 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S @@ -24,16 +24,6 @@ #include <asm/page.h> #include <asm/asm-offsets.h> -/* Hypercall entry point. Will be patched with device tree instructions. */ - -.global kvm_hypercall_start -kvm_hypercall_start: - li r3, -1 - nop - nop - nop - blr - #define KVM_MAGIC_PAGE (-4096) #ifdef CONFIG_64BIT @@ -132,7 +122,7 @@ kvm_emulate_mtmsrd_len: .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4 -#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI) +#define MSR_SAFE_BITS (MSR_EE | MSR_RI) #define MSR_CRITICAL_BITS ~MSR_SAFE_BITS .global kvm_emulate_mtmsr diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 80a5775175844..d03eb6f7b0584 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -37,56 +37,121 @@ /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63 -long kvmppc_alloc_hpt(struct kvm *kvm) +/* Power architecture requires HPT is at least 256kB */ +#define PPC_MIN_HPT_ORDER 18 + +long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) { unsigned long hpt; - long lpid; struct revmap_entry *rev; struct kvmppc_linear_info *li; + long order = kvm_hpt_order; - /* Allocate guest's hashed page table */ - li = kvm_alloc_hpt(); - if (li) { - /* using preallocated memory */ - hpt = (ulong)li->base_virt; - kvm->arch.hpt_li = li; - } else { - /* using dynamic memory */ + if (htab_orderp) { + order = *htab_orderp; + if (order < PPC_MIN_HPT_ORDER) + order = PPC_MIN_HPT_ORDER; + } + + /* + * If the user wants a different size from default, + * try first to allocate it from the kernel page allocator. + */ + hpt = 0; + if (order != kvm_hpt_order) { hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| - __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT); + __GFP_NOWARN, order - PAGE_SHIFT); + if (!hpt) + --order; } + /* Next try to allocate from the preallocated pool */ if (!hpt) { - pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); - return -ENOMEM; + li = kvm_alloc_hpt(); + if (li) { + hpt = (ulong)li->base_virt; + kvm->arch.hpt_li = li; + order = kvm_hpt_order; + } } + + /* Lastly try successively smaller sizes from the page allocator */ + while (!hpt && order > PPC_MIN_HPT_ORDER) { + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| + __GFP_NOWARN, order - PAGE_SHIFT); + if (!hpt) + --order; + } + + if (!hpt) + return -ENOMEM; + kvm->arch.hpt_virt = hpt; + kvm->arch.hpt_order = order; + /* HPTEs are 2**4 bytes long */ + kvm->arch.hpt_npte = 1ul << (order - 4); + /* 128 (2**7) bytes in each HPTEG */ + kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; /* Allocate reverse map array */ - rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE); + rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); if (!rev) { pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); goto out_freehpt; } kvm->arch.revmap = rev; + kvm->arch.sdr1 = __pa(hpt) | (order - 18); - lpid = kvmppc_alloc_lpid(); - if (lpid < 0) - goto out_freeboth; + pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", + hpt, order, kvm->arch.lpid); - kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); - kvm->arch.lpid = lpid; - - pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); + if (htab_orderp) + *htab_orderp = order; return 0; - out_freeboth: - vfree(rev); out_freehpt: - free_pages(hpt, HPT_ORDER - PAGE_SHIFT); + if (kvm->arch.hpt_li) + kvm_release_hpt(kvm->arch.hpt_li); + else + free_pages(hpt, order - PAGE_SHIFT); return -ENOMEM; } +long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) +{ + long err = -EBUSY; + long order; + + mutex_lock(&kvm->lock); + if (kvm->arch.rma_setup_done) { + kvm->arch.rma_setup_done = 0; + /* order rma_setup_done vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.rma_setup_done = 1; + goto out; + } + } + if (kvm->arch.hpt_virt) { + order = kvm->arch.hpt_order; + /* Set the entire HPT to 0, i.e. invalid HPTEs */ + memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); + /* + * Set the whole last_vcpu array to an invalid vcpu number. + * This ensures that each vcpu will flush its TLB on next entry. + */ + memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); + *htab_orderp = order; + err = 0; + } else { + err = kvmppc_alloc_hpt(kvm, htab_orderp); + order = *htab_orderp; + } + out: + mutex_unlock(&kvm->lock); + return err; +} + void kvmppc_free_hpt(struct kvm *kvm) { kvmppc_free_lpid(kvm->arch.lpid); @@ -94,7 +159,8 @@ void kvmppc_free_hpt(struct kvm *kvm) if (kvm->arch.hpt_li) kvm_release_hpt(kvm->arch.hpt_li); else - free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); + free_pages(kvm->arch.hpt_virt, + kvm->arch.hpt_order - PAGE_SHIFT); } /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ @@ -119,6 +185,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, unsigned long psize; unsigned long hp0, hp1; long ret; + struct kvm *kvm = vcpu->kvm; psize = 1ul << porder; npages = memslot->npages >> (porder - PAGE_SHIFT); @@ -127,8 +194,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, if (npages > 1ul << (40 - porder)) npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ - if (npages > HPT_NPTEG) - npages = HPT_NPTEG; + if (npages > kvm->arch.hpt_mask + 1) + npages = kvm->arch.hpt_mask + 1; hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); @@ -138,7 +205,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, for (i = 0; i < npages; ++i) { addr = i << porder; /* can't use hpt_hash since va > 64 bits */ - hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; /* * We assume that the hash table is empty and no * vcpus are using it at this stage. Since we create diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3abe1b86e5834..83e929e66f9d8 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -56,7 +56,7 @@ /* #define EXIT_DEBUG_INT */ static void kvmppc_end_cede(struct kvm_vcpu *vcpu); -static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu); +static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { @@ -1104,11 +1104,15 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINTR; } - /* On the first time here, set up VRMA or RMA */ + atomic_inc(&vcpu->kvm->arch.vcpus_running); + /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ + smp_mb(); + + /* On the first time here, set up HTAB and VRMA or RMA */ if (!vcpu->kvm->arch.rma_setup_done) { - r = kvmppc_hv_setup_rma(vcpu); + r = kvmppc_hv_setup_htab_rma(vcpu); if (r) - return r; + goto out; } flush_fp_to_thread(current); @@ -1126,6 +1130,9 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) kvmppc_core_prepare_to_enter(vcpu); } } while (r == RESUME_GUEST); + + out: + atomic_dec(&vcpu->kvm->arch.vcpus_running); return r; } @@ -1341,7 +1348,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, { } -static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) +static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { int err = 0; struct kvm *kvm = vcpu->kvm; @@ -1360,6 +1367,15 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) if (kvm->arch.rma_setup_done) goto out; /* another vcpu beat us to it */ + /* Allocate hashed page table (if not done already) and reset it */ + if (!kvm->arch.hpt_virt) { + err = kvmppc_alloc_hpt(kvm, NULL); + if (err) { + pr_err("KVM: Couldn't alloc HPT\n"); + goto out; + } + } + /* Look up the memslot for guest physical address 0 */ memslot = gfn_to_memslot(kvm, 0); @@ -1471,13 +1487,14 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) int kvmppc_core_init_vm(struct kvm *kvm) { - long r; - unsigned long lpcr; + unsigned long lpcr, lpid; - /* Allocate hashed page table */ - r = kvmppc_alloc_hpt(kvm); - if (r) - return r; + /* Allocate the guest's logical partition ID */ + + lpid = kvmppc_alloc_lpid(); + if (lpid < 0) + return -ENOMEM; + kvm->arch.lpid = lpid; INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); @@ -1487,7 +1504,6 @@ int kvmppc_core_init_vm(struct kvm *kvm) if (cpu_has_feature(CPU_FTR_ARCH_201)) { /* PPC970; HID4 is effectively the LPCR */ - unsigned long lpid = kvm->arch.lpid; kvm->arch.host_lpid = 0; kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index e1b60f56f2a1e..fb4eac290fefc 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -25,6 +25,9 @@ static void __init kvm_linear_init_one(ulong size, int count, int type); static struct kvmppc_linear_info *kvm_alloc_linear(int type); static void kvm_release_linear(struct kvmppc_linear_info *ri); +int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER; +EXPORT_SYMBOL_GPL(kvm_hpt_order); + /*************** RMA *************/ /* @@ -209,7 +212,7 @@ static void kvm_release_linear(struct kvmppc_linear_info *ri) void __init kvm_linear_init(void) { /* HPT */ - kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT); + kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT); /* RMA */ /* Only do this on PPC970 in HV mode */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index cec4daddbf316..5c70d19494f92 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -237,7 +237,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, /* Find and lock the HPTEG slot to use */ do_insert: - if (pte_index >= HPT_NPTE) + if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; @@ -352,7 +352,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long v, r, rb; struct revmap_entry *rev; - if (pte_index >= HPT_NPTE) + if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) @@ -419,7 +419,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) i = 4; break; } - if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) { + if (req != 1 || flags == 3 || + pte_index >= kvm->arch.hpt_npte) { /* parameter error */ args[j] = ((0xa0 | flags) << 56) + pte_index; ret = H_PARAMETER; @@ -521,7 +522,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, struct revmap_entry *rev; unsigned long v, r, rb, mask, bits; - if (pte_index >= HPT_NPTE) + if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); @@ -583,7 +584,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, int i, n = 1; struct revmap_entry *rev = NULL; - if (pte_index >= HPT_NPTE) + if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; if (flags & H_READ_4) { pte_index &= ~3; @@ -678,7 +679,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, somask = (1UL << 28) - 1; vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; } - hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK; + hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; avpn = slb_v & ~(somask >> 16); /* also includes B */ avpn |= (eaddr & somask) >> 16; @@ -723,7 +724,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, if (val & HPTE_V_SECONDARY) break; val |= HPTE_V_SECONDARY; - hash = hash ^ HPT_HASH_MASK; + hash = hash ^ kvm->arch.hpt_mask; } return -1; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 72f13f4a06e0d..86681eec60b1e 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1267,6 +1267,11 @@ void kvmppc_decrementer_func(unsigned long data) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + if (vcpu->arch.tcr & TCR_ARE) { + vcpu->arch.dec = vcpu->arch.decar; + kvmppc_emulate_dec(vcpu); + } + kvmppc_set_tsr_bits(vcpu, TSR_DIS); } diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 6c76397f2af48..9eb9809eb13e2 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -129,6 +129,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) kvmppc_set_tcr(vcpu, spr_val); break; + case SPRN_DECAR: + vcpu->arch.decar = spr_val; + break; /* * Note: SPRG4-7 are user-readable. * These values are loaded into the real SPRGs when resuming the diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 8b99e076dc818..e04b0ef55ce0e 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -269,6 +269,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) *spr_val = vcpu->arch.shared->mas7_3 >> 32; break; #endif + case SPRN_DECAR: + *spr_val = vcpu->arch.decar; + break; case SPRN_TLB0CFG: *spr_val = vcpu->arch.tlbcfg[0]; break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1493c8de947b9..87f4dc886076d 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -246,6 +246,7 @@ int kvm_dev_ioctl_check_extension(long ext) #endif #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: + case KVM_CAP_PPC_ALLOC_HTAB: r = 1; break; #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -802,6 +803,23 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; break; } + + case KVM_PPC_ALLOCATE_HTAB: { + struct kvm *kvm = filp->private_data; + u32 htab_order; + + r = -EFAULT; + if (get_user(htab_order, (u32 __user *)argp)) + break; + r = kvmppc_alloc_reset_hpt(kvm, &htab_order); + if (r) + break; + r = -EFAULT; + if (put_user(htab_order, (u32 __user *)argp)) + break; + r = 0; + break; + } #endif /* CONFIG_KVM_BOOK3S_64_HV */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index a35ca44ade66a..e7a896acd9827 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -25,6 +25,7 @@ source "arch/powerpc/platforms/wsp/Kconfig" config KVM_GUEST bool "KVM Guest support" default n + select EPAPR_PARAVIRT ---help--- This option enables various optimizations for running under the KVM hypervisor. Overhead for the kernel when not running inside KVM should @@ -32,6 +33,14 @@ config KVM_GUEST In case of doubt, say Y +config EPAPR_PARAVIRT + bool "ePAPR para-virtualization support" + default n + help + Enables ePAPR para-virtualization support for guests. + + In case of doubt, say Y + config PPC_NATIVE bool depends on 6xx || PPC64 diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index bf238c55740bc..173f07aaeb2b3 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -55,5 +55,7 @@ int sclp_chp_configure(struct chp_id chpid); int sclp_chp_deconfigure(struct chp_id chpid); int sclp_chp_read_info(struct sclp_chp_info *info); void sclp_get_ipl_info(struct sclp_ipl_info *info); +bool sclp_has_linemode(void); +bool sclp_has_vt220(void); #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 489d1d8d96b06..e86bca6f975c5 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -63,6 +63,7 @@ #include <asm/kvm_virtio.h> #include <asm/diag.h> #include <asm/os_info.h> +#include <asm/sclp.h> #include "entry.h" long psw_kernel_bits = PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_PRIMARY | @@ -138,9 +139,14 @@ __setup("condev=", condev_setup); static void __init set_preferred_console(void) { - if (MACHINE_IS_KVM) - add_preferred_console("hvc", 0, NULL); - else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) + if (MACHINE_IS_KVM) { + if (sclp_has_vt220()) + add_preferred_console("ttyS", 1, NULL); + else if (sclp_has_linemode()) + add_preferred_console("ttyS", 0, NULL); + else + add_preferred_console("hvc", 0, NULL); + } else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) add_preferred_console("ttyS", 0, NULL); else if (CONSOLE_IS_3270) add_preferred_console("tty3270", 0, NULL); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 664766d0c83c6..ace93603d8610 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -347,6 +347,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.guest_fpregs.fpc = 0; asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); vcpu->arch.sie_block->gbea = 1; + atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index a6983b2772201..72f5009deb5a0 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. + * + * Note: the operation is performed atomically with respect to + * the local CPU, but not other CPUs. Portable code should not + * rely on this behaviour. + * KVM relies on this behaviour on x86 for modifying memory that is also + * accessed from a hypervisor on the same CPU if running in a VM: don't change + * this without also updating arch/x86/kernel/kvm.c */ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) { diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index e7d1c194d2728..246617efd67f5 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -12,6 +12,7 @@ /* Select x86 specific features in <linux/kvm.h> */ #define __KVM_HAVE_PIT #define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_DEVICE_ASSIGNMENT #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index db7c1f2709a27..24b76474d9de0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -175,6 +175,13 @@ enum { /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 +/* + * The following bit is set with PV-EOI, unset on EOI. + * We detect PV-EOI changes by guest by comparing + * this bit with PV-EOI in guest memory. + * See the implementation in apic_update_pv_eoi. + */ +#define KVM_APIC_PV_EOI_PENDING 1 /* * We don't want allocation failures within the mmu code, so we preallocate @@ -484,6 +491,11 @@ struct kvm_vcpu_arch { u64 length; u64 status; } osvw; + + struct { + u64 msr_val; + struct gfn_to_hva_cache data; + } pv_eoi; }; struct kvm_lpage_info { diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 63ab1661d00eb..2f7712e08b1e8 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -22,6 +22,7 @@ #define KVM_FEATURE_CLOCKSOURCE2 3 #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 +#define KVM_FEATURE_PV_EOI 6 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. @@ -37,6 +38,7 @@ #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 #define MSR_KVM_STEAL_TIME 0x4b564d03 +#define MSR_KVM_PV_EOI_EN 0x4b564d04 struct kvm_steal_time { __u64 steal; @@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data { __u32 enabled; }; +#define KVM_PV_EOI_BIT 0 +#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) +#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK +#define KVM_PV_EOI_DISABLED 0x0 + #ifdef __KERNEL__ #include <asm/processor.h> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 31f180c21ce91..de007c2727307 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -404,6 +404,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -415,11 +416,14 @@ enum vmcs_field { #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 #define VMX_EPT_GAW_EPTP_SHIFT 3 +#define VMX_EPT_AD_ENABLE_BIT (1ull << 6) #define VMX_EPT_DEFAULT_MT 0x6ull #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e554e5ad2fe8b..75ab94c75c7a4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -39,6 +39,8 @@ #include <asm/desc.h> #include <asm/tlbflush.h> #include <asm/idle.h> +#include <asm/apic.h> +#include <asm/apicdef.h> static int kvmapf = 1; @@ -283,6 +285,22 @@ static void kvm_register_steal_time(void) cpu, __pa(st)); } +static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; + +static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +{ + /** + * This relies on __test_and_clear_bit to modify the memory + * in a way that is atomic with respect to the local CPU. + * The hypervisor only accesses this memory from the local CPU so + * there's no need for lock or memory barriers. + * An optimization barrier is implied in apic write. + */ + if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) + return; + apic->write(APIC_EOI, APIC_EOI_ACK); +} + void __cpuinit kvm_guest_cpu_init(void) { if (!kvm_para_available()) @@ -300,11 +318,20 @@ void __cpuinit kvm_guest_cpu_init(void) smp_processor_id()); } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ + BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); + __get_cpu_var(kvm_apic_eoi) = 0; + pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_EOI_EN, pa); + } + if (has_steal_clock) kvm_register_steal_time(); } -static void kvm_pv_disable_apf(void *unused) +static void kvm_pv_disable_apf(void) { if (!__get_cpu_var(apf_reason).enabled) return; @@ -316,11 +343,23 @@ static void kvm_pv_disable_apf(void *unused) smp_processor_id()); } +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + /* + * We disable PV EOI before we load a new kernel by kexec, + * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. + * New kernel can re-enable when it boots. + */ + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); +} + static int kvm_pv_reboot_notify(struct notifier_block *nb, unsigned long code, void *unused) { if (code == SYS_RESTART) - on_each_cpu(kvm_pv_disable_apf, NULL, 1); + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); return NOTIFY_DONE; } @@ -371,7 +410,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) static void kvm_guest_cpu_offline(void *dummy) { kvm_disable_steal_time(); - kvm_pv_disable_apf(NULL); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); apf_task_wake_all(); } @@ -424,6 +465,16 @@ void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + /* Should happen once for each apic */ + WARN_ON((*drv)->eoi_write == kvm_guest_apic_eoi_write); + (*drv)->eoi_write = kvm_guest_apic_eoi_write; + } + } + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7df1c6d839fb4..61ccbdf3d0ac2 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -409,6 +409,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | + (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); if (sched_info_on()) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93c15743f1ee1..ce878788a39fd 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap) clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +static inline int __apic_test_and_set_vector(int vec, void *bitmap) +{ + return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int __apic_test_and_clear_vector(int vec, void *bitmap) +{ + return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline int apic_hw_enabled(struct kvm_lapic *apic) { return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; @@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap) return fls(word[word_offset << 2]) - 1 + (word_offset << 5); } +static u8 count_vectors(void *bitmap) +{ + u32 *word = bitmap; + int word_offset; + u8 count = 0; + for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) + count += hweight32(word[word_offset << 2]); + return count; +} + static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; @@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) apic->irr_pending = true; } +static inline void apic_set_isr(int vec, struct kvm_lapic *apic) +{ + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) + ++apic->isr_count; + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); + /* + * ISR (in service register) bit is set when injecting an interrupt. + * The highest vector is injected. Thus the latest bit set matches + * the highest bit in ISR. + */ + apic->highest_isr_cache = vec; +} + +static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) +{ + if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + --apic->isr_count; + BUG_ON(apic->isr_count < 0); + apic->highest_isr_cache = -1; +} + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -270,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) irq->level, irq->trig_mode); } +static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) +{ + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); +} + +static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); +} + +static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} + +static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) +{ + u8 val; + if (pv_eoi_get_user(vcpu, &val) < 0) + apic_debug("Can't read EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return val & 0x1; +} + +static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { + apic_debug("Can't set EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + +static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { + apic_debug("Can't clear EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; + if (!apic->isr_count) + return -1; + if (likely(apic->highest_isr_cache != -1)) + return apic->highest_isr_cache; result = find_highest_vector(apic->regs + APIC_ISR); ASSERT(result == -1 || result >= 16); @@ -482,17 +575,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } -static void apic_set_eoi(struct kvm_lapic *apic) +static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); + + trace_kvm_eoi(apic, vector); + /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC */ if (vector == -1) - return; + return vector; - apic_clear_vector(vector, apic->regs + APIC_ISR); + apic_clear_isr(vector, apic); apic_update_ppr(apic); if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && @@ -505,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); } kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + return vector; } static void apic_send_ipi(struct kvm_lapic *apic) @@ -1081,10 +1178,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } apic->irr_pending = false; + apic->isr_count = 0; + apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); if (kvm_vcpu_is_bsp(vcpu)) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; @@ -1248,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) if (vector == -1) return -1; - apic_set_vector(vector, apic->regs + APIC_ISR); + apic_set_isr(vector, apic); apic_update_ppr(apic); apic_clear_irr(vector, apic); return vector; @@ -1267,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; + apic->isr_count = count_vectors(apic->regs + APIC_ISR); + apic->highest_isr_cache = -1; kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -1283,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* + * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt + * + * Detect whether guest triggered PV EOI since the + * last entry. If yes, set EOI on guests's behalf. + * Clear PV EOI in guest memory in any case. + */ +static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + bool pending; + int vector; + /* + * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host + * and KVM_PV_EOI_ENABLED in guest memory as follows: + * + * KVM_APIC_PV_EOI_PENDING is unset: + * -> host disabled PV EOI. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: + * -> host enabled PV EOI, guest did not execute EOI yet. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: + * -> host enabled PV EOI, guest executed EOI. + */ + BUG_ON(!pv_eoi_enabled(vcpu)); + pending = pv_eoi_get_pending(vcpu); + /* + * Clear pending bit in any case: it will be set again on vmentry. + * While this might not be ideal from performance point of view, + * this makes sure pv eoi is only enabled when we know it's safe. + */ + pv_eoi_clr_pending(vcpu); + if (pending) + return; + vector = apic_set_eoi(apic); + trace_kvm_pv_eoi(apic, vector); +} + void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; void *vapic; + if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) + apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; @@ -1298,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) apic_set_tpr(vcpu->arch.apic, data & 0xff); } +/* + * apic_sync_pv_eoi_to_guest - called before vmentry + * + * Detect whether it's safe to enable PV EOI and + * if yes do so. + */ +static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + if (!pv_eoi_enabled(vcpu) || + /* IRR set or many bits in ISR: could be nested. */ + apic->irr_pending || + /* Cache not set: could be safe but we don't bother. */ + apic->highest_isr_cache == -1 || + /* Need EOI to update ioapic. */ + kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + /* + * PV EOI was disabled by apic_sync_pv_eoi_from_guest + * so we need not do anything here. + */ + return; + } + + pv_eoi_set_pending(apic->vcpu); +} + void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) { u32 data, tpr; int max_irr, max_isr; - struct kvm_lapic *apic; + struct kvm_lapic *apic = vcpu->arch.apic; void *vapic; + apic_sync_pv_eoi_to_guest(vcpu, apic); + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - apic = vcpu->arch.apic; tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) @@ -1394,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +{ + u64 addr = data & ~KVM_MSR_ENABLED; + if (!IS_ALIGNED(addr, 4)) + return 1; + + vcpu->arch.pv_eoi.msr_val = data; + if (!pv_eoi_enabled(vcpu)) + return 0; + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, + addr); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6f4ce2575d095..4af5405ae1e2f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -13,6 +13,15 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; + /* Number of bits set in ISR. */ + s16 isr_count; + /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ + int highest_isr_cache; + /** + * APIC register page. The layout matches the register layout seen by + * the guest 1:1, because it is accessed by the vmx microcode. + * Note: Only one register, the TPR, is used by the microcode. + */ void *regs; gpa_t vapic_addr; struct page *vapic_page; @@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index be3cea4407ffa..3b53d9e08bfc3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -90,7 +90,7 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT_FIRST_AVAIL_BITS_SHIFT 10 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 #define PT64_LEVEL_BITS 9 @@ -652,8 +652,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) mmu_page_header_cache); } -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, - size_t size) +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; @@ -664,8 +663,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, - sizeof(struct pte_list_desc)); + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1238,11 +1236,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { u64 *sptep; - struct rmap_iterator iter; + struct rmap_iterator uninitialized_var(iter); int young = 0; /* - * Emulate the accessed bit for EPT, by checking if this page has + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has * an EPT mapping, and clearing it if it does. On the next access, * a new EPT mapping will be established. * This has some overhead, but not as much as the cost of swapping @@ -1253,11 +1252,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + BUG_ON(!is_shadow_present_pte(*sptep)); - if (*sptep & PT_ACCESSED_MASK) { + if (*sptep & shadow_accessed_mask) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); } } @@ -1281,9 +1281,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + BUG_ON(!is_shadow_present_pte(*sptep)); - if (*sptep & PT_ACCESSED_MASK) { + if (*sptep & shadow_accessed_mask) { young = 1; break; } @@ -1401,12 +1401,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, - sizeof *sp); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, - PAGE_SIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); @@ -3942,7 +3940,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; - struct kvm *kvm_freed = NULL; int nr_to_scan = sc->nr_to_scan; if (nr_to_scan == 0) @@ -3954,22 +3951,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) int idx; LIST_HEAD(invalid_list); + /* + * n_used_mmu_pages is accessed without holding kvm->mmu_lock + * here. We may skip a VM instance errorneosly, but we do not + * want to shrink a VM that only started to populate its MMU + * anyway. + */ + if (kvm->arch.n_used_mmu_pages > 0) { + if (!nr_to_scan--) + break; + continue; + } + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (!kvm_freed && nr_to_scan > 0 && - kvm->arch.n_used_mmu_pages > 0) { - kvm_mmu_remove_some_alloc_mmu_pages(kvm, - &invalid_list); - kvm_freed = kvm; - } - nr_to_scan--; + kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); + + list_move_tail(&kvm->vm_list, &vm_list); + break; } - if (kvm_freed) - list_move_tail(&kvm_freed->vm_list, &vm_list); raw_spin_unlock(&kvm_lock); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f75af406b268e..7a418783259d6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) break; case MSR_IA32_DEBUGCTLMSR: if (!boot_cpu_has(X86_FEATURE_LBRV)) { - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", + __func__, data); break; } if (data & DEBUGCTL_RESERVED_BITS) @@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: return kvm_set_msr_common(vcpu, ecx, data); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 911d2641f14c5..851914e207fc5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq, __entry->coalesced ? " (coalesced)" : "") ); +TRACE_EVENT(kvm_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + +TRACE_EVENT(kvm_pv_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + /* * Tracepoint for nested VMRUN */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32eb588662920..eeeb4a25aed67 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -71,6 +71,9 @@ static bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); +static bool __read_mostly enable_ept_ad_bits = 1; +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); + static bool __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); @@ -615,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -789,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void) return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; } +static inline bool cpu_has_vmx_ept_ad_bits(void) +{ + return vmx_capability.ept & VMX_EPT_AD_BIT; +} + static inline bool cpu_has_vmx_invept_individual_addr(void) { return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; @@ -2645,8 +2657,12 @@ static __init int hardware_setup(void) !cpu_has_vmx_ept_4levels()) { enable_ept = 0; enable_unrestricted_guest = 0; + enable_ept_ad_bits = 0; } + if (!cpu_has_vmx_ept_ad_bits()) + enable_ept_ad_bits = 0; + if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; @@ -2770,6 +2786,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_segment var; if (enable_unrestricted_guest) return; @@ -2813,20 +2830,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu) if (emulate_invalid_guest_state) goto continue_rmode; - vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); - vmcs_write32(GUEST_SS_LIMIT, 0xffff); - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + vmx_get_segment(vcpu, &var, VCPU_SREG_SS); + vmx_set_segment(vcpu, &var, VCPU_SREG_SS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_CS); + vmx_set_segment(vcpu, &var, VCPU_SREG_CS); - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) - vmcs_writel(GUEST_CS_BASE, 0xf0000); - vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); + vmx_get_segment(vcpu, &var, VCPU_SREG_ES); + vmx_set_segment(vcpu, &var, VCPU_SREG_ES); - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + vmx_get_segment(vcpu, &var, VCPU_SREG_DS); + vmx_set_segment(vcpu, &var, VCPU_SREG_DS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_GS); + vmx_set_segment(vcpu, &var, VCPU_SREG_GS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_FS); + vmx_set_segment(vcpu, &var, VCPU_SREG_FS); continue_rmode: kvm_mmu_reset_context(vcpu); @@ -3027,6 +3047,8 @@ static u64 construct_eptp(unsigned long root_hpa) /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; + if (enable_ept_ad_bits) + eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); return eptp; @@ -3229,6 +3251,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_write32(sf->ar_bytes, ar); __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + + /* + * Fix segments for real mode guest in hosts that don't have + * "unrestricted_mode" or it was disabled. + * This is done to allow migration of the guests from hosts with + * unrestricted guest like Westmere to older host that don't have + * unrestricted guest like Nehelem. + */ + if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { + switch (seg) { + case VCPU_SREG_CS: + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); + vmcs_write16(GUEST_CS_SELECTOR, + vmcs_readl(GUEST_CS_BASE) >> 4); + break; + case VCPU_SREG_ES: + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + break; + case VCPU_SREG_DS: + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + break; + case VCPU_SREG_GS: + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + break; + case VCPU_SREG_FS: + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + break; + case VCPU_SREG_SS: + vmcs_write16(GUEST_SS_SELECTOR, + vmcs_readl(GUEST_SS_BASE) >> 4); + vmcs_write32(GUEST_SS_LIMIT, 0xffff); + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + break; + } + } } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -4489,7 +4549,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; } vcpu->run->exit_reason = 0; - pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", + vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } @@ -7275,8 +7335,10 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK); + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be6d54929fa7d..03ce38611657f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -795,6 +795,7 @@ static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1437,8 +1438,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; } default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); return 1; } return 0; @@ -1470,8 +1471,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); return 1; } @@ -1551,15 +1552,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ if (data != 0) { - pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", + data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%llx\n", data); return 1; } break; @@ -1574,8 +1575,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) thus reserved and should throw a #GP */ return 1; } - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: @@ -1653,6 +1654,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; + case MSR_KVM_PV_EOI_EN: + if (kvm_lapic_enable_pv_eoi(vcpu, data)) + return 1; + break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: @@ -1671,8 +1676,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_EVNTSEL2: case MSR_K7_EVNTSEL3: if (data != 0) - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; /* at least RHEL 4 unconditionally writes to the perfctr registers, * so we ignore writes to make it happy. @@ -1681,8 +1686,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: case MSR_K7_PERFCTR3: - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; case MSR_P6_PERFCTR0: case MSR_P6_PERFCTR1: @@ -1693,8 +1698,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return kvm_pmu_set_msr(vcpu, msr, data); if (pr || data != 0) - pr_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; case MSR_K7_CLK_CTL: /* @@ -1720,7 +1725,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) @@ -1738,12 +1743,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr, data); if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", - msr, data); + vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + msr, data); return 1; } else { - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", - msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + msr, data); break; } } @@ -1846,7 +1851,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = kvm->arch.hv_hypercall; break; default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; } @@ -1877,7 +1882,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.hv_vapic; break; default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; } *pdata = data; @@ -2030,10 +2035,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_get_msr(vcpu, msr, pdata); if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; } else { - pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); data = 0; } break; @@ -3144,6 +3149,16 @@ out: return r; } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3250,29 +3265,6 @@ long kvm_arch_vm_ioctl(struct file *filp, create_pit_unlock: mutex_unlock(&kvm->slots_lock); break; - case KVM_IRQ_LINE_STATUS: - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; - - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - r = -ENXIO; - if (irqchip_in_kernel(kvm)) { - __s32 status; - status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event.irq, irq_event.level); - if (ioctl == KVM_IRQ_LINE_STATUS) { - r = -EFAULT; - irq_event.status = status; - if (copy_to_user(argp, &irq_event, - sizeof irq_event)) - goto out; - } - r = 0; - } - break; - } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip *chip; @@ -4116,7 +4108,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) value = kvm_get_cr8(vcpu); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + kvm_err("%s: unexpected cr %u\n", __func__, cr); return 0; } @@ -4145,7 +4137,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) res = kvm_set_cr8(vcpu, val); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + kvm_err("%s: unexpected cr %u\n", __func__, cr); res = -1; } @@ -5296,8 +5288,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = kvm_mmu_reload(vcpu); if (unlikely(r)) { - kvm_x86_ops->cancel_injection(vcpu); - goto out; + goto cancel_injection; } preempt_disable(); @@ -5322,9 +5313,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); - kvm_x86_ops->cancel_injection(vcpu); r = 1; - goto out; + goto cancel_injection; } srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -5388,9 +5378,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(vcpu->arch.tsc_always_catchup)) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - kvm_lapic_sync_from_vapic(vcpu); + if (vcpu->arch.apic_attention) + kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(vcpu); + return r; + +cancel_injection: + kvm_x86_ops->cancel_injection(vcpu); + if (unlikely(vcpu->arch.apic_attention)) + kvm_lapic_sync_from_vapic(vcpu); out: return r; } @@ -6304,7 +6301,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { - vfree(free->arch.lpage_info[i]); + kvm_kvfree(free->arch.lpage_info[i]); free->arch.lpage_info[i] = NULL; } } @@ -6323,7 +6320,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) slot->base_gfn, level) + 1; slot->arch.lpage_info[i] = - vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); + kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); if (!slot->arch.lpage_info[i]) goto out_free; @@ -6350,7 +6347,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) out_free: for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - vfree(slot->arch.lpage_info[i]); + kvm_kvfree(slot->arch.lpage_info[i]); slot->arch.lpage_info[i] = NULL; } return -ENOMEM; diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index 30f29a0020a12..3fcc000efc535 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c @@ -654,16 +654,6 @@ sclp_remove_processed(struct sccb_header *sccb) EXPORT_SYMBOL(sclp_remove_processed); -struct init_sccb { - struct sccb_header header; - u16 _reserved; - u16 mask_length; - sccb_mask_t receive_mask; - sccb_mask_t send_mask; - sccb_mask_t sclp_receive_mask; - sccb_mask_t sclp_send_mask; -} __attribute__((packed)); - /* Prepare init mask request. Called while sclp_lock is locked. */ static inline void __sclp_make_init_req(u32 receive_mask, u32 send_mask) diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h index 49a1bb52bc87a..d7e97ae9ef6de 100644 --- a/drivers/s390/char/sclp.h +++ b/drivers/s390/char/sclp.h @@ -88,6 +88,16 @@ struct sccb_header { u16 response_code; } __attribute__((packed)); +struct init_sccb { + struct sccb_header header; + u16 _reserved; + u16 mask_length; + sccb_mask_t receive_mask; + sccb_mask_t send_mask; + sccb_mask_t sclp_receive_mask; + sccb_mask_t sclp_send_mask; +} __attribute__((packed)); + extern u64 sclp_facilities; #define SCLP_HAS_CHP_INFO (sclp_facilities & 0x8000000000000000ULL) #define SCLP_HAS_CHP_RECONFIG (sclp_facilities & 0x2000000000000000ULL) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 766cb7b19b403..71ea923c322d4 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -48,6 +48,7 @@ struct read_info_sccb { u8 _reserved5[4096 - 112]; /* 112-4095 */ } __attribute__((packed, aligned(PAGE_SIZE))); +static struct init_sccb __initdata early_event_mask_sccb __aligned(PAGE_SIZE); static struct read_info_sccb __initdata early_read_info_sccb; static int __initdata early_read_info_sccb_valid; @@ -104,6 +105,19 @@ static void __init sclp_read_info_early(void) } } +static void __init sclp_event_mask_early(void) +{ + struct init_sccb *sccb = &early_event_mask_sccb; + int rc; + + do { + memset(sccb, 0, sizeof(*sccb)); + sccb->header.length = sizeof(*sccb); + sccb->mask_length = sizeof(sccb_mask_t); + rc = sclp_cmd_sync_early(SCLP_CMDW_WRITE_EVENT_MASK, sccb); + } while (rc == -EBUSY); +} + void __init sclp_facilities_detect(void) { struct read_info_sccb *sccb; @@ -119,6 +133,30 @@ void __init sclp_facilities_detect(void) rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; rzm <<= 20; + + sclp_event_mask_early(); +} + +bool __init sclp_has_linemode(void) +{ + struct init_sccb *sccb = &early_event_mask_sccb; + + if (sccb->header.response_code != 0x20) + return 0; + if (sccb->sclp_send_mask & (EVTYP_MSG_MASK | EVTYP_PMSGCMD_MASK)) + return 1; + return 0; +} + +bool __init sclp_has_vt220(void) +{ + struct init_sccb *sccb = &early_event_mask_sccb; + + if (sccb->header.response_code != 0x20) + return 0; + if (sccb->sclp_send_mask & EVTYP_VT220MSG_MASK) + return 1; + return 0; } unsigned long long sclp_get_rnmax(void) diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c index d74e9ae6dfb3e..d3bdae49512f7 100644 --- a/drivers/s390/kvm/kvm_virtio.c +++ b/drivers/s390/kvm/kvm_virtio.c @@ -25,6 +25,7 @@ #include <asm/io.h> #include <asm/kvm_para.h> #include <asm/kvm_virtio.h> +#include <asm/sclp.h> #include <asm/setup.h> #include <asm/irq.h> @@ -468,7 +469,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) static int __init s390_virtio_console_init(void) { - if (!MACHINE_IS_KVM) + if (sclp_has_vt220() || sclp_has_linemode()) return -ENODEV; return virtio_cons_early_init(early_put_chars); } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 09f2b3aa2da7e..f9784476c9267 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -111,6 +111,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. + * For ARM: IRQ: irq = (2*vcpu_index). FIQ: irq = (2*vcpu_indx + 1). */ union { __u32 irq; @@ -617,6 +618,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_SIGNAL_MSI 77 #define KVM_CAP_PPC_GET_SMMU_INFO 78 #define KVM_CAP_S390_COW 79 +#define KVM_CAP_PPC_ALLOC_HTAB 80 #ifdef KVM_CAP_IRQ_ROUTING @@ -828,6 +830,8 @@ struct kvm_s390_ucas_mapping { #define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) /* Available with KVM_CAP_PPC_GET_SMMU_INFO */ #define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) +/* Available with KVM_CAP_PPC_ALLOC_HTAB */ +#define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) /* * ioctls for vcpu fds diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c4464356b35b0..96aa7fb26f87b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -306,7 +306,7 @@ struct kvm { struct hlist_head irq_ack_notifier_list; #endif -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) struct mmu_notifier mmu_notifier; unsigned long mmu_notifier_seq; long mmu_notifier_count; @@ -314,13 +314,19 @@ struct kvm { long tlbs_dirty; }; -/* The guest did something we don't support. */ -#define pr_unimpl(vcpu, fmt, ...) \ - pr_err_ratelimited("kvm: %i: cpu%i " fmt, \ - current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__) +#define kvm_err(fmt, ...) \ + pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) +#define kvm_info(fmt, ...) \ + pr_info("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) +#define kvm_debug(fmt, ...) \ + pr_debug("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) +#define kvm_pr_unimpl(fmt, ...) \ + pr_err_ratelimited("kvm [%i]: " fmt, \ + task_tgid_nr(current), ## __VA_ARGS__) -#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) -#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) +/* The guest did something we don't support. */ +#define vcpu_unimpl(vcpu, fmt, ...) \ + kvm_pr_unimpl("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { @@ -488,6 +494,7 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc); +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -535,6 +542,9 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); +void *kvm_kvzalloc(unsigned long size); +void kvm_kvfree(const void *addr); + #ifndef __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { @@ -771,7 +781,7 @@ struct kvm_stats_debugfs_item { extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) { if (unlikely(vcpu->kvm->mmu_notifier_count)) @@ -793,7 +803,7 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se } #endif -#ifdef CONFIG_HAVE_KVM_IRQCHIP +#ifdef KVM_CAP_IRQ_ROUTING #define KVM_MAX_IRQ_ROUTES 1024 diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 46e3cd8e197a5..7ef9e759f4990 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -13,7 +13,8 @@ ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ - ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI) + ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ + ERSN(S390_UCONTROL) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), @@ -36,7 +37,7 @@ TRACE_EVENT(kvm_userspace_exit, __entry->errno < 0 ? -__entry->errno : __entry->reason) ); -#if defined(__KVM_HAVE_IOAPIC) +#if defined(__KVM_HAVE_IRQ_LINE) TRACE_EVENT(kvm_set_irq, TP_PROTO(unsigned int gsi, int level, int irq_source_id), TP_ARGS(gsi, level, irq_source_id), @@ -56,7 +57,9 @@ TRACE_EVENT(kvm_set_irq, TP_printk("gsi %u level %d source %d", __entry->gsi, __entry->level, __entry->irq_source_id) ); +#endif +#if defined(__KVM_HAVE_IOAPIC) #define kvm_deliver_mode \ {0x0, "Fixed"}, \ {0x1, "LowPrio"}, \ diff --git a/mm/memory.c b/mm/memory.c index 2466d1250231f..2cdadb935ed81 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -379,12 +379,14 @@ void pgd_clear_bad(pgd_t *pgd) pgd_ERROR(*pgd); pgd_clear(pgd); } +EXPORT_SYMBOL_GPL(pgd_clear_bad); void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } +EXPORT_SYMBOL_GPL(pud_clear_bad); void pmd_clear_bad(pmd_t *pmd) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7e140683ff14d..1d338773b0c39 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -516,16 +516,32 @@ out_err_nodisable: return ERR_PTR(r); } +/* + * Avoid using vmalloc for a small buffer. + * Should not be used when the size is statically known. + */ +void *kvm_kvzalloc(unsigned long size) +{ + if (size > PAGE_SIZE) + return vzalloc(size); + else + return kzalloc(size, GFP_KERNEL); +} + +void kvm_kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) { if (!memslot->dirty_bitmap) return; - if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) - vfree(memslot->dirty_bitmap); - else - kfree(memslot->dirty_bitmap); - + kvm_kvfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; } @@ -617,11 +633,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) #ifndef CONFIG_S390 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); - if (dirty_bytes > PAGE_SIZE) - memslot->dirty_bitmap = vzalloc(dirty_bytes); - else - memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); - + memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); if (!memslot->dirty_bitmap) return -ENOMEM; @@ -2081,6 +2093,25 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif +#ifdef __KVM_HAVE_IRQ_LINE + case KVM_IRQ_LINE_STATUS: + case KVM_IRQ_LINE: { + struct kvm_irq_level irq_event; + + r = -EFAULT; + if (copy_from_user(&irq_event, argp, sizeof irq_event)) + goto out; + + r = kvm_vm_ioctl_irq_line(kvm, &irq_event); + + if (ioctl == KVM_IRQ_LINE_STATUS) { + if (copy_to_user(argp, &irq_event, sizeof irq_event)) + r = -EFAULT; + } + + break; + } +#endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); if (r == -ENOTTY) @@ -2213,7 +2244,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_SIGNAL_MSI: #endif return 1; -#ifdef CONFIG_HAVE_KVM_IRQCHIP +#ifdef KVM_CAP_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; #endif |