aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFan Du <fan.du@intel.com>2019-03-28 15:13:27 +0800
committerFengguang Wu <fengguang.wu@intel.com>2019-03-28 16:27:02 +0800
commitb90cc62670b1956431b4bf1e0b1ff2ebbffcc406 (patch)
treec88b5b4a44e3dcaa80f9a46ed6a999f8c1b4c5fe
parent9e17ab4ef02e3d7c84436d7f458409558d42092a (diff)
downloadlinux-ept-idle-v2-4.20.tar.gz
x86, ept: Track dirty page w/o page faultept-idle-v2-4.20
EPT Dirty Bit Use Scenario: Live Migration ------------------------------------------ Case 1: Enable VM dirty page track w/ PML ------ a. Write protect guest memory in VM memory region creation b. Update VM dirty_bitmap in page fault path for huge page c. Qemu query VM dirty_bitmap by KVM_GET_DIRTY_LOG ioctl c1. Kick VM, sync dirty_bitmap with PML buffer in VM_EXIT c2. D bit will be cleared for 4K page c3. For huge page, write protect spte w/o PML ------- a. Write protect guest memory in VM memory region creation b. VM memslot dirty_bitmap will be updated in page fault path c. Qemu query VM dirty_bitmap by KVM_GET_DIRTY_LOG ioctl c3. Write protect spte Case 2: Disable VM dirty page track w/ PML ------ a. Set spte Dirty bit, so PML full VM_EXIT is not triggered w/o PML ------- a. Nothing need to do. kvm ioctl w/ KVM_SET_USER_MEMORY_REGION kvm_vm_ioctl_set_memory_region -> __kvm_set_memory_region -> kvm_arch_commit_memory_region -> kvm_mmu_slot_apply_flags case 1:-> vmx_slot_enable_log_dirty -> kvm_mmu_slot_leaf_clear_dirty -> __rmap_clear_dirty -> spte_clear_dirty -> kvm_mmu_slot_largepage_remove_write_access -> __rmap_write_protect -> spte_write_protect case 2:-> vmx_slot_disable_log_dirty -> kvm_mmu_slot_set_dirty -> __rmap_set_dirty -> spte_set_dirty kvm ioctl w/ KVM_GET_DIRTY_LOG -> kvm_vm_ioctl_get_dirty_log -> vmx_flush_log_dirty -> kvm_flush_pml_buffers -> kvm_get_dirty_log_protect -> kvm_arch_mmu_enable_log_dirty_pt_masked -> vmx_enable_log_dirty_pt_masked -> kvm_mmu_clear_dirty_pt_masked -> __rmap_clear_dirty -> spte_clear_dirty Conclusion: The only user of EPT Dirty bit is Qemu Live migration, there is no other user inside kvm-kernel itself. This inspire us to use EPT Dirty to track page hotness. The admin should be able to coordinate b/w VM live migration use case and pmem2dram case. Reuse EPT Dirty Bit for pmem2dram --------------------------------- Goal: Provide interface for user space daemon to migrate written page. Motivation: a. AEP write latency is limited, read latency can satisfy workload. Track written page is necessary. b. EPT A bit and D bit b1. By SDM, EPT A bit is a super set of EPT D bit, A bit does not necessarily imply written page. b2. In case of cache write, write are performed after load (read) the target address into cache, then perform write. Statistics of A bit and D bit could possibly be the same. Any cases of write is directed down to iMC? b3. In case of NT write, cache is by passed. The above cache case does not stand. Need conclusive result whether A bit is enough to track written page. Identify potential corner case where D bit is useful. This patch build EPT D bit tracking up on ept_idle module, user space daemon could easily leverage exiting code to benchmark. In practice user space daemon is expected to account D bit as accessed as well. Signed-off-by: Fan Du <fan.du@intel.com> Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
-rw-r--r--arch/x86/kvm/ept_idle.c12
-rw-r--r--arch/x86/kvm/ept_idle.h3
2 files changed, 15 insertions, 0 deletions
diff --git a/arch/x86/kvm/ept_idle.c b/arch/x86/kvm/ept_idle.c
index 9ead3750ccc45..eca3e288c1636 100644
--- a/arch/x86/kvm/ept_idle.c
+++ b/arch/x86/kvm/ept_idle.c
@@ -180,6 +180,11 @@ static int ept_pte_range(struct ept_idle_ctrl *eic,
page_type = PTE_IDLE;
else {
page_type = PTE_ACCESSED;
+ if (eic->flags & SCAN_DIRTY_PAGE) {
+ if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY,
+ (unsigned long *) &pte->pte))
+ page_type = PTE_DIRTY;
+ }
}
err = eic_add_page(eic, addr, addr + PAGE_SIZE, page_type);
@@ -220,6 +225,12 @@ static int ept_pmd_range(struct ept_idle_ctrl *eic,
page_type = pte_page_type;
} else if (pmd_large(*pmd)) {
page_type = PMD_ACCESSED;
+ if (eic->flags & SCAN_DIRTY_PAGE) {
+ if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY,
+ (unsigned long *) pmd))
+ page_type = PMD_DIRTY;
+ }
+
} else
page_type = pte_page_type;
@@ -563,6 +574,7 @@ static ssize_t ept_idle_read(struct file *file, char *buf,
eic->buf = buf;
eic->buf_size = count;
eic->mm = mm;
+ eic->flags = file->f_flags;
eic->kvm = mm_kvm(mm);
if (!eic->kvm) {
ret = -EINVAL;
diff --git a/arch/x86/kvm/ept_idle.h b/arch/x86/kvm/ept_idle.h
index 0a6c493ee48fd..cf16e21c5b5f7 100644
--- a/arch/x86/kvm/ept_idle.h
+++ b/arch/x86/kvm/ept_idle.h
@@ -3,6 +3,7 @@
#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */
#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */
+#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */
enum ProcIdlePageType {
PTE_ACCESSED, /* 4k page */
@@ -31,7 +32,9 @@ enum ProcIdlePageType {
#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0)
#define _PAGE_BIT_EPT_ACCESSED 8
+#define _PAGE_BIT_EPT_DIRTY 9
#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED)
+#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY)
#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7))