Merge branch 'msr-bitmaps' of git://git.kernel.org/pub/scm/virt/kvm/kvm into x86/pti

Pull the KVM prerequisites so the IBPB patches apply.
2024-10-10 21:36:41 +00:00 · 2018-02-03 22:30:16 +01:00 · 2018-02-03 22:30:16 +01:00 · a96223f192
parent af189c95a3 904e14fb7c
commit a96223f192
1 changed files with 186 additions and 254 deletions
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@ -112,6 +112,14 @@ static u64 __read_mostly host_xss;
 static bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 #define MSR_TYPE_R	1
 #define MSR_TYPE_W	2
 #define MSR_TYPE_RW	3
 #define MSR_BITMAP_MODE_X2APIC		1
 #define MSR_BITMAP_MODE_X2APIC_APICV	2
 #define MSR_BITMAP_MODE_LM		4
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
@ -186,7 +194,6 @@ module_param(ple_window_max, int, S_IRUGO);
 extern const ulong vmx_return;
 #define NR_AUTOLOAD_MSRS 8
 #define VMCS02_POOL_SIZE 1
 struct vmcs {
 	u32 revision_id;
@ -211,6 +218,7 @@ struct loaded_vmcs {
 	int soft_vnmi_blocked;
 	ktime_t entry_time;
 	s64 vnmi_blocked_time;
 	unsigned long *msr_bitmap;
 	struct list_head loaded_vmcss_on_cpu_link;
 };
@ -227,7 +235,7 @@ struct shared_msr_entry {
 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
 * More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
 * underlying hardware which will be used to run L2.
 * This structure is packed to ensure that its layout is identical across
 * machines (necessary for live migration).
@ -410,13 +418,6 @@ struct __packed vmcs12 {
 */
 #define VMCS12_SIZE 0x1000
 /* Used to remember the last vmcs02 used for some recently used vmcs12s */
 struct vmcs02_list {
 	struct list_head list;
 	gpa_t vmptr;
 	struct loaded_vmcs vmcs02;
 };
 /*
 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@ -441,15 +442,15 @@ struct nested_vmx {
 	 */
 	bool sync_shadow_vmcs;
 	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
 	struct list_head vmcs02_pool;
 	int vmcs02_num;
 	bool change_vmcs01_virtual_x2apic_mode;
 	/* L2 must run next, and mustn't decide to exit to L1. */
 	bool nested_run_pending;
 	struct loaded_vmcs vmcs02;
 	/*
-	 * Guest pages referred to in vmcs02 with host-physical pointers, so
+	 * Guest pages referred to in the vmcs02 with host-physical
-	 * we must keep them pinned while L2 runs.
+	 * pointers, so we must keep them pinned while L2 runs.
 	 */
 	struct page *apic_access_page;
 	struct page *virtual_apic_page;
@ -458,8 +459,6 @@ struct nested_vmx {
 	bool pi_pending;
 	u16 posted_intr_nv;
 	unsigned long *msr_bitmap;
 	struct hrtimer preemption_timer;
 	bool preemption_timer_expired;
@ -582,6 +581,7 @@ struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
 	u8                    fail;
 	u8		      msr_bitmap_mode;
 	u32                   exit_intr_info;
 	u32                   idt_vectoring_info;
 	ulong                 rflags;
@ -933,6 +933,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
 					    u16 error_code);
 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@ -952,12 +953,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
 enum {
 	VMX_IO_BITMAP_A,
 	VMX_IO_BITMAP_B,
 	VMX_MSR_BITMAP_LEGACY,
 	VMX_MSR_BITMAP_LONGMODE,
 	VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
 	VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
 	VMX_MSR_BITMAP_LEGACY_X2APIC,
 	VMX_MSR_BITMAP_LONGMODE_X2APIC,
 	VMX_VMREAD_BITMAP,
 	VMX_VMWRITE_BITMAP,
 	VMX_BITMAP_NR
@ -967,12 +962,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
 #define vmx_io_bitmap_a                      (vmx_bitmap[VMX_IO_BITMAP_A])
 #define vmx_io_bitmap_b                      (vmx_bitmap[VMX_IO_BITMAP_B])
 #define vmx_msr_bitmap_legacy                (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
 #define vmx_msr_bitmap_longmode              (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
 #define vmx_msr_bitmap_legacy_x2apic_apicv   (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
 #define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
 #define vmx_msr_bitmap_legacy_x2apic         (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
 #define vmx_msr_bitmap_longmode_x2apic       (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
@ -2570,36 +2559,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 	vmx->guest_msrs[from] = tmp;
 }
 static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 {
 	unsigned long *msr_bitmap;
 	if (is_guest_mode(vcpu))
 		msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
 	else if (cpu_has_secondary_exec_ctrls() &&
 		 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
 		  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
 		if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
 			if (is_long_mode(vcpu))
 				msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
 			else
 				msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
 		} else {
 			if (is_long_mode(vcpu))
 				msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
 			else
 				msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
 		}
 	} else {
 		if (is_long_mode(vcpu))
 			msr_bitmap = vmx_msr_bitmap_longmode;
 		else
 			msr_bitmap = vmx_msr_bitmap_legacy;
 	}
 	vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
 }
 /*
 * Set up the vmcs to automatically save and restore system
 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
@ -2640,7 +2599,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 	vmx->save_nmsrs = save_nmsrs;
 	if (cpu_has_vmx_msr_bitmap())
-		vmx_set_msr_bitmap(&vmx->vcpu);
+		vmx_update_msr_bitmap(&vmx->vcpu);
 }
 /*
@ -3835,11 +3794,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 	return vmcs;
 }
 static struct vmcs *alloc_vmcs(void)
 {
 	return alloc_vmcs_cpu(raw_smp_processor_id());
 }
 static void free_vmcs(struct vmcs *vmcs)
 {
 	free_pages((unsigned long)vmcs, vmcs_config.order);
@ -3855,9 +3809,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 	loaded_vmcs_clear(loaded_vmcs);
 	free_vmcs(loaded_vmcs->vmcs);
 	loaded_vmcs->vmcs = NULL;
 	if (loaded_vmcs->msr_bitmap)
 		free_page((unsigned long)loaded_vmcs->msr_bitmap);
 	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
 }
 static struct vmcs *alloc_vmcs(void)
 {
 	return alloc_vmcs_cpu(raw_smp_processor_id());
 }
 static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 {
 	loaded_vmcs->vmcs = alloc_vmcs();
 	if (!loaded_vmcs->vmcs)
 		return -ENOMEM;
 	loaded_vmcs->shadow_vmcs = NULL;
 	loaded_vmcs_init(loaded_vmcs);
 	if (cpu_has_vmx_msr_bitmap()) {
 		loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 		if (!loaded_vmcs->msr_bitmap)
 			goto out_vmcs;
 		memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
 	}
 	return 0;
 out_vmcs:
 	free_loaded_vmcs(loaded_vmcs);
 	return -ENOMEM;
 }
 static void free_kvm_area(void)
 {
 	int cpu;
@ -4916,10 +4899,8 @@ static void free_vpid(int vpid)
 	spin_unlock(&vmx_vpid_lock);
 }
-#define MSR_TYPE_R	1
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
-#define MSR_TYPE_W	2
+							  u32 msr, int type)
 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
 						u32 msr, int type)
 {
 	int f = sizeof(unsigned long);
@ -4953,6 +4934,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
 	}
 }
 static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
 							 u32 msr, int type)
 {
 	int f = sizeof(unsigned long);
 	if (!cpu_has_vmx_msr_bitmap())
 		return;
 	/*
 	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
 	 * have the write-low and read-high bitmap offsets the wrong way round.
 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 	 */
 	if (msr <= 0x1fff) {
 		if (type & MSR_TYPE_R)
 			/* read-low */
 			__set_bit(msr, msr_bitmap + 0x000 / f);
 		if (type & MSR_TYPE_W)
 			/* write-low */
 			__set_bit(msr, msr_bitmap + 0x800 / f);
 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 		msr &= 0x1fff;
 		if (type & MSR_TYPE_R)
 			/* read-high */
 			__set_bit(msr, msr_bitmap + 0x400 / f);
 		if (type & MSR_TYPE_W)
 			/* write-high */
 			__set_bit(msr, msr_bitmap + 0xc00 / f);
 	}
 }
 static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
 			     			      u32 msr, int type, bool value)
 {
 	if (value)
 		vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
 	else
 		vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
 }
 /*
 * If a msr is allowed by L0, we should check whether it is allowed by L1.
 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
@ -4999,28 +5024,68 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
 	}
 }
-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
 {
-	if (!longmode_only)
+	u8 mode = 0;
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+
-						msr, MSR_TYPE_R | MSR_TYPE_W);
+	if (cpu_has_secondary_exec_ctrls() &&
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+	    (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
-						msr, MSR_TYPE_R | MSR_TYPE_W);
+	     SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
 		mode |= MSR_BITMAP_MODE_X2APIC;
 		if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
 			mode |= MSR_BITMAP_MODE_X2APIC_APICV;
 	}
 	if (is_long_mode(vcpu))
 		mode |= MSR_BITMAP_MODE_LM;
 	return mode;
 }
-static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
 static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
 					 u8 mode)
 {
-	if (apicv_active) {
+	int msr;
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
+
-				msr, type);
+	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
+		unsigned word = msr / BITS_PER_LONG;
-				msr, type);
+		msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
-	} else {
+		msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
 		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
 				msr, type);
 		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
 				msr, type);
 	}
 	if (mode & MSR_BITMAP_MODE_X2APIC) {
 		/*
 		 * TPR reads and writes can be virtualized even if virtual interrupt
 		 * delivery is not in use.
 		 */
 		vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
 		if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
 			vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
 			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
 			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
 		}
 	}
 }
 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
 	u8 mode = vmx_msr_bitmap_mode(vcpu);
 	u8 changed = mode ^ vmx->msr_bitmap_mode;
 	if (!changed)
 		return;
 	vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
 				  !(mode & MSR_BITMAP_MODE_LM));
 	if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
 		vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
 	vmx->msr_bitmap_mode = mode;
 }
 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
@ -5272,7 +5337,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 	}
 	if (cpu_has_vmx_msr_bitmap())
-		vmx_set_msr_bitmap(vcpu);
+		vmx_update_msr_bitmap(vcpu);
 }
 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@ -5459,7 +5524,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
 	}
 	if (cpu_has_vmx_msr_bitmap())
-		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
+		vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
@ -6742,7 +6807,7 @@ void vmx_enable_tdp(void)
 static __init int hardware_setup(void)
 {
-	int r = -ENOMEM, i, msr;
+	int r = -ENOMEM, i;
 	rdmsrl_safe(MSR_EFER, &host_efer);
@ -6762,9 +6827,6 @@ static __init int hardware_setup(void)
 	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
 	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
 	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
 	if (setup_vmcs_config(&vmcs_config) < 0) {
 		r = -EIO;
 		goto out;
@ -6833,42 +6895,8 @@ static __init int hardware_setup(void)
 		kvm_tsc_scaling_ratio_frac_bits = 48;
 	}
 	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
 	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
 	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 	memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
 			vmx_msr_bitmap_legacy, PAGE_SIZE);
 	memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
 			vmx_msr_bitmap_longmode, PAGE_SIZE);
 	memcpy(vmx_msr_bitmap_legacy_x2apic,
 			vmx_msr_bitmap_legacy, PAGE_SIZE);
 	memcpy(vmx_msr_bitmap_longmode_x2apic,
 			vmx_msr_bitmap_longmode, PAGE_SIZE);
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 	for (msr = 0x800; msr <= 0x8ff; msr++) {
 		if (msr == 0x839 /* TMCCT */)
 			continue;
 		vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
 	}
 	/*
 	 * TPR reads and writes can be virtualized even if virtual interrupt
 	 * delivery is not in use.
 	 */
 	vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
 	vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
 	/* EOI */
 	vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
 	/* SELF-IPI */
 	vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
 	if (enable_ept)
 		vmx_enable_tdp();
 	else
@ -6971,94 +6999,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
 	return handle_nop(vcpu);
 }
 /*
 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
 * We could reuse a single VMCS for all the L2 guests, but we also want the
 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
 * allows keeping them loaded on the processor, and in the future will allow
 * optimizations where prepare_vmcs02 doesn't need to set all the fields on
 * every entry if they never change.
 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
 *
 * The following functions allocate and free a vmcs02 in this pool.
 */
 /* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
 static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
 {
 	struct vmcs02_list *item;
 	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
 		if (item->vmptr == vmx->nested.current_vmptr) {
 			list_move(&item->list, &vmx->nested.vmcs02_pool);
 			return &item->vmcs02;
 		}
 	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
 		/* Recycle the least recently used VMCS. */
 		item = list_last_entry(&vmx->nested.vmcs02_pool,
 				       struct vmcs02_list, list);
 		item->vmptr = vmx->nested.current_vmptr;
 		list_move(&item->list, &vmx->nested.vmcs02_pool);
 		return &item->vmcs02;
 	}
 	/* Create a new VMCS */
 	item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
 	if (!item)
 		return NULL;
 	item->vmcs02.vmcs = alloc_vmcs();
 	item->vmcs02.shadow_vmcs = NULL;
 	if (!item->vmcs02.vmcs) {
 		kfree(item);
 		return NULL;
 	}
 	loaded_vmcs_init(&item->vmcs02);
 	item->vmptr = vmx->nested.current_vmptr;
 	list_add(&(item->list), &(vmx->nested.vmcs02_pool));
 	vmx->nested.vmcs02_num++;
 	return &item->vmcs02;
 }
 /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
 static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
 {
 	struct vmcs02_list *item;
 	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
 		if (item->vmptr == vmptr) {
 			free_loaded_vmcs(&item->vmcs02);
 			list_del(&item->list);
 			kfree(item);
 			vmx->nested.vmcs02_num--;
 			return;
 		}
 }
 /*
 * Free all VMCSs saved for this vcpu, except the one pointed by
 * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
 * must be &vmx->vmcs01.
 */
 static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
 {
 	struct vmcs02_list *item, *n;
 	WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
 	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
 		/*
 		 * Something will leak if the above WARN triggers.  Better than
 		 * a use-after-free.
 		 */
 		if (vmx->loaded_vmcs == &item->vmcs02)
 			continue;
 		free_loaded_vmcs(&item->vmcs02);
 		list_del(&item->list);
 		kfree(item);
 		vmx->nested.vmcs02_num--;
 	}
 }
 /*
 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
 * set the success or error code of an emulated VMX instruction, as specified
@ -7239,13 +7179,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs *shadow_vmcs;
 	int r;
-	if (cpu_has_vmx_msr_bitmap()) {
+	r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
-		vmx->nested.msr_bitmap =
+	if (r < 0)
-				(unsigned long *)__get_free_page(GFP_KERNEL);
+		goto out_vmcs02;
 		if (!vmx->nested.msr_bitmap)
 			goto out_msr_bitmap;
 	}
 	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
 	if (!vmx->nested.cached_vmcs12)
@ -7262,9 +7200,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
 	}
 	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
 	vmx->nested.vmcs02_num = 0;
 	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_REL_PINNED);
 	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@ -7276,9 +7211,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 	kfree(vmx->nested.cached_vmcs12);
 out_cached_vmcs12:
-	free_page((unsigned long)vmx->nested.msr_bitmap);
+	free_loaded_vmcs(&vmx->nested.vmcs02);
-out_msr_bitmap:
+out_vmcs02:
 	return -ENOMEM;
 }
@ -7421,10 +7356,6 @@ static void free_nested(struct vcpu_vmx *vmx)
 	free_vpid(vmx->nested.vpid02);
 	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.current_vmptr = -1ull;
 	if (vmx->nested.msr_bitmap) {
 		free_page((unsigned long)vmx->nested.msr_bitmap);
 		vmx->nested.msr_bitmap = NULL;
 	}
 	if (enable_shadow_vmcs) {
 		vmx_disable_shadow_vmcs(vmx);
 		vmcs_clear(vmx->vmcs01.shadow_vmcs);
@ -7432,7 +7363,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 		vmx->vmcs01.shadow_vmcs = NULL;
 	}
 	kfree(vmx->nested.cached_vmcs12);
-	/* Unpin physical memory we referred to in current vmcs02 */
+	/* Unpin physical memory we referred to in the vmcs02 */
 	if (vmx->nested.apic_access_page) {
 		kvm_release_page_dirty(vmx->nested.apic_access_page);
 		vmx->nested.apic_access_page = NULL;
@ -7448,7 +7379,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 		vmx->nested.pi_desc = NULL;
 	}
-	nested_free_all_saved_vmcss(vmx);
+	free_loaded_vmcs(&vmx->nested.vmcs02);
 }
 /* Emulate the VMXOFF instruction */
@ -7491,8 +7422,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 			vmptr + offsetof(struct vmcs12, launch_state),
 			&zero, sizeof(zero));
 	nested_free_vmcs02(vmx, vmptr);
 	nested_vmx_succeed(vcpu);
 	return kvm_skip_emulated_instruction(vcpu);
 }
@ -8404,10 +8333,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 	/*
 	 * The host physical addresses of some pages of guest memory
-	 * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+	 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
-	 * may write to these pages via their host physical address while
+	 * Page). The CPU may write to these pages via their host
-	 * L2 is running, bypassing any address-translation-based dirty
+	 * physical address while L2 is running, bypassing any
-	 * tracking (e.g. EPT write protection).
+	 * address-translation-based dirty tracking (e.g. EPT write
 	 * protection).
 	 *
 	 * Mark them dirty on every exit from L2 to prevent them from
 	 * getting out of sync with dirty tracking.
@ -8941,7 +8871,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	}
 	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
-	vmx_set_msr_bitmap(vcpu);
+	vmx_update_msr_bitmap(vcpu);
 }
 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
@ -9602,6 +9532,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
 	struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
 	unsigned long *msr_bitmap;
 	int cpu;
 	if (!vmx)
@ -9634,13 +9565,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx->guest_msrs)
 		goto free_pml;
-	vmx->loaded_vmcs = &vmx->vmcs01;
+	err = alloc_loaded_vmcs(&vmx->vmcs01);
-	vmx->loaded_vmcs->vmcs = alloc_vmcs();
+	if (err < 0)
 	vmx->loaded_vmcs->shadow_vmcs = NULL;
 	if (!vmx->loaded_vmcs->vmcs)
 		goto free_msrs;
 	loaded_vmcs_init(vmx->loaded_vmcs);
 	msr_bitmap = vmx->vmcs01.msr_bitmap;
 	vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
 	vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
 	vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
 	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
 	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
 	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
 	vmx->msr_bitmap_mode = 0;
 	vmx->loaded_vmcs = &vmx->vmcs01;
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
 	vmx->vcpu.cpu = cpu;
@ -10103,7 +10041,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 	int msr;
 	struct page *page;
 	unsigned long *msr_bitmap_l1;
-	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
+	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
 	/* This shortcut is ok because we support only x2APIC MSRs so far. */
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
@ -10680,6 +10618,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	if (kvm_has_tsc_control)
 		decache_tsc_multiplier(vmx);
 	if (cpu_has_vmx_msr_bitmap())
 		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
 	if (enable_vpid) {
 		/*
 		 * There is no direct mapping between vpid02 and vpid12, the
@ -10901,20 +10842,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	struct loaded_vmcs *vmcs02;
 	u32 msr_entry_idx;
 	u32 exit_qual;
 	vmcs02 = nested_get_current_vmcs02(vmx);
 	if (!vmcs02)
 		return -ENOMEM;
 	enter_guest_mode(vcpu);
 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-	vmx_switch_vmcs(vcpu, vmcs02);
+	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
 	vmx_segment_cache_clear(vmx);
 	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@ -11483,7 +11419,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 	if (cpu_has_vmx_msr_bitmap())
-		vmx_set_msr_bitmap(vcpu);
+		vmx_update_msr_bitmap(vcpu);
 	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
 				vmcs12->vm_exit_msr_load_count))
@ -11532,10 +11468,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	vm_exit_controls_reset_shadow(vmx);
 	vmx_segment_cache_clear(vmx);
 	/* if no vmcs02 cache requested, remove the one we used */
 	if (VMCS02_POOL_SIZE == 0)
 		nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
 	/* Update any VMCS fields that might have changed while L2 ran */
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);