Merge branch 'msr-bitmaps' of git://git.kernel.org/pub/scm/virt/kvm/kvm into x86/pti

Pull the KVM prerequisites so the IBPB patches apply.
This commit is contained in:
Thomas Gleixner 2018-02-03 22:30:16 +01:00
commit a96223f192

View file

@ -112,6 +112,14 @@ static u64 __read_mostly host_xss;
static bool __read_mostly enable_pml = 1; static bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO); module_param_named(pml, enable_pml, bool, S_IRUGO);
#define MSR_TYPE_R 1
#define MSR_TYPE_W 2
#define MSR_TYPE_RW 3
#define MSR_BITMAP_MODE_X2APIC 1
#define MSR_BITMAP_MODE_X2APIC_APICV 2
#define MSR_BITMAP_MODE_LM 4
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
@ -186,7 +194,6 @@ module_param(ple_window_max, int, S_IRUGO);
extern const ulong vmx_return; extern const ulong vmx_return;
#define NR_AUTOLOAD_MSRS 8 #define NR_AUTOLOAD_MSRS 8
#define VMCS02_POOL_SIZE 1
struct vmcs { struct vmcs {
u32 revision_id; u32 revision_id;
@ -211,6 +218,7 @@ struct loaded_vmcs {
int soft_vnmi_blocked; int soft_vnmi_blocked;
ktime_t entry_time; ktime_t entry_time;
s64 vnmi_blocked_time; s64 vnmi_blocked_time;
unsigned long *msr_bitmap;
struct list_head loaded_vmcss_on_cpu_link; struct list_head loaded_vmcss_on_cpu_link;
}; };
@ -227,7 +235,7 @@ struct shared_msr_entry {
* stored in guest memory specified by VMPTRLD, but is opaque to the guest, * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
* which must access it using VMREAD/VMWRITE/VMCLEAR instructions. * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
* More than one of these structures may exist, if L1 runs multiple L2 guests. * More than one of these structures may exist, if L1 runs multiple L2 guests.
* nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
* underlying hardware which will be used to run L2. * underlying hardware which will be used to run L2.
* This structure is packed to ensure that its layout is identical across * This structure is packed to ensure that its layout is identical across
* machines (necessary for live migration). * machines (necessary for live migration).
@ -410,13 +418,6 @@ struct __packed vmcs12 {
*/ */
#define VMCS12_SIZE 0x1000 #define VMCS12_SIZE 0x1000
/* Used to remember the last vmcs02 used for some recently used vmcs12s */
struct vmcs02_list {
struct list_head list;
gpa_t vmptr;
struct loaded_vmcs vmcs02;
};
/* /*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need * The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu. * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@ -441,15 +442,15 @@ struct nested_vmx {
*/ */
bool sync_shadow_vmcs; bool sync_shadow_vmcs;
/* vmcs02_list cache of VMCSs recently used to run L2 guests */
struct list_head vmcs02_pool;
int vmcs02_num;
bool change_vmcs01_virtual_x2apic_mode; bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */ /* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending; bool nested_run_pending;
struct loaded_vmcs vmcs02;
/* /*
* Guest pages referred to in vmcs02 with host-physical pointers, so * Guest pages referred to in the vmcs02 with host-physical
* we must keep them pinned while L2 runs. * pointers, so we must keep them pinned while L2 runs.
*/ */
struct page *apic_access_page; struct page *apic_access_page;
struct page *virtual_apic_page; struct page *virtual_apic_page;
@ -458,8 +459,6 @@ struct nested_vmx {
bool pi_pending; bool pi_pending;
u16 posted_intr_nv; u16 posted_intr_nv;
unsigned long *msr_bitmap;
struct hrtimer preemption_timer; struct hrtimer preemption_timer;
bool preemption_timer_expired; bool preemption_timer_expired;
@ -582,6 +581,7 @@ struct vcpu_vmx {
struct kvm_vcpu vcpu; struct kvm_vcpu vcpu;
unsigned long host_rsp; unsigned long host_rsp;
u8 fail; u8 fail;
u8 msr_bitmap_mode;
u32 exit_intr_info; u32 exit_intr_info;
u32 idt_vectoring_info; u32 idt_vectoring_info;
ulong rflags; ulong rflags;
@ -933,6 +933,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
u16 error_code); u16 error_code);
static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@ -952,12 +953,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
enum { enum {
VMX_IO_BITMAP_A, VMX_IO_BITMAP_A,
VMX_IO_BITMAP_B, VMX_IO_BITMAP_B,
VMX_MSR_BITMAP_LEGACY,
VMX_MSR_BITMAP_LONGMODE,
VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
VMX_MSR_BITMAP_LEGACY_X2APIC,
VMX_MSR_BITMAP_LONGMODE_X2APIC,
VMX_VMREAD_BITMAP, VMX_VMREAD_BITMAP,
VMX_VMWRITE_BITMAP, VMX_VMWRITE_BITMAP,
VMX_BITMAP_NR VMX_BITMAP_NR
@ -967,12 +962,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) #define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) #define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
@ -2570,36 +2559,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
vmx->guest_msrs[from] = tmp; vmx->guest_msrs[from] = tmp;
} }
static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
{
unsigned long *msr_bitmap;
if (is_guest_mode(vcpu))
msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
else if (cpu_has_secondary_exec_ctrls() &&
(vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
else
msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
} else {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
else
msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
}
} else {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode;
else
msr_bitmap = vmx_msr_bitmap_legacy;
}
vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
}
/* /*
* Set up the vmcs to automatically save and restore system * Set up the vmcs to automatically save and restore system
* msrs. Don't touch the 64-bit msrs if the guest is in legacy * msrs. Don't touch the 64-bit msrs if the guest is in legacy
@ -2640,7 +2599,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx->save_nmsrs = save_nmsrs; vmx->save_nmsrs = save_nmsrs;
if (cpu_has_vmx_msr_bitmap()) if (cpu_has_vmx_msr_bitmap())
vmx_set_msr_bitmap(&vmx->vcpu); vmx_update_msr_bitmap(&vmx->vcpu);
} }
/* /*
@ -3835,11 +3794,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
return vmcs; return vmcs;
} }
static struct vmcs *alloc_vmcs(void)
{
return alloc_vmcs_cpu(raw_smp_processor_id());
}
static void free_vmcs(struct vmcs *vmcs) static void free_vmcs(struct vmcs *vmcs)
{ {
free_pages((unsigned long)vmcs, vmcs_config.order); free_pages((unsigned long)vmcs, vmcs_config.order);
@ -3855,9 +3809,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
loaded_vmcs_clear(loaded_vmcs); loaded_vmcs_clear(loaded_vmcs);
free_vmcs(loaded_vmcs->vmcs); free_vmcs(loaded_vmcs->vmcs);
loaded_vmcs->vmcs = NULL; loaded_vmcs->vmcs = NULL;
if (loaded_vmcs->msr_bitmap)
free_page((unsigned long)loaded_vmcs->msr_bitmap);
WARN_ON(loaded_vmcs->shadow_vmcs != NULL); WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
} }
static struct vmcs *alloc_vmcs(void)
{
return alloc_vmcs_cpu(raw_smp_processor_id());
}
static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{
loaded_vmcs->vmcs = alloc_vmcs();
if (!loaded_vmcs->vmcs)
return -ENOMEM;
loaded_vmcs->shadow_vmcs = NULL;
loaded_vmcs_init(loaded_vmcs);
if (cpu_has_vmx_msr_bitmap()) {
loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
}
return 0;
out_vmcs:
free_loaded_vmcs(loaded_vmcs);
return -ENOMEM;
}
static void free_kvm_area(void) static void free_kvm_area(void)
{ {
int cpu; int cpu;
@ -4916,10 +4899,8 @@ static void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock); spin_unlock(&vmx_vpid_lock);
} }
#define MSR_TYPE_R 1 static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
#define MSR_TYPE_W 2 u32 msr, int type)
static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type)
{ {
int f = sizeof(unsigned long); int f = sizeof(unsigned long);
@ -4953,6 +4934,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
} }
} }
static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type)
{
int f = sizeof(unsigned long);
if (!cpu_has_vmx_msr_bitmap())
return;
/*
* See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
* have the write-low and read-high bitmap offsets the wrong way round.
* We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
*/
if (msr <= 0x1fff) {
if (type & MSR_TYPE_R)
/* read-low */
__set_bit(msr, msr_bitmap + 0x000 / f);
if (type & MSR_TYPE_W)
/* write-low */
__set_bit(msr, msr_bitmap + 0x800 / f);
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
if (type & MSR_TYPE_R)
/* read-high */
__set_bit(msr, msr_bitmap + 0x400 / f);
if (type & MSR_TYPE_W)
/* write-high */
__set_bit(msr, msr_bitmap + 0xc00 / f);
}
}
static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type, bool value)
{
if (value)
vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
else
vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
}
/* /*
* If a msr is allowed by L0, we should check whether it is allowed by L1. * If a msr is allowed by L0, we should check whether it is allowed by L1.
* The corresponding bit will be cleared unless both of L0 and L1 allow it. * The corresponding bit will be cleared unless both of L0 and L1 allow it.
@ -4999,28 +5024,68 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
} }
} }
static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
{ {
if (!longmode_only) u8 mode = 0;
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
msr, MSR_TYPE_R | MSR_TYPE_W); if (cpu_has_secondary_exec_ctrls() &&
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
msr, MSR_TYPE_R | MSR_TYPE_W); SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
mode |= MSR_BITMAP_MODE_X2APIC;
if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
mode |= MSR_BITMAP_MODE_X2APIC_APICV;
}
if (is_long_mode(vcpu))
mode |= MSR_BITMAP_MODE_LM;
return mode;
} }
static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
u8 mode)
{ {
if (apicv_active) { int msr;
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
msr, type); for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, unsigned word = msr / BITS_PER_LONG;
msr, type); msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
} else { msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
msr, type);
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
msr, type);
} }
if (mode & MSR_BITMAP_MODE_X2APIC) {
/*
* TPR reads and writes can be virtualized even if virtual interrupt
* delivery is not in use.
*/
vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
}
}
}
static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
u8 mode = vmx_msr_bitmap_mode(vcpu);
u8 changed = mode ^ vmx->msr_bitmap_mode;
if (!changed)
return;
vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
!(mode & MSR_BITMAP_MODE_LM));
if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
vmx->msr_bitmap_mode = mode;
} }
static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
@ -5272,7 +5337,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
} }
if (cpu_has_vmx_msr_bitmap()) if (cpu_has_vmx_msr_bitmap())
vmx_set_msr_bitmap(vcpu); vmx_update_msr_bitmap(vcpu);
} }
static u32 vmx_exec_control(struct vcpu_vmx *vmx) static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@ -5459,7 +5524,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
} }
if (cpu_has_vmx_msr_bitmap()) if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
@ -6742,7 +6807,7 @@ void vmx_enable_tdp(void)
static __init int hardware_setup(void) static __init int hardware_setup(void)
{ {
int r = -ENOMEM, i, msr; int r = -ENOMEM, i;
rdmsrl_safe(MSR_EFER, &host_efer); rdmsrl_safe(MSR_EFER, &host_efer);
@ -6762,9 +6827,6 @@ static __init int hardware_setup(void)
memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
if (setup_vmcs_config(&vmcs_config) < 0) { if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO; r = -EIO;
goto out; goto out;
@ -6833,42 +6895,8 @@ static __init int hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 48; kvm_tsc_scaling_ratio_frac_bits = 48;
} }
vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
vmx_msr_bitmap_legacy, PAGE_SIZE);
memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
vmx_msr_bitmap_longmode, PAGE_SIZE);
memcpy(vmx_msr_bitmap_legacy_x2apic,
vmx_msr_bitmap_legacy, PAGE_SIZE);
memcpy(vmx_msr_bitmap_longmode_x2apic,
vmx_msr_bitmap_longmode, PAGE_SIZE);
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
for (msr = 0x800; msr <= 0x8ff; msr++) {
if (msr == 0x839 /* TMCCT */)
continue;
vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
}
/*
* TPR reads and writes can be virtualized even if virtual interrupt
* delivery is not in use.
*/
vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
/* EOI */
vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
/* SELF-IPI */
vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
if (enable_ept) if (enable_ept)
vmx_enable_tdp(); vmx_enable_tdp();
else else
@ -6971,94 +6999,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
return handle_nop(vcpu); return handle_nop(vcpu);
} }
/*
* To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
* We could reuse a single VMCS for all the L2 guests, but we also want the
* option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
* allows keeping them loaded on the processor, and in the future will allow
* optimizations where prepare_vmcs02 doesn't need to set all the fields on
* every entry if they never change.
* So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
* (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
*
* The following functions allocate and free a vmcs02 in this pool.
*/
/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
{
struct vmcs02_list *item;
list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
if (item->vmptr == vmx->nested.current_vmptr) {
list_move(&item->list, &vmx->nested.vmcs02_pool);
return &item->vmcs02;
}
if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
/* Recycle the least recently used VMCS. */
item = list_last_entry(&vmx->nested.vmcs02_pool,
struct vmcs02_list, list);
item->vmptr = vmx->nested.current_vmptr;
list_move(&item->list, &vmx->nested.vmcs02_pool);
return &item->vmcs02;
}
/* Create a new VMCS */
item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
if (!item)
return NULL;
item->vmcs02.vmcs = alloc_vmcs();
item->vmcs02.shadow_vmcs = NULL;
if (!item->vmcs02.vmcs) {
kfree(item);
return NULL;
}
loaded_vmcs_init(&item->vmcs02);
item->vmptr = vmx->nested.current_vmptr;
list_add(&(item->list), &(vmx->nested.vmcs02_pool));
vmx->nested.vmcs02_num++;
return &item->vmcs02;
}
/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
{
struct vmcs02_list *item;
list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
if (item->vmptr == vmptr) {
free_loaded_vmcs(&item->vmcs02);
list_del(&item->list);
kfree(item);
vmx->nested.vmcs02_num--;
return;
}
}
/*
* Free all VMCSs saved for this vcpu, except the one pointed by
* vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
* must be &vmx->vmcs01.
*/
static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
{
struct vmcs02_list *item, *n;
WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
/*
* Something will leak if the above WARN triggers. Better than
* a use-after-free.
*/
if (vmx->loaded_vmcs == &item->vmcs02)
continue;
free_loaded_vmcs(&item->vmcs02);
list_del(&item->list);
kfree(item);
vmx->nested.vmcs02_num--;
}
}
/* /*
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
* set the success or error code of an emulated VMX instruction, as specified * set the success or error code of an emulated VMX instruction, as specified
@ -7239,13 +7179,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs *shadow_vmcs; struct vmcs *shadow_vmcs;
int r;
if (cpu_has_vmx_msr_bitmap()) { r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
vmx->nested.msr_bitmap = if (r < 0)
(unsigned long *)__get_free_page(GFP_KERNEL); goto out_vmcs02;
if (!vmx->nested.msr_bitmap)
goto out_msr_bitmap;
}
vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
if (!vmx->nested.cached_vmcs12) if (!vmx->nested.cached_vmcs12)
@ -7262,9 +7200,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
vmx->vmcs01.shadow_vmcs = shadow_vmcs; vmx->vmcs01.shadow_vmcs = shadow_vmcs;
} }
INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
vmx->nested.vmcs02_num = 0;
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED); HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@ -7276,9 +7211,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
kfree(vmx->nested.cached_vmcs12); kfree(vmx->nested.cached_vmcs12);
out_cached_vmcs12: out_cached_vmcs12:
free_page((unsigned long)vmx->nested.msr_bitmap); free_loaded_vmcs(&vmx->nested.vmcs02);
out_msr_bitmap: out_vmcs02:
return -ENOMEM; return -ENOMEM;
} }
@ -7421,10 +7356,6 @@ static void free_nested(struct vcpu_vmx *vmx)
free_vpid(vmx->nested.vpid02); free_vpid(vmx->nested.vpid02);
vmx->nested.posted_intr_nv = -1; vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmptr = -1ull;
if (vmx->nested.msr_bitmap) {
free_page((unsigned long)vmx->nested.msr_bitmap);
vmx->nested.msr_bitmap = NULL;
}
if (enable_shadow_vmcs) { if (enable_shadow_vmcs) {
vmx_disable_shadow_vmcs(vmx); vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs); vmcs_clear(vmx->vmcs01.shadow_vmcs);
@ -7432,7 +7363,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->vmcs01.shadow_vmcs = NULL; vmx->vmcs01.shadow_vmcs = NULL;
} }
kfree(vmx->nested.cached_vmcs12); kfree(vmx->nested.cached_vmcs12);
/* Unpin physical memory we referred to in current vmcs02 */ /* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) { if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page); kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL; vmx->nested.apic_access_page = NULL;
@ -7448,7 +7379,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.pi_desc = NULL; vmx->nested.pi_desc = NULL;
} }
nested_free_all_saved_vmcss(vmx); free_loaded_vmcs(&vmx->nested.vmcs02);
} }
/* Emulate the VMXOFF instruction */ /* Emulate the VMXOFF instruction */
@ -7491,8 +7422,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
vmptr + offsetof(struct vmcs12, launch_state), vmptr + offsetof(struct vmcs12, launch_state),
&zero, sizeof(zero)); &zero, sizeof(zero));
nested_free_vmcs02(vmx, vmptr);
nested_vmx_succeed(vcpu); nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu); return kvm_skip_emulated_instruction(vcpu);
} }
@ -8404,10 +8333,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
/* /*
* The host physical addresses of some pages of guest memory * The host physical addresses of some pages of guest memory
* are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
* may write to these pages via their host physical address while * Page). The CPU may write to these pages via their host
* L2 is running, bypassing any address-translation-based dirty * physical address while L2 is running, bypassing any
* tracking (e.g. EPT write protection). * address-translation-based dirty tracking (e.g. EPT write
* protection).
* *
* Mark them dirty on every exit from L2 to prevent them from * Mark them dirty on every exit from L2 to prevent them from
* getting out of sync with dirty tracking. * getting out of sync with dirty tracking.
@ -8941,7 +8871,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
} }
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
vmx_set_msr_bitmap(vcpu); vmx_update_msr_bitmap(vcpu);
} }
static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
@ -9602,6 +9532,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{ {
int err; int err;
struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
unsigned long *msr_bitmap;
int cpu; int cpu;
if (!vmx) if (!vmx)
@ -9634,13 +9565,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx->guest_msrs) if (!vmx->guest_msrs)
goto free_pml; goto free_pml;
vmx->loaded_vmcs = &vmx->vmcs01; err = alloc_loaded_vmcs(&vmx->vmcs01);
vmx->loaded_vmcs->vmcs = alloc_vmcs(); if (err < 0)
vmx->loaded_vmcs->shadow_vmcs = NULL;
if (!vmx->loaded_vmcs->vmcs)
goto free_msrs; goto free_msrs;
loaded_vmcs_init(vmx->loaded_vmcs);
msr_bitmap = vmx->vmcs01.msr_bitmap;
vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
vmx->msr_bitmap_mode = 0;
vmx->loaded_vmcs = &vmx->vmcs01;
cpu = get_cpu(); cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu); vmx_vcpu_load(&vmx->vcpu, cpu);
vmx->vcpu.cpu = cpu; vmx->vcpu.cpu = cpu;
@ -10103,7 +10041,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
int msr; int msr;
struct page *page; struct page *page;
unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
/* This shortcut is ok because we support only x2APIC MSRs so far. */ /* This shortcut is ok because we support only x2APIC MSRs so far. */
if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
@ -10680,6 +10618,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (kvm_has_tsc_control) if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx); decache_tsc_multiplier(vmx);
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
if (enable_vpid) { if (enable_vpid) {
/* /*
* There is no direct mapping between vpid02 and vpid12, the * There is no direct mapping between vpid02 and vpid12, the
@ -10901,20 +10842,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct loaded_vmcs *vmcs02;
u32 msr_entry_idx; u32 msr_entry_idx;
u32 exit_qual; u32 exit_qual;
vmcs02 = nested_get_current_vmcs02(vmx);
if (!vmcs02)
return -ENOMEM;
enter_guest_mode(vcpu); enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
vmx_switch_vmcs(vcpu, vmcs02); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx); vmx_segment_cache_clear(vmx);
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@ -11483,7 +11419,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_write64(GUEST_IA32_DEBUGCTL, 0); vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
if (cpu_has_vmx_msr_bitmap()) if (cpu_has_vmx_msr_bitmap())
vmx_set_msr_bitmap(vcpu); vmx_update_msr_bitmap(vcpu);
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count)) vmcs12->vm_exit_msr_load_count))
@ -11532,10 +11468,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vm_exit_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx); vmx_segment_cache_clear(vmx);
/* if no vmcs02 cache requested, remove the one we used */
if (VMCS02_POOL_SIZE == 0)
nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
/* Update any VMCS fields that might have changed while L2 ran */ /* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);