vmm: Avoid embedding cpuset_t ioctl ABIs

Commit 0bda8d3e9f ("vmm: permit some IPIs to be handled by userspace")
embedded cpuset_t into the vmm(4) ioctl ABI.  This was a mistake since
we otherwise have some leeway to change the cpuset_t for the whole
system, but we want to keep the vmm ioctl ABI stable.

Rework IPI reporting to avoid this problem.  Along the way, make VM_RUN
a bit more efficient:
- Split vmexit metadata out of the main VM_RUN structure.  This data is
  only written by the kernel.
- Have userspace pass a cpuset_t pointer and cpusetsize in the VM_RUN
  structure, as is done for cpuset syscalls.
- Have the destination CPU mask for VM_EXITCODE_IPIs live outside the
  vmexit info structure, and make VM_RUN copy it out separately.  Zero
  out any extra bytes in the CPU mask, like cpuset syscalls do.
- Modify the vmexit handler prototype to take a full VM_RUN structure.

PR:		271330
Reviewed by:	corvink, jhb (previous versions)
Sponsored by:	The FreeBSD Foundation
Differential Revision:	https://reviews.freebsd.org/D40113
This commit is contained in:
Mark Johnston 2023-05-23 21:13:33 -04:00
parent 0be82d56b4
commit e17eca3276
10 changed files with 182 additions and 58 deletions

View file

@ -721,16 +721,9 @@ vm_get_register_set(struct vcpu *vcpu, unsigned int count,
}
int
vm_run(struct vcpu *vcpu, struct vm_exit *vmexit)
vm_run(struct vcpu *vcpu, struct vm_run *vmrun)
{
int error;
struct vm_run vmrun;
bzero(&vmrun, sizeof(vmrun));
error = vcpu_ioctl(vcpu, VM_RUN, &vmrun);
bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
return (error);
return (vcpu_ioctl(vcpu, VM_RUN, vmrun));
}
int

View file

@ -155,7 +155,7 @@ int vm_set_register_set(struct vcpu *vcpu, unsigned int count,
const int *regnums, uint64_t *regvals);
int vm_get_register_set(struct vcpu *vcpu, unsigned int count,
const int *regnums, uint64_t *regvals);
int vm_run(struct vcpu *vcpu, struct vm_exit *ret_vmexit);
int vm_run(struct vcpu *vcpu, struct vm_run *vmrun);
int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
int vm_reinit(struct vmctx *ctx);
int vm_apicid2vcpu(struct vmctx *ctx, int apicid);

View file

@ -273,7 +273,7 @@ int vm_get_seg_desc(struct vcpu *vcpu, int reg,
struct seg_desc *ret_desc);
int vm_set_seg_desc(struct vcpu *vcpu, int reg,
struct seg_desc *desc);
int vm_run(struct vcpu *vcpu, struct vm_exit *vme_user);
int vm_run(struct vcpu *vcpu);
int vm_suspend(struct vm *vm, enum vm_suspend_how how);
int vm_inject_nmi(struct vcpu *vcpu);
int vm_nmi_pending(struct vcpu *vcpu);
@ -297,6 +297,7 @@ int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu);
int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu);
int vm_restart_instruction(struct vcpu *vcpu);
struct vm_exit *vm_exitinfo(struct vcpu *vcpu);
cpuset_t *vm_exitinfo_cpuset(struct vcpu *vcpu);
void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip);
void vm_exit_debug(struct vcpu *vcpu, uint64_t rip);
void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip);
@ -754,9 +755,13 @@ struct vm_exit {
enum vm_suspend_how how;
} suspended;
struct {
/*
* The destination vCPU mask is saved in vcpu->cpuset
* and is copied out to userspace separately to avoid
* ABI concerns.
*/
uint32_t mode;
uint8_t vector;
cpuset_t dmask;
} ipi;
struct vm_task_switch task_switch;
} u;

View file

@ -89,7 +89,9 @@ struct vm_register_set {
struct vm_run {
int cpuid;
struct vm_exit vm_exit;
cpuset_t *cpuset; /* CPU set storage */
size_t cpusetsize;
struct vm_exit *vm_exit;
};
struct vm_exception {
@ -349,7 +351,7 @@ enum {
};
#define VM_RUN \
_IOWR('v', IOCNUM_RUN, struct vm_run)
_IOW('v', IOCNUM_RUN, struct vm_run)
#define VM_SUSPEND \
_IOW('v', IOCNUM_SUSPEND, struct vm_suspend)
#define VM_REINIT \

View file

@ -1146,7 +1146,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
vmexit->exitcode = VM_EXITCODE_IPI;
vmexit->u.ipi.mode = mode;
vmexit->u.ipi.vector = vec;
vmexit->u.ipi.dmask = ipimask;
*vm_exitinfo_cpuset(vlapic->vcpu) = ipimask;
*retu = true;
}
@ -1166,7 +1166,7 @@ int
vm_handle_ipi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
{
struct vlapic *vlapic = vm_lapic(vcpu);
cpuset_t *dmask = &vme->u.ipi.dmask;
cpuset_t *dmask = vm_exitinfo_cpuset(vcpu);
uint8_t vec = vme->u.ipi.vector;
*retu = true;

View file

@ -123,6 +123,7 @@ struct vcpu {
uint64_t guest_xcr0; /* (i) guest %xcr0 register */
void *stats; /* (a,i) statistics */
struct vm_exit exitinfo; /* (x) exit reason and collateral */
cpuset_t exitinfo_cpuset; /* (x) storage for vmexit handlers */
uint64_t nextrip; /* (x) next instruction to execute */
uint64_t tsc_offset; /* (o) TSC offsetting */
};
@ -399,6 +400,12 @@ vm_exitinfo(struct vcpu *vcpu)
return (&vcpu->exitinfo);
}
cpuset_t *
vm_exitinfo_cpuset(struct vcpu *vcpu)
{
return (&vcpu->exitinfo_cpuset);
}
static int
vmm_init(void)
{
@ -1837,7 +1844,7 @@ vm_exit_astpending(struct vcpu *vcpu, uint64_t rip)
}
int
vm_run(struct vcpu *vcpu, struct vm_exit *vme_user)
vm_run(struct vcpu *vcpu)
{
struct vm *vm = vcpu->vm;
struct vm_eventinfo evinfo;
@ -1938,8 +1945,6 @@ vm_run(struct vcpu *vcpu, struct vm_exit *vme_user)
vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1);
VMM_CTR2(vcpu, "retu %d/%d", error, vme->exitcode);
/* copy the exit information */
*vme_user = *vme;
return (error);
}

View file

@ -93,7 +93,29 @@ struct vm_snapshot_meta_old {
#define VM_SNAPSHOT_REQ_OLD \
_IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta_old)
#endif
struct vm_exit_ipi_13 {
uint32_t mode;
uint8_t vector;
__BITSET_DEFINE(, 256) dmask;
};
struct vm_exit_13 {
uint32_t exitcode;
int32_t inst_length;
uint64_t rip;
uint64_t u[120 / sizeof(uint64_t)];
};
struct vm_run_13 {
int cpuid;
struct vm_exit_13 vm_exit;
};
#define VM_RUN_13 \
_IOWR('v', IOCNUM_RUN, struct vm_run_13)
#endif /* COMPAT_FREEBSD13 */
struct devmem_softc {
int segid;
@ -396,6 +418,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vm_seg_desc *vmsegdesc;
struct vm_register_set *vmregset;
struct vm_run *vmrun;
#ifdef COMPAT_FREEBSD13
struct vm_run_13 *vmrun_13;
#endif
struct vm_exception *vmexc;
struct vm_lapic_irq *vmirq;
struct vm_lapic_msi *vmmsi;
@ -459,6 +484,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
*/
switch (cmd) {
case VM_RUN:
#ifdef COMPAT_FREEBSD13
case VM_RUN_13:
#endif
case VM_GET_REGISTER:
case VM_SET_REGISTER:
case VM_GET_SEGMENT_DESCRIPTOR:
@ -579,11 +607,73 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
break;
}
switch(cmd) {
case VM_RUN:
switch (cmd) {
case VM_RUN: {
struct vm_exit *vme;
vmrun = (struct vm_run *)data;
error = vm_run(vcpu, &vmrun->vm_exit);
vme = vm_exitinfo(vcpu);
error = vm_run(vcpu);
if (error != 0)
break;
error = copyout(vme, vmrun->vm_exit, sizeof(*vme));
if (error != 0)
break;
if (vme->exitcode == VM_EXITCODE_IPI) {
error = copyout(vm_exitinfo_cpuset(vcpu),
vmrun->cpuset,
min(vmrun->cpusetsize, sizeof(cpuset_t)));
if (error != 0)
break;
if (sizeof(cpuset_t) < vmrun->cpusetsize) {
uint8_t *p;
p = (uint8_t *)vmrun->cpuset +
sizeof(cpuset_t);
while (error == 0 &&
p < (uint8_t *)vmrun->cpuset +
vmrun->cpusetsize) {
error = subyte(p++, 0);
}
}
}
break;
}
#ifdef COMPAT_FREEBSD13
case VM_RUN_13: {
struct vm_exit *vme;
struct vm_exit_13 *vme_13;
vmrun_13 = (struct vm_run_13 *)data;
vme_13 = &vmrun_13->vm_exit;
vme = vm_exitinfo(vcpu);
error = vm_run(vcpu);
if (error == 0) {
vme_13->exitcode = vme->exitcode;
vme_13->inst_length = vme->inst_length;
vme_13->rip = vme->rip;
memcpy(vme_13->u, &vme->u, sizeof(vme_13->u));
if (vme->exitcode == VM_EXITCODE_IPI) {
struct vm_exit_ipi_13 *ipi;
cpuset_t *dmask;
int cpu;
dmask = vm_exitinfo_cpuset(vcpu);
ipi = (struct vm_exit_ipi_13 *)&vme_13->u[0];
BIT_ZERO(256, &ipi->dmask);
CPU_FOREACH_ISSET(cpu, dmask) {
if (cpu >= 256)
break;
BIT_SET(256, cpu, &ipi->dmask);
}
}
}
break;
}
#endif
case VM_SUSPEND:
vmsuspend = (struct vm_suspend *)data;
error = vm_suspend(sc->vm, vmsuspend->how);

View file

@ -184,7 +184,7 @@ static const char * const vmx_exit_reason_desc[] = {
[EXIT_REASON_XRSTORS] = "XRSTORS"
};
typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_exit *);
typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_run *);
int guest_ncpus;
uint16_t cpu_cores, cpu_sockets, cpu_threads;
@ -592,11 +592,13 @@ vmexit_handle_notify(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
}
static int
vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme)
vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
{
struct vm_exit *vme;
int error;
int bytes, port, in, out;
vme = vmrun->vm_exit;
port = vme->u.inout.port;
bytes = vme->u.inout.bytes;
in = vme->u.inout.in;
@ -621,12 +623,16 @@ vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme)
}
static int
vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme)
vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu,
struct vm_run *vmrun)
{
struct vm_exit *vme;
uint64_t val;
uint32_t eax, edx;
int error;
vme = vmrun->vm_exit;
val = 0;
error = emulate_rdmsr(vcpu, vme->u.msr.code, &val);
if (error != 0) {
@ -650,10 +656,14 @@ vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme)
}
static int
vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme)
vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu,
struct vm_run *vmrun)
{
struct vm_exit *vme;
int error;
vme = vmrun->vm_exit;
error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval);
if (error != 0) {
fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
@ -685,8 +695,11 @@ vmexit_vmx_desc(uint32_t exit_reason)
}
static int
vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme)
vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
{
struct vm_exit *vme;
vme = vmrun->vm_exit;
fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu));
fprintf(stderr, "\treason\t\tVMX\n");
@ -718,8 +731,11 @@ vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme)
}
static int
vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme)
vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun)
{
struct vm_exit *vme;
vme = vmrun->vm_exit;
fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu));
fprintf(stderr, "\treason\t\tSVM\n");
@ -733,10 +749,9 @@ vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme)
static int
vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
struct vm_exit *vme)
struct vm_run *vmrun)
{
assert(vme->inst_length == 0);
assert(vmrun->vm_exit->inst_length == 0);
stats.vmexit_bogus++;
@ -745,10 +760,9 @@ vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
static int
vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
struct vm_exit *vme)
struct vm_run *vmrun)
{
assert(vme->inst_length == 0);
assert(vmrun->vm_exit->inst_length == 0);
stats.vmexit_reqidle++;
@ -757,9 +771,8 @@ vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
static int
vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
struct vm_exit *vme __unused)
struct vm_run *vmrun __unused)
{
stats.vmexit_hlt++;
/*
@ -772,9 +785,8 @@ vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
static int
vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
struct vm_exit *vme __unused)
struct vm_run *vmrun __unused)
{
stats.vmexit_pause++;
return (VMEXIT_CONTINUE);
@ -782,10 +794,9 @@ vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
static int
vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu,
struct vm_exit *vme)
struct vm_run *vmrun)
{
assert(vme->inst_length == 0);
assert(vmrun->vm_exit->inst_length == 0);
stats.vmexit_mtrap++;
@ -802,12 +813,15 @@ vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu,
static int
vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu,
struct vm_exit *vme)
struct vm_run *vmrun)
{
int err, i, cs_d;
struct vm_exit *vme;
struct vie *vie;
int err, i, cs_d;
enum vm_cpu_mode mode;
vme = vmrun->vm_exit;
stats.vmexit_inst_emul++;
vie = &vme->u.inst_emul.vie;
@ -852,11 +866,14 @@ static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
static int
vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme)
vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
{
struct vm_exit *vme;
enum vm_suspend_how how;
int vcpuid = vcpu_id(vcpu);
vme = vmrun->vm_exit;
how = vme->u.suspended.how;
fbsdrun_deletecpu(vcpuid);
@ -894,7 +911,7 @@ vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme)
static int
vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu,
struct vm_exit *vme __unused)
struct vm_run *vmrun __unused)
{
#ifdef BHYVE_SNAPSHOT
@ -914,22 +931,27 @@ vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu,
static int
vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu,
struct vm_exit *vme)
struct vm_run *vmrun)
{
gdb_cpu_breakpoint(vcpu, vme);
gdb_cpu_breakpoint(vcpu, vmrun->vm_exit);
return (VMEXIT_CONTINUE);
}
static int
vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
struct vm_exit *vme)
struct vm_run *vmrun)
{
struct vm_exit *vme;
cpuset_t *dmask;
int error = -1;
int i;
dmask = vmrun->cpuset;
vme = vmrun->vm_exit;
switch (vme->u.ipi.mode) {
case APIC_DELMODE_INIT:
CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) {
CPU_FOREACH_ISSET(i, dmask) {
error = vm_suspend_cpu(vcpu_info[i].vcpu);
if (error) {
warnx("%s: failed to suspend cpu %d\n",
@ -939,7 +961,7 @@ vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
}
break;
case APIC_DELMODE_STARTUP:
CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) {
CPU_FOREACH_ISSET(i, dmask) {
spinup_ap(vcpu_info[i].vcpu,
vme->u.ipi.vector << PAGE_SHIFT);
}
@ -974,15 +996,20 @@ static void
vm_loop(struct vmctx *ctx, struct vcpu *vcpu)
{
struct vm_exit vme;
struct vm_run vmrun;
int error, rc;
enum vm_exitcode exitcode;
cpuset_t active_cpus;
cpuset_t active_cpus, dmask;
error = vm_active_cpus(ctx, &active_cpus);
assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus));
vmrun.vm_exit = &vme;
vmrun.cpuset = &dmask;
vmrun.cpusetsize = sizeof(dmask);
while (1) {
error = vm_run(vcpu, &vme);
error = vm_run(vcpu, &vmrun);
if (error != 0)
break;
@ -993,7 +1020,7 @@ vm_loop(struct vmctx *ctx, struct vcpu *vcpu)
exit(4);
}
rc = (*handler[exitcode])(ctx, vcpu, &vme);
rc = (*handler[exitcode])(ctx, vcpu, &vmrun);
switch (rc) {
case VMEXIT_CONTINUE:

View file

@ -39,7 +39,7 @@ extern uint16_t cpu_cores, cpu_sockets, cpu_threads;
struct vcpu;
struct vmctx;
struct vm_exit;
struct vm_run;
void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len);
#ifdef BHYVE_SNAPSHOT
@ -48,6 +48,6 @@ uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr);
int fbsdrun_virtio_msix(void);
int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_exit *);
int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_run *);
#endif

View file

@ -704,7 +704,7 @@ push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging,
} while (0)
int
vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit)
vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
{
struct seg_desc nt;
struct tss32 oldtss, newtss;
@ -712,12 +712,14 @@ vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit)
struct vm_guest_paging *paging, sup_paging;
struct user_segment_descriptor nt_desc, ot_desc;
struct iovec nt_iov[2], ot_iov[2];
struct vm_exit *vmexit;
uint64_t cr0, ot_base;
uint32_t eip, ot_lim, access;
int error, ext, fault, minlimit, nt_type, ot_type;
enum task_switch_reason reason;
uint16_t nt_sel, ot_sel;
vmexit = vmrun->vm_exit;
task_switch = &vmexit->u.task_switch;
nt_sel = task_switch->tsssel;
ext = vmexit->u.task_switch.ext;