vmm: Use an sx lock to protect the memory map.

Previously bhyve obtained a "read lock" on the memory map for ioctls
needing to read the map by locking the last vCPU.  This is now
replaced by a new per-VM sx lock.  Modifying the map requires
exclusively locking the sx lock as well as locking all existing vCPUs.
Reading the map requires either locking one vCPU or the sx lock.

This permits safely modifying or querying the memory map while some
vCPUs do not exist which will be true in a future commit.

Reviewed by:	corvink, markj
Differential Revision:	https://reviews.freebsd.org/D37172
This commit is contained in:
John Baldwin 2022-11-18 10:04:37 -08:00
parent 08ebb36076
commit 67b69e76e8
3 changed files with 72 additions and 52 deletions

View file

@ -231,6 +231,9 @@ int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
/*
* APIs that modify the guest memory map require all vcpus to be frozen.
*/
void vm_slock_memsegs(struct vm *vm);
void vm_xlock_memsegs(struct vm *vm);
void vm_unlock_memsegs(struct vm *vm);
int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
size_t len, int prot, int flags);
int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len);

View file

@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$");
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/sx.h>
#include <sys/vnode.h>
#include <vm/vm.h>
@ -155,6 +156,11 @@ struct mem_map {
* (o) initialized the first time the VM is created
* (i) initialized when VM is created and when it is reinitialized
* (x) initialized before use
*
* Locking:
* [m] mem_segs_lock
* [r] rendezvous_mtx
* [v] reads require one frozen vcpu, writes require freezing all vcpus
*/
struct vm {
void *cookie; /* (i) cpu-specific data */
@ -170,13 +176,13 @@ struct vm {
int suspend; /* (i) stop VM execution */
volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */
cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */
void *rendezvous_arg; /* (x) rendezvous func/arg */
cpuset_t rendezvous_req_cpus; /* (x) [r] rendezvous requested */
cpuset_t rendezvous_done_cpus; /* (x) [r] rendezvous finished */
void *rendezvous_arg; /* (x) [r] rendezvous func/arg */
vm_rendezvous_func_t rendezvous_func;
struct mtx rendezvous_mtx; /* (o) rendezvous lock */
struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) [m+v] guest address space */
struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) [m+v] guest memory regions */
struct vmspace *vmspace; /* (o) guest's address space */
char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */
struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
@ -185,6 +191,7 @@ struct vm {
uint16_t cores; /* (o) num of cores/socket */
uint16_t threads; /* (o) num of threads/core */
uint16_t maxcpus; /* (o) max pluggable cpus */
struct sx mem_segs_lock; /* (o) */
};
#define VMM_CTR0(vcpu, format) \
@ -518,6 +525,7 @@ vm_create(const char *name, struct vm **retvm)
strcpy(vm->name, name);
vm->vmspace = vmspace;
mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
sx_init(&vm->mem_segs_lock, "vm mem_segs");
vm->sockets = 1;
vm->cores = cores_per_package; /* XXX backwards compatibility */
@ -609,6 +617,7 @@ vm_cleanup(struct vm *vm, bool destroy)
vmmops_vmspace_free(vm->vmspace);
vm->vmspace = NULL;
sx_destroy(&vm->mem_segs_lock);
mtx_destroy(&vm->rendezvous_mtx);
}
}
@ -645,6 +654,24 @@ vm_name(struct vm *vm)
return (vm->name);
}
void
vm_slock_memsegs(struct vm *vm)
{
sx_slock(&vm->mem_segs_lock);
}
void
vm_xlock_memsegs(struct vm *vm)
{
sx_xlock(&vm->mem_segs_lock);
}
void
vm_unlock_memsegs(struct vm *vm)
{
sx_unlock(&vm->mem_segs_lock);
}
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
@ -702,6 +729,8 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
struct mem_seg *seg;
vm_object_t obj;
sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
if (ident < 0 || ident >= VM_MAX_MEMSEGS)
return (EINVAL);
@ -732,6 +761,8 @@ vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
{
struct mem_seg *seg;
sx_assert(&vm->mem_segs_lock, SX_LOCKED);
if (ident < 0 || ident >= VM_MAX_MEMSEGS)
return (EINVAL);
@ -1075,19 +1106,7 @@ void *
vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
void **cookie)
{
#ifdef INVARIANTS
/*
* All vcpus are frozen by ioctls that modify the memory map
* (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
* guaranteed if at least one vcpu is in the VCPU_FROZEN state.
*/
int state;
for (int i = 0; i < vm->maxcpus; i++) {
state = vcpu_get_state(vm_vcpu(vm, i), NULL);
KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
__func__, state));
}
#endif
sx_assert(&vm->mem_segs_lock, SX_LOCKED);
return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
}

View file

@ -221,8 +221,6 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
vm_paddr_t gpa, maxaddr;
void *hpa, *cookie;
struct vmmdev_softc *sc;
struct vcpu *vcpu;
uint16_t lastcpu;
error = vmm_priv_check(curthread->td_ucred);
if (error)
@ -233,13 +231,9 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
return (ENXIO);
/*
* Get a read lock on the guest memory map by freezing any vcpu.
* Get a read lock on the guest memory map.
*/
lastcpu = vm_get_maxcpus(sc->vm) - 1;
error = vcpu_lock_one(sc, lastcpu);
if (error)
return (error);
vcpu = vm_vcpu(sc->vm, lastcpu);
vm_slock_memsegs(sc->vm);
prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
maxaddr = vmm_sysmem_maxaddr(sc->vm);
@ -256,7 +250,7 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
* Since this device does not support lseek(2), dd(1) will
* read(2) blocks of data to simulate the lseek(2).
*/
hpa = vm_gpa_hold(vcpu, gpa, c, prot, &cookie);
hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
if (hpa == NULL) {
if (uio->uio_rw == UIO_READ && gpa < maxaddr)
error = uiomove(__DECONST(void *, zero_region),
@ -268,7 +262,7 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
vm_gpa_release(cookie);
}
}
vcpu_unlock_one(sc, lastcpu);
vm_unlock_memsegs(sc->vm);
return (error);
}
@ -420,6 +414,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vm_readwrite_kernemu_device *kernemu;
uint64_t *regvals;
int *regnums;
bool memsegs_locked;
#ifdef BHYVE_SNAPSHOT
struct vm_snapshot_meta *snapshot_meta;
#endif
@ -435,6 +430,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
vcpuid = -1;
vcpu = NULL;
state_changed = 0;
memsegs_locked = false;
/*
* For VMM ioctls that operate on a single vCPU, lookup the
@ -476,10 +472,6 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
vcpu = vm_vcpu(sc->vm, vcpuid);
break;
case VM_MAP_PPTDEV_MMIO:
case VM_UNMAP_PPTDEV_MMIO:
case VM_BIND_PPTDEV:
case VM_UNBIND_PPTDEV:
#ifdef COMPAT_FREEBSD12
case VM_ALLOC_MEMSEG_FBSD12:
#endif
@ -487,6 +479,21 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
case VM_MMAP_MEMSEG:
case VM_MUNMAP_MEMSEG:
case VM_REINIT:
/*
* ioctls that modify the memory map must lock memory
* segments exclusively.
*/
vm_xlock_memsegs(sc->vm);
memsegs_locked = true;
/* FALLTHROUGH */
case VM_MAP_PPTDEV_MMIO:
case VM_UNMAP_PPTDEV_MMIO:
case VM_BIND_PPTDEV:
case VM_UNBIND_PPTDEV:
#ifdef BHYVE_SNAPSHOT
case VM_SNAPSHOT_REQ:
case VM_RESTORE_TIME:
#endif
/*
* ioctls that operate on the entire virtual machine must
* prevent all vcpus from running.
@ -503,14 +510,10 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
case VM_GET_MEMSEG:
case VM_MMAP_GETNEXT:
/*
* Lock a vcpu to make sure that the memory map cannot be
* modified while it is being inspected.
* Lock the memory map while it is being inspected.
*/
vcpuid = vm_get_maxcpus(sc->vm) - 1;
error = vcpu_lock_one(sc, vcpuid);
if (error)
goto done;
state_changed = 1;
vm_slock_memsegs(sc->vm);
memsegs_locked = true;
break;
#ifdef COMPAT_FREEBSD13
@ -958,6 +961,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
vcpu_unlock_one(sc, vcpuid);
else if (state_changed == 2)
vcpu_unlock_all(sc);
if (memsegs_locked)
vm_unlock_memsegs(sc->vm);
done:
/*
@ -978,7 +983,6 @@ vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
size_t len;
vm_ooffset_t segoff, first, last;
int error, found, segid;
uint16_t lastcpu;
bool sysmem;
error = vmm_priv_check(curthread->td_ucred);
@ -997,12 +1001,9 @@ vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
}
/*
* Get a read lock on the guest memory map by freezing any vcpu.
* Get a read lock on the guest memory map.
*/
lastcpu = vm_get_maxcpus(sc->vm) - 1;
error = vcpu_lock_one(sc, lastcpu);
if (error)
return (error);
vm_slock_memsegs(sc->vm);
gpa = 0;
found = 0;
@ -1029,7 +1030,7 @@ vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
error = EINVAL;
}
}
vcpu_unlock_one(sc, lastcpu);
vm_unlock_memsegs(sc->vm);
return (error);
}
@ -1242,7 +1243,6 @@ devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
vm_ooffset_t first, last;
size_t seglen;
int error;
uint16_t lastcpu;
bool sysmem;
dsc = cdev->si_drv1;
@ -1256,16 +1256,14 @@ devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
if ((nprot & PROT_EXEC) || first < 0 || first >= last)
return (EINVAL);
lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
error = vcpu_lock_one(dsc->sc, lastcpu);
if (error)
return (error);
vm_slock_memsegs(dsc->sc->vm);
error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
KASSERT(error == 0 && !sysmem && *objp != NULL,
("%s: invalid devmem segment %d", __func__, dsc->segid));
vcpu_unlock_one(dsc->sc, lastcpu);
vm_unlock_memsegs(dsc->sc->vm);
if (seglen >= last) {
vm_object_reference(*objp);