2011-05-13 04:54:01 +00:00
|
|
|
/*-
|
2023-05-10 15:40:58 +00:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
2017-11-26 02:00:33 +00:00
|
|
|
*
|
2011-05-13 04:54:01 +00:00
|
|
|
* Copyright (c) 2011 NetApp, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
2014-05-31 23:37:34 +00:00
|
|
|
#include <sys/param.h>
|
2022-10-24 21:31:11 +00:00
|
|
|
#include <sys/capsicum.h>
|
2011-05-13 04:54:01 +00:00
|
|
|
#include <sys/sysctl.h>
|
|
|
|
#include <sys/ioctl.h>
|
|
|
|
#include <sys/mman.h>
|
2024-04-03 16:52:00 +00:00
|
|
|
#include <sys/linker.h>
|
2019-12-17 01:33:26 +00:00
|
|
|
#include <sys/module.h>
|
2014-05-26 18:21:08 +00:00
|
|
|
#include <sys/_iovec.h>
|
2014-05-31 23:37:34 +00:00
|
|
|
#include <sys/cpuset.h>
|
2011-05-13 04:54:01 +00:00
|
|
|
|
2022-10-24 21:31:11 +00:00
|
|
|
#include <capsicum_helpers.h>
|
2015-05-06 16:25:20 +00:00
|
|
|
#include <errno.h>
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
#include <stdbool.h>
|
2011-05-13 04:54:01 +00:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <assert.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
|
2013-10-09 03:56:07 +00:00
|
|
|
#include <libutil.h>
|
|
|
|
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
#include <vm/vm.h>
|
2011-05-13 04:54:01 +00:00
|
|
|
#include <machine/vmm.h>
|
|
|
|
#include <machine/vmm_dev.h>
|
2024-04-03 17:45:06 +00:00
|
|
|
#ifdef WITH_VMMAPI_SNAPSHOT
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
#include <machine/vmm_snapshot.h>
|
2024-04-03 17:45:06 +00:00
|
|
|
#endif
|
2011-05-13 04:54:01 +00:00
|
|
|
|
|
|
|
#include "vmmapi.h"
|
2023-03-24 18:49:06 +00:00
|
|
|
#include "internal.h"
|
2011-05-13 04:54:01 +00:00
|
|
|
|
2013-10-09 03:56:07 +00:00
|
|
|
#define MB (1024 * 1024UL)
|
2013-03-18 22:38:30 +00:00
|
|
|
#define GB (1024 * 1024 * 1024UL)
|
|
|
|
|
2024-04-03 17:07:51 +00:00
|
|
|
#ifdef __amd64__
|
2024-04-03 17:01:31 +00:00
|
|
|
#define VM_LOWMEM_LIMIT (3 * GB)
|
2024-04-03 17:07:51 +00:00
|
|
|
#else
|
|
|
|
#define VM_LOWMEM_LIMIT 0
|
|
|
|
#endif
|
2024-04-03 17:01:31 +00:00
|
|
|
#define VM_HIGHMEM_BASE (4 * GB)
|
|
|
|
|
2015-06-18 06:00:17 +00:00
|
|
|
/*
|
|
|
|
* Size of the guard region before and after the virtual address space
|
|
|
|
* mapping the guest physical memory. This must be a multiple of the
|
|
|
|
* superpage size for performance reasons.
|
|
|
|
*/
|
|
|
|
#define VM_MMAP_GUARD_SIZE (4 * MB)
|
|
|
|
|
|
|
|
#define PROT_RW (PROT_READ | PROT_WRITE)
|
|
|
|
#define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC)
|
|
|
|
|
2011-05-13 04:54:01 +00:00
|
|
|
#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
|
|
|
|
#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
|
|
|
|
|
|
|
|
static int
|
|
|
|
vm_device_open(const char *name)
|
|
|
|
{
|
2018-06-14 01:28:55 +00:00
|
|
|
int fd, len;
|
|
|
|
char *vmfile;
|
2011-05-13 04:54:01 +00:00
|
|
|
|
|
|
|
len = strlen("/dev/vmm/") + strlen(name) + 1;
|
|
|
|
vmfile = malloc(len);
|
|
|
|
assert(vmfile != NULL);
|
|
|
|
snprintf(vmfile, len, "/dev/vmm/%s", name);
|
|
|
|
|
2018-06-14 01:28:55 +00:00
|
|
|
/* Open the device file */
|
|
|
|
fd = open(vmfile, O_RDWR, 0);
|
2011-05-13 04:54:01 +00:00
|
|
|
|
|
|
|
free(vmfile);
|
2018-06-14 01:28:55 +00:00
|
|
|
return (fd);
|
2011-05-13 04:54:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vm_create(const char *name)
|
|
|
|
{
|
2019-12-17 01:33:26 +00:00
|
|
|
/* Try to load vmm(4) module before creating a guest. */
|
2019-12-17 01:37:02 +00:00
|
|
|
if (modfind("vmm") < 0)
|
|
|
|
kldload("vmm");
|
2021-07-26 20:40:16 +00:00
|
|
|
return (CREATE(name));
|
2011-05-13 04:54:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
struct vmctx *
|
|
|
|
vm_open(const char *name)
|
|
|
|
{
|
|
|
|
struct vmctx *vm;
|
2021-03-11 19:27:43 +00:00
|
|
|
int saved_errno;
|
2011-05-13 04:54:01 +00:00
|
|
|
|
|
|
|
vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
|
|
|
|
assert(vm != NULL);
|
|
|
|
|
|
|
|
vm->fd = -1;
|
2014-05-13 16:40:27 +00:00
|
|
|
vm->memflags = 0;
|
2011-05-13 04:54:01 +00:00
|
|
|
vm->name = (char *)(vm + 1);
|
|
|
|
strcpy(vm->name, name);
|
2024-04-03 17:01:31 +00:00
|
|
|
memset(vm->memsegs, 0, sizeof(vm->memsegs));
|
2011-05-13 04:54:01 +00:00
|
|
|
|
|
|
|
if ((vm->fd = vm_device_open(vm->name)) < 0)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
return (vm);
|
|
|
|
err:
|
2021-03-11 19:27:43 +00:00
|
|
|
saved_errno = errno;
|
2021-03-07 06:19:30 +00:00
|
|
|
free(vm);
|
2021-03-11 19:27:43 +00:00
|
|
|
errno = saved_errno;
|
2011-05-13 04:54:01 +00:00
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2022-06-30 21:21:57 +00:00
|
|
|
void
|
|
|
|
vm_close(struct vmctx *vm)
|
|
|
|
{
|
|
|
|
assert(vm != NULL);
|
|
|
|
|
|
|
|
close(vm->fd);
|
|
|
|
free(vm);
|
|
|
|
}
|
|
|
|
|
2011-05-13 04:54:01 +00:00
|
|
|
void
|
|
|
|
vm_destroy(struct vmctx *vm)
|
|
|
|
{
|
|
|
|
assert(vm != NULL);
|
|
|
|
|
|
|
|
if (vm->fd >= 0)
|
|
|
|
close(vm->fd);
|
2012-10-04 02:27:14 +00:00
|
|
|
DESTROY(vm->name);
|
|
|
|
|
2011-05-13 04:54:01 +00:00
|
|
|
free(vm);
|
|
|
|
}
|
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
struct vcpu *
|
|
|
|
vm_vcpu_open(struct vmctx *ctx, int vcpuid)
|
|
|
|
{
|
|
|
|
struct vcpu *vcpu;
|
|
|
|
|
|
|
|
vcpu = malloc(sizeof(*vcpu));
|
|
|
|
vcpu->ctx = ctx;
|
|
|
|
vcpu->vcpuid = vcpuid;
|
|
|
|
return (vcpu);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vm_vcpu_close(struct vcpu *vcpu)
|
|
|
|
{
|
|
|
|
free(vcpu);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vcpu_id(struct vcpu *vcpu)
|
|
|
|
{
|
|
|
|
return (vcpu->vcpuid);
|
|
|
|
}
|
|
|
|
|
2013-10-09 03:56:07 +00:00
|
|
|
int
|
2021-07-26 20:40:16 +00:00
|
|
|
vm_parse_memsize(const char *opt, size_t *ret_memsize)
|
2013-10-09 03:56:07 +00:00
|
|
|
{
|
|
|
|
char *endptr;
|
|
|
|
size_t optval;
|
|
|
|
int error;
|
|
|
|
|
2021-07-26 20:40:16 +00:00
|
|
|
optval = strtoul(opt, &endptr, 0);
|
|
|
|
if (*opt != '\0' && *endptr == '\0') {
|
2013-10-09 03:56:07 +00:00
|
|
|
/*
|
|
|
|
* For the sake of backward compatibility if the memory size
|
|
|
|
* specified on the command line is less than a megabyte then
|
|
|
|
* it is interpreted as being in units of MB.
|
|
|
|
*/
|
|
|
|
if (optval < MB)
|
|
|
|
optval *= MB;
|
|
|
|
*ret_memsize = optval;
|
|
|
|
error = 0;
|
|
|
|
} else
|
2021-07-26 20:40:16 +00:00
|
|
|
error = expand_number(opt, ret_memsize);
|
2013-10-09 03:56:07 +00:00
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2013-03-18 22:38:30 +00:00
|
|
|
uint32_t
|
2024-04-03 17:01:31 +00:00
|
|
|
vm_get_lowmem_limit(struct vmctx *ctx __unused)
|
2013-03-18 22:38:30 +00:00
|
|
|
{
|
|
|
|
|
2024-04-03 17:01:31 +00:00
|
|
|
return (VM_LOWMEM_LIMIT);
|
2013-03-18 22:38:30 +00:00
|
|
|
}
|
|
|
|
|
2014-05-13 16:40:27 +00:00
|
|
|
void
|
|
|
|
vm_set_memflags(struct vmctx *ctx, int flags)
|
|
|
|
{
|
|
|
|
|
|
|
|
ctx->memflags = flags;
|
|
|
|
}
|
|
|
|
|
2015-06-18 06:00:17 +00:00
|
|
|
int
|
|
|
|
vm_get_memflags(struct vmctx *ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (ctx->memflags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
|
|
|
|
size_t len, int prot)
|
|
|
|
{
|
|
|
|
struct vm_memmap memmap;
|
|
|
|
int error, flags;
|
|
|
|
|
|
|
|
memmap.gpa = gpa;
|
|
|
|
memmap.segid = segid;
|
|
|
|
memmap.segoff = off;
|
|
|
|
memmap.len = len;
|
|
|
|
memmap.prot = prot;
|
|
|
|
memmap.flags = 0;
|
|
|
|
|
|
|
|
if (ctx->memflags & VM_MEM_F_WIRED)
|
|
|
|
memmap.flags |= VM_MEMMAP_F_WIRED;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this mapping already exists then don't create it again. This
|
|
|
|
* is the common case for SYSMEM mappings created by bhyveload(8).
|
|
|
|
*/
|
|
|
|
error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
|
|
|
|
if (error == 0 && gpa == memmap.gpa) {
|
|
|
|
if (segid != memmap.segid || off != memmap.segoff ||
|
|
|
|
prot != memmap.prot || flags != memmap.flags) {
|
|
|
|
errno = EEXIST;
|
|
|
|
return (-1);
|
|
|
|
} else {
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
int
|
|
|
|
vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
|
|
|
|
size_t *lowmem_size, size_t *highmem_size)
|
|
|
|
{
|
|
|
|
|
|
|
|
*guest_baseaddr = ctx->baseaddr;
|
2024-04-03 17:01:31 +00:00
|
|
|
*lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size;
|
|
|
|
*highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size;
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2021-03-18 16:08:52 +00:00
|
|
|
int
|
|
|
|
vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
|
|
|
|
{
|
|
|
|
struct vm_munmap munmap;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
munmap.gpa = gpa;
|
|
|
|
munmap.len = len;
|
|
|
|
|
|
|
|
error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2015-06-18 06:00:17 +00:00
|
|
|
int
|
|
|
|
vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
|
|
|
|
vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
|
|
|
|
{
|
|
|
|
struct vm_memmap memmap;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&memmap, sizeof(struct vm_memmap));
|
|
|
|
memmap.gpa = *gpa;
|
|
|
|
error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
|
|
|
|
if (error == 0) {
|
|
|
|
*gpa = memmap.gpa;
|
|
|
|
*segid = memmap.segid;
|
|
|
|
*segoff = memmap.segoff;
|
|
|
|
*len = memmap.len;
|
|
|
|
*prot = memmap.prot;
|
|
|
|
*flags = memmap.flags;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return 0 if the segments are identical and non-zero otherwise.
|
|
|
|
*
|
|
|
|
* This is slightly complicated by the fact that only device memory segments
|
|
|
|
* are named.
|
|
|
|
*/
|
2013-03-18 22:38:30 +00:00
|
|
|
static int
|
2015-06-18 06:00:17 +00:00
|
|
|
cmpseg(size_t len, const char *str, size_t len2, const char *str2)
|
2011-05-13 04:54:01 +00:00
|
|
|
{
|
2015-06-18 06:00:17 +00:00
|
|
|
|
|
|
|
if (len == len2) {
|
|
|
|
if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
|
|
|
|
{
|
|
|
|
struct vm_memseg memseg;
|
|
|
|
size_t n;
|
|
|
|
int error;
|
2011-05-13 04:54:01 +00:00
|
|
|
|
|
|
|
/*
|
2015-06-18 06:00:17 +00:00
|
|
|
* If the memory segment has already been created then just return.
|
|
|
|
* This is the usual case for the SYSMEM segment created by userspace
|
|
|
|
* loaders like bhyveload(8).
|
2011-05-13 04:54:01 +00:00
|
|
|
*/
|
2015-06-18 06:00:17 +00:00
|
|
|
error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
|
|
|
|
sizeof(memseg.name));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
if (memseg.len != 0) {
|
|
|
|
if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
|
|
|
|
errno = EINVAL;
|
|
|
|
return (-1);
|
|
|
|
} else {
|
|
|
|
return (0);
|
|
|
|
}
|
2011-05-13 04:54:01 +00:00
|
|
|
}
|
2015-06-18 06:00:17 +00:00
|
|
|
|
|
|
|
bzero(&memseg, sizeof(struct vm_memseg));
|
|
|
|
memseg.segid = segid;
|
|
|
|
memseg.len = len;
|
|
|
|
if (name != NULL) {
|
|
|
|
n = strlcpy(memseg.name, name, sizeof(memseg.name));
|
|
|
|
if (n >= sizeof(memseg.name)) {
|
|
|
|
errno = ENAMETOOLONG;
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
|
2011-05-13 04:54:01 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2013-03-18 22:38:30 +00:00
|
|
|
int
|
2015-06-18 06:00:17 +00:00
|
|
|
vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
|
|
|
|
size_t bufsize)
|
2013-03-18 22:38:30 +00:00
|
|
|
{
|
2015-06-18 06:00:17 +00:00
|
|
|
struct vm_memseg memseg;
|
|
|
|
size_t n;
|
2013-03-18 22:38:30 +00:00
|
|
|
int error;
|
|
|
|
|
2024-04-03 17:09:43 +00:00
|
|
|
bzero(&memseg, sizeof(memseg));
|
2015-06-18 06:00:17 +00:00
|
|
|
memseg.segid = segid;
|
|
|
|
error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
|
|
|
|
if (error == 0) {
|
|
|
|
*lenp = memseg.len;
|
|
|
|
n = strlcpy(namebuf, memseg.name, bufsize);
|
|
|
|
if (n >= bufsize) {
|
|
|
|
errno = ENAMETOOLONG;
|
|
|
|
error = -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
|
|
|
|
{
|
|
|
|
char *ptr;
|
|
|
|
int error, flags;
|
|
|
|
|
|
|
|
/* Map 'len' bytes starting at 'gpa' in the guest address space */
|
|
|
|
error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
flags = MAP_SHARED | MAP_FIXED;
|
|
|
|
if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
|
|
|
|
flags |= MAP_NOCORE;
|
|
|
|
|
|
|
|
/* mmap into the process address space on the host */
|
|
|
|
ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
|
|
|
|
if (ptr == MAP_FAILED)
|
|
|
|
return (-1);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
|
|
|
|
{
|
|
|
|
size_t objsize, len;
|
|
|
|
vm_paddr_t gpa;
|
|
|
|
char *baseaddr, *ptr;
|
2018-09-06 20:29:40 +00:00
|
|
|
int error;
|
2015-06-18 06:00:17 +00:00
|
|
|
|
|
|
|
assert(vms == VM_MMAP_ALL);
|
2013-03-18 22:38:30 +00:00
|
|
|
|
|
|
|
/*
|
2024-04-03 17:01:31 +00:00
|
|
|
* If 'memsize' cannot fit entirely in the 'lowmem' segment then create
|
|
|
|
* another 'highmem' segment above VM_HIGHMEM_BASE for the remainder.
|
2013-03-18 22:38:30 +00:00
|
|
|
*/
|
2024-04-03 17:01:31 +00:00
|
|
|
if (memsize > VM_LOWMEM_LIMIT) {
|
|
|
|
ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT;
|
|
|
|
ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT;
|
|
|
|
objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size;
|
2013-03-18 22:38:30 +00:00
|
|
|
} else {
|
2024-04-03 17:01:31 +00:00
|
|
|
ctx->memsegs[VM_MEMSEG_LOW].size = memsize;
|
|
|
|
ctx->memsegs[VM_MEMSEG_HIGH].size = 0;
|
|
|
|
objsize = memsize;
|
2013-03-18 22:38:30 +00:00
|
|
|
}
|
|
|
|
|
2015-06-18 06:00:17 +00:00
|
|
|
error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Stake out a contiguous region covering the guest physical memory
|
|
|
|
* and the adjoining guard regions.
|
|
|
|
*/
|
|
|
|
len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
|
2018-09-06 20:29:40 +00:00
|
|
|
ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
|
2015-06-18 06:00:17 +00:00
|
|
|
if (ptr == MAP_FAILED)
|
|
|
|
return (-1);
|
|
|
|
|
|
|
|
baseaddr = ptr + VM_MMAP_GUARD_SIZE;
|
2024-04-03 17:01:31 +00:00
|
|
|
if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) {
|
|
|
|
gpa = VM_HIGHMEM_BASE;
|
|
|
|
len = ctx->memsegs[VM_MEMSEG_HIGH].size;
|
2015-06-18 06:00:17 +00:00
|
|
|
error = setup_memory_segment(ctx, gpa, len, baseaddr);
|
2013-03-18 22:38:30 +00:00
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2024-04-03 17:01:31 +00:00
|
|
|
if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) {
|
2015-06-18 06:00:17 +00:00
|
|
|
gpa = 0;
|
2024-04-03 17:01:31 +00:00
|
|
|
len = ctx->memsegs[VM_MEMSEG_LOW].size;
|
2015-06-18 06:00:17 +00:00
|
|
|
error = setup_memory_segment(ctx, gpa, len, baseaddr);
|
2013-03-18 22:38:30 +00:00
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2015-06-18 06:00:17 +00:00
|
|
|
ctx->baseaddr = baseaddr;
|
|
|
|
|
2013-03-18 22:38:30 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2015-06-22 00:30:34 +00:00
|
|
|
/*
|
|
|
|
* Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
|
|
|
|
* the lowmem or highmem regions.
|
|
|
|
*
|
|
|
|
* In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
|
|
|
|
* The instruction emulation code depends on this behavior.
|
|
|
|
*/
|
2013-03-18 22:38:30 +00:00
|
|
|
void *
|
|
|
|
vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
|
2011-05-13 04:54:01 +00:00
|
|
|
{
|
2024-04-03 17:01:31 +00:00
|
|
|
vm_size_t lowsize, highsize;
|
2011-05-13 04:54:01 +00:00
|
|
|
|
2024-04-03 17:01:31 +00:00
|
|
|
lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
|
|
|
|
if (lowsize > 0) {
|
|
|
|
if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize)
|
2015-06-22 00:30:34 +00:00
|
|
|
return (ctx->baseaddr + gaddr);
|
|
|
|
}
|
2013-03-18 22:38:30 +00:00
|
|
|
|
2024-04-03 17:01:31 +00:00
|
|
|
highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
|
|
|
|
if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) {
|
|
|
|
if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize &&
|
|
|
|
gaddr + len <= VM_HIGHMEM_BASE + highsize)
|
|
|
|
return (ctx->baseaddr + gaddr);
|
2015-06-22 00:30:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return (NULL);
|
2011-05-13 04:54:01 +00:00
|
|
|
}
|
|
|
|
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
vm_paddr_t
|
|
|
|
vm_rev_map_gpa(struct vmctx *ctx, void *addr)
|
|
|
|
{
|
|
|
|
vm_paddr_t offaddr;
|
2024-04-03 17:01:31 +00:00
|
|
|
vm_size_t lowsize, highsize;
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
|
|
|
|
offaddr = (char *)addr - ctx->baseaddr;
|
|
|
|
|
2024-04-03 17:01:31 +00:00
|
|
|
lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
|
|
|
|
if (lowsize > 0)
|
|
|
|
if (offaddr <= lowsize)
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
return (offaddr);
|
|
|
|
|
2024-04-03 17:01:31 +00:00
|
|
|
highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
|
|
|
|
if (highsize > 0)
|
|
|
|
if (offaddr >= VM_HIGHMEM_BASE &&
|
|
|
|
offaddr < VM_HIGHMEM_BASE + highsize)
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
return (offaddr);
|
|
|
|
|
|
|
|
return ((vm_paddr_t)-1);
|
|
|
|
}
|
|
|
|
|
2022-03-18 05:26:54 +00:00
|
|
|
const char *
|
|
|
|
vm_get_name(struct vmctx *ctx)
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
{
|
|
|
|
|
2022-03-18 05:26:54 +00:00
|
|
|
return (ctx->name);
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
}
|
|
|
|
|
2014-06-24 02:02:51 +00:00
|
|
|
size_t
|
|
|
|
vm_get_lowmem_size(struct vmctx *ctx)
|
|
|
|
{
|
|
|
|
|
2024-04-03 17:01:31 +00:00
|
|
|
return (ctx->memsegs[VM_MEMSEG_LOW].size);
|
|
|
|
}
|
|
|
|
|
|
|
|
vm_paddr_t
|
|
|
|
vm_get_highmem_base(struct vmctx *ctx __unused)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (VM_HIGHMEM_BASE);
|
2014-06-24 02:02:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
size_t
|
|
|
|
vm_get_highmem_size(struct vmctx *ctx)
|
|
|
|
{
|
|
|
|
|
2024-04-03 17:01:31 +00:00
|
|
|
return (ctx->memsegs[VM_MEMSEG_HIGH].size);
|
2014-06-24 02:02:51 +00:00
|
|
|
}
|
|
|
|
|
2015-06-18 06:00:17 +00:00
|
|
|
void *
|
|
|
|
vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
|
|
|
|
{
|
|
|
|
char pathname[MAXPATHLEN];
|
|
|
|
size_t len2;
|
|
|
|
char *base, *ptr;
|
|
|
|
int fd, error, flags;
|
|
|
|
|
|
|
|
fd = -1;
|
|
|
|
ptr = MAP_FAILED;
|
|
|
|
if (name == NULL || strlen(name) == 0) {
|
|
|
|
errno = EINVAL;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = vm_alloc_memseg(ctx, segid, len, name);
|
|
|
|
if (error)
|
|
|
|
goto done;
|
|
|
|
|
2015-07-06 19:41:43 +00:00
|
|
|
strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
|
2015-06-18 06:00:17 +00:00
|
|
|
strlcat(pathname, ctx->name, sizeof(pathname));
|
|
|
|
strlcat(pathname, ".", sizeof(pathname));
|
|
|
|
strlcat(pathname, name, sizeof(pathname));
|
|
|
|
|
|
|
|
fd = open(pathname, O_RDWR);
|
|
|
|
if (fd < 0)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Stake out a contiguous region covering the device memory and the
|
|
|
|
* adjoining guard regions.
|
|
|
|
*/
|
|
|
|
len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
|
2018-09-06 20:29:40 +00:00
|
|
|
base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
|
|
|
|
0);
|
2015-06-18 06:00:17 +00:00
|
|
|
if (base == MAP_FAILED)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
flags = MAP_SHARED | MAP_FIXED;
|
|
|
|
if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
|
|
|
|
flags |= MAP_NOCORE;
|
|
|
|
|
|
|
|
/* mmap the devmem region in the host address space */
|
|
|
|
ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
|
|
|
|
done:
|
|
|
|
if (fd >= 0)
|
|
|
|
close(fd);
|
|
|
|
return (ptr);
|
|
|
|
}
|
|
|
|
|
2024-04-03 16:52:25 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* XXX: fragile, handle with care
|
|
|
|
* Assumes that the first field of the ioctl data
|
|
|
|
* is the vcpuid.
|
|
|
|
*/
|
|
|
|
*(int *)arg = vcpu->vcpuid;
|
|
|
|
return (ioctl(vcpu->ctx->fd, cmd, arg));
|
|
|
|
}
|
|
|
|
|
2011-05-13 04:54:01 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
|
2011-05-13 04:54:01 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct vm_register vmreg;
|
|
|
|
|
|
|
|
bzero(&vmreg, sizeof(vmreg));
|
|
|
|
vmreg.regnum = reg;
|
|
|
|
vmreg.regval = val;
|
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg);
|
2011-05-13 04:54:01 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val)
|
2011-05-13 04:54:01 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct vm_register vmreg;
|
|
|
|
|
|
|
|
bzero(&vmreg, sizeof(vmreg));
|
|
|
|
vmreg.regnum = reg;
|
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg);
|
2011-05-13 04:54:01 +00:00
|
|
|
*ret_val = vmreg.regval;
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2018-02-22 00:39:25 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_set_register_set(struct vcpu *vcpu, unsigned int count,
|
2018-02-22 00:39:25 +00:00
|
|
|
const int *regnums, uint64_t *regvals)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct vm_register_set vmregset;
|
|
|
|
|
|
|
|
bzero(&vmregset, sizeof(vmregset));
|
|
|
|
vmregset.count = count;
|
|
|
|
vmregset.regnums = regnums;
|
|
|
|
vmregset.regvals = regvals;
|
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset);
|
2018-02-22 00:39:25 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_get_register_set(struct vcpu *vcpu, unsigned int count,
|
2018-02-22 00:39:25 +00:00
|
|
|
const int *regnums, uint64_t *regvals)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct vm_register_set vmregset;
|
|
|
|
|
|
|
|
bzero(&vmregset, sizeof(vmregset));
|
|
|
|
vmregset.count = count;
|
|
|
|
vmregset.regnums = regnums;
|
|
|
|
vmregset.regvals = regvals;
|
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset);
|
2018-02-22 00:39:25 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2011-05-13 04:54:01 +00:00
|
|
|
int
|
2023-05-24 01:13:33 +00:00
|
|
|
vm_run(struct vcpu *vcpu, struct vm_run *vmrun)
|
2011-05-13 04:54:01 +00:00
|
|
|
{
|
2023-05-24 01:13:33 +00:00
|
|
|
return (vcpu_ioctl(vcpu, VM_RUN, vmrun));
|
2011-05-13 04:54:01 +00:00
|
|
|
}
|
|
|
|
|
2014-03-26 23:34:27 +00:00
|
|
|
int
|
2014-04-28 22:06:40 +00:00
|
|
|
vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
|
2014-03-26 23:34:27 +00:00
|
|
|
{
|
2014-04-28 22:06:40 +00:00
|
|
|
struct vm_suspend vmsuspend;
|
2014-03-26 23:34:27 +00:00
|
|
|
|
2014-04-28 22:06:40 +00:00
|
|
|
bzero(&vmsuspend, sizeof(vmsuspend));
|
|
|
|
vmsuspend.how = how;
|
|
|
|
return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
|
2014-03-26 23:34:27 +00:00
|
|
|
}
|
|
|
|
|
2014-06-07 21:36:52 +00:00
|
|
|
int
|
|
|
|
vm_reinit(struct vmctx *ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (ioctl(ctx->fd, VM_REINIT, 0));
|
|
|
|
}
|
|
|
|
|
2011-05-13 04:54:01 +00:00
|
|
|
int
|
|
|
|
vm_capability_name2type(const char *capname)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2024-04-03 16:55:36 +00:00
|
|
|
for (i = 0; i < VM_CAP_MAX; i++) {
|
|
|
|
if (vm_capstrmap[i] != NULL &&
|
|
|
|
strcmp(vm_capstrmap[i], capname) == 0)
|
2020-04-21 17:30:56 +00:00
|
|
|
return (i);
|
2011-05-13 04:54:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
2012-10-12 17:39:28 +00:00
|
|
|
const char *
|
|
|
|
vm_capability_type2name(int type)
|
|
|
|
{
|
2024-04-03 16:55:36 +00:00
|
|
|
if (type >= 0 && type < VM_CAP_MAX)
|
|
|
|
return (vm_capstrmap[type]);
|
2012-10-12 17:39:28 +00:00
|
|
|
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2011-05-13 04:54:01 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval)
|
2011-05-13 04:54:01 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct vm_capability vmcap;
|
|
|
|
|
|
|
|
bzero(&vmcap, sizeof(vmcap));
|
|
|
|
vmcap.captype = cap;
|
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap);
|
2011-05-13 04:54:01 +00:00
|
|
|
*retval = vmcap.capval;
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val)
|
2011-05-13 04:54:01 +00:00
|
|
|
{
|
|
|
|
struct vm_capability vmcap;
|
|
|
|
|
|
|
|
bzero(&vmcap, sizeof(vmcap));
|
|
|
|
vmcap.captype = cap;
|
|
|
|
vmcap.capval = val;
|
2018-06-14 01:28:55 +00:00
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap));
|
2011-05-13 04:54:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t *
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv,
|
2011-05-13 04:54:01 +00:00
|
|
|
int *ret_entries)
|
|
|
|
{
|
2022-02-07 22:11:10 +00:00
|
|
|
static _Thread_local uint64_t *stats_buf;
|
|
|
|
static _Thread_local u_int stats_count;
|
|
|
|
uint64_t *new_stats;
|
|
|
|
struct vm_stats vmstats;
|
|
|
|
u_int count, index;
|
|
|
|
bool have_stats;
|
|
|
|
|
|
|
|
have_stats = false;
|
|
|
|
count = 0;
|
|
|
|
for (index = 0;; index += nitems(vmstats.statbuf)) {
|
|
|
|
vmstats.index = index;
|
2023-03-24 18:49:06 +00:00
|
|
|
if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0)
|
2022-02-07 22:11:10 +00:00
|
|
|
break;
|
|
|
|
if (stats_count < index + vmstats.num_entries) {
|
|
|
|
new_stats = realloc(stats_buf,
|
|
|
|
(index + vmstats.num_entries) * sizeof(uint64_t));
|
|
|
|
if (new_stats == NULL) {
|
|
|
|
errno = ENOMEM;
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
stats_count = index + vmstats.num_entries;
|
|
|
|
stats_buf = new_stats;
|
|
|
|
}
|
|
|
|
memcpy(stats_buf + index, vmstats.statbuf,
|
|
|
|
vmstats.num_entries * sizeof(uint64_t));
|
|
|
|
count += vmstats.num_entries;
|
|
|
|
have_stats = true;
|
2011-05-13 04:54:01 +00:00
|
|
|
|
2022-02-07 22:11:10 +00:00
|
|
|
if (vmstats.num_entries != nitems(vmstats.statbuf))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (have_stats) {
|
2011-05-13 04:54:01 +00:00
|
|
|
if (ret_entries)
|
2022-02-07 22:11:10 +00:00
|
|
|
*ret_entries = count;
|
2011-05-13 04:54:01 +00:00
|
|
|
if (ret_tv)
|
|
|
|
*ret_tv = vmstats.tv;
|
2022-02-07 22:11:10 +00:00
|
|
|
return (stats_buf);
|
2011-05-13 04:54:01 +00:00
|
|
|
} else
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *
|
|
|
|
vm_get_stat_desc(struct vmctx *ctx, int index)
|
|
|
|
{
|
|
|
|
static struct vm_stat_desc statdesc;
|
|
|
|
|
|
|
|
statdesc.index = index;
|
|
|
|
if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
|
|
|
|
return (statdesc.desc);
|
|
|
|
else
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2024-04-03 17:45:06 +00:00
|
|
|
#ifdef __amd64__
|
2013-10-05 21:22:35 +00:00
|
|
|
int
|
|
|
|
vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
|
|
|
|
{
|
|
|
|
int error, i;
|
|
|
|
struct vm_gpa_pte gpapte;
|
|
|
|
|
|
|
|
bzero(&gpapte, sizeof(gpapte));
|
|
|
|
gpapte.gpa = gpa;
|
|
|
|
|
|
|
|
error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
|
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
*num = gpapte.ptenum;
|
|
|
|
for (i = 0; i < gpapte.ptenum; i++)
|
|
|
|
pte[i] = gpapte.pte[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
2013-11-25 19:04:51 +00:00
|
|
|
|
2015-05-06 16:25:20 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
|
2015-05-06 16:25:20 +00:00
|
|
|
uint64_t gla, int prot, uint64_t *gpa, int *fault)
|
2014-05-24 23:12:30 +00:00
|
|
|
{
|
|
|
|
struct vm_gla2gpa gg;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&gg, sizeof(struct vm_gla2gpa));
|
|
|
|
gg.prot = prot;
|
|
|
|
gg.gla = gla;
|
|
|
|
gg.paging = *paging;
|
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg);
|
2014-05-24 23:12:30 +00:00
|
|
|
if (error == 0) {
|
|
|
|
*fault = gg.fault;
|
|
|
|
*gpa = gg.gpa;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
2024-04-03 17:45:06 +00:00
|
|
|
#endif
|
2014-05-24 23:12:30 +00:00
|
|
|
|
2018-02-26 19:19:05 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
|
2018-02-26 19:19:05 +00:00
|
|
|
uint64_t gla, int prot, uint64_t *gpa, int *fault)
|
|
|
|
{
|
|
|
|
struct vm_gla2gpa gg;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&gg, sizeof(struct vm_gla2gpa));
|
|
|
|
gg.prot = prot;
|
|
|
|
gg.gla = gla;
|
|
|
|
gg.paging = *paging;
|
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg);
|
2018-02-26 19:19:05 +00:00
|
|
|
if (error == 0) {
|
|
|
|
*fault = gg.fault;
|
|
|
|
*gpa = gg.gpa;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2014-05-24 23:12:30 +00:00
|
|
|
#ifndef min
|
|
|
|
#define min(a,b) (((a) < (b)) ? (a) : (b))
|
|
|
|
#endif
|
|
|
|
|
2024-04-03 17:45:06 +00:00
|
|
|
#ifdef __amd64__
|
2014-05-24 23:12:30 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
|
2015-05-06 16:25:20 +00:00
|
|
|
uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
|
|
|
|
int *fault)
|
2014-05-24 23:12:30 +00:00
|
|
|
{
|
2015-01-19 06:51:04 +00:00
|
|
|
void *va;
|
2021-07-26 20:40:16 +00:00
|
|
|
uint64_t gpa, off;
|
|
|
|
int error, i, n;
|
2014-05-26 18:21:08 +00:00
|
|
|
|
|
|
|
for (i = 0; i < iovcnt; i++) {
|
|
|
|
iov[i].iov_base = 0;
|
|
|
|
iov[i].iov_len = 0;
|
|
|
|
}
|
2014-05-24 23:12:30 +00:00
|
|
|
|
|
|
|
while (len) {
|
2014-05-26 18:21:08 +00:00
|
|
|
assert(iovcnt > 0);
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
|
2015-05-06 16:25:20 +00:00
|
|
|
if (error || *fault)
|
|
|
|
return (error);
|
2014-05-24 23:12:30 +00:00
|
|
|
|
|
|
|
off = gpa & PAGE_MASK;
|
2021-07-26 20:40:16 +00:00
|
|
|
n = MIN(len, PAGE_SIZE - off);
|
2014-05-26 18:21:08 +00:00
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
va = vm_map_gpa(vcpu->ctx, gpa, n);
|
2015-01-19 06:51:04 +00:00
|
|
|
if (va == NULL)
|
2015-05-06 16:25:20 +00:00
|
|
|
return (EFAULT);
|
2015-01-19 06:51:04 +00:00
|
|
|
|
|
|
|
iov->iov_base = va;
|
2014-05-26 18:21:08 +00:00
|
|
|
iov->iov_len = n;
|
|
|
|
iov++;
|
|
|
|
iovcnt--;
|
2014-05-24 23:12:30 +00:00
|
|
|
|
|
|
|
gla += n;
|
|
|
|
len -= n;
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
2024-04-03 17:45:06 +00:00
|
|
|
#endif
|
2014-05-24 23:12:30 +00:00
|
|
|
|
2015-01-19 06:51:04 +00:00
|
|
|
void
|
2022-11-18 18:01:44 +00:00
|
|
|
vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused)
|
2015-01-19 06:51:04 +00:00
|
|
|
{
|
2022-11-18 18:01:44 +00:00
|
|
|
/*
|
|
|
|
* Intentionally empty. This is used by the instruction
|
|
|
|
* emulation code shared with the kernel. The in-kernel
|
|
|
|
* version of this is non-empty.
|
|
|
|
*/
|
2015-01-19 06:51:04 +00:00
|
|
|
}
|
|
|
|
|
2014-05-26 18:21:08 +00:00
|
|
|
void
|
2022-11-18 18:01:44 +00:00
|
|
|
vm_copyin(struct iovec *iov, void *vp, size_t len)
|
2014-05-24 23:12:30 +00:00
|
|
|
{
|
2014-05-26 18:21:08 +00:00
|
|
|
const char *src;
|
2014-05-24 23:12:30 +00:00
|
|
|
char *dst;
|
2014-05-26 18:21:08 +00:00
|
|
|
size_t n;
|
|
|
|
|
|
|
|
dst = vp;
|
|
|
|
while (len) {
|
|
|
|
assert(iov->iov_len);
|
|
|
|
n = min(len, iov->iov_len);
|
2015-01-19 06:51:04 +00:00
|
|
|
src = iov->iov_base;
|
2014-05-26 18:21:08 +00:00
|
|
|
bcopy(src, dst, n);
|
|
|
|
|
|
|
|
iov++;
|
|
|
|
dst += n;
|
|
|
|
len -= n;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2022-11-18 18:01:44 +00:00
|
|
|
vm_copyout(const void *vp, struct iovec *iov, size_t len)
|
2014-05-26 18:21:08 +00:00
|
|
|
{
|
2014-05-24 23:12:30 +00:00
|
|
|
const char *src;
|
2014-05-26 18:21:08 +00:00
|
|
|
char *dst;
|
|
|
|
size_t n;
|
2014-05-24 23:12:30 +00:00
|
|
|
|
|
|
|
src = vp;
|
|
|
|
while (len) {
|
2014-05-26 18:21:08 +00:00
|
|
|
assert(iov->iov_len);
|
|
|
|
n = min(len, iov->iov_len);
|
2015-01-19 06:51:04 +00:00
|
|
|
dst = iov->iov_base;
|
2014-05-24 23:12:30 +00:00
|
|
|
bcopy(src, dst, n);
|
|
|
|
|
2014-05-26 18:21:08 +00:00
|
|
|
iov++;
|
2014-05-24 23:12:30 +00:00
|
|
|
src += n;
|
|
|
|
len -= n;
|
|
|
|
}
|
|
|
|
}
|
2014-05-31 23:37:34 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
|
|
|
|
{
|
|
|
|
struct vm_cpuset vm_cpuset;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&vm_cpuset, sizeof(struct vm_cpuset));
|
|
|
|
vm_cpuset.which = which;
|
|
|
|
vm_cpuset.cpusetsize = sizeof(cpuset_t);
|
|
|
|
vm_cpuset.cpus = cpus;
|
|
|
|
|
|
|
|
error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
|
|
|
|
}
|
|
|
|
|
2018-04-06 22:03:43 +00:00
|
|
|
int
|
|
|
|
vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
|
|
|
|
}
|
|
|
|
|
2014-05-31 23:37:34 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_activate_cpu(struct vcpu *vcpu)
|
2014-05-31 23:37:34 +00:00
|
|
|
{
|
|
|
|
struct vm_activate_cpu ac;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&ac, sizeof(struct vm_activate_cpu));
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac);
|
2014-05-31 23:37:34 +00:00
|
|
|
return (error);
|
|
|
|
}
|
2014-07-19 20:59:08 +00:00
|
|
|
|
2018-04-06 22:03:43 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_suspend_all_cpus(struct vmctx *ctx)
|
2018-04-06 22:03:43 +00:00
|
|
|
{
|
|
|
|
struct vm_activate_cpu ac;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&ac, sizeof(struct vm_activate_cpu));
|
2023-03-24 18:49:06 +00:00
|
|
|
ac.vcpuid = -1;
|
2018-04-06 22:03:43 +00:00
|
|
|
error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_suspend_cpu(struct vcpu *vcpu)
|
|
|
|
{
|
|
|
|
struct vm_activate_cpu ac;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&ac, sizeof(struct vm_activate_cpu));
|
|
|
|
error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vm_resume_cpu(struct vcpu *vcpu)
|
|
|
|
{
|
|
|
|
struct vm_activate_cpu ac;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&ac, sizeof(struct vm_activate_cpu));
|
|
|
|
error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vm_resume_all_cpus(struct vmctx *ctx)
|
2018-04-06 22:03:43 +00:00
|
|
|
{
|
|
|
|
struct vm_activate_cpu ac;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&ac, sizeof(struct vm_activate_cpu));
|
2023-03-24 18:49:06 +00:00
|
|
|
ac.vcpuid = -1;
|
2018-04-06 22:03:43 +00:00
|
|
|
error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2024-04-03 17:45:06 +00:00
|
|
|
#ifdef __amd64__
|
2014-07-19 20:59:08 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
|
2014-07-19 20:59:08 +00:00
|
|
|
{
|
|
|
|
struct vm_intinfo vmii;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&vmii, sizeof(struct vm_intinfo));
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii);
|
2014-07-19 20:59:08 +00:00
|
|
|
if (error == 0) {
|
|
|
|
*info1 = vmii.info1;
|
|
|
|
*info2 = vmii.info2;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_set_intinfo(struct vcpu *vcpu, uint64_t info1)
|
2014-07-19 20:59:08 +00:00
|
|
|
{
|
|
|
|
struct vm_intinfo vmii;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&vmii, sizeof(struct vm_intinfo));
|
|
|
|
vmii.info1 = info1;
|
2023-03-24 18:49:06 +00:00
|
|
|
error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii);
|
2014-07-19 20:59:08 +00:00
|
|
|
return (error);
|
|
|
|
}
|
2024-04-03 17:45:06 +00:00
|
|
|
#endif
|
2014-12-30 22:19:34 +00:00
|
|
|
|
2024-04-03 17:45:06 +00:00
|
|
|
#ifdef WITH_VMMAPI_SNAPSHOT
|
2015-01-18 03:08:30 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_restart_instruction(struct vcpu *vcpu)
|
2015-01-18 03:08:30 +00:00
|
|
|
{
|
2023-03-24 18:49:06 +00:00
|
|
|
int arg;
|
2015-01-18 03:08:30 +00:00
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg));
|
2015-01-18 03:08:30 +00:00
|
|
|
}
|
2017-02-14 13:35:59 +00:00
|
|
|
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
int
|
2023-03-24 18:49:06 +00:00
|
|
|
vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta)
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
{
|
|
|
|
|
2023-03-24 18:49:06 +00:00
|
|
|
if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
#ifdef SNAPSHOT_DEBUG
|
|
|
|
fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
|
|
|
|
__func__, meta->dev_name, errno);
|
|
|
|
#endif
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vm_restore_time(struct vmctx *ctx)
|
|
|
|
{
|
|
|
|
int dummy;
|
|
|
|
|
|
|
|
dummy = 0;
|
|
|
|
return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
|
|
|
|
}
|
2024-04-03 17:45:06 +00:00
|
|
|
#endif
|
Initial support for bhyve save and restore.
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed. In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken). A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.
To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.
While the current implementation is useful for several uses cases, it
has a few limitations. The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system). In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions. The file format also does not currently support
versioning of individual chunks of state. As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files. The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility. As a result, the current implementation is not enabled
by default. It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.
Submitted by: Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by: Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes: yes
Sponsored by: University Politehnica of Bucharest
Sponsored by: Matthew Grooms (student scholarships)
Sponsored by: iXsystems
Differential Revision: https://reviews.freebsd.org/D19495
2020-05-05 00:02:04 +00:00
|
|
|
|
Add the ability to control the CPU topology of created VMs
from userland without the need to use sysctls, it allows the old
sysctls to continue to function, but deprecates them at
FreeBSD_version 1200060 (Relnotes for deprecate).
The command line of bhyve is maintained in a backwards compatible way.
The API of libvmmapi is maintained in a backwards compatible way.
The sysctl's are maintained in a backwards compatible way.
Added command option looks like:
bhyve -c [[cpus=]n][,sockets=n][,cores=n][,threads=n][,maxcpus=n]
The optional parts can be specified in any order, but only a single
integer invokes the backwards compatible parse. [,maxcpus=n] is
hidden by #ifdef until kernel support is added, though the api
is put in place.
bhyvectl --get-cpu-topology option added.
Reviewed by: grehan (maintainer, earlier version),
Reviewed by: bcr (manpages)
Approved by: bde (mentor), phk (mentor)
Tested by: Oleg Ginzburg <olevole@olevole.ru> (cbsd)
MFC after: 1 week
Relnotes: Y
Differential Revision: https://reviews.freebsd.org/D9930
2018-04-08 19:24:49 +00:00
|
|
|
int
|
|
|
|
vm_set_topology(struct vmctx *ctx,
|
|
|
|
uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
|
|
|
|
{
|
|
|
|
struct vm_cpu_topology topology;
|
|
|
|
|
|
|
|
bzero(&topology, sizeof (struct vm_cpu_topology));
|
|
|
|
topology.sockets = sockets;
|
|
|
|
topology.cores = cores;
|
|
|
|
topology.threads = threads;
|
|
|
|
topology.maxcpus = maxcpus;
|
|
|
|
return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
vm_get_topology(struct vmctx *ctx,
|
|
|
|
uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
|
|
|
|
{
|
|
|
|
struct vm_cpu_topology topology;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(&topology, sizeof (struct vm_cpu_topology));
|
|
|
|
error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
|
|
|
|
if (error == 0) {
|
|
|
|
*sockets = topology.sockets;
|
|
|
|
*cores = topology.cores;
|
|
|
|
*threads = topology.threads;
|
|
|
|
*maxcpus = topology.maxcpus;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2022-10-24 21:31:11 +00:00
|
|
|
int
|
|
|
|
vm_limit_rights(struct vmctx *ctx)
|
|
|
|
{
|
|
|
|
cap_rights_t rights;
|
|
|
|
|
|
|
|
cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
|
|
|
|
if (caph_rights_limit(ctx->fd, &rights) != 0)
|
|
|
|
return (-1);
|
2024-04-03 16:55:54 +00:00
|
|
|
if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0)
|
2022-10-24 21:31:11 +00:00
|
|
|
return (-1);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Avoid using in new code. Operations on the fd should be wrapped here so that
|
|
|
|
* capability rights can be kept in sync.
|
|
|
|
*/
|
2017-02-14 13:35:59 +00:00
|
|
|
int
|
|
|
|
vm_get_device_fd(struct vmctx *ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (ctx->fd);
|
|
|
|
}
|
|
|
|
|
2022-10-24 21:31:11 +00:00
|
|
|
/* Legacy interface, do not use. */
|
2017-02-14 13:35:59 +00:00
|
|
|
const cap_ioctl_t *
|
|
|
|
vm_get_ioctls(size_t *len)
|
|
|
|
{
|
|
|
|
cap_ioctl_t *cmds;
|
2024-04-03 16:55:54 +00:00
|
|
|
size_t sz;
|
2017-02-14 13:35:59 +00:00
|
|
|
|
|
|
|
if (len == NULL) {
|
2024-04-03 16:55:54 +00:00
|
|
|
sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]);
|
|
|
|
cmds = malloc(sz);
|
2017-02-14 13:35:59 +00:00
|
|
|
if (cmds == NULL)
|
|
|
|
return (NULL);
|
2024-04-03 16:55:54 +00:00
|
|
|
bcopy(vm_ioctl_cmds, cmds, sz);
|
2017-02-14 13:35:59 +00:00
|
|
|
return (cmds);
|
|
|
|
}
|
|
|
|
|
2024-04-03 16:55:54 +00:00
|
|
|
*len = vm_ioctl_ncmds;
|
2017-02-14 13:35:59 +00:00
|
|
|
return (NULL);
|
|
|
|
}
|