mirror of
https://github.com/freebsd/freebsd-src
synced 2024-10-15 04:43:53 +00:00
makefs: Add ZFS support
This allows one to take a staged directory tree and create a file consisting of a ZFS pool with one or more datasets that contain the contents of the directory tree. This is useful for creating virtual machine images without using the kernel to create a pool; "zpool create" requires root privileges and currently is not permitted in jails. makefs -t zfs also provides reproducible images by using a fixed seed for pseudo-random number generation, used for generating GUIDs and hash salts. makefs -t zfs requires relatively little by way of machine resources. The "zpool_reguid" rc.conf setting can be used to ask a FreeBSD guest to generate a unique pool GUID upon first boot. A small number of pool and dataset properties are supported. The pool is backed by a single disk vdev. Data is always checksummed using Fletcher-4, no redundant copies are made, and no compression is used. The manual page documents supported pool and filesystem properties. The implementation uses a few pieces of ZFS support from with the boot loader, especially definitions for various on-disk structures, but is otherwise standalone and in particular doesn't depend on OpenZFS. This feature should be treated as experimental for now, i.e., important data shouldn't be trusted to a makefs-created pool, and the command-line interface is subject to change. Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D35248
This commit is contained in:
parent
3e1101f29b
commit
240afd8c1f
|
@ -19,6 +19,17 @@ MAN= makefs.8
|
|||
NO_WCAST_ALIGN=
|
||||
CSTD= c11
|
||||
|
||||
.if ${MK_ZFS} != "no"
|
||||
SRCS+= zfs.c
|
||||
CFLAGS+=-I${SRCDIR}/zfs \
|
||||
-I${SRCTOP}/stand/libsa \
|
||||
-I${SRCTOP}/sys/cddl/boot
|
||||
|
||||
CFLAGS+= -DHAVE_ZFS
|
||||
|
||||
.include "${SRCDIR}/zfs/Makefile.inc"
|
||||
.endif
|
||||
|
||||
.include "${SRCDIR}/cd9660/Makefile.inc"
|
||||
.include "${SRCDIR}/ffs/Makefile.inc"
|
||||
.include "${SRCDIR}/msdos/Makefile.inc"
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
.\"
|
||||
.\" $FreeBSD$
|
||||
.\"
|
||||
.Dd September 17, 2020
|
||||
.Dd August 5, 2022
|
||||
.Dt MAKEFS 8
|
||||
.Os
|
||||
.Sh NAME
|
||||
|
@ -266,6 +266,8 @@ BSD fast file system (default).
|
|||
ISO 9660 file system.
|
||||
.It Sy msdos
|
||||
FAT12, FAT16, or FAT32 file system.
|
||||
.It Sy zfs
|
||||
ZFS pool containing one or more file systems.
|
||||
.El
|
||||
.It Fl x
|
||||
Exclude file system nodes not explicitly listed in the specfile.
|
||||
|
@ -494,10 +496,97 @@ Volume ID.
|
|||
.It Cm volume_label
|
||||
Volume Label.
|
||||
.El
|
||||
.Ss zfs-specific options
|
||||
Note: ZFS support is currently considered experimental.
|
||||
Do not use it for anything critical.
|
||||
.Pp
|
||||
The image created by
|
||||
.Nm
|
||||
contains a ZFS pool with a single vdev of type
|
||||
.Ql disk .
|
||||
The root dataset is always created implicitly and contains the entire input
|
||||
directory tree unless additional datasets are specified using the options
|
||||
described below.
|
||||
.Pp
|
||||
The arguments consist of a keyword, an equal sign
|
||||
.Pq Ql = ,
|
||||
and a value.
|
||||
The following keywords are supported:
|
||||
.Pp
|
||||
.Bl -tag -width omit-trailing-period -offset indent -compact
|
||||
.It ashift
|
||||
The base-2 logarithm of the minimum block size.
|
||||
Typical values are 9 (512B blocks) and 12 (4KB blocks).
|
||||
The default value is 12.
|
||||
.It bootfs
|
||||
The name of the bootable dataset for the pool.
|
||||
Specifying this option causes the
|
||||
.Ql bootfs
|
||||
property to be set in the created pool.
|
||||
.It mssize
|
||||
The size of metaslabs in the created pool.
|
||||
By default,
|
||||
.Nm
|
||||
allocates large (up to 512MB) metaslabs with the expectation that
|
||||
the image will be auto-expanded upon first use.
|
||||
This option allows the default heuristic to be overridden.
|
||||
.It poolname
|
||||
The name of the ZFS pool.
|
||||
This option must be specified.
|
||||
.It rootpath
|
||||
An implicit path prefix added to dataset mountpoints.
|
||||
By default it is
|
||||
.Pa /<poolname> .
|
||||
For creating bootable pools, the
|
||||
.Va rootpath
|
||||
should be set to
|
||||
.Pa / .
|
||||
At least one dataset must have a mountpoint equal to
|
||||
.Va rootpath .
|
||||
.It fs
|
||||
Create an additional dataset.
|
||||
This option may be specified multiple times.
|
||||
The argument value must be of the form
|
||||
.Ar <dataset>[;<prop1=v1>[;<prop2=v2>[;...]]] ,
|
||||
where
|
||||
.Ar dataset
|
||||
is the name of the dataset and must belong to the pool's namespace.
|
||||
For example, with a pool name of
|
||||
.Ql test
|
||||
all dataset names must be prefixed by
|
||||
.Ql test/ .
|
||||
A dataset must exist at each level of the pool's namespace.
|
||||
For example, to create
|
||||
.Ql test/foo/bar ,
|
||||
.Ql test/foo
|
||||
must be created as well.
|
||||
.Pp
|
||||
The dataset mountpoints determine how the datasets are populated with
|
||||
files from the staged directory tree.
|
||||
Conceptually, all datasets are mounted before any are populated with files.
|
||||
The root of the staged directory tree is mapped to
|
||||
.Va rootpath .
|
||||
.Pp
|
||||
Dataset properties, as described in
|
||||
.Xr zfsprops 8 ,
|
||||
may be specified following the dataset name.
|
||||
The following properties may be set for a dataset:
|
||||
.Pp
|
||||
.Bl -tag -compact -offset indent
|
||||
.It atime
|
||||
.It canmount
|
||||
.It exec
|
||||
.It mountpoint
|
||||
.It setuid
|
||||
.El
|
||||
.El
|
||||
.Sh SEE ALSO
|
||||
.Xr mtree 5 ,
|
||||
.Xr mtree 8 ,
|
||||
.Xr newfs 8
|
||||
.Xr newfs 8 ,
|
||||
.Xr zfsconcepts 8 ,
|
||||
.Xr zfsprops 8 ,
|
||||
.Xr zpoolprops 8
|
||||
.Sh HISTORY
|
||||
The
|
||||
.Nm
|
||||
|
@ -518,4 +607,6 @@ and first appeared in
|
|||
.An Ram Vedam
|
||||
(cd9660 support),
|
||||
.An Christos Zoulas
|
||||
(msdos support).
|
||||
(msdos support),
|
||||
.An Mark Johnston
|
||||
(zfs support).
|
||||
|
|
|
@ -77,6 +77,9 @@ static fstype_t fstypes[] = {
|
|||
ENTRY(cd9660),
|
||||
ENTRY(ffs),
|
||||
ENTRY(msdos),
|
||||
#ifdef HAVE_ZFS
|
||||
ENTRY(zfs),
|
||||
#endif
|
||||
{ .type = NULL },
|
||||
};
|
||||
|
||||
|
|
|
@ -78,12 +78,14 @@ enum fi_flags {
|
|||
FI_SIZED = 1<<0, /* inode sized */
|
||||
FI_ALLOCATED = 1<<1, /* fsinode->ino allocated */
|
||||
FI_WRITTEN = 1<<2, /* inode written */
|
||||
FI_ROOT = 1<<3, /* root of a ZFS dataset */
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint32_t ino; /* inode number used on target fs */
|
||||
uint32_t nlink; /* number of links to this entry */
|
||||
enum fi_flags flags; /* flags used by fs specific code */
|
||||
void *param; /* for use by individual fs impls */
|
||||
struct stat st; /* stat entry */
|
||||
} fsinode;
|
||||
|
||||
|
@ -186,6 +188,9 @@ void fs ## _makefs(const char *, const char *, fsnode *, fsinfo_t *)
|
|||
DECLARE_FUN(cd9660);
|
||||
DECLARE_FUN(ffs);
|
||||
DECLARE_FUN(msdos);
|
||||
#ifdef HAVE_ZFS
|
||||
DECLARE_FUN(zfs);
|
||||
#endif
|
||||
|
||||
extern u_int debug;
|
||||
extern int dupsok;
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
ATF_TESTS_SH+= makefs_cd9660_tests
|
||||
ATF_TESTS_SH+= makefs_ffs_tests
|
||||
ATF_TESTS_SH+= makefs_zfs_tests
|
||||
|
||||
BINDIR= ${TESTSDIR}
|
||||
|
||||
|
|
634
usr.sbin/makefs/tests/makefs_zfs_tests.sh
Normal file
634
usr.sbin/makefs/tests/makefs_zfs_tests.sh
Normal file
|
@ -0,0 +1,634 @@
|
|||
#-
|
||||
# SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
||||
#
|
||||
# Copyright (c) 2022 The FreeBSD Foundation
|
||||
#
|
||||
# This software was developed by Mark Johnston under sponsorship from
|
||||
# the FreeBSD Foundation.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
# SUCH DAMAGE.
|
||||
#
|
||||
|
||||
MAKEFS="makefs -t zfs -o nowarn=true"
|
||||
ZFS_POOL_NAME="makefstest$$"
|
||||
TEST_ZFS_POOL_NAME="$TMPDIR/poolname"
|
||||
|
||||
. "$(dirname "$0")/makefs_tests_common.sh"
|
||||
|
||||
common_cleanup()
|
||||
{
|
||||
local pool md
|
||||
|
||||
# Try to force a TXG, this can help catch bugs by triggering a panic.
|
||||
sync
|
||||
|
||||
pool=$(cat $TEST_ZFS_POOL_NAME)
|
||||
if zpool list "$pool" >/dev/null; then
|
||||
zpool destroy "$pool"
|
||||
fi
|
||||
|
||||
md=$(cat $TEST_MD_DEVICE_FILE)
|
||||
if [ -c /dev/"$md" ]; then
|
||||
mdconfig -d -u "$md"
|
||||
fi
|
||||
}
|
||||
|
||||
import_image()
|
||||
{
|
||||
atf_check -e empty -o save:$TEST_MD_DEVICE_FILE -s exit:0 \
|
||||
mdconfig -a -f $TEST_IMAGE
|
||||
atf_check zpool import -R $TEST_MOUNT_DIR $ZFS_POOL_NAME
|
||||
echo "$ZFS_POOL_NAME" > $TEST_ZFS_POOL_NAME
|
||||
}
|
||||
|
||||
#
|
||||
# Test autoexpansion of the vdev.
|
||||
#
|
||||
# The pool is initially 10GB, so we get 10GB minus one metaslab's worth of
|
||||
# usable space for data. Then the pool is expanded to 50GB, and the amount of
|
||||
# usable space is 50GB minus one metaslab.
|
||||
#
|
||||
atf_test_case autoexpand cleanup
|
||||
autoexpand_body()
|
||||
{
|
||||
local mssize poolsize poolsize1 newpoolsize
|
||||
|
||||
create_test_inputs
|
||||
|
||||
mssize=$((128 * 1024 * 1024))
|
||||
poolsize=$((10 * 1024 * 1024 * 1024))
|
||||
atf_check $MAKEFS -s $poolsize -o mssize=$mssize -o rootpath=/ \
|
||||
-o poolname=$ZFS_POOL_NAME \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
newpoolsize=$((50 * 1024 * 1024 * 1024))
|
||||
truncate -s $newpoolsize $TEST_IMAGE
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
|
||||
poolsize1=$(zpool list -Hp -o size $ZFS_POOL_NAME)
|
||||
atf_check [ $((poolsize1 + $mssize)) -eq $poolsize ]
|
||||
|
||||
atf_check zpool online -e $ZFS_POOL_NAME /dev/$(cat $TEST_MD_DEVICE_FILE)
|
||||
|
||||
check_image_contents
|
||||
|
||||
poolsize1=$(zpool list -Hp -o size $ZFS_POOL_NAME)
|
||||
atf_check [ $((poolsize1 + $mssize)) -eq $newpoolsize ]
|
||||
}
|
||||
autoexpand_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
#
|
||||
# Test with some default layout defined by the common code.
|
||||
#
|
||||
atf_test_case basic cleanup
|
||||
basic_body()
|
||||
{
|
||||
create_test_inputs
|
||||
|
||||
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
}
|
||||
basic_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
atf_test_case dataset_removal cleanup
|
||||
dataset_removal_body()
|
||||
{
|
||||
create_test_dirs
|
||||
|
||||
cd $TEST_INPUTS_DIR
|
||||
mkdir dir
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
-o fs=${ZFS_POOL_NAME}/dir \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
|
||||
atf_check zfs destroy ${ZFS_POOL_NAME}/dir
|
||||
}
|
||||
dataset_removal_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
#
|
||||
# Make sure that we can create and remove an empty directory.
|
||||
#
|
||||
atf_test_case empty_dir cleanup
|
||||
empty_dir_body()
|
||||
{
|
||||
create_test_dirs
|
||||
|
||||
cd $TEST_INPUTS_DIR
|
||||
mkdir dir
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
|
||||
atf_check rmdir ${TEST_MOUNT_DIR}/dir
|
||||
}
|
||||
empty_dir_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
atf_test_case empty_fs cleanup
|
||||
empty_fs_body()
|
||||
{
|
||||
create_test_dirs
|
||||
|
||||
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
}
|
||||
empty_fs_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
atf_test_case file_sizes cleanup
|
||||
file_sizes_body()
|
||||
{
|
||||
local i
|
||||
|
||||
create_test_dirs
|
||||
cd $TEST_INPUTS_DIR
|
||||
|
||||
i=1
|
||||
while [ $i -lt $((1 << 20)) ]; do
|
||||
truncate -s $i ${i}.1
|
||||
truncate -s $(($i - 1)) ${i}.2
|
||||
truncate -s $(($i + 1)) ${i}.3
|
||||
i=$(($i << 1))
|
||||
done
|
||||
|
||||
cd -
|
||||
|
||||
# XXXMJ this creates sparse files, make sure makefs doesn't
|
||||
# preserve the sparseness.
|
||||
# XXXMJ need to test with larger files (at least 128MB for L2 indirs)
|
||||
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
}
|
||||
file_sizes_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
atf_test_case hard_links cleanup
|
||||
hard_links_body()
|
||||
{
|
||||
local f
|
||||
|
||||
create_test_dirs
|
||||
cd $TEST_INPUTS_DIR
|
||||
|
||||
mkdir dir
|
||||
echo "hello" > 1
|
||||
ln 1 2
|
||||
ln 1 dir/1
|
||||
|
||||
echo "goodbye" > dir/a
|
||||
ln dir/a dir/b
|
||||
ln dir/a a
|
||||
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
|
||||
stat -f '%i' ${TEST_MOUNT_DIR}/1 > ./ino
|
||||
stat -f '%l' ${TEST_MOUNT_DIR}/1 > ./nlink
|
||||
for f in 1 2 dir/1; do
|
||||
atf_check -o file:./nlink -e empty -s exit:0 \
|
||||
stat -f '%l' ${TEST_MOUNT_DIR}/${f}
|
||||
atf_check -o file:./ino -e empty -s exit:0 \
|
||||
stat -f '%i' ${TEST_MOUNT_DIR}/${f}
|
||||
atf_check cmp -s ${TEST_INPUTS_DIR}/1 ${TEST_MOUNT_DIR}/${f}
|
||||
done
|
||||
|
||||
stat -f '%i' ${TEST_MOUNT_DIR}/dir/a > ./ino
|
||||
stat -f '%l' ${TEST_MOUNT_DIR}/dir/a > ./nlink
|
||||
for f in dir/a dir/b a; do
|
||||
atf_check -o file:./nlink -e empty -s exit:0 \
|
||||
stat -f '%l' ${TEST_MOUNT_DIR}/${f}
|
||||
atf_check -o file:./ino -e empty -s exit:0 \
|
||||
stat -f '%i' ${TEST_MOUNT_DIR}/${f}
|
||||
atf_check cmp -s ${TEST_INPUTS_DIR}/dir/a ${TEST_MOUNT_DIR}/${f}
|
||||
done
|
||||
}
|
||||
hard_links_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
# Allocate enough dnodes from an object set that the meta dnode needs to use
|
||||
# indirect blocks.
|
||||
atf_test_case indirect_dnode_array cleanup
|
||||
indirect_dnode_array_body()
|
||||
{
|
||||
local count i
|
||||
|
||||
# How many dnodes do we need to allocate? Well, the data block size
|
||||
# for meta dnodes is always 16KB, so with a dnode size of 512B we get
|
||||
# 32 dnodes per direct block. The maximum indirect block size is 128KB
|
||||
# and that can fit 1024 block pointers, so we need at least 32 * 1024
|
||||
# files to force the use of two levels of indirection.
|
||||
#
|
||||
# Unfortunately that number of files makes the test run quite slowly,
|
||||
# so we settle for a single indirect block for now...
|
||||
count=$(jot -r 1 32 1024)
|
||||
|
||||
create_test_dirs
|
||||
cd $TEST_INPUTS_DIR
|
||||
for i in $(seq 1 $count); do
|
||||
touch $i
|
||||
done
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
}
|
||||
indirect_dnode_array_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
#
|
||||
# Create some files with long names, so as to test fat ZAP handling.
|
||||
#
|
||||
atf_test_case long_file_name cleanup
|
||||
long_file_name_body()
|
||||
{
|
||||
local dir i
|
||||
|
||||
create_test_dirs
|
||||
cd $TEST_INPUTS_DIR
|
||||
|
||||
# micro ZAP keys can be at most 50 bytes.
|
||||
for i in $(seq 1 60); do
|
||||
touch $(jot -s '' $i 1 1)
|
||||
done
|
||||
dir=$(jot -s '' 61 1 1)
|
||||
mkdir $dir
|
||||
for i in $(seq 1 60); do
|
||||
touch ${dir}/$(jot -s '' $i 1 1)
|
||||
done
|
||||
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
|
||||
# Add a directory entry in the hope that OpenZFS might catch a bug
|
||||
# in makefs' fat ZAP encoding.
|
||||
touch ${TEST_MOUNT_DIR}/foo
|
||||
}
|
||||
long_file_name_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
#
|
||||
# Exercise handling of multiple datasets.
|
||||
#
|
||||
atf_test_case multi_dataset_1 cleanup
|
||||
multi_dataset_1_body()
|
||||
{
|
||||
create_test_dirs
|
||||
cd $TEST_INPUTS_DIR
|
||||
|
||||
mkdir dir1
|
||||
echo a > dir1/a
|
||||
mkdir dir2
|
||||
echo b > dir2/b
|
||||
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
-o fs=${ZFS_POOL_NAME}/dir1 -o fs=${ZFS_POOL_NAME}/dir2 \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
|
||||
# Make sure that we have three datasets with the expected mount points.
|
||||
atf_check -o inline:${ZFS_POOL_NAME}\\n -e empty -s exit:0 \
|
||||
zfs list -H -o name ${ZFS_POOL_NAME}
|
||||
atf_check -o inline:${TEST_MOUNT_DIR}\\n -e empty -s exit:0 \
|
||||
zfs list -H -o mountpoint ${ZFS_POOL_NAME}
|
||||
|
||||
atf_check -o inline:${ZFS_POOL_NAME}/dir1\\n -e empty -s exit:0 \
|
||||
zfs list -H -o name ${ZFS_POOL_NAME}/dir1
|
||||
atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \
|
||||
zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
|
||||
|
||||
atf_check -o inline:${ZFS_POOL_NAME}/dir2\\n -e empty -s exit:0 \
|
||||
zfs list -H -o name ${ZFS_POOL_NAME}/dir2
|
||||
atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \
|
||||
zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2
|
||||
}
|
||||
multi_dataset_1_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
#
|
||||
# Create a pool with two datasets, where the root dataset is mounted below
|
||||
# the child dataset.
|
||||
#
|
||||
atf_test_case multi_dataset_2 cleanup
|
||||
multi_dataset_2_body()
|
||||
{
|
||||
create_test_dirs
|
||||
cd $TEST_INPUTS_DIR
|
||||
|
||||
mkdir dir1
|
||||
echo a > dir1/a
|
||||
mkdir dir2
|
||||
echo b > dir2/b
|
||||
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
-o fs=${ZFS_POOL_NAME}/dir1\;mountpoint=/ \
|
||||
-o fs=${ZFS_POOL_NAME}\;mountpoint=/dir1 \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
}
|
||||
multi_dataset_2_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
#
|
||||
# Create a dataset with a non-existent mount point.
|
||||
#
|
||||
atf_test_case multi_dataset_3 cleanup
|
||||
multi_dataset_3_body()
|
||||
{
|
||||
create_test_dirs
|
||||
cd $TEST_INPUTS_DIR
|
||||
|
||||
mkdir dir1
|
||||
echo a > dir1/a
|
||||
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
-o fs=${ZFS_POOL_NAME}/dir1 \
|
||||
-o fs=${ZFS_POOL_NAME}/dir2 \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \
|
||||
zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2
|
||||
|
||||
# Mounting dir2 should have created a directory called dir2. Go
|
||||
# back and create it in the staging tree before comparing.
|
||||
atf_check mkdir ${TEST_INPUTS_DIR}/dir2
|
||||
|
||||
check_image_contents
|
||||
}
|
||||
multi_dataset_3_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
#
|
||||
# Create an unmounted dataset.
|
||||
#
|
||||
atf_test_case multi_dataset_4 cleanup
|
||||
multi_dataset_4_body()
|
||||
{
|
||||
create_test_dirs
|
||||
cd $TEST_INPUTS_DIR
|
||||
|
||||
mkdir dir1
|
||||
echo a > dir1/a
|
||||
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
-o fs=${ZFS_POOL_NAME}/dir1\;canmount=noauto\;mountpoint=none \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
atf_check -o inline:none\\n -e empty -s exit:0 \
|
||||
zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
|
||||
|
||||
check_image_contents
|
||||
|
||||
atf_check zfs set mountpoint=/dir1 ${ZFS_POOL_NAME}/dir1
|
||||
atf_check zfs mount ${ZFS_POOL_NAME}/dir1
|
||||
atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \
|
||||
zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
|
||||
|
||||
# dir1/a should be part of the root dataset, not dir1.
|
||||
atf_check -s not-exit:0 -e not-empty stat ${TEST_MOUNT_DIR}dir1/a
|
||||
}
|
||||
multi_dataset_4_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
#
|
||||
# Rudimentary test to verify that two ZFS images created using the same
|
||||
# parameters and input hierarchy are byte-identical. In particular, makefs(1)
|
||||
# does not preserve file access times.
|
||||
#
|
||||
atf_test_case reproducible cleanup
|
||||
reproducible_body()
|
||||
{
|
||||
create_test_inputs
|
||||
|
||||
atf_check $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
${TEST_IMAGE}.1 $TEST_INPUTS_DIR
|
||||
|
||||
atf_check $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
${TEST_IMAGE}.2 $TEST_INPUTS_DIR
|
||||
|
||||
# XXX-MJ cmp(1) is really slow
|
||||
atf_check cmp ${TEST_IMAGE}.1 ${TEST_IMAGE}.2
|
||||
}
|
||||
reproducible_cleanup()
|
||||
{
|
||||
}
|
||||
|
||||
#
|
||||
# Verify that we can take a snapshot of a generated dataset.
|
||||
#
|
||||
atf_test_case snapshot cleanup
|
||||
snapshot_body()
|
||||
{
|
||||
create_test_dirs
|
||||
cd $TEST_INPUTS_DIR
|
||||
|
||||
mkdir dir
|
||||
echo "hello" > dir/hello
|
||||
echo "goodbye" > goodbye
|
||||
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
atf_check zfs snapshot ${ZFS_POOL_NAME}@1
|
||||
}
|
||||
snapshot_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
#
|
||||
# Check handling of symbolic links.
|
||||
#
|
||||
atf_test_case soft_links cleanup
|
||||
soft_links_body()
|
||||
{
|
||||
create_test_dirs
|
||||
cd $TEST_INPUTS_DIR
|
||||
|
||||
mkdir dir
|
||||
ln -s a a
|
||||
ln -s dir/../a a
|
||||
ln -s dir/b b
|
||||
echo 'c' > dir
|
||||
ln -s dir/c c
|
||||
# XXX-MJ overflows bonus buffer ln -s $(jot -s '' 320 1 1) 1
|
||||
|
||||
cd -
|
||||
|
||||
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
}
|
||||
soft_links_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
#
|
||||
# Verify that we can set properties on the root dataset.
|
||||
#
|
||||
atf_test_case root_props cleanup
|
||||
root_props_body()
|
||||
{
|
||||
create_test_inputs
|
||||
|
||||
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
|
||||
-o fs=${ZFS_POOL_NAME}\;atime=off\;setuid=off \
|
||||
$TEST_IMAGE $TEST_INPUTS_DIR
|
||||
|
||||
import_image
|
||||
|
||||
check_image_contents
|
||||
|
||||
atf_check -o inline:off\\n -e empty -s exit:0 \
|
||||
zfs get -H -o value atime $ZFS_POOL_NAME
|
||||
atf_check -o inline:local\\n -e empty -s exit:0 \
|
||||
zfs get -H -o source atime $ZFS_POOL_NAME
|
||||
atf_check -o inline:off\\n -e empty -s exit:0 \
|
||||
zfs get -H -o value setuid $ZFS_POOL_NAME
|
||||
atf_check -o inline:local\\n -e empty -s exit:0 \
|
||||
zfs get -H -o source setuid $ZFS_POOL_NAME
|
||||
}
|
||||
root_props_cleanup()
|
||||
{
|
||||
common_cleanup
|
||||
}
|
||||
|
||||
atf_init_test_cases()
|
||||
{
|
||||
atf_add_test_case autoexpand
|
||||
atf_add_test_case basic
|
||||
atf_add_test_case dataset_removal
|
||||
atf_add_test_case empty_dir
|
||||
atf_add_test_case empty_fs
|
||||
atf_add_test_case file_sizes
|
||||
atf_add_test_case hard_links
|
||||
atf_add_test_case indirect_dnode_array
|
||||
atf_add_test_case long_file_name
|
||||
atf_add_test_case multi_dataset_1
|
||||
atf_add_test_case multi_dataset_2
|
||||
atf_add_test_case multi_dataset_3
|
||||
atf_add_test_case multi_dataset_4
|
||||
atf_add_test_case reproducible
|
||||
atf_add_test_case snapshot
|
||||
atf_add_test_case soft_links
|
||||
atf_add_test_case root_props
|
||||
|
||||
# XXXMJ tests:
|
||||
# - test with different ashifts (at least, 9 and 12), different image sizes
|
||||
# - create datasets in imported pool
|
||||
}
|
758
usr.sbin/makefs/zfs.c
Normal file
758
usr.sbin/makefs/zfs.c
Normal file
|
@ -0,0 +1,758 @@
|
|||
/*-
|
||||
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
||||
*
|
||||
* Copyright (c) 2022 The FreeBSD Foundation
|
||||
*
|
||||
* This software was developed by Mark Johnston under sponsorship from
|
||||
* the FreeBSD Foundation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/errno.h>
|
||||
#include <sys/queue.h>
|
||||
|
||||
#include <assert.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include "makefs.h"
|
||||
#include "zfs.h"
|
||||
|
||||
#define VDEV_LABEL_SPACE \
|
||||
((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
|
||||
_Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
|
||||
|
||||
#define MINMSSIZE ((off_t)1 << 24) /* 16MB */
|
||||
#define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */
|
||||
#define MAXMSSIZE ((off_t)1 << 34) /* 16GB */
|
||||
|
||||
#define INDIR_LEVELS 6
|
||||
/* Indirect blocks are always 128KB. */
|
||||
#define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t))
|
||||
|
||||
struct dnode_cursor {
|
||||
char inddir[INDIR_LEVELS][MAXBLOCKSIZE];
|
||||
off_t indloc;
|
||||
off_t indspace;
|
||||
dnode_phys_t *dnode;
|
||||
off_t dataoff;
|
||||
off_t datablksz;
|
||||
};
|
||||
|
||||
void
|
||||
zfs_prep_opts(fsinfo_t *fsopts)
|
||||
{
|
||||
zfs_opt_t *zfs = ecalloc(1, sizeof(*zfs));
|
||||
|
||||
const option_t zfs_options[] = {
|
||||
{ '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
|
||||
0, 0, "Bootable dataset" },
|
||||
{ '\0', "mssize", &zfs->mssize, OPT_INT64,
|
||||
MINMSSIZE, MAXMSSIZE, "Metaslab size" },
|
||||
{ '\0', "poolname", &zfs->poolname, OPT_STRPTR,
|
||||
0, 0, "ZFS pool name" },
|
||||
{ '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
|
||||
0, 0, "Prefix for all dataset mount points" },
|
||||
{ '\0', "ashift", &zfs->ashift, OPT_INT32,
|
||||
MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
|
||||
{ '\0', "nowarn", &zfs->nowarn, OPT_BOOL,
|
||||
0, 0, "Suppress warning about experimental ZFS support" },
|
||||
{ .name = NULL }
|
||||
};
|
||||
|
||||
STAILQ_INIT(&zfs->datasetdescs);
|
||||
|
||||
fsopts->fs_specific = zfs;
|
||||
fsopts->fs_options = copy_opts(zfs_options);
|
||||
}
|
||||
|
||||
int
|
||||
zfs_parse_opts(const char *option, fsinfo_t *fsopts)
|
||||
{
|
||||
zfs_opt_t *zfs;
|
||||
struct dataset_desc *dsdesc;
|
||||
char buf[BUFSIZ], *opt, *val;
|
||||
int rv;
|
||||
|
||||
zfs = fsopts->fs_specific;
|
||||
|
||||
opt = val = estrdup(option);
|
||||
opt = strsep(&val, "=");
|
||||
if (strcmp(opt, "fs") == 0) {
|
||||
if (val == NULL)
|
||||
errx(1, "invalid filesystem parameters `%s'", option);
|
||||
|
||||
/*
|
||||
* Dataset descriptions will be parsed later, in dsl_init().
|
||||
* Just stash them away for now.
|
||||
*/
|
||||
dsdesc = ecalloc(1, sizeof(*dsdesc));
|
||||
dsdesc->params = estrdup(val);
|
||||
free(opt);
|
||||
STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next);
|
||||
return (1);
|
||||
}
|
||||
free(opt);
|
||||
|
||||
rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
|
||||
return (rv == -1 ? 0 : 1);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_size_vdev(fsinfo_t *fsopts)
|
||||
{
|
||||
zfs_opt_t *zfs;
|
||||
off_t asize, mssize, vdevsize, vdevsize1;
|
||||
|
||||
zfs = fsopts->fs_specific;
|
||||
|
||||
assert(fsopts->maxsize != 0);
|
||||
assert(zfs->ashift != 0);
|
||||
|
||||
/*
|
||||
* Figure out how big the vdev should be.
|
||||
*/
|
||||
vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift);
|
||||
if (vdevsize < MINDEVSIZE)
|
||||
errx(1, "maximum image size is too small");
|
||||
if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) {
|
||||
errx(1, "image size bounds must be multiples of %d",
|
||||
1 << zfs->ashift);
|
||||
}
|
||||
asize = vdevsize - VDEV_LABEL_SPACE;
|
||||
|
||||
/*
|
||||
* Size metaslabs according to the following heuristic:
|
||||
* - provide at least 8 metaslabs,
|
||||
* - without using a metaslab size larger than 512MB.
|
||||
* This approximates what OpenZFS does without being complicated. In
|
||||
* practice we expect pools to be expanded upon first use, and OpenZFS
|
||||
* does not resize metaslabs in that case, so there is no right answer
|
||||
* here. In general we want to provide large metaslabs even if the
|
||||
* image size is small, and 512MB is a reasonable size for pools up to
|
||||
* several hundred gigabytes.
|
||||
*
|
||||
* The user may override this heuristic using the "-o mssize" option.
|
||||
*/
|
||||
mssize = zfs->mssize;
|
||||
if (mssize == 0) {
|
||||
mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE);
|
||||
if (!powerof2(mssize))
|
||||
mssize = 1l << (flsll(mssize) - 1);
|
||||
}
|
||||
if (!powerof2(mssize))
|
||||
errx(1, "metaslab size must be a power of 2");
|
||||
|
||||
/*
|
||||
* If we have some slop left over, try to cover it by resizing the vdev,
|
||||
* subject to the maxsize and minsize parameters.
|
||||
*/
|
||||
if (asize % mssize != 0) {
|
||||
vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE;
|
||||
if (vdevsize1 < fsopts->minsize)
|
||||
vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE;
|
||||
if (vdevsize1 <= fsopts->maxsize)
|
||||
vdevsize = vdevsize1;
|
||||
}
|
||||
asize = vdevsize - VDEV_LABEL_SPACE;
|
||||
|
||||
zfs->asize = asize;
|
||||
zfs->vdevsize = vdevsize;
|
||||
zfs->mssize = mssize;
|
||||
zfs->msshift = flsll(mssize) - 1;
|
||||
zfs->mscount = asize / mssize;
|
||||
}
|
||||
|
||||
/*
|
||||
* Validate options and set some default values.
|
||||
*/
|
||||
static void
|
||||
zfs_check_opts(fsinfo_t *fsopts)
|
||||
{
|
||||
zfs_opt_t *zfs;
|
||||
|
||||
zfs = fsopts->fs_specific;
|
||||
|
||||
if (fsopts->offset != 0)
|
||||
errx(1, "unhandled offset option");
|
||||
if (fsopts->maxsize == 0)
|
||||
errx(1, "an image size must be specified");
|
||||
|
||||
if (zfs->poolname == NULL)
|
||||
errx(1, "a pool name must be specified");
|
||||
|
||||
if (zfs->rootpath == NULL)
|
||||
easprintf(&zfs->rootpath, "/%s", zfs->poolname);
|
||||
if (zfs->rootpath[0] != '/')
|
||||
errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
|
||||
|
||||
if (zfs->ashift == 0)
|
||||
zfs->ashift = 12;
|
||||
|
||||
zfs_size_vdev(fsopts);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_cleanup_opts(fsinfo_t *fsopts)
|
||||
{
|
||||
struct dataset_desc *d, *tmp;
|
||||
zfs_opt_t *zfs;
|
||||
|
||||
zfs = fsopts->fs_specific;
|
||||
free(zfs->rootpath);
|
||||
free(zfs->bootfs);
|
||||
free(__DECONST(void *, zfs->poolname));
|
||||
STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) {
|
||||
free(d->params);
|
||||
free(d);
|
||||
}
|
||||
free(zfs);
|
||||
free(fsopts->fs_options);
|
||||
}
|
||||
|
||||
static size_t
|
||||
nvlist_size(const nvlist_t *nvl)
|
||||
{
|
||||
return (sizeof(nvl->nv_header) + nvl->nv_size);
|
||||
}
|
||||
|
||||
static void
|
||||
nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
|
||||
{
|
||||
assert(sz >= nvlist_size(nvl));
|
||||
|
||||
memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
|
||||
memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
|
||||
}
|
||||
|
||||
static nvlist_t *
|
||||
pool_config_nvcreate(zfs_opt_t *zfs)
|
||||
{
|
||||
nvlist_t *featuresnv, *poolnv;
|
||||
|
||||
poolnv = nvlist_create(NV_UNIQUE_NAME);
|
||||
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG);
|
||||
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
|
||||
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
|
||||
nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
|
||||
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid);
|
||||
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid);
|
||||
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
|
||||
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
|
||||
|
||||
featuresnv = nvlist_create(NV_UNIQUE_NAME);
|
||||
nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
|
||||
nvlist_destroy(featuresnv);
|
||||
|
||||
return (poolnv);
|
||||
}
|
||||
|
||||
static nvlist_t *
|
||||
pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
|
||||
{
|
||||
nvlist_t *diskvdevnv;
|
||||
|
||||
assert(zfs->objarrid != 0);
|
||||
|
||||
diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
|
||||
nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
|
||||
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
|
||||
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
|
||||
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
|
||||
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
|
||||
nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
|
||||
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
|
||||
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
|
||||
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
|
||||
zfs->objarrid);
|
||||
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
|
||||
zfs->msshift);
|
||||
|
||||
return (diskvdevnv);
|
||||
}
|
||||
|
||||
static nvlist_t *
|
||||
pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
|
||||
{
|
||||
nvlist_t *diskvdevnv, *rootvdevnv;
|
||||
|
||||
diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
|
||||
rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
|
||||
|
||||
nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
|
||||
nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid);
|
||||
nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
|
||||
nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
|
||||
nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
|
||||
1);
|
||||
nvlist_destroy(diskvdevnv);
|
||||
|
||||
return (rootvdevnv);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create the pool's "config" object, which contains an nvlist describing pool
|
||||
* parameters and the vdev topology. It is similar but not identical to the
|
||||
* nvlist stored in vdev labels. The main difference is that vdev labels do not
|
||||
* describe the full vdev tree and in particular do not contain the "root"
|
||||
* meta-vdev.
|
||||
*/
|
||||
static void
|
||||
pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
|
||||
{
|
||||
dnode_phys_t *dnode;
|
||||
nvlist_t *poolconfig, *vdevconfig;
|
||||
void *configbuf;
|
||||
uint64_t dnid;
|
||||
off_t configloc, configblksz;
|
||||
int error;
|
||||
|
||||
dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST,
|
||||
DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
|
||||
|
||||
poolconfig = pool_config_nvcreate(zfs);
|
||||
|
||||
vdevconfig = pool_root_vdev_config_nvcreate(zfs);
|
||||
nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
|
||||
nvlist_destroy(vdevconfig);
|
||||
|
||||
error = nvlist_export(poolconfig);
|
||||
if (error != 0)
|
||||
errc(1, error, "nvlist_export");
|
||||
|
||||
configblksz = nvlist_size(poolconfig);
|
||||
configloc = objset_space_alloc(zfs, zfs->mos, &configblksz);
|
||||
configbuf = ecalloc(1, configblksz);
|
||||
nvlist_copy(poolconfig, configbuf, configblksz);
|
||||
|
||||
vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
|
||||
|
||||
dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
|
||||
dnode->dn_flags = DNODE_FLAG_USED_BYTES;
|
||||
*(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
|
||||
|
||||
zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
|
||||
|
||||
nvlist_destroy(poolconfig);
|
||||
free(configbuf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add objects block pointer list objects, used for deferred frees. We don't do
|
||||
* anything with them, but they need to be present or OpenZFS will refuse to
|
||||
* import the pool.
|
||||
*/
|
||||
static void
|
||||
pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
|
||||
{
|
||||
uint64_t dnid;
|
||||
|
||||
(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
|
||||
BPOBJ_SIZE_V2, &dnid);
|
||||
zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
|
||||
|
||||
(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
|
||||
BPOBJ_SIZE_V2, &dnid);
|
||||
zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add required feature metadata objects. We don't know anything about ZFS
|
||||
* features, so the objects are just empty ZAPs.
|
||||
*/
|
||||
static void
|
||||
pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
|
||||
{
|
||||
dnode_phys_t *dnode;
|
||||
uint64_t dnid;
|
||||
|
||||
dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
|
||||
zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
|
||||
zap_write(zfs, zap_alloc(zfs->mos, dnode));
|
||||
|
||||
dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
|
||||
zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
|
||||
zap_write(zfs, zap_alloc(zfs->mos, dnode));
|
||||
|
||||
dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
|
||||
zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
|
||||
zap_write(zfs, zap_alloc(zfs->mos, dnode));
|
||||
}
|
||||
|
||||
static void
|
||||
pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
|
||||
{
|
||||
zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET,
|
||||
dsl_dir_id(zfs->rootdsldir));
|
||||
}
|
||||
|
||||
static void
|
||||
pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
|
||||
{
|
||||
dnode_phys_t *dnode;
|
||||
uint64_t id;
|
||||
|
||||
dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id);
|
||||
zap_add_uint64(objdir, DMU_POOL_PROPS, id);
|
||||
|
||||
zfs->poolprops = zap_alloc(zfs->mos, dnode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the MOS object directory, the root of virtually all of the pool's
|
||||
* data and metadata.
|
||||
*/
|
||||
static void
|
||||
pool_init_objdir(zfs_opt_t *zfs)
|
||||
{
|
||||
zfs_zap_t *zap;
|
||||
dnode_phys_t *objdir;
|
||||
|
||||
objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
|
||||
|
||||
zap = zap_alloc(zfs->mos, objdir);
|
||||
pool_init_objdir_config(zfs, zap);
|
||||
pool_init_objdir_bplists(zfs, zap);
|
||||
pool_init_objdir_feature_maps(zfs, zap);
|
||||
pool_init_objdir_dsl(zfs, zap);
|
||||
pool_init_objdir_poolprops(zfs, zap);
|
||||
zap_write(zfs, zap);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the meta-object set (MOS) and immediately write out several
|
||||
* special objects whose contents are already finalized, including the object
|
||||
* directory.
|
||||
*
|
||||
* Once the MOS is finalized, it'll look roughly like this:
|
||||
*
|
||||
* object directory (ZAP)
|
||||
* |-> vdev config object (nvlist)
|
||||
* |-> features for read
|
||||
* |-> features for write
|
||||
* |-> feature descriptions
|
||||
* |-> sync bplist
|
||||
* |-> free bplist
|
||||
* |-> pool properties
|
||||
* L-> root DSL directory
|
||||
* |-> DSL child directory (ZAP)
|
||||
* | |-> $MOS (DSL dir)
|
||||
* | | |-> child map
|
||||
* | | L-> props (ZAP)
|
||||
* | |-> $FREE (DSL dir)
|
||||
* | | |-> child map
|
||||
* | | L-> props (ZAP)
|
||||
* | |-> $ORIGIN (DSL dir)
|
||||
* | | |-> child map
|
||||
* | | |-> dataset
|
||||
* | | | L-> deadlist
|
||||
* | | |-> snapshot
|
||||
* | | | |-> deadlist
|
||||
* | | | L-> snapshot names
|
||||
* | | |-> props (ZAP)
|
||||
* | | L-> clones (ZAP)
|
||||
* | |-> dataset 1 (DSL dir)
|
||||
* | | |-> DSL dataset
|
||||
* | | | |-> snapshot names
|
||||
* | | | L-> deadlist
|
||||
* | | |-> child map
|
||||
* | | | L-> ...
|
||||
* | | L-> props
|
||||
* | |-> dataset 2
|
||||
* | | L-> ...
|
||||
* | |-> ...
|
||||
* | L-> dataset n
|
||||
* |-> DSL root dataset
|
||||
* | |-> snapshot names
|
||||
* | L-> deadlist
|
||||
* L-> props (ZAP)
|
||||
* space map object array
|
||||
* |-> space map 1
|
||||
* |-> space map 2
|
||||
* |-> ...
|
||||
* L-> space map n (zfs->mscount)
|
||||
*
|
||||
* The space map object array is pointed to by the "msarray" property in the
|
||||
* pool configuration.
|
||||
*/
|
||||
static void
|
||||
pool_init(zfs_opt_t *zfs)
|
||||
{
|
||||
uint64_t dnid;
|
||||
|
||||
zfs->poolguid = ((uint64_t)random() << 32) | random();
|
||||
zfs->vdevguid = ((uint64_t)random() << 32) | random();
|
||||
|
||||
zfs->mos = objset_alloc(zfs, DMU_OST_META);
|
||||
|
||||
(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
|
||||
assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
|
||||
|
||||
(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
|
||||
|
||||
dsl_init(zfs);
|
||||
|
||||
pool_init_objdir(zfs);
|
||||
}
|
||||
|
||||
static void
|
||||
pool_labels_write(zfs_opt_t *zfs)
|
||||
{
|
||||
uberblock_t *ub;
|
||||
vdev_label_t *label;
|
||||
nvlist_t *poolconfig, *vdevconfig;
|
||||
int error;
|
||||
|
||||
label = ecalloc(1, sizeof(*label));
|
||||
|
||||
/*
|
||||
* Assemble the vdev configuration and store it in the label.
|
||||
*/
|
||||
poolconfig = pool_config_nvcreate(zfs);
|
||||
vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
|
||||
nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
|
||||
nvlist_destroy(vdevconfig);
|
||||
|
||||
error = nvlist_export(poolconfig);
|
||||
if (error != 0)
|
||||
errc(1, error, "nvlist_export");
|
||||
nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
|
||||
sizeof(label->vl_vdev_phys.vp_nvlist));
|
||||
nvlist_destroy(poolconfig);
|
||||
|
||||
/*
|
||||
* Fill out the uberblock. Just make each one the same. The embedded
|
||||
* checksum is calculated in vdev_label_write().
|
||||
*/
|
||||
for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
|
||||
uoff += (1 << zfs->ashift)) {
|
||||
ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
|
||||
ub->ub_magic = UBERBLOCK_MAGIC;
|
||||
ub->ub_version = SPA_VERSION;
|
||||
ub->ub_txg = TXG;
|
||||
ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid;
|
||||
ub->ub_timestamp = 0;
|
||||
|
||||
ub->ub_software_version = SPA_VERSION;
|
||||
ub->ub_mmp_magic = MMP_MAGIC;
|
||||
ub->ub_mmp_delay = 0;
|
||||
ub->ub_mmp_config = 0;
|
||||
ub->ub_checkpoint_txg = 0;
|
||||
objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out four copies of the label: two at the beginning of the vdev
|
||||
* and two at the end.
|
||||
*/
|
||||
for (int i = 0; i < VDEV_LABELS; i++)
|
||||
vdev_label_write(zfs, i, label);
|
||||
|
||||
free(label);
|
||||
}
|
||||
|
||||
static void
|
||||
pool_fini(zfs_opt_t *zfs)
|
||||
{
|
||||
zap_write(zfs, zfs->poolprops);
|
||||
dsl_write(zfs);
|
||||
objset_write(zfs, zfs->mos);
|
||||
pool_labels_write(zfs);
|
||||
}
|
||||
|
||||
struct dnode_cursor *
|
||||
dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
|
||||
off_t size, off_t blksz)
|
||||
{
|
||||
struct dnode_cursor *c;
|
||||
uint64_t nbppindir, indlevel, ndatablks, nindblks;
|
||||
|
||||
assert(dnode->dn_nblkptr == 1);
|
||||
assert(blksz <= MAXBLOCKSIZE);
|
||||
|
||||
if (blksz == 0) {
|
||||
/* Must be between 1<<ashift and 128KB. */
|
||||
blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
|
||||
powerof2(size) ? size : (1ul << flsll(size))));
|
||||
}
|
||||
assert(powerof2(blksz));
|
||||
|
||||
/*
|
||||
* Do we need indirect blocks? Figure out how many levels are needed
|
||||
* (indlevel == 1 means no indirect blocks) and how much space is needed
|
||||
* (it has to be allocated up-front to break the dependency cycle
|
||||
* described in objset_write()).
|
||||
*/
|
||||
ndatablks = size == 0 ? 0 : howmany(size, blksz);
|
||||
nindblks = 0;
|
||||
for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
|
||||
nbppindir *= BLKPTR_PER_INDIR;
|
||||
nindblks += howmany(ndatablks, indlevel * nbppindir);
|
||||
}
|
||||
assert(indlevel < INDIR_LEVELS);
|
||||
|
||||
dnode->dn_nlevels = (uint8_t)indlevel;
|
||||
dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
|
||||
dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
|
||||
|
||||
c = ecalloc(1, sizeof(*c));
|
||||
if (nindblks > 0) {
|
||||
c->indspace = nindblks * MAXBLOCKSIZE;
|
||||
c->indloc = objset_space_alloc(zfs, os, &c->indspace);
|
||||
}
|
||||
c->dnode = dnode;
|
||||
c->dataoff = 0;
|
||||
c->datablksz = blksz;
|
||||
|
||||
return (c);
|
||||
}
|
||||
|
||||
static void
|
||||
_dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels)
|
||||
{
|
||||
blkptr_t *bp, *pbp;
|
||||
void *buf;
|
||||
uint64_t fill;
|
||||
off_t blkid, blksz, loc;
|
||||
|
||||
assert(levels > 0);
|
||||
assert(levels <= c->dnode->dn_nlevels - 1);
|
||||
|
||||
blksz = MAXBLOCKSIZE;
|
||||
blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
|
||||
for (int level = 1; level <= levels; level++) {
|
||||
buf = c->inddir[level - 1];
|
||||
|
||||
if (level == c->dnode->dn_nlevels - 1) {
|
||||
pbp = &c->dnode->dn_blkptr[0];
|
||||
} else {
|
||||
uint64_t iblkid;
|
||||
|
||||
iblkid = blkid & (BLKPTR_PER_INDIR - 1);
|
||||
pbp = (blkptr_t *)
|
||||
&c->inddir[level][iblkid * sizeof(blkptr_t)];
|
||||
}
|
||||
|
||||
/*
|
||||
* Space for indirect blocks is allocated up-front; see the
|
||||
* comment in objset_write().
|
||||
*/
|
||||
loc = c->indloc;
|
||||
c->indloc += blksz;
|
||||
assert(c->indspace >= blksz);
|
||||
c->indspace -= blksz;
|
||||
|
||||
bp = buf;
|
||||
fill = 0;
|
||||
for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
|
||||
fill += BP_GET_FILL(&bp[i]);
|
||||
|
||||
vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
|
||||
loc, pbp);
|
||||
memset(buf, 0, MAXBLOCKSIZE);
|
||||
|
||||
blkid /= BLKPTR_PER_INDIR;
|
||||
}
|
||||
}
|
||||
|
||||
blkptr_t *
|
||||
dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
|
||||
{
|
||||
off_t blkid, l1id;
|
||||
int levels;
|
||||
|
||||
if (c->dnode->dn_nlevels == 1) {
|
||||
assert(off < MAXBLOCKSIZE);
|
||||
return (&c->dnode->dn_blkptr[0]);
|
||||
}
|
||||
|
||||
assert(off % c->datablksz == 0);
|
||||
|
||||
/* Do we need to flush any full indirect blocks? */
|
||||
if (off > 0) {
|
||||
blkid = off / c->datablksz;
|
||||
for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) {
|
||||
if (blkid % BLKPTR_PER_INDIR != 0)
|
||||
break;
|
||||
blkid /= BLKPTR_PER_INDIR;
|
||||
}
|
||||
if (levels > 0)
|
||||
_dnode_cursor_flush(zfs, c, levels);
|
||||
}
|
||||
|
||||
c->dataoff = off;
|
||||
l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
|
||||
return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
|
||||
}
|
||||
|
||||
void
|
||||
dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
|
||||
{
|
||||
int levels;
|
||||
|
||||
levels = c->dnode->dn_nlevels - 1;
|
||||
if (levels > 0)
|
||||
_dnode_cursor_flush(zfs, c, levels);
|
||||
assert(c->indspace == 0);
|
||||
free(c);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
|
||||
{
|
||||
zfs_opt_t *zfs;
|
||||
int dirfd;
|
||||
|
||||
zfs = fsopts->fs_specific;
|
||||
|
||||
/*
|
||||
* Use a fixed seed to provide reproducible pseudo-random numbers for
|
||||
* on-disk structures when needed (e.g., GUIDs, ZAP hash salts).
|
||||
*/
|
||||
srandom(1729);
|
||||
|
||||
zfs_check_opts(fsopts);
|
||||
|
||||
if (!zfs->nowarn) {
|
||||
fprintf(stderr,
|
||||
"ZFS support is currently considered experimental. "
|
||||
"Do not use it for anything critical.\n");
|
||||
}
|
||||
|
||||
dirfd = open(dir, O_DIRECTORY | O_RDONLY);
|
||||
if (dirfd < 0)
|
||||
err(1, "open(%s)", dir);
|
||||
|
||||
vdev_init(zfs, image);
|
||||
pool_init(zfs);
|
||||
fs_build(zfs, dirfd, root);
|
||||
pool_fini(zfs);
|
||||
vdev_fini(zfs);
|
||||
}
|
12
usr.sbin/makefs/zfs/Makefile.inc
Normal file
12
usr.sbin/makefs/zfs/Makefile.inc
Normal file
|
@ -0,0 +1,12 @@
|
|||
.PATH: ${SRCDIR}/zfs
|
||||
.PATH: ${SRCTOP}/stand/libsa/zfs
|
||||
|
||||
SRCS+= dsl.c \
|
||||
fs.c \
|
||||
objset.c \
|
||||
vdev.c \
|
||||
zap.c
|
||||
|
||||
SRCS+= nvlist.c
|
||||
|
||||
CFLAGS.nvlist.c+= -I${SRCTOP}/stand/libsa -Wno-cast-qual
|
598
usr.sbin/makefs/zfs/dsl.c
Normal file
598
usr.sbin/makefs/zfs/dsl.c
Normal file
|
@ -0,0 +1,598 @@
|
|||
/*-
|
||||
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
||||
*
|
||||
* Copyright (c) 2022 The FreeBSD Foundation
|
||||
*
|
||||
* This software was developed by Mark Johnston under sponsorship from
|
||||
* the FreeBSD Foundation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include "makefs.h"
|
||||
#include "zfs.h"
|
||||
|
||||
typedef struct zfs_dsl_dataset {
|
||||
zfs_objset_t *os; /* referenced objset, may be null */
|
||||
dsl_dataset_phys_t *phys; /* on-disk representation */
|
||||
uint64_t dsid; /* DSL dataset dnode */
|
||||
|
||||
struct zfs_dsl_dir *dir; /* containing parent */
|
||||
} zfs_dsl_dataset_t;
|
||||
|
||||
typedef STAILQ_HEAD(zfs_dsl_dir_list, zfs_dsl_dir) zfs_dsl_dir_list_t;
|
||||
|
||||
typedef struct zfs_dsl_dir {
|
||||
char *fullname; /* full dataset name */
|
||||
char *name; /* basename(fullname) */
|
||||
dsl_dir_phys_t *phys; /* on-disk representation */
|
||||
nvlist_t *propsnv; /* properties saved in propszap */
|
||||
|
||||
zfs_dsl_dataset_t *headds; /* principal dataset, may be null */
|
||||
|
||||
uint64_t dirid; /* DSL directory dnode */
|
||||
zfs_zap_t *propszap; /* dataset properties */
|
||||
zfs_zap_t *childzap; /* child directories */
|
||||
|
||||
/* DSL directory tree linkage. */
|
||||
struct zfs_dsl_dir *parent;
|
||||
zfs_dsl_dir_list_t children;
|
||||
STAILQ_ENTRY(zfs_dsl_dir) next;
|
||||
} zfs_dsl_dir_t;
|
||||
|
||||
static zfs_dsl_dir_t *dsl_dir_alloc(zfs_opt_t *zfs, const char *name);
|
||||
static zfs_dsl_dataset_t *dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir);
|
||||
|
||||
static int
|
||||
nvlist_find_string(nvlist_t *nvl, const char *key, char **retp)
|
||||
{
|
||||
char *str;
|
||||
int error, len;
|
||||
|
||||
error = nvlist_find(nvl, key, DATA_TYPE_STRING, NULL, &str, &len);
|
||||
if (error == 0) {
|
||||
*retp = ecalloc(1, len + 1);
|
||||
memcpy(*retp, str, len);
|
||||
}
|
||||
return (error);
|
||||
}
|
||||
|
||||
static int
|
||||
nvlist_find_uint64(nvlist_t *nvl, const char *key, uint64_t *retp)
|
||||
{
|
||||
return (nvlist_find(nvl, key, DATA_TYPE_UINT64, NULL, retp, NULL));
|
||||
}
|
||||
|
||||
/*
|
||||
* Return an allocated string containing the head dataset's mountpoint,
|
||||
* including the root path prefix.
|
||||
*
|
||||
* If the dataset has a mountpoint property, it is returned. Otherwise we have
|
||||
* to follow ZFS' inheritance rules.
|
||||
*/
|
||||
char *
|
||||
dsl_dir_get_mountpoint(zfs_opt_t *zfs, zfs_dsl_dir_t *dir)
|
||||
{
|
||||
zfs_dsl_dir_t *pdir;
|
||||
char *mountpoint, *origmountpoint;
|
||||
|
||||
if (nvlist_find_string(dir->propsnv, "mountpoint", &mountpoint) == 0) {
|
||||
if (strcmp(mountpoint, "none") == 0)
|
||||
return (NULL);
|
||||
|
||||
/*
|
||||
* nvlist_find_string() does not make a copy.
|
||||
*/
|
||||
mountpoint = estrdup(mountpoint);
|
||||
} else {
|
||||
/*
|
||||
* If we don't have a mountpoint, it's inherited from one of our
|
||||
* ancestors. Walk up the hierarchy until we find it, building
|
||||
* up our mountpoint along the way. The mountpoint property is
|
||||
* always set for the root dataset.
|
||||
*/
|
||||
for (pdir = dir->parent, mountpoint = estrdup(dir->name);;) {
|
||||
origmountpoint = mountpoint;
|
||||
|
||||
if (nvlist_find_string(pdir->propsnv, "mountpoint",
|
||||
&mountpoint) == 0) {
|
||||
easprintf(&mountpoint, "%s%s%s", mountpoint,
|
||||
mountpoint[strlen(mountpoint) - 1] == '/' ?
|
||||
"" : "/", origmountpoint);
|
||||
free(origmountpoint);
|
||||
break;
|
||||
}
|
||||
|
||||
easprintf(&mountpoint, "%s/%s", pdir->name,
|
||||
origmountpoint);
|
||||
free(origmountpoint);
|
||||
pdir = pdir->parent;
|
||||
}
|
||||
}
|
||||
assert(mountpoint[0] == '/');
|
||||
assert(strstr(mountpoint, zfs->rootpath) == mountpoint);
|
||||
|
||||
return (mountpoint);
|
||||
}
|
||||
|
||||
int
|
||||
dsl_dir_get_canmount(zfs_dsl_dir_t *dir, uint64_t *canmountp)
|
||||
{
|
||||
return (nvlist_find_uint64(dir->propsnv, "canmount", canmountp));
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle dataset properties that we know about; stash them into an nvlist to be
|
||||
* written later to the properties ZAP object.
|
||||
*
|
||||
* If the set of properties we handle grows too much, we should probably explore
|
||||
* using libzfs to manage them.
|
||||
*/
|
||||
static void
|
||||
dsl_dir_set_prop(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, const char *key,
|
||||
const char *val)
|
||||
{
|
||||
nvlist_t *nvl;
|
||||
|
||||
nvl = dir->propsnv;
|
||||
if (val == NULL || val[0] == '\0')
|
||||
errx(1, "missing value for property `%s'", key);
|
||||
if (nvpair_find(nvl, key) != NULL)
|
||||
errx(1, "property `%s' already set", key);
|
||||
|
||||
if (strcmp(key, "mountpoint") == 0) {
|
||||
if (strcmp(val, "none") != 0) {
|
||||
if (val[0] != '/')
|
||||
errx(1, "mountpoint `%s' is not absolute", val);
|
||||
if (strcmp(val, zfs->rootpath) != 0 &&
|
||||
strcmp(zfs->rootpath, "/") != 0 &&
|
||||
(strstr(val, zfs->rootpath) != val ||
|
||||
val[strlen(zfs->rootpath)] != '/')) {
|
||||
errx(1, "mountpoint `%s' is not prefixed by "
|
||||
"the root path `%s'", val, zfs->rootpath);
|
||||
}
|
||||
}
|
||||
nvlist_add_string(nvl, key, val);
|
||||
} else if (strcmp(key, "atime") == 0 || strcmp(key, "exec") == 0 ||
|
||||
strcmp(key, "setuid") == 0) {
|
||||
if (strcmp(val, "on") == 0)
|
||||
nvlist_add_uint64(nvl, key, 1);
|
||||
else if (strcmp(val, "off") == 0)
|
||||
nvlist_add_uint64(nvl, key, 0);
|
||||
else
|
||||
errx(1, "invalid value `%s' for %s", val, key);
|
||||
} else if (strcmp(key, "canmount") == 0) {
|
||||
if (strcmp(val, "noauto") == 0)
|
||||
nvlist_add_uint64(nvl, key, 2);
|
||||
else if (strcmp(val, "on") == 0)
|
||||
nvlist_add_uint64(nvl, key, 1);
|
||||
else if (strcmp(val, "off") == 0)
|
||||
nvlist_add_uint64(nvl, key, 0);
|
||||
else
|
||||
errx(1, "invalid value `%s' for %s", val, key);
|
||||
} else {
|
||||
errx(1, "unknown property `%s'", key);
|
||||
}
|
||||
}
|
||||
|
||||
static zfs_dsl_dir_t *
|
||||
dsl_metadir_alloc(zfs_opt_t *zfs, const char *name)
|
||||
{
|
||||
zfs_dsl_dir_t *dir;
|
||||
char *path;
|
||||
|
||||
easprintf(&path, "%s/%s", zfs->poolname, name);
|
||||
dir = dsl_dir_alloc(zfs, path);
|
||||
free(path);
|
||||
return (dir);
|
||||
}
|
||||
|
||||
static void
|
||||
dsl_origindir_init(zfs_opt_t *zfs)
|
||||
{
|
||||
dnode_phys_t *clones;
|
||||
uint64_t clonesid;
|
||||
|
||||
zfs->origindsldir = dsl_metadir_alloc(zfs, "$ORIGIN");
|
||||
zfs->originds = dsl_dataset_alloc(zfs, zfs->origindsldir);
|
||||
zfs->snapds = dsl_dataset_alloc(zfs, zfs->origindsldir);
|
||||
|
||||
clones = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_CLONES, &clonesid);
|
||||
zfs->cloneszap = zap_alloc(zfs->mos, clones);
|
||||
zfs->origindsldir->phys->dd_clones = clonesid;
|
||||
}
|
||||
|
||||
void
|
||||
dsl_init(zfs_opt_t *zfs)
|
||||
{
|
||||
zfs_dsl_dir_t *dir;
|
||||
struct dataset_desc *d;
|
||||
const char *dspropdelim;
|
||||
|
||||
dspropdelim = ";";
|
||||
|
||||
zfs->rootdsldir = dsl_dir_alloc(zfs, NULL);
|
||||
|
||||
nvlist_add_uint64(zfs->rootdsldir->propsnv, "compression",
|
||||
ZIO_COMPRESS_OFF);
|
||||
|
||||
zfs->rootds = dsl_dataset_alloc(zfs, zfs->rootdsldir);
|
||||
zfs->rootdsldir->headds = zfs->rootds;
|
||||
|
||||
zfs->mosdsldir = dsl_metadir_alloc(zfs, "$MOS");
|
||||
zfs->freedsldir = dsl_metadir_alloc(zfs, "$FREE");
|
||||
dsl_origindir_init(zfs);
|
||||
|
||||
/*
|
||||
* Go through the list of user-specified datasets and create DSL objects
|
||||
* for them.
|
||||
*/
|
||||
STAILQ_FOREACH(d, &zfs->datasetdescs, next) {
|
||||
char *dsname, *next, *params, *param, *nextparam;
|
||||
|
||||
params = d->params;
|
||||
dsname = strsep(¶ms, dspropdelim);
|
||||
|
||||
if (strcmp(dsname, zfs->poolname) == 0) {
|
||||
/*
|
||||
* This is the root dataset; it's already created, so
|
||||
* we're just setting options.
|
||||
*/
|
||||
dir = zfs->rootdsldir;
|
||||
} else {
|
||||
/*
|
||||
* This dataset must be a child of the root dataset.
|
||||
*/
|
||||
if (strstr(dsname, zfs->poolname) != dsname ||
|
||||
(next = strchr(dsname, '/')) == NULL ||
|
||||
(size_t)(next - dsname) != strlen(zfs->poolname)) {
|
||||
errx(1, "dataset `%s' must be a child of `%s'",
|
||||
dsname, zfs->poolname);
|
||||
}
|
||||
dir = dsl_dir_alloc(zfs, dsname);
|
||||
dir->headds = dsl_dataset_alloc(zfs, dir);
|
||||
}
|
||||
|
||||
for (nextparam = param = params; nextparam != NULL;) {
|
||||
char *key, *val;
|
||||
|
||||
param = strsep(&nextparam, dspropdelim);
|
||||
|
||||
key = val = param;
|
||||
key = strsep(&val, "=");
|
||||
dsl_dir_set_prop(zfs, dir, key, val);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the root dataset's mount point if the user didn't override the
|
||||
* default.
|
||||
*/
|
||||
if (nvpair_find(zfs->rootdsldir->propsnv, "mountpoint") == NULL) {
|
||||
nvlist_add_string(zfs->rootdsldir->propsnv, "mountpoint",
|
||||
zfs->rootpath);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t
|
||||
dsl_dir_id(zfs_dsl_dir_t *dir)
|
||||
{
|
||||
return (dir->dirid);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
dsl_dir_dataset_id(zfs_dsl_dir_t *dir)
|
||||
{
|
||||
return (dir->headds->dsid);
|
||||
}
|
||||
|
||||
static void
|
||||
dsl_dir_foreach_post(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir,
|
||||
void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg)
|
||||
{
|
||||
zfs_dsl_dir_t *cdsldir;
|
||||
|
||||
STAILQ_FOREACH(cdsldir, &dsldir->children, next) {
|
||||
dsl_dir_foreach_post(zfs, cdsldir, cb, arg);
|
||||
}
|
||||
cb(zfs, dsldir, arg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Used when the caller doesn't care about the order one way or another.
|
||||
*/
|
||||
void
|
||||
dsl_dir_foreach(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir,
|
||||
void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg)
|
||||
{
|
||||
dsl_dir_foreach_post(zfs, dsldir, cb, arg);
|
||||
}
|
||||
|
||||
const char *
|
||||
dsl_dir_fullname(const zfs_dsl_dir_t *dir)
|
||||
{
|
||||
return (dir->fullname);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a DSL directory, which is effectively an entry in the ZFS namespace.
|
||||
* We always create a root DSL directory, whose name is the pool's name, and
|
||||
* several metadata directories.
|
||||
*
|
||||
* Each directory has two ZAP objects, one pointing to child directories, and
|
||||
* one for properties (which are inherited by children unless overridden).
|
||||
* Directories typically reference a DSL dataset, the "head dataset", which
|
||||
* points to an object set.
|
||||
*/
|
||||
static zfs_dsl_dir_t *
|
||||
dsl_dir_alloc(zfs_opt_t *zfs, const char *name)
|
||||
{
|
||||
zfs_dsl_dir_list_t l, *lp;
|
||||
zfs_dsl_dir_t *dir, *parent;
|
||||
dnode_phys_t *dnode;
|
||||
char *dirname, *nextdir, *origname;
|
||||
uint64_t childid, propsid;
|
||||
|
||||
dir = ecalloc(1, sizeof(*dir));
|
||||
|
||||
dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DIR,
|
||||
DMU_OT_DSL_DIR, sizeof(dsl_dir_phys_t), &dir->dirid);
|
||||
dir->phys = (dsl_dir_phys_t *)DN_BONUS(dnode);
|
||||
|
||||
dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_PROPS, &propsid);
|
||||
dir->propszap = zap_alloc(zfs->mos, dnode);
|
||||
|
||||
dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DIR_CHILD_MAP,
|
||||
&childid);
|
||||
dir->childzap = zap_alloc(zfs->mos, dnode);
|
||||
|
||||
dir->propsnv = nvlist_create(NV_UNIQUE_NAME);
|
||||
STAILQ_INIT(&dir->children);
|
||||
|
||||
dir->phys->dd_child_dir_zapobj = childid;
|
||||
dir->phys->dd_props_zapobj = propsid;
|
||||
|
||||
if (name == NULL) {
|
||||
/*
|
||||
* This is the root DSL directory.
|
||||
*/
|
||||
dir->name = estrdup(zfs->poolname);
|
||||
dir->fullname = estrdup(zfs->poolname);
|
||||
dir->parent = NULL;
|
||||
dir->phys->dd_parent_obj = 0;
|
||||
|
||||
assert(zfs->rootdsldir == NULL);
|
||||
zfs->rootdsldir = dir;
|
||||
return (dir);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert the new directory into the hierarchy. Currently this must be
|
||||
* done in order, e.g., when creating pool/a/b, pool/a must already
|
||||
* exist.
|
||||
*/
|
||||
STAILQ_INIT(&l);
|
||||
STAILQ_INSERT_HEAD(&l, zfs->rootdsldir, next);
|
||||
origname = dirname = nextdir = estrdup(name);
|
||||
for (lp = &l;; lp = &parent->children) {
|
||||
dirname = strsep(&nextdir, "/");
|
||||
if (nextdir == NULL)
|
||||
break;
|
||||
|
||||
STAILQ_FOREACH(parent, lp, next) {
|
||||
if (strcmp(parent->name, dirname) == 0)
|
||||
break;
|
||||
}
|
||||
if (parent == NULL) {
|
||||
errx(1, "no parent at `%s' for filesystem `%s'",
|
||||
dirname, name);
|
||||
}
|
||||
}
|
||||
|
||||
dir->fullname = estrdup(name);
|
||||
dir->name = estrdup(dirname);
|
||||
free(origname);
|
||||
STAILQ_INSERT_TAIL(lp, dir, next);
|
||||
zap_add_uint64(parent->childzap, dir->name, dir->dirid);
|
||||
|
||||
dir->parent = parent;
|
||||
dir->phys->dd_parent_obj = parent->dirid;
|
||||
return (dir);
|
||||
}
|
||||
|
||||
void
|
||||
dsl_dir_size_set(zfs_dsl_dir_t *dir, uint64_t bytes)
|
||||
{
|
||||
dir->phys->dd_used_bytes = bytes;
|
||||
dir->phys->dd_compressed_bytes = bytes;
|
||||
dir->phys->dd_uncompressed_bytes = bytes;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert dataset properties into entries in the DSL directory's properties
|
||||
* ZAP.
|
||||
*/
|
||||
static void
|
||||
dsl_dir_finalize_props(zfs_dsl_dir_t *dir)
|
||||
{
|
||||
for (nvp_header_t *nvh = NULL;
|
||||
(nvh = nvlist_next_nvpair(dir->propsnv, nvh)) != NULL;) {
|
||||
nv_string_t *nvname;
|
||||
nv_pair_data_t *nvdata;
|
||||
const char *name;
|
||||
|
||||
nvname = (nv_string_t *)(nvh + 1);
|
||||
nvdata = (nv_pair_data_t *)(&nvname->nv_data[0] +
|
||||
NV_ALIGN4(nvname->nv_size));
|
||||
|
||||
name = nvstring_get(nvname);
|
||||
switch (nvdata->nv_type) {
|
||||
case DATA_TYPE_UINT64: {
|
||||
uint64_t val;
|
||||
|
||||
memcpy(&val, &nvdata->nv_data[0], sizeof(uint64_t));
|
||||
zap_add_uint64(dir->propszap, name, val);
|
||||
break;
|
||||
}
|
||||
case DATA_TYPE_STRING: {
|
||||
nv_string_t *nvstr;
|
||||
|
||||
nvstr = (nv_string_t *)&nvdata->nv_data[0];
|
||||
zap_add_string(dir->propszap, name,
|
||||
nvstring_get(nvstr));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
dsl_dir_finalize(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, void *arg __unused)
|
||||
{
|
||||
char key[32];
|
||||
zfs_dsl_dir_t *cdir;
|
||||
dnode_phys_t *snapnames;
|
||||
zfs_dsl_dataset_t *headds;
|
||||
zfs_objset_t *os;
|
||||
uint64_t bytes, snapnamesid;
|
||||
|
||||
dsl_dir_finalize_props(dir);
|
||||
zap_write(zfs, dir->propszap);
|
||||
zap_write(zfs, dir->childzap);
|
||||
|
||||
headds = dir->headds;
|
||||
if (headds == NULL)
|
||||
return;
|
||||
os = headds->os;
|
||||
if (os == NULL)
|
||||
return;
|
||||
|
||||
snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP,
|
||||
&snapnamesid);
|
||||
zap_write(zfs, zap_alloc(zfs->mos, snapnames));
|
||||
|
||||
dir->phys->dd_head_dataset_obj = headds->dsid;
|
||||
dir->phys->dd_clone_parent_obj = zfs->snapds->dsid;
|
||||
headds->phys->ds_prev_snap_obj = zfs->snapds->dsid;
|
||||
headds->phys->ds_snapnames_zapobj = snapnamesid;
|
||||
objset_root_blkptr_copy(os, &headds->phys->ds_bp);
|
||||
|
||||
zfs->snapds->phys->ds_num_children++;
|
||||
snprintf(key, sizeof(key), "%jx", (uintmax_t)headds->dsid);
|
||||
zap_add_uint64(zfs->cloneszap, key, headds->dsid);
|
||||
|
||||
bytes = objset_space(os);
|
||||
headds->phys->ds_used_bytes = bytes;
|
||||
headds->phys->ds_uncompressed_bytes = bytes;
|
||||
headds->phys->ds_compressed_bytes = bytes;
|
||||
|
||||
STAILQ_FOREACH(cdir, &dir->children, next)
|
||||
bytes += cdir->phys->dd_used_bytes;
|
||||
dsl_dir_size_set(dir, bytes);
|
||||
}
|
||||
|
||||
void
|
||||
dsl_write(zfs_opt_t *zfs)
|
||||
{
|
||||
zfs_zap_t *snapnameszap;
|
||||
dnode_phys_t *snapnames;
|
||||
uint64_t snapmapid;
|
||||
|
||||
/*
|
||||
* Perform accounting, starting from the leaves of the DSL directory
|
||||
* tree. Accounting for $MOS is done later, once we've finished
|
||||
* allocating space.
|
||||
*/
|
||||
dsl_dir_foreach_post(zfs, zfs->rootdsldir, dsl_dir_finalize, NULL);
|
||||
|
||||
snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP,
|
||||
&snapmapid);
|
||||
snapnameszap = zap_alloc(zfs->mos, snapnames);
|
||||
zap_add_uint64(snapnameszap, "$ORIGIN", zfs->snapds->dsid);
|
||||
zap_write(zfs, snapnameszap);
|
||||
|
||||
zfs->origindsldir->phys->dd_head_dataset_obj = zfs->originds->dsid;
|
||||
zfs->originds->phys->ds_prev_snap_obj = zfs->snapds->dsid;
|
||||
zfs->originds->phys->ds_snapnames_zapobj = snapmapid;
|
||||
|
||||
zfs->snapds->phys->ds_next_snap_obj = zfs->originds->dsid;
|
||||
assert(zfs->snapds->phys->ds_num_children > 0);
|
||||
zfs->snapds->phys->ds_num_children++;
|
||||
|
||||
zap_write(zfs, zfs->cloneszap);
|
||||
|
||||
/* XXX-MJ dirs and datasets are leaked */
|
||||
}
|
||||
|
||||
void
|
||||
dsl_dir_dataset_write(zfs_opt_t *zfs, zfs_objset_t *os, zfs_dsl_dir_t *dir)
|
||||
{
|
||||
dir->headds->os = os;
|
||||
objset_write(zfs, os);
|
||||
}
|
||||
|
||||
bool
|
||||
dsl_dir_has_dataset(zfs_dsl_dir_t *dir)
|
||||
{
|
||||
return (dir->headds != NULL);
|
||||
}
|
||||
|
||||
bool
|
||||
dsl_dir_dataset_has_objset(zfs_dsl_dir_t *dir)
|
||||
{
|
||||
return (dsl_dir_has_dataset(dir) && dir->headds->os != NULL);
|
||||
}
|
||||
|
||||
static zfs_dsl_dataset_t *
|
||||
dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir)
|
||||
{
|
||||
zfs_dsl_dataset_t *ds;
|
||||
dnode_phys_t *dnode;
|
||||
uint64_t deadlistid;
|
||||
|
||||
ds = ecalloc(1, sizeof(*ds));
|
||||
|
||||
dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DATASET,
|
||||
DMU_OT_DSL_DATASET, sizeof(dsl_dataset_phys_t), &ds->dsid);
|
||||
ds->phys = (dsl_dataset_phys_t *)DN_BONUS(dnode);
|
||||
|
||||
dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DEADLIST,
|
||||
DMU_OT_DEADLIST_HDR, sizeof(dsl_deadlist_phys_t), &deadlistid);
|
||||
zap_write(zfs, zap_alloc(zfs->mos, dnode));
|
||||
|
||||
ds->phys->ds_dir_obj = dir->dirid;
|
||||
ds->phys->ds_deadlist_obj = deadlistid;
|
||||
ds->phys->ds_creation_txg = TXG - 1;
|
||||
if (ds != zfs->snapds)
|
||||
ds->phys->ds_prev_snap_txg = TXG - 1;
|
||||
ds->phys->ds_guid = ((uint64_t)random() << 32) | random();
|
||||
ds->dir = dir;
|
||||
|
||||
return (ds);
|
||||
}
|
981
usr.sbin/makefs/zfs/fs.c
Normal file
981
usr.sbin/makefs/zfs/fs.c
Normal file
|
@ -0,0 +1,981 @@
|
|||
/*-
|
||||
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
||||
*
|
||||
* Copyright (c) 2022 The FreeBSD Foundation
|
||||
*
|
||||
* This software was developed by Mark Johnston under sponsorship from
|
||||
* the FreeBSD Foundation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <sys/dirent.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <assert.h>
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include "makefs.h"
|
||||
#include "zfs.h"
|
||||
|
||||
typedef struct {
|
||||
const char *name;
|
||||
unsigned int id;
|
||||
uint16_t size;
|
||||
sa_bswap_type_t bs;
|
||||
} zfs_sattr_t;
|
||||
|
||||
typedef struct zfs_fs {
|
||||
zfs_objset_t *os;
|
||||
|
||||
/* Offset table for system attributes, indexed by a zpl_attr_t. */
|
||||
uint16_t *saoffs;
|
||||
size_t sacnt;
|
||||
const zfs_sattr_t *satab;
|
||||
} zfs_fs_t;
|
||||
|
||||
/*
|
||||
* The order of the attributes doesn't matter, this is simply the one hard-coded
|
||||
* by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
|
||||
*/
|
||||
typedef enum zpl_attr {
|
||||
ZPL_ATIME,
|
||||
ZPL_MTIME,
|
||||
ZPL_CTIME,
|
||||
ZPL_CRTIME,
|
||||
ZPL_GEN,
|
||||
ZPL_MODE,
|
||||
ZPL_SIZE,
|
||||
ZPL_PARENT,
|
||||
ZPL_LINKS,
|
||||
ZPL_XATTR,
|
||||
ZPL_RDEV,
|
||||
ZPL_FLAGS,
|
||||
ZPL_UID,
|
||||
ZPL_GID,
|
||||
ZPL_PAD,
|
||||
ZPL_ZNODE_ACL,
|
||||
ZPL_DACL_COUNT,
|
||||
ZPL_SYMLINK,
|
||||
ZPL_SCANSTAMP,
|
||||
ZPL_DACL_ACES,
|
||||
ZPL_DXATTR,
|
||||
ZPL_PROJID,
|
||||
} zpl_attr_t;
|
||||
|
||||
/*
|
||||
* This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
|
||||
*/
|
||||
static const zfs_sattr_t zpl_attrs[] = {
|
||||
#define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b }
|
||||
_ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
_ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
|
||||
_ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
|
||||
_ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
|
||||
_ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
|
||||
_ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
|
||||
#undef ZPL_ATTR
|
||||
};
|
||||
|
||||
/*
|
||||
* This layout matches that of a filesystem created using OpenZFS on FreeBSD.
|
||||
* It need not match in general, but FreeBSD's loader doesn't bother parsing the
|
||||
* layout and just hard-codes attribute offsets.
|
||||
*/
|
||||
static const sa_attr_type_t zpl_attr_layout[] = {
|
||||
ZPL_MODE,
|
||||
ZPL_SIZE,
|
||||
ZPL_GEN,
|
||||
ZPL_UID,
|
||||
ZPL_GID,
|
||||
ZPL_PARENT,
|
||||
ZPL_FLAGS,
|
||||
ZPL_ATIME,
|
||||
ZPL_MTIME,
|
||||
ZPL_CTIME,
|
||||
ZPL_CRTIME,
|
||||
ZPL_LINKS,
|
||||
ZPL_DACL_COUNT,
|
||||
ZPL_DACL_ACES,
|
||||
ZPL_SYMLINK,
|
||||
};
|
||||
|
||||
/*
|
||||
* Keys for the ZPL attribute tables in the SA layout ZAP. The first two
|
||||
* indices are reserved for legacy attribute encoding.
|
||||
*/
|
||||
#define SA_LAYOUT_INDEX_DEFAULT 2
|
||||
#define SA_LAYOUT_INDEX_SYMLINK 3
|
||||
|
||||
struct fs_populate_dir {
|
||||
SLIST_ENTRY(fs_populate_dir) next;
|
||||
int dirfd;
|
||||
uint64_t objid;
|
||||
zfs_zap_t *zap;
|
||||
};
|
||||
|
||||
struct fs_populate_arg {
|
||||
zfs_opt_t *zfs;
|
||||
zfs_fs_t *fs; /* owning filesystem */
|
||||
int dirfd; /* current directory fd */
|
||||
uint64_t rootdirid; /* root directory dnode ID */
|
||||
SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */
|
||||
};
|
||||
|
||||
static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int);
|
||||
|
||||
static bool
|
||||
fsnode_isroot(const fsnode *cur)
|
||||
{
|
||||
return (strcmp(cur->name, ".") == 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Visit each node in a directory hierarchy, in pre-order depth-first order.
|
||||
*/
|
||||
static void
|
||||
fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg)
|
||||
{
|
||||
assert(root->type == S_IFDIR);
|
||||
|
||||
for (fsnode *cur = root; cur != NULL; cur = cur->next) {
|
||||
assert(cur->type == S_IFREG || cur->type == S_IFDIR ||
|
||||
cur->type == S_IFLNK);
|
||||
|
||||
if (cb(cur, arg) == 0)
|
||||
continue;
|
||||
if (cur->type == S_IFDIR && cur->child != NULL)
|
||||
fsnode_foreach(cur->child, cb, arg);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid)
|
||||
{
|
||||
struct fs_populate_dir *dir;
|
||||
uint64_t type;
|
||||
|
||||
switch (cur->type) {
|
||||
case S_IFREG:
|
||||
type = DT_REG;
|
||||
break;
|
||||
case S_IFDIR:
|
||||
type = DT_DIR;
|
||||
break;
|
||||
case S_IFLNK:
|
||||
type = DT_LNK;
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
dir = SLIST_FIRST(&arg->dirs);
|
||||
zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
|
||||
}
|
||||
|
||||
static void
|
||||
fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind,
|
||||
size_t *szp)
|
||||
{
|
||||
assert(ind < fs->sacnt);
|
||||
assert(fs->saoffs[ind] != 0xffff);
|
||||
|
||||
memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
|
||||
*szp += fs->satab[ind].size;
|
||||
}
|
||||
|
||||
static void
|
||||
fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val,
|
||||
size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
|
||||
{
|
||||
assert(ind < fs->sacnt);
|
||||
assert(fs->saoffs[ind] != 0xffff);
|
||||
assert(fs->satab[ind].size == 0);
|
||||
|
||||
memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
|
||||
*szp += valsz;
|
||||
}
|
||||
|
||||
static void
|
||||
fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur,
|
||||
dnode_phys_t *dnode)
|
||||
{
|
||||
char target[PATH_MAX];
|
||||
zfs_fs_t *fs;
|
||||
zfs_ace_hdr_t aces[3];
|
||||
struct stat *sb;
|
||||
sa_hdr_phys_t *sahdr;
|
||||
uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
|
||||
char *attrbuf;
|
||||
size_t bonussz, hdrsz;
|
||||
int layout;
|
||||
|
||||
assert(dnode->dn_bonustype == DMU_OT_SA);
|
||||
assert(dnode->dn_nblkptr == 1);
|
||||
|
||||
fs = arg->fs;
|
||||
sb = &cur->inode->st;
|
||||
|
||||
switch (cur->type) {
|
||||
case S_IFREG:
|
||||
layout = SA_LAYOUT_INDEX_DEFAULT;
|
||||
links = cur->inode->nlink;
|
||||
objsize = sb->st_size;
|
||||
parent = SLIST_FIRST(&arg->dirs)->objid;
|
||||
break;
|
||||
case S_IFDIR:
|
||||
layout = SA_LAYOUT_INDEX_DEFAULT;
|
||||
links = 1; /* .. */
|
||||
objsize = 1; /* .. */
|
||||
|
||||
/*
|
||||
* The size of a ZPL directory is the number of entries
|
||||
* (including "." and ".."), and the link count is the number of
|
||||
* entries which are directories (including "." and "..").
|
||||
*/
|
||||
for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child;
|
||||
c != NULL; c = c->next) {
|
||||
if (c->type == S_IFDIR)
|
||||
links++;
|
||||
objsize++;
|
||||
}
|
||||
|
||||
/* The root directory is its own parent. */
|
||||
parent = SLIST_EMPTY(&arg->dirs) ?
|
||||
arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
|
||||
break;
|
||||
case S_IFLNK: {
|
||||
ssize_t n;
|
||||
|
||||
if ((n = readlinkat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name,
|
||||
target, sizeof(target) - 1)) == -1)
|
||||
err(1, "readlinkat(%s)", cur->name);
|
||||
target[n] = '\0';
|
||||
|
||||
layout = SA_LAYOUT_INDEX_SYMLINK;
|
||||
links = 1;
|
||||
objsize = strlen(target);
|
||||
parent = SLIST_FIRST(&arg->dirs)->objid;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
daclcount = nitems(aces);
|
||||
flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED |
|
||||
ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */
|
||||
gen = 1;
|
||||
gid = sb->st_gid;
|
||||
mode = sb->st_mode;
|
||||
uid = sb->st_uid;
|
||||
|
||||
memset(aces, 0, sizeof(aces));
|
||||
aces[0].z_flags = ACE_OWNER;
|
||||
aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
|
||||
aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER |
|
||||
ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL |
|
||||
ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
|
||||
if ((mode & S_IRUSR) != 0)
|
||||
aces[0].z_access_mask |= ACE_READ_DATA;
|
||||
if ((mode & S_IWUSR) != 0)
|
||||
aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
|
||||
if ((mode & S_IXUSR) != 0)
|
||||
aces[0].z_access_mask |= ACE_EXECUTE;
|
||||
|
||||
aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP;
|
||||
aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
|
||||
aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
|
||||
ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
|
||||
if ((mode & S_IRGRP) != 0)
|
||||
aces[1].z_access_mask |= ACE_READ_DATA;
|
||||
if ((mode & S_IWGRP) != 0)
|
||||
aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
|
||||
if ((mode & S_IXGRP) != 0)
|
||||
aces[1].z_access_mask |= ACE_EXECUTE;
|
||||
|
||||
aces[2].z_flags = ACE_EVERYONE;
|
||||
aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
|
||||
aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
|
||||
ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
|
||||
if ((mode & S_IROTH) != 0)
|
||||
aces[2].z_access_mask |= ACE_READ_DATA;
|
||||
if ((mode & S_IWOTH) != 0)
|
||||
aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
|
||||
if ((mode & S_IXOTH) != 0)
|
||||
aces[2].z_access_mask |= ACE_EXECUTE;
|
||||
|
||||
switch (layout) {
|
||||
case SA_LAYOUT_INDEX_DEFAULT:
|
||||
/* At most one variable-length attribute. */
|
||||
hdrsz = sizeof(uint64_t);
|
||||
break;
|
||||
case SA_LAYOUT_INDEX_SYMLINK:
|
||||
/* At most five variable-length attributes. */
|
||||
hdrsz = sizeof(uint64_t) * 2;
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
|
||||
sahdr->sa_magic = SA_MAGIC;
|
||||
SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
|
||||
|
||||
bonussz = SA_HDR_SIZE(sahdr);
|
||||
attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
|
||||
|
||||
fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
|
||||
fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
|
||||
fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
|
||||
fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
|
||||
fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
|
||||
fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
|
||||
fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
|
||||
fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
|
||||
fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
|
||||
|
||||
/*
|
||||
* We deliberately set atime = mtime here to ensure that images are
|
||||
* reproducible.
|
||||
*/
|
||||
assert(sizeof(sb->st_mtim) == fs->satab[ZPL_ATIME].size);
|
||||
fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
|
||||
assert(sizeof(sb->st_ctim) == fs->satab[ZPL_CTIME].size);
|
||||
fs_populate_attr(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
|
||||
assert(sizeof(sb->st_mtim) == fs->satab[ZPL_MTIME].size);
|
||||
fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
|
||||
assert(sizeof(sb->st_birthtim) == fs->satab[ZPL_CRTIME].size);
|
||||
fs_populate_attr(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
|
||||
|
||||
fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
|
||||
ZPL_DACL_ACES, &bonussz);
|
||||
sahdr->sa_lengths[0] = sizeof(aces);
|
||||
|
||||
if (cur->type == S_IFLNK) {
|
||||
assert(layout == SA_LAYOUT_INDEX_SYMLINK);
|
||||
/* Need to use a spill block pointer if the target is long. */
|
||||
assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
|
||||
fs_populate_varszattr(fs, attrbuf, target, objsize,
|
||||
sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
|
||||
sahdr->sa_lengths[1] = (uint16_t)objsize;
|
||||
}
|
||||
|
||||
dnode->dn_bonuslen = bonussz;
|
||||
}
|
||||
|
||||
static void
|
||||
fs_populate_file(fsnode *cur, struct fs_populate_arg *arg)
|
||||
{
|
||||
struct dnode_cursor *c;
|
||||
dnode_phys_t *dnode;
|
||||
zfs_opt_t *zfs;
|
||||
char *buf;
|
||||
uint64_t dnid;
|
||||
ssize_t n;
|
||||
size_t bufsz;
|
||||
off_t size, target;
|
||||
int fd;
|
||||
|
||||
assert(cur->type == S_IFREG);
|
||||
assert((cur->inode->flags & FI_ROOT) == 0);
|
||||
|
||||
zfs = arg->zfs;
|
||||
|
||||
assert(cur->inode->ino != 0);
|
||||
if ((cur->inode->flags & FI_ALLOCATED) != 0) {
|
||||
/*
|
||||
* This is a hard link of an existing file.
|
||||
*
|
||||
* XXX-MJ need to check whether it crosses datasets, add a test
|
||||
* case for that
|
||||
*/
|
||||
fs_populate_dirent(arg, cur, cur->inode->ino);
|
||||
return;
|
||||
}
|
||||
|
||||
dnode = objset_dnode_bonus_alloc(arg->fs->os,
|
||||
DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
|
||||
cur->inode->ino = dnid;
|
||||
cur->inode->flags |= FI_ALLOCATED;
|
||||
|
||||
fd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, O_RDONLY);
|
||||
if (fd == -1)
|
||||
err(1, "openat(%s)", cur->name);
|
||||
|
||||
buf = zfs->filebuf;
|
||||
bufsz = sizeof(zfs->filebuf);
|
||||
size = cur->inode->st.st_size;
|
||||
c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
|
||||
for (off_t foff = 0; foff < size; foff += target) {
|
||||
off_t loc, sofar;
|
||||
|
||||
/*
|
||||
* Fill up our buffer, handling partial reads.
|
||||
*
|
||||
* It might be profitable to use copy_file_range(2) here.
|
||||
*/
|
||||
sofar = 0;
|
||||
target = MIN(size - foff, (off_t)bufsz);
|
||||
do {
|
||||
n = read(fd, buf + sofar, target);
|
||||
if (n < 0)
|
||||
err(1, "reading from '%s'", cur->name);
|
||||
if (n == 0)
|
||||
errx(1, "unexpected EOF reading '%s'",
|
||||
cur->name);
|
||||
sofar += n;
|
||||
} while (sofar < target);
|
||||
|
||||
if (target < (off_t)bufsz)
|
||||
memset(buf + target, 0, bufsz - target);
|
||||
|
||||
loc = objset_space_alloc(zfs, arg->fs->os, &target);
|
||||
vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, target, loc,
|
||||
dnode_cursor_next(zfs, c, foff));
|
||||
}
|
||||
if (close(fd) != 0)
|
||||
err(1, "close");
|
||||
dnode_cursor_finish(zfs, c);
|
||||
|
||||
fs_populate_sattrs(arg, cur, dnode);
|
||||
fs_populate_dirent(arg, cur, dnid);
|
||||
}
|
||||
|
||||
static void
|
||||
fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg)
|
||||
{
|
||||
dnode_phys_t *dnode;
|
||||
zfs_objset_t *os;
|
||||
uint64_t dnid;
|
||||
int dirfd;
|
||||
|
||||
assert(cur->type == S_IFDIR);
|
||||
assert((cur->inode->flags & FI_ALLOCATED) == 0);
|
||||
|
||||
os = arg->fs->os;
|
||||
|
||||
dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
|
||||
DMU_OT_SA, 0, &dnid);
|
||||
|
||||
/*
|
||||
* Add an entry to the parent directory and open this directory.
|
||||
*/
|
||||
if (!SLIST_EMPTY(&arg->dirs)) {
|
||||
fs_populate_dirent(arg, cur, dnid);
|
||||
dirfd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name,
|
||||
O_DIRECTORY);
|
||||
if (dirfd < 0)
|
||||
err(1, "open(%s)", cur->name);
|
||||
} else {
|
||||
arg->rootdirid = dnid;
|
||||
dirfd = arg->dirfd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set ZPL attributes.
|
||||
*/
|
||||
fs_populate_sattrs(arg, cur, dnode);
|
||||
|
||||
/*
|
||||
* If this is a root directory, then its children belong to a different
|
||||
* dataset and this directory remains empty in the current objset.
|
||||
*/
|
||||
if ((cur->inode->flags & FI_ROOT) == 0) {
|
||||
struct fs_populate_dir *dir;
|
||||
|
||||
dir = ecalloc(1, sizeof(*dir));
|
||||
dir->dirfd = dirfd;
|
||||
dir->objid = dnid;
|
||||
dir->zap = zap_alloc(os, dnode);
|
||||
SLIST_INSERT_HEAD(&arg->dirs, dir, next);
|
||||
} else {
|
||||
zap_write(arg->zfs, zap_alloc(os, dnode));
|
||||
fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg)
|
||||
{
|
||||
dnode_phys_t *dnode;
|
||||
uint64_t dnid;
|
||||
|
||||
assert(cur->type == S_IFLNK);
|
||||
assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0);
|
||||
|
||||
dnode = objset_dnode_bonus_alloc(arg->fs->os,
|
||||
DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
|
||||
|
||||
fs_populate_dirent(arg, cur, dnid);
|
||||
|
||||
fs_populate_sattrs(arg, cur, dnode);
|
||||
}
|
||||
|
||||
static int
|
||||
fs_foreach_populate(fsnode *cur, void *_arg)
|
||||
{
|
||||
struct fs_populate_arg *arg;
|
||||
struct fs_populate_dir *dir;
|
||||
int ret;
|
||||
|
||||
arg = _arg;
|
||||
switch (cur->type) {
|
||||
case S_IFREG:
|
||||
fs_populate_file(cur, arg);
|
||||
break;
|
||||
case S_IFDIR:
|
||||
if (fsnode_isroot(cur))
|
||||
break;
|
||||
fs_populate_dir(cur, arg);
|
||||
break;
|
||||
case S_IFLNK:
|
||||
fs_populate_symlink(cur, arg);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
|
||||
|
||||
if (cur->next == NULL &&
|
||||
(cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) {
|
||||
/*
|
||||
* We reached a terminal node in a subtree. Walk back up and
|
||||
* write out directories. We're done once we hit the root of a
|
||||
* dataset or find a level where we're not on the edge of the
|
||||
* tree.
|
||||
*/
|
||||
do {
|
||||
dir = SLIST_FIRST(&arg->dirs);
|
||||
SLIST_REMOVE_HEAD(&arg->dirs, next);
|
||||
zap_write(arg->zfs, dir->zap);
|
||||
if (dir->dirfd != -1 && close(dir->dirfd) != 0)
|
||||
err(1, "close");
|
||||
free(dir);
|
||||
cur = cur->parent;
|
||||
} while (cur != NULL && cur->next == NULL &&
|
||||
(cur->inode->flags & FI_ROOT) == 0);
|
||||
}
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static void
|
||||
fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
|
||||
const sa_attr_type_t layout[], size_t sacnt)
|
||||
{
|
||||
char ti[16];
|
||||
|
||||
assert(sizeof(layout[0]) == 2);
|
||||
|
||||
snprintf(ti, sizeof(ti), "%u", index);
|
||||
zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
|
||||
(const uint8_t *)layout);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize system attribute tables.
|
||||
*
|
||||
* There are two elements to this. First, we write the zpl_attrs[] and
|
||||
* zpl_attr_layout[] tables to disk. Then we create a lookup table which
|
||||
* allows us to set file attributes quickly.
|
||||
*/
|
||||
static uint64_t
|
||||
fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs)
|
||||
{
|
||||
zfs_zap_t *sazap, *salzap, *sarzap;
|
||||
zfs_objset_t *os;
|
||||
dnode_phys_t *saobj, *salobj, *sarobj;
|
||||
uint64_t saobjid, salobjid, sarobjid;
|
||||
uint16_t offset;
|
||||
|
||||
os = fs->os;
|
||||
|
||||
/*
|
||||
* The on-disk tables are stored in two ZAP objects, the registry object
|
||||
* and the layout object. Individual attributes are described by
|
||||
* entries in the registry object; for example, the value for the
|
||||
* "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
|
||||
* The attributes of a file are ordered according to one of the layouts
|
||||
* defined in the layout object. The master node object is simply used
|
||||
* to locate the registry and layout objects.
|
||||
*/
|
||||
saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
|
||||
salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
|
||||
sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
|
||||
|
||||
sarzap = zap_alloc(os, sarobj);
|
||||
for (size_t i = 0; i < nitems(zpl_attrs); i++) {
|
||||
const zfs_sattr_t *sa;
|
||||
uint64_t attr;
|
||||
|
||||
attr = 0;
|
||||
sa = &zpl_attrs[i];
|
||||
SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
|
||||
zap_add_uint64(sarzap, sa->name, attr);
|
||||
}
|
||||
zap_write(zfs, sarzap);
|
||||
|
||||
/*
|
||||
* Layouts are arrays of indices into the registry. We define two
|
||||
* layouts for use by the ZPL, one for non-symlinks and one for
|
||||
* symlinks. They are identical except that the symlink layout includes
|
||||
* ZPL_SYMLINK as its final attribute.
|
||||
*/
|
||||
salzap = zap_alloc(os, salobj);
|
||||
assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
|
||||
fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT,
|
||||
zpl_attr_layout, nitems(zpl_attr_layout) - 1);
|
||||
fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK,
|
||||
zpl_attr_layout, nitems(zpl_attr_layout));
|
||||
zap_write(zfs, salzap);
|
||||
|
||||
sazap = zap_alloc(os, saobj);
|
||||
zap_add_uint64(sazap, SA_LAYOUTS, salobjid);
|
||||
zap_add_uint64(sazap, SA_REGISTRY, sarobjid);
|
||||
zap_write(zfs, sazap);
|
||||
|
||||
/* Sanity check. */
|
||||
for (size_t i = 0; i < nitems(zpl_attrs); i++)
|
||||
assert(i == zpl_attrs[i].id);
|
||||
|
||||
/*
|
||||
* Build the offset table used when setting file attributes. File
|
||||
* attributes are stored in the object's bonus buffer; this table
|
||||
* provides the buffer offset of attributes referenced by the layout
|
||||
* table.
|
||||
*/
|
||||
fs->sacnt = nitems(zpl_attrs);
|
||||
fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
|
||||
for (size_t i = 0; i < fs->sacnt; i++)
|
||||
fs->saoffs[i] = 0xffff;
|
||||
offset = 0;
|
||||
for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
|
||||
uint16_t size;
|
||||
|
||||
assert(zpl_attr_layout[i] < fs->sacnt);
|
||||
|
||||
fs->saoffs[zpl_attr_layout[i]] = offset;
|
||||
size = zpl_attrs[zpl_attr_layout[i]].size;
|
||||
offset += size;
|
||||
}
|
||||
fs->satab = zpl_attrs;
|
||||
|
||||
return (saobjid);
|
||||
}
|
||||
|
||||
static void
|
||||
fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg)
|
||||
{
|
||||
char *mountpoint, *origmountpoint, *name, *next;
|
||||
fsnode *cur, *root;
|
||||
uint64_t canmount;
|
||||
|
||||
if (!dsl_dir_has_dataset(dsldir))
|
||||
return;
|
||||
|
||||
mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
|
||||
if (mountpoint == NULL)
|
||||
return;
|
||||
if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If we were asked to specify a bootfs, set it here.
|
||||
*/
|
||||
if (zfs->bootfs != NULL && strcmp(zfs->bootfs,
|
||||
dsl_dir_fullname(dsldir)) == 0) {
|
||||
zap_add_uint64(zfs->poolprops, "bootfs",
|
||||
dsl_dir_dataset_id(dsldir));
|
||||
}
|
||||
|
||||
origmountpoint = mountpoint;
|
||||
|
||||
/*
|
||||
* Figure out which fsnode corresponds to our mountpoint.
|
||||
*/
|
||||
root = arg;
|
||||
cur = root;
|
||||
if (strcmp(mountpoint, zfs->rootpath) != 0) {
|
||||
mountpoint += strlen(zfs->rootpath);
|
||||
|
||||
/*
|
||||
* Look up the directory in the staged tree. For example, if
|
||||
* the dataset's mount point is /foo/bar/baz, we'll search the
|
||||
* root directory for "foo", search "foo" for "baz", and so on.
|
||||
* Each intermediate name must refer to a directory; the final
|
||||
* component need not exist.
|
||||
*/
|
||||
cur = root;
|
||||
for (next = name = mountpoint; next != NULL;) {
|
||||
for (; *next == '/'; next++)
|
||||
;
|
||||
name = strsep(&next, "/");
|
||||
|
||||
for (; cur != NULL && strcmp(cur->name, name) != 0;
|
||||
cur = cur->next)
|
||||
;
|
||||
if (cur == NULL) {
|
||||
if (next == NULL)
|
||||
break;
|
||||
errx(1, "missing mountpoint directory for `%s'",
|
||||
dsl_dir_fullname(dsldir));
|
||||
}
|
||||
if (cur->type != S_IFDIR) {
|
||||
errx(1,
|
||||
"mountpoint for `%s' is not a directory",
|
||||
dsl_dir_fullname(dsldir));
|
||||
}
|
||||
if (next != NULL)
|
||||
cur = cur->child;
|
||||
}
|
||||
}
|
||||
|
||||
if (cur != NULL) {
|
||||
assert(cur->type == S_IFDIR);
|
||||
|
||||
/*
|
||||
* Multiple datasets shouldn't share a mountpoint. It's
|
||||
* technically allowed, but it's not clear what makefs should do
|
||||
* in that case.
|
||||
*/
|
||||
assert((cur->inode->flags & FI_ROOT) == 0);
|
||||
if (cur != root)
|
||||
cur->inode->flags |= FI_ROOT;
|
||||
assert(cur->inode->param == NULL);
|
||||
cur->inode->param = dsldir;
|
||||
}
|
||||
|
||||
free(origmountpoint);
|
||||
}
|
||||
|
||||
static int
|
||||
fs_foreach_mark(fsnode *cur, void *arg)
|
||||
{
|
||||
uint64_t *countp;
|
||||
|
||||
countp = arg;
|
||||
if (cur->type == S_IFDIR && fsnode_isroot(cur))
|
||||
return (1);
|
||||
|
||||
if (cur->inode->ino == 0) {
|
||||
cur->inode->ino = ++(*countp);
|
||||
cur->inode->nlink = 1;
|
||||
} else {
|
||||
cur->inode->nlink++;
|
||||
}
|
||||
|
||||
return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a filesystem dataset. More specifically:
|
||||
* - create an object set for the dataset,
|
||||
* - add required metadata (SA tables, property definitions, etc.) to that
|
||||
* object set,
|
||||
* - optionally populate the object set with file objects, using "root" as the
|
||||
* root directory.
|
||||
*
|
||||
* "dirfd" is a directory descriptor for the directory referenced by "root". It
|
||||
* is closed before returning.
|
||||
*/
|
||||
static void
|
||||
fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd)
|
||||
{
|
||||
struct fs_populate_arg arg;
|
||||
zfs_fs_t fs;
|
||||
zfs_zap_t *masterzap;
|
||||
zfs_objset_t *os;
|
||||
dnode_phys_t *deleteq, *masterobj;
|
||||
uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
|
||||
bool fakedroot;
|
||||
|
||||
/*
|
||||
* This dataset's mountpoint doesn't exist in the staging tree, or the
|
||||
* dataset doesn't have a mountpoint at all. In either case we still
|
||||
* need a root directory. Fake up a root fsnode to handle this case.
|
||||
*/
|
||||
fakedroot = root == NULL;
|
||||
if (fakedroot) {
|
||||
struct stat *stp;
|
||||
|
||||
assert(dirfd == -1);
|
||||
|
||||
root = ecalloc(1, sizeof(*root));
|
||||
root->inode = ecalloc(1, sizeof(*root->inode));
|
||||
root->name = estrdup(".");
|
||||
root->type = S_IFDIR;
|
||||
|
||||
stp = &root->inode->st;
|
||||
stp->st_uid = 0;
|
||||
stp->st_gid = 0;
|
||||
stp->st_mode = S_IFDIR | 0755;
|
||||
}
|
||||
assert(root->type == S_IFDIR);
|
||||
assert(fsnode_isroot(root));
|
||||
|
||||
/*
|
||||
* Initialize the object set for this dataset.
|
||||
*/
|
||||
os = objset_alloc(zfs, DMU_OST_ZFS);
|
||||
masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
|
||||
assert(moid == MASTER_NODE_OBJ);
|
||||
|
||||
memset(&fs, 0, sizeof(fs));
|
||||
fs.os = os;
|
||||
|
||||
/*
|
||||
* Create the ZAP SA layout now since filesystem object dnodes will
|
||||
* refer to those attributes.
|
||||
*/
|
||||
saobjid = fs_set_zpl_attrs(zfs, &fs);
|
||||
|
||||
/*
|
||||
* Make a pass over the staged directory to detect hard links and assign
|
||||
* virtual dnode numbers.
|
||||
*/
|
||||
dnodecount = 1; /* root directory */
|
||||
fsnode_foreach(root, fs_foreach_mark, &dnodecount);
|
||||
|
||||
/*
|
||||
* Make a second pass to populate the dataset with files from the
|
||||
* staged directory. Most of our runtime is spent here.
|
||||
*/
|
||||
arg.dirfd = dirfd;
|
||||
arg.zfs = zfs;
|
||||
arg.fs = &fs;
|
||||
SLIST_INIT(&arg.dirs);
|
||||
fs_populate_dir(root, &arg);
|
||||
assert(!SLIST_EMPTY(&arg.dirs));
|
||||
fsnode_foreach(root, fs_foreach_populate, &arg);
|
||||
assert(SLIST_EMPTY(&arg.dirs));
|
||||
rootdirid = arg.rootdirid;
|
||||
|
||||
/*
|
||||
* Create an empty delete queue. We don't do anything with it, but
|
||||
* OpenZFS will refuse to mount filesystems that don't have one.
|
||||
*/
|
||||
deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
|
||||
zap_write(zfs, zap_alloc(os, deleteq));
|
||||
|
||||
/*
|
||||
* Populate and write the master node object. This is a ZAP object
|
||||
* containing various dataset properties and the object IDs of the root
|
||||
* directory and delete queue.
|
||||
*/
|
||||
masterzap = zap_alloc(os, masterobj);
|
||||
zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid);
|
||||
zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid);
|
||||
zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid);
|
||||
zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
|
||||
zap_add_uint64(masterzap, "normalization", 0 /* off */);
|
||||
zap_add_uint64(masterzap, "utf8only", 0 /* off */);
|
||||
zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */);
|
||||
zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */);
|
||||
zap_write(zfs, masterzap);
|
||||
|
||||
/*
|
||||
* All finished with this object set, we may as well write it now.
|
||||
* The DSL layer will sum up the bytes consumed by each dataset using
|
||||
* information stored in the object set, so it can't be freed just yet.
|
||||
*/
|
||||
dsl_dir_dataset_write(zfs, os, dsldir);
|
||||
|
||||
if (fakedroot) {
|
||||
free(root->inode);
|
||||
free(root->name);
|
||||
free(root);
|
||||
}
|
||||
free(fs.saoffs);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create an object set for each DSL directory which has a dataset and doesn't
|
||||
* already have an object set.
|
||||
*/
|
||||
static void
|
||||
fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused)
|
||||
{
|
||||
if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir))
|
||||
fs_build_one(zfs, dsldir, NULL, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create our datasets and populate them with files.
|
||||
*/
|
||||
void
|
||||
fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root)
|
||||
{
|
||||
/*
|
||||
* Run through our datasets and find the root fsnode for each one. Each
|
||||
* root fsnode is flagged so that we can figure out which dataset it
|
||||
* belongs to.
|
||||
*/
|
||||
dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root);
|
||||
|
||||
/*
|
||||
* Did we find our boot filesystem?
|
||||
*/
|
||||
if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs"))
|
||||
errx(1, "no mounted dataset matches bootfs property `%s'",
|
||||
zfs->bootfs);
|
||||
|
||||
/*
|
||||
* Traverse the file hierarchy starting from the root fsnode. One
|
||||
* dataset, not necessarily the root dataset, must "own" the root
|
||||
* directory by having its mountpoint be equal to the root path.
|
||||
*
|
||||
* As roots of other datasets are encountered during the traversal,
|
||||
* fs_build_one() recursively creates the corresponding object sets and
|
||||
* populates them. Once this function has returned, all datasets will
|
||||
* have been fully populated.
|
||||
*/
|
||||
fs_build_one(zfs, root->inode->param, root, dirfd);
|
||||
|
||||
/*
|
||||
* Now create object sets for datasets whose mountpoints weren't found
|
||||
* in the staging directory, either because there is no mountpoint, or
|
||||
* because the mountpoint doesn't correspond to an existing directory.
|
||||
*/
|
||||
dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL);
|
||||
}
|
259
usr.sbin/makefs/zfs/objset.c
Normal file
259
usr.sbin/makefs/zfs/objset.c
Normal file
|
@ -0,0 +1,259 @@
|
|||
/*-
|
||||
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
||||
*
|
||||
* Copyright (c) 2022 The FreeBSD Foundation
|
||||
*
|
||||
* This software was developed by Mark Johnston under sponsorship from
|
||||
* the FreeBSD Foundation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include "zfs.h"
|
||||
|
||||
#define DNODES_PER_CHUNK (MAXBLOCKSIZE / sizeof(dnode_phys_t))
|
||||
|
||||
struct objset_dnode_chunk {
|
||||
dnode_phys_t buf[DNODES_PER_CHUNK];
|
||||
unsigned int nextfree;
|
||||
STAILQ_ENTRY(objset_dnode_chunk) next;
|
||||
};
|
||||
|
||||
typedef struct zfs_objset {
|
||||
/* Physical object set. */
|
||||
objset_phys_t *phys;
|
||||
off_t osloc;
|
||||
off_t osblksz;
|
||||
blkptr_t osbp; /* set in objset_write() */
|
||||
|
||||
/* Accounting. */
|
||||
off_t space; /* bytes allocated to this objset */
|
||||
|
||||
/* dnode allocator. */
|
||||
uint64_t dnodecount;
|
||||
STAILQ_HEAD(, objset_dnode_chunk) dnodechunks;
|
||||
} zfs_objset_t;
|
||||
|
||||
static void
|
||||
dnode_init(dnode_phys_t *dnode, uint8_t type, uint8_t bonustype,
|
||||
uint16_t bonuslen)
|
||||
{
|
||||
dnode->dn_indblkshift = MAXBLOCKSHIFT;
|
||||
dnode->dn_type = type;
|
||||
dnode->dn_bonustype = bonustype;
|
||||
dnode->dn_bonuslen = bonuslen;
|
||||
dnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4;
|
||||
dnode->dn_nlevels = 1;
|
||||
dnode->dn_nblkptr = 1;
|
||||
dnode->dn_flags = DNODE_FLAG_USED_BYTES;
|
||||
}
|
||||
|
||||
zfs_objset_t *
|
||||
objset_alloc(zfs_opt_t *zfs, uint64_t type)
|
||||
{
|
||||
struct objset_dnode_chunk *chunk;
|
||||
zfs_objset_t *os;
|
||||
|
||||
os = ecalloc(1, sizeof(*os));
|
||||
os->osblksz = sizeof(objset_phys_t);
|
||||
os->osloc = objset_space_alloc(zfs, os, &os->osblksz);
|
||||
|
||||
/*
|
||||
* Object ID zero is always reserved for the meta dnode, which is
|
||||
* embedded in the objset itself.
|
||||
*/
|
||||
STAILQ_INIT(&os->dnodechunks);
|
||||
chunk = ecalloc(1, sizeof(*chunk));
|
||||
chunk->nextfree = 1;
|
||||
STAILQ_INSERT_HEAD(&os->dnodechunks, chunk, next);
|
||||
os->dnodecount = 1;
|
||||
|
||||
os->phys = ecalloc(1, os->osblksz);
|
||||
os->phys->os_type = type;
|
||||
|
||||
dnode_init(&os->phys->os_meta_dnode, DMU_OT_DNODE, DMU_OT_NONE, 0);
|
||||
os->phys->os_meta_dnode.dn_datablkszsec =
|
||||
DNODE_BLOCK_SIZE >> MINBLOCKSHIFT;
|
||||
|
||||
return (os);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write the dnode array and physical object set to disk.
|
||||
*/
|
||||
static void
|
||||
_objset_write(zfs_opt_t *zfs, zfs_objset_t *os, struct dnode_cursor *c,
|
||||
off_t loc)
|
||||
{
|
||||
struct objset_dnode_chunk *chunk, *tmp;
|
||||
unsigned int total;
|
||||
|
||||
/*
|
||||
* Write out the dnode array, i.e., the meta-dnode. For some reason its
|
||||
* data blocks must be 16KB in size no matter how large the array is.
|
||||
*/
|
||||
total = 0;
|
||||
STAILQ_FOREACH_SAFE(chunk, &os->dnodechunks, next, tmp) {
|
||||
unsigned int i;
|
||||
|
||||
assert(chunk->nextfree <= os->dnodecount);
|
||||
assert(chunk->nextfree <= DNODES_PER_CHUNK);
|
||||
|
||||
for (i = 0; i < chunk->nextfree; i += DNODES_PER_BLOCK) {
|
||||
blkptr_t *bp;
|
||||
uint64_t fill;
|
||||
|
||||
if (chunk->nextfree - i < DNODES_PER_BLOCK)
|
||||
fill = DNODES_PER_BLOCK - (chunk->nextfree - i);
|
||||
else
|
||||
fill = 0;
|
||||
bp = dnode_cursor_next(zfs, c,
|
||||
(total + i) * sizeof(dnode_phys_t));
|
||||
vdev_pwrite_dnode_indir(zfs, &os->phys->os_meta_dnode,
|
||||
0, fill, chunk->buf + i, DNODE_BLOCK_SIZE, loc, bp);
|
||||
loc += DNODE_BLOCK_SIZE;
|
||||
}
|
||||
total += i;
|
||||
|
||||
free(chunk);
|
||||
}
|
||||
dnode_cursor_finish(zfs, c);
|
||||
STAILQ_INIT(&os->dnodechunks);
|
||||
|
||||
/*
|
||||
* Write the object set itself. The saved block pointer will be copied
|
||||
* into the referencing DSL dataset or the uberblocks.
|
||||
*/
|
||||
vdev_pwrite_data(zfs, DMU_OT_OBJSET, ZIO_CHECKSUM_FLETCHER_4, 0, 1,
|
||||
os->phys, os->osblksz, os->osloc, &os->osbp);
|
||||
}
|
||||
|
||||
void
|
||||
objset_write(zfs_opt_t *zfs, zfs_objset_t *os)
|
||||
{
|
||||
struct dnode_cursor *c;
|
||||
off_t dnodeloc, dnodesz;
|
||||
uint64_t dnodecount;
|
||||
|
||||
/*
|
||||
* There is a chicken-and-egg problem here when writing the MOS: we
|
||||
* cannot write space maps before we're finished allocating space from
|
||||
* the vdev, and we can't write the MOS without having allocated space
|
||||
* for indirect dnode blocks. Thus, rather than lazily allocating
|
||||
* indirect blocks for the meta-dnode (which would be simpler), they are
|
||||
* allocated up-front and before writing space maps.
|
||||
*/
|
||||
dnodecount = os->dnodecount;
|
||||
if (os == zfs->mos)
|
||||
dnodecount += zfs->mscount;
|
||||
dnodesz = dnodecount * sizeof(dnode_phys_t);
|
||||
c = dnode_cursor_init(zfs, os, &os->phys->os_meta_dnode, dnodesz,
|
||||
DNODE_BLOCK_SIZE);
|
||||
dnodesz = roundup2(dnodesz, DNODE_BLOCK_SIZE);
|
||||
dnodeloc = objset_space_alloc(zfs, os, &dnodesz);
|
||||
|
||||
if (os == zfs->mos) {
|
||||
vdev_spacemap_write(zfs);
|
||||
|
||||
/*
|
||||
* We've finished allocating space, account for it in $MOS.
|
||||
*/
|
||||
dsl_dir_size_set(zfs->mosdsldir, os->space);
|
||||
}
|
||||
_objset_write(zfs, os, c, dnodeloc);
|
||||
}
|
||||
|
||||
dnode_phys_t *
|
||||
objset_dnode_bonus_alloc(zfs_objset_t *os, uint8_t type, uint8_t bonustype,
|
||||
uint16_t bonuslen, uint64_t *idp)
|
||||
{
|
||||
struct objset_dnode_chunk *chunk;
|
||||
dnode_phys_t *dnode;
|
||||
|
||||
assert(bonuslen <= DN_OLD_MAX_BONUSLEN);
|
||||
assert(!STAILQ_EMPTY(&os->dnodechunks));
|
||||
|
||||
chunk = STAILQ_LAST(&os->dnodechunks, objset_dnode_chunk, next);
|
||||
if (chunk->nextfree == DNODES_PER_CHUNK) {
|
||||
chunk = ecalloc(1, sizeof(*chunk));
|
||||
STAILQ_INSERT_TAIL(&os->dnodechunks, chunk, next);
|
||||
}
|
||||
*idp = os->dnodecount++;
|
||||
dnode = &chunk->buf[chunk->nextfree++];
|
||||
dnode_init(dnode, type, bonustype, bonuslen);
|
||||
dnode->dn_datablkszsec = os->osblksz >> MINBLOCKSHIFT;
|
||||
return (dnode);
|
||||
}
|
||||
|
||||
dnode_phys_t *
|
||||
objset_dnode_alloc(zfs_objset_t *os, uint8_t type, uint64_t *idp)
|
||||
{
|
||||
return (objset_dnode_bonus_alloc(os, type, DMU_OT_NONE, 0, idp));
|
||||
}
|
||||
|
||||
/*
|
||||
* Look up a physical dnode by ID. This is not used often so a linear search is
|
||||
* fine.
|
||||
*/
|
||||
dnode_phys_t *
|
||||
objset_dnode_lookup(zfs_objset_t *os, uint64_t id)
|
||||
{
|
||||
struct objset_dnode_chunk *chunk;
|
||||
|
||||
assert(id > 0);
|
||||
assert(id < os->dnodecount);
|
||||
|
||||
STAILQ_FOREACH(chunk, &os->dnodechunks, next) {
|
||||
if (id < DNODES_PER_CHUNK)
|
||||
return (&chunk->buf[id]);
|
||||
id -= DNODES_PER_CHUNK;
|
||||
}
|
||||
assert(0);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
off_t
|
||||
objset_space_alloc(zfs_opt_t *zfs, zfs_objset_t *os, off_t *lenp)
|
||||
{
|
||||
off_t loc;
|
||||
|
||||
loc = vdev_space_alloc(zfs, lenp);
|
||||
os->space += *lenp;
|
||||
return (loc);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
objset_space(const zfs_objset_t *os)
|
||||
{
|
||||
return (os->space);
|
||||
}
|
||||
|
||||
void
|
||||
objset_root_blkptr_copy(const zfs_objset_t *os, blkptr_t *bp)
|
||||
{
|
||||
memcpy(bp, &os->osbp, sizeof(blkptr_t));
|
||||
}
|
435
usr.sbin/makefs/zfs/vdev.c
Normal file
435
usr.sbin/makefs/zfs/vdev.c
Normal file
|
@ -0,0 +1,435 @@
|
|||
/*-
|
||||
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
||||
*
|
||||
* Copyright (c) 2022 The FreeBSD Foundation
|
||||
*
|
||||
* This software was developed by Mark Johnston under sponsorship from
|
||||
* the FreeBSD Foundation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include "zfs.h"
|
||||
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
#include "zfs/fletcher.c"
|
||||
#include "zfs/sha256.c"
|
||||
#pragma clang diagnostic pop
|
||||
|
||||
static void
|
||||
blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level,
|
||||
uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum)
|
||||
{
|
||||
dva_t *dva;
|
||||
|
||||
assert(powerof2(size));
|
||||
|
||||
BP_ZERO(bp);
|
||||
BP_SET_LSIZE(bp, size);
|
||||
BP_SET_PSIZE(bp, size);
|
||||
BP_SET_CHECKSUM(bp, cksumt);
|
||||
BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
|
||||
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
|
||||
BP_SET_BIRTH(bp, TXG, TXG);
|
||||
BP_SET_LEVEL(bp, level);
|
||||
BP_SET_FILL(bp, fill);
|
||||
BP_SET_TYPE(bp, dntype);
|
||||
|
||||
dva = BP_IDENTITY(bp);
|
||||
DVA_SET_VDEV(dva, 0);
|
||||
DVA_SET_OFFSET(dva, off);
|
||||
DVA_SET_ASIZE(dva, size);
|
||||
memcpy(&bp->blk_cksum, cksum, sizeof(*cksum));
|
||||
}
|
||||
|
||||
/*
|
||||
* Write a block of data to the vdev. The offset is always relative to the end
|
||||
* of the second leading vdev label.
|
||||
*
|
||||
* Consumers should generally use the helpers below, which provide block
|
||||
* pointers and update dnode accounting, rather than calling this function
|
||||
* directly.
|
||||
*/
|
||||
static void
|
||||
vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off)
|
||||
{
|
||||
ssize_t n;
|
||||
|
||||
assert(off >= 0 && off < zfs->asize);
|
||||
assert(powerof2(len));
|
||||
assert((off_t)len > 0 && off + (off_t)len > off &&
|
||||
off + (off_t)len < zfs->asize);
|
||||
if (zfs->spacemap != NULL) {
|
||||
/*
|
||||
* Verify that the blocks being written were in fact allocated.
|
||||
*
|
||||
* The space map isn't available once the on-disk space map is
|
||||
* finalized, so this check doesn't quite catch everything.
|
||||
*/
|
||||
assert(bit_ntest(zfs->spacemap, off >> zfs->ashift,
|
||||
(off + len - 1) >> zfs->ashift, 1));
|
||||
}
|
||||
|
||||
off += VDEV_LABEL_START_SIZE;
|
||||
for (size_t sofar = 0; sofar < len; sofar += n) {
|
||||
n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar,
|
||||
off + sofar);
|
||||
if (n < 0)
|
||||
err(1, "pwrite");
|
||||
assert(n > 0);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
|
||||
uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
|
||||
blkptr_t *bp)
|
||||
{
|
||||
zio_cksum_t cksum;
|
||||
|
||||
assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4);
|
||||
|
||||
fletcher_4_native(data, sz, NULL, &cksum);
|
||||
blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum);
|
||||
vdev_pwrite(zfs, data, sz, loc);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level,
|
||||
uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp)
|
||||
{
|
||||
vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill,
|
||||
data, sz, loc, bp);
|
||||
|
||||
assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0);
|
||||
dnode->dn_used += sz;
|
||||
}
|
||||
|
||||
void
|
||||
vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data,
|
||||
off_t sz, off_t loc)
|
||||
{
|
||||
vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc,
|
||||
&dnode->dn_blkptr[0]);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_label_set_checksum(void *buf, off_t off, off_t size)
|
||||
{
|
||||
zio_cksum_t cksum;
|
||||
zio_eck_t *eck;
|
||||
|
||||
assert(size > 0 && (size_t)size >= sizeof(zio_eck_t));
|
||||
|
||||
eck = (zio_eck_t *)((char *)buf + size) - 1;
|
||||
eck->zec_magic = ZEC_MAGIC;
|
||||
ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0);
|
||||
zio_checksum_SHA256(buf, size, NULL, &cksum);
|
||||
eck->zec_cksum = cksum;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set embedded checksums and write the label at the specified index.
|
||||
*/
|
||||
void
|
||||
vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp)
|
||||
{
|
||||
vdev_label_t *label;
|
||||
ssize_t n;
|
||||
off_t blksz, loff;
|
||||
|
||||
assert(ind >= 0 && ind < VDEV_LABELS);
|
||||
|
||||
/*
|
||||
* Make a copy since we have to modify the label to set checksums.
|
||||
*/
|
||||
label = ecalloc(1, sizeof(*label));
|
||||
memcpy(label, labelp, sizeof(*label));
|
||||
|
||||
if (ind < 2)
|
||||
loff = ind * sizeof(*label);
|
||||
else
|
||||
loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label);
|
||||
|
||||
/*
|
||||
* Set the verifier checksum for the boot block. We don't use it, but
|
||||
* the FreeBSD loader reads it and will complain if the checksum isn't
|
||||
* valid.
|
||||
*/
|
||||
vdev_label_set_checksum(&label->vl_be,
|
||||
loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be));
|
||||
|
||||
/*
|
||||
* Set the verifier checksum for the label.
|
||||
*/
|
||||
vdev_label_set_checksum(&label->vl_vdev_phys,
|
||||
loff + __offsetof(vdev_label_t, vl_vdev_phys),
|
||||
sizeof(label->vl_vdev_phys));
|
||||
|
||||
/*
|
||||
* Set the verifier checksum for the uberblocks. There is one uberblock
|
||||
* per sector; for example, with an ashift of 12 we end up with
|
||||
* 128KB/4KB=32 copies of the uberblock in the ring.
|
||||
*/
|
||||
blksz = 1 << zfs->ashift;
|
||||
assert(sizeof(label->vl_uberblock) % blksz == 0);
|
||||
for (size_t roff = 0; roff < sizeof(label->vl_uberblock);
|
||||
roff += blksz) {
|
||||
vdev_label_set_checksum(&label->vl_uberblock[0] + roff,
|
||||
loff + __offsetof(vdev_label_t, vl_uberblock) + roff,
|
||||
blksz);
|
||||
}
|
||||
|
||||
n = pwrite(zfs->fd, label, sizeof(*label), loff);
|
||||
if (n < 0)
|
||||
err(1, "writing vdev label");
|
||||
assert(n == sizeof(*label));
|
||||
|
||||
free(label);
|
||||
}
|
||||
|
||||
/*
|
||||
* Find a chunk of contiguous free space of length *lenp, according to the
|
||||
* following rules:
|
||||
* 1. If the length is less than or equal to 128KB, the returned run's length
|
||||
* will be the smallest power of 2 equal to or larger than the length.
|
||||
* 2. If the length is larger than 128KB, the returned run's length will be
|
||||
* the smallest multiple of 128KB that is larger than the length.
|
||||
* 3. The returned run's length will be size-aligned up to 128KB.
|
||||
*
|
||||
* XXX-MJ the third rule isn't actually required, so this can just be a dumb
|
||||
* bump allocator. Maybe there's some benefit to keeping large blocks aligned,
|
||||
* so let's keep it for now and hope we don't get too much fragmentation.
|
||||
* Alternately we could try to allocate all blocks of a certain size from the
|
||||
* same metaslab.
|
||||
*/
|
||||
off_t
|
||||
vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp)
|
||||
{
|
||||
off_t len;
|
||||
int align, loc, minblksz, nbits;
|
||||
|
||||
minblksz = 1 << zfs->ashift;
|
||||
len = roundup2(*lenp, minblksz);
|
||||
|
||||
assert(len != 0);
|
||||
assert(len / minblksz <= INT_MAX);
|
||||
|
||||
if (len < MAXBLOCKSIZE) {
|
||||
if ((len & (len - 1)) != 0)
|
||||
len = (off_t)1 << flsll(len);
|
||||
align = len / minblksz;
|
||||
} else {
|
||||
len = roundup2(len, MAXBLOCKSIZE);
|
||||
align = MAXBLOCKSIZE / minblksz;
|
||||
}
|
||||
|
||||
for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) {
|
||||
bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits,
|
||||
&loc);
|
||||
if (loc == -1) {
|
||||
errx(1, "failed to find %ju bytes of space",
|
||||
(uintmax_t)len);
|
||||
}
|
||||
if ((loc & (align - 1)) == 0)
|
||||
break;
|
||||
}
|
||||
assert(loc + nbits > loc);
|
||||
bit_nset(zfs->spacemap, loc, loc + nbits - 1);
|
||||
*lenp = len;
|
||||
|
||||
return ((off_t)loc << zfs->ashift);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_spacemap_init(zfs_opt_t *zfs)
|
||||
{
|
||||
uint64_t nbits;
|
||||
|
||||
assert(powerof2(zfs->mssize));
|
||||
|
||||
nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift;
|
||||
if (nbits > INT_MAX) {
|
||||
/*
|
||||
* With the smallest block size of 512B, the limit on the image
|
||||
* size is 2TB. That should be enough for anyone.
|
||||
*/
|
||||
errx(1, "image size is too large");
|
||||
}
|
||||
zfs->spacemapbits = (int)nbits;
|
||||
zfs->spacemap = bit_alloc(zfs->spacemapbits);
|
||||
if (zfs->spacemap == NULL)
|
||||
err(1, "bitstring allocation failed");
|
||||
}
|
||||
|
||||
void
|
||||
vdev_spacemap_write(zfs_opt_t *zfs)
|
||||
{
|
||||
dnode_phys_t *objarr;
|
||||
bitstr_t *spacemap;
|
||||
uint64_t *objarrblk;
|
||||
off_t smblksz, objarrblksz, objarrloc;
|
||||
|
||||
struct {
|
||||
dnode_phys_t *dnode;
|
||||
uint64_t dnid;
|
||||
off_t loc;
|
||||
} *sma;
|
||||
|
||||
objarrblksz = sizeof(uint64_t) * zfs->mscount;
|
||||
assert(objarrblksz <= MAXBLOCKSIZE);
|
||||
objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz);
|
||||
objarrblk = ecalloc(1, objarrblksz);
|
||||
|
||||
objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid);
|
||||
objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT;
|
||||
|
||||
/*
|
||||
* Use the smallest block size for space maps. The space allocation
|
||||
* algorithm should aim to minimize the number of holes.
|
||||
*/
|
||||
smblksz = 1 << zfs->ashift;
|
||||
|
||||
/*
|
||||
* First allocate dnodes and space for all of our space maps. No more
|
||||
* space can be allocated from the vdev after this point.
|
||||
*/
|
||||
sma = ecalloc(zfs->mscount, sizeof(*sma));
|
||||
for (uint64_t i = 0; i < zfs->mscount; i++) {
|
||||
sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos,
|
||||
DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER,
|
||||
sizeof(space_map_phys_t), &sma[i].dnid);
|
||||
sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz);
|
||||
}
|
||||
spacemap = zfs->spacemap;
|
||||
zfs->spacemap = NULL;
|
||||
|
||||
/*
|
||||
* Now that the set of allocated space is finalized, populate each space
|
||||
* map and write it to the vdev.
|
||||
*/
|
||||
for (uint64_t i = 0; i < zfs->mscount; i++) {
|
||||
space_map_phys_t *sm;
|
||||
uint64_t alloc, length, *smblk;
|
||||
int shift, startb, endb, srunb, erunb;
|
||||
|
||||
/*
|
||||
* We only allocate a single block for this space map, but
|
||||
* OpenZFS assumes that a space map object with sufficient bonus
|
||||
* space supports histograms.
|
||||
*/
|
||||
sma[i].dnode->dn_nblkptr = 3;
|
||||
sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT;
|
||||
|
||||
smblk = ecalloc(1, smblksz);
|
||||
|
||||
alloc = length = 0;
|
||||
shift = zfs->msshift - zfs->ashift;
|
||||
for (srunb = startb = i * (1 << shift),
|
||||
endb = (i + 1) * (1 << shift);
|
||||
srunb < endb; srunb = erunb) {
|
||||
uint64_t runlen, runoff;
|
||||
|
||||
/* Find a run of allocated space. */
|
||||
bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb);
|
||||
if (srunb == -1 || srunb >= endb)
|
||||
break;
|
||||
|
||||
bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb);
|
||||
if (erunb == -1 || erunb > endb)
|
||||
erunb = endb;
|
||||
|
||||
/*
|
||||
* The space represented by [srunb, erunb) has been
|
||||
* allocated. Add a record to the space map to indicate
|
||||
* this. Run offsets are relative to the beginning of
|
||||
* the metaslab.
|
||||
*/
|
||||
runlen = erunb - srunb;
|
||||
runoff = srunb - startb;
|
||||
|
||||
assert(length * sizeof(uint64_t) < (uint64_t)smblksz);
|
||||
smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) |
|
||||
SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0);
|
||||
smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) |
|
||||
SM2_OFFSET_ENCODE(runoff);
|
||||
|
||||
alloc += runlen << zfs->ashift;
|
||||
length += 2;
|
||||
}
|
||||
|
||||
sm = DN_BONUS(sma[i].dnode);
|
||||
sm->smp_length = length * sizeof(uint64_t);
|
||||
sm->smp_alloc = alloc;
|
||||
|
||||
vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz,
|
||||
sma[i].loc);
|
||||
free(smblk);
|
||||
|
||||
/* Record this space map in the space map object array. */
|
||||
objarrblk[i] = sma[i].dnid;
|
||||
}
|
||||
|
||||
/*
|
||||
* All of the space maps are written, now write the object array.
|
||||
*/
|
||||
vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc);
|
||||
free(objarrblk);
|
||||
|
||||
assert(zfs->spacemap == NULL);
|
||||
free(spacemap);
|
||||
free(sma);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_init(zfs_opt_t *zfs, const char *image)
|
||||
{
|
||||
assert(zfs->ashift >= MINBLOCKSHIFT);
|
||||
|
||||
zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644);
|
||||
if (zfs->fd == -1)
|
||||
err(1, "Can't open `%s' for writing", image);
|
||||
if (ftruncate(zfs->fd, zfs->vdevsize) != 0)
|
||||
err(1, "Failed to extend image file `%s'", image);
|
||||
|
||||
vdev_spacemap_init(zfs);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_fini(zfs_opt_t *zfs)
|
||||
{
|
||||
assert(zfs->spacemap == NULL);
|
||||
|
||||
if (zfs->fd != -1) {
|
||||
if (close(zfs->fd) != 0)
|
||||
err(1, "close");
|
||||
zfs->fd = -1;
|
||||
}
|
||||
}
|
551
usr.sbin/makefs/zfs/zap.c
Normal file
551
usr.sbin/makefs/zfs/zap.c
Normal file
|
@ -0,0 +1,551 @@
|
|||
/*-
|
||||
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
||||
*
|
||||
* Copyright (c) 2022 The FreeBSD Foundation
|
||||
*
|
||||
* This software was developed by Mark Johnston under sponsorship from
|
||||
* the FreeBSD Foundation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/endian.h>
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include "makefs.h"
|
||||
#include "zfs.h"
|
||||
|
||||
typedef struct zfs_zap_entry {
|
||||
char *name; /* entry key, private copy */
|
||||
uint64_t hash; /* key hash */
|
||||
union {
|
||||
uint8_t *valp;
|
||||
uint16_t *val16p;
|
||||
uint32_t *val32p;
|
||||
uint64_t *val64p;
|
||||
}; /* entry value, an integer array */
|
||||
uint64_t val64; /* embedded value for a common case */
|
||||
size_t intsz; /* array element size; 1, 2, 4 or 8 */
|
||||
size_t intcnt; /* array size */
|
||||
STAILQ_ENTRY(zfs_zap_entry) next;
|
||||
} zfs_zap_entry_t;
|
||||
|
||||
struct zfs_zap {
|
||||
STAILQ_HEAD(, zfs_zap_entry) kvps;
|
||||
uint64_t hashsalt; /* key hash input */
|
||||
unsigned long kvpcnt; /* number of key-value pairs */
|
||||
unsigned long chunks; /* count of chunks needed for fat ZAP */
|
||||
bool micro; /* can this be a micro ZAP? */
|
||||
|
||||
dnode_phys_t *dnode; /* backpointer */
|
||||
zfs_objset_t *os; /* backpointer */
|
||||
};
|
||||
|
||||
static uint16_t
|
||||
zap_entry_chunks(zfs_zap_entry_t *ent)
|
||||
{
|
||||
return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) +
|
||||
howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES));
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
zap_hash(uint64_t salt, const char *name)
|
||||
{
|
||||
static uint64_t crc64_table[256];
|
||||
const uint64_t crc64_poly = 0xC96C5795D7870F42UL;
|
||||
const uint8_t *cp;
|
||||
uint64_t crc;
|
||||
uint8_t c;
|
||||
|
||||
assert(salt != 0);
|
||||
if (crc64_table[128] == 0) {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
uint64_t *t;
|
||||
|
||||
t = crc64_table + i;
|
||||
*t = i;
|
||||
for (int j = 8; j > 0; j--)
|
||||
*t = (*t >> 1) ^ (-(*t & 1) & crc64_poly);
|
||||
}
|
||||
}
|
||||
assert(crc64_table[128] == crc64_poly);
|
||||
|
||||
for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++)
|
||||
crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF];
|
||||
|
||||
/*
|
||||
* Only use 28 bits, since we need 4 bits in the cookie for the
|
||||
* collision differentiator. We MUST use the high bits, since
|
||||
* those are the ones that we first pay attention to when
|
||||
* choosing the bucket.
|
||||
*/
|
||||
crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
|
||||
|
||||
return (crc);
|
||||
}
|
||||
|
||||
zfs_zap_t *
|
||||
zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode)
|
||||
{
|
||||
zfs_zap_t *zap;
|
||||
|
||||
zap = ecalloc(1, sizeof(*zap));
|
||||
STAILQ_INIT(&zap->kvps);
|
||||
zap->hashsalt = ((uint64_t)random() << 32) | random();
|
||||
zap->micro = true;
|
||||
zap->kvpcnt = 0;
|
||||
zap->chunks = 0;
|
||||
zap->dnode = dnode;
|
||||
zap->os = os;
|
||||
return (zap);
|
||||
}
|
||||
|
||||
void
|
||||
zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt,
|
||||
const uint8_t *val)
|
||||
{
|
||||
zfs_zap_entry_t *ent;
|
||||
|
||||
assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8);
|
||||
assert(strlen(name) + 1 <= ZAP_MAXNAMELEN);
|
||||
assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN);
|
||||
|
||||
ent = ecalloc(1, sizeof(*ent));
|
||||
ent->name = estrdup(name);
|
||||
ent->hash = zap_hash(zap->hashsalt, ent->name);
|
||||
ent->intsz = intsz;
|
||||
ent->intcnt = intcnt;
|
||||
if (intsz == sizeof(uint64_t) && intcnt == 1) {
|
||||
/*
|
||||
* Micro-optimization to elide a memory allocation in that most
|
||||
* common case where this is a directory entry.
|
||||
*/
|
||||
ent->val64p = &ent->val64;
|
||||
} else {
|
||||
ent->valp = ecalloc(intcnt, intsz);
|
||||
}
|
||||
memcpy(ent->valp, val, intcnt * intsz);
|
||||
zap->kvpcnt++;
|
||||
zap->chunks += zap_entry_chunks(ent);
|
||||
STAILQ_INSERT_TAIL(&zap->kvps, ent, next);
|
||||
|
||||
if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) ||
|
||||
strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX))
|
||||
zap->micro = false;
|
||||
}
|
||||
|
||||
void
|
||||
zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val)
|
||||
{
|
||||
zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val);
|
||||
}
|
||||
|
||||
void
|
||||
zap_add_string(zfs_zap_t *zap, const char *name, const char *val)
|
||||
{
|
||||
zap_add(zap, name, 1, strlen(val) + 1, val);
|
||||
}
|
||||
|
||||
bool
|
||||
zap_entry_exists(zfs_zap_t *zap, const char *name)
|
||||
{
|
||||
zfs_zap_entry_t *ent;
|
||||
|
||||
STAILQ_FOREACH(ent, &zap->kvps, next) {
|
||||
if (strcmp(ent->name, name) == 0)
|
||||
return (true);
|
||||
}
|
||||
return (false);
|
||||
}
|
||||
|
||||
static void
|
||||
zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap)
|
||||
{
|
||||
dnode_phys_t *dnode;
|
||||
zfs_zap_entry_t *ent;
|
||||
mzap_phys_t *mzap;
|
||||
mzap_ent_phys_t *ment;
|
||||
off_t bytes, loc;
|
||||
|
||||
memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
|
||||
mzap = (mzap_phys_t *)&zfs->filebuf[0];
|
||||
mzap->mz_block_type = ZBT_MICRO;
|
||||
mzap->mz_salt = zap->hashsalt;
|
||||
mzap->mz_normflags = 0;
|
||||
|
||||
bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment);
|
||||
assert(bytes <= (off_t)MZAP_MAX_BLKSZ);
|
||||
|
||||
ment = &mzap->mz_chunk[0];
|
||||
STAILQ_FOREACH(ent, &zap->kvps, next) {
|
||||
memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt);
|
||||
ment->mze_cd = 0; /* XXX-MJ */
|
||||
strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name));
|
||||
ment++;
|
||||
}
|
||||
|
||||
loc = objset_space_alloc(zfs, zap->os, &bytes);
|
||||
|
||||
dnode = zap->dnode;
|
||||
dnode->dn_maxblkid = 0;
|
||||
dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT;
|
||||
dnode->dn_flags = DNODE_FLAG_USED_BYTES;
|
||||
|
||||
vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write some data to the fat ZAP leaf chunk starting at index "li".
|
||||
*
|
||||
* Note that individual integers in the value may be split among consecutive
|
||||
* leaves.
|
||||
*/
|
||||
static void
|
||||
zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz,
|
||||
const uint8_t *val)
|
||||
{
|
||||
struct zap_leaf_array *la;
|
||||
|
||||
assert(sz <= ZAP_MAXVALUELEN);
|
||||
|
||||
for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) {
|
||||
n = MIN(resid, ZAP_LEAF_ARRAY_BYTES);
|
||||
|
||||
la = &ZAP_LEAF_CHUNK(l, li).l_array;
|
||||
assert(la->la_type == ZAP_CHUNK_FREE);
|
||||
la->la_type = ZAP_CHUNK_ARRAY;
|
||||
memcpy(la->la_array, val, n);
|
||||
la->la_next = li + 1;
|
||||
}
|
||||
la->la_next = 0xffff;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the shortest hash prefix length which lets us distribute keys without
|
||||
* overflowing a leaf block. This is not (space) optimal, but is simple, and
|
||||
* directories large enough to overflow a single 128KB leaf block are uncommon.
|
||||
*/
|
||||
static unsigned int
|
||||
zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l)
|
||||
{
|
||||
zfs_zap_entry_t *ent;
|
||||
unsigned int prefixlen;
|
||||
|
||||
if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) {
|
||||
/*
|
||||
* All chunks will fit in a single leaf block.
|
||||
*/
|
||||
return (0);
|
||||
}
|
||||
|
||||
for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) {
|
||||
uint32_t *leafchunks;
|
||||
|
||||
leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks));
|
||||
STAILQ_FOREACH(ent, &zap->kvps, next) {
|
||||
uint64_t li;
|
||||
uint16_t chunks;
|
||||
|
||||
li = ZAP_HASH_IDX(ent->hash, prefixlen);
|
||||
|
||||
chunks = zap_entry_chunks(ent);
|
||||
if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) {
|
||||
/*
|
||||
* Not enough space, grow the prefix and retry.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
leafchunks[li] += chunks;
|
||||
}
|
||||
free(leafchunks);
|
||||
|
||||
if (ent == NULL) {
|
||||
/*
|
||||
* Everything fits, we're done.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If this fails, then we need to expand the pointer table. For now
|
||||
* this situation is unhandled since it is hard to trigger.
|
||||
*/
|
||||
assert(prefixlen < (unsigned int)l->l_bs);
|
||||
|
||||
return (prefixlen);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize a fat ZAP leaf block.
|
||||
*/
|
||||
static void
|
||||
zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen)
|
||||
{
|
||||
zap_leaf_phys_t *leaf;
|
||||
|
||||
leaf = l->l_phys;
|
||||
|
||||
leaf->l_hdr.lh_block_type = ZBT_LEAF;
|
||||
leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
|
||||
leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
|
||||
leaf->l_hdr.lh_prefix = prefix;
|
||||
leaf->l_hdr.lh_prefix_len = prefixlen;
|
||||
|
||||
/* Initialize the leaf hash table. */
|
||||
assert(leaf->l_hdr.lh_nfree < 0xffff);
|
||||
memset(leaf->l_hash, 0xff,
|
||||
ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash));
|
||||
|
||||
/* Initialize the leaf chunks. */
|
||||
for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
|
||||
struct zap_leaf_free *lf;
|
||||
|
||||
lf = &ZAP_LEAF_CHUNK(l, i).l_free;
|
||||
lf->lf_type = ZAP_CHUNK_FREE;
|
||||
if (i + 1 == ZAP_LEAF_NUMCHUNKS(l))
|
||||
lf->lf_next = 0xffff;
|
||||
else
|
||||
lf->lf_next = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap)
|
||||
{
|
||||
struct dnode_cursor *c;
|
||||
zap_leaf_t l;
|
||||
zap_phys_t *zaphdr;
|
||||
struct zap_table_phys *zt;
|
||||
zfs_zap_entry_t *ent;
|
||||
dnode_phys_t *dnode;
|
||||
uint8_t *leafblks;
|
||||
uint64_t lblkcnt, *ptrhasht;
|
||||
off_t loc, blksz;
|
||||
size_t blkshift;
|
||||
unsigned int prefixlen;
|
||||
int ptrcnt;
|
||||
|
||||
/*
|
||||
* For simplicity, always use the largest block size. This should be ok
|
||||
* since most directories will be micro ZAPs, but it's space inefficient
|
||||
* for small ZAPs and might need to be revisited.
|
||||
*/
|
||||
blkshift = MAXBLOCKSHIFT;
|
||||
blksz = (off_t)1 << blkshift;
|
||||
|
||||
/*
|
||||
* Embedded pointer tables give up to 8192 entries. This ought to be
|
||||
* enough for anything except massive directories.
|
||||
*/
|
||||
ptrcnt = (blksz / 2) / sizeof(uint64_t);
|
||||
|
||||
memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
|
||||
zaphdr = (zap_phys_t *)&zfs->filebuf[0];
|
||||
zaphdr->zap_block_type = ZBT_HEADER;
|
||||
zaphdr->zap_magic = ZAP_MAGIC;
|
||||
zaphdr->zap_num_entries = zap->kvpcnt;
|
||||
zaphdr->zap_salt = zap->hashsalt;
|
||||
|
||||
l.l_bs = blkshift;
|
||||
l.l_phys = NULL;
|
||||
|
||||
zt = &zaphdr->zap_ptrtbl;
|
||||
zt->zt_blk = 0;
|
||||
zt->zt_numblks = 0;
|
||||
zt->zt_shift = flsll(ptrcnt) - 1;
|
||||
zt->zt_nextblk = 0;
|
||||
zt->zt_blks_copied = 0;
|
||||
|
||||
/*
|
||||
* How many leaf blocks do we need? Initialize them and update the
|
||||
* header.
|
||||
*/
|
||||
prefixlen = zap_fat_write_prefixlen(zap, &l);
|
||||
lblkcnt = 1 << prefixlen;
|
||||
leafblks = ecalloc(lblkcnt, blksz);
|
||||
for (unsigned int li = 0; li < lblkcnt; li++) {
|
||||
l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
|
||||
zap_fat_write_leaf_init(&l, li, prefixlen);
|
||||
}
|
||||
zaphdr->zap_num_leafs = lblkcnt;
|
||||
zaphdr->zap_freeblk = lblkcnt + 1;
|
||||
|
||||
/*
|
||||
* For each entry, figure out which leaf block it belongs to based on
|
||||
* the upper bits of its hash, allocate chunks from that leaf, and fill
|
||||
* them out.
|
||||
*/
|
||||
ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2);
|
||||
STAILQ_FOREACH(ent, &zap->kvps, next) {
|
||||
struct zap_leaf_entry *le;
|
||||
uint16_t *lptr;
|
||||
uint64_t hi, li;
|
||||
uint16_t namelen, nchunks, nnamechunks, nvalchunks;
|
||||
|
||||
hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift);
|
||||
li = ZAP_HASH_IDX(ent->hash, prefixlen);
|
||||
assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1);
|
||||
ptrhasht[hi] = li + 1;
|
||||
l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
|
||||
|
||||
namelen = strlen(ent->name) + 1;
|
||||
|
||||
/*
|
||||
* How many leaf chunks do we need for this entry?
|
||||
*/
|
||||
nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES);
|
||||
nvalchunks = howmany(ent->intcnt,
|
||||
ZAP_LEAF_ARRAY_BYTES / ent->intsz);
|
||||
nchunks = 1 + nnamechunks + nvalchunks;
|
||||
|
||||
/*
|
||||
* Allocate a run of free leaf chunks for this entry,
|
||||
* potentially extending a hash chain.
|
||||
*/
|
||||
assert(l.l_phys->l_hdr.lh_nfree >= nchunks);
|
||||
l.l_phys->l_hdr.lh_nfree -= nchunks;
|
||||
l.l_phys->l_hdr.lh_nentries++;
|
||||
lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash);
|
||||
while (*lptr != 0xffff) {
|
||||
assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l));
|
||||
le = ZAP_LEAF_ENTRY(&l, *lptr);
|
||||
assert(le->le_type == ZAP_CHUNK_ENTRY);
|
||||
le->le_cd++;
|
||||
lptr = &le->le_next;
|
||||
}
|
||||
*lptr = l.l_phys->l_hdr.lh_freelist;
|
||||
l.l_phys->l_hdr.lh_freelist += nchunks;
|
||||
assert(l.l_phys->l_hdr.lh_freelist <=
|
||||
ZAP_LEAF_NUMCHUNKS(&l));
|
||||
if (l.l_phys->l_hdr.lh_freelist ==
|
||||
ZAP_LEAF_NUMCHUNKS(&l))
|
||||
l.l_phys->l_hdr.lh_freelist = 0xffff;
|
||||
|
||||
/*
|
||||
* Integer values must be stored in big-endian format.
|
||||
*/
|
||||
switch (ent->intsz) {
|
||||
case 1:
|
||||
break;
|
||||
case 2:
|
||||
for (uint16_t *v = ent->val16p;
|
||||
v - ent->val16p < (ptrdiff_t)ent->intcnt;
|
||||
v++)
|
||||
*v = htobe16(*v);
|
||||
break;
|
||||
case 4:
|
||||
for (uint32_t *v = ent->val32p;
|
||||
v - ent->val32p < (ptrdiff_t)ent->intcnt;
|
||||
v++)
|
||||
*v = htobe32(*v);
|
||||
break;
|
||||
case 8:
|
||||
for (uint64_t *v = ent->val64p;
|
||||
v - ent->val64p < (ptrdiff_t)ent->intcnt;
|
||||
v++)
|
||||
*v = htobe64(*v);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Finally, write out the leaf chunks for this entry.
|
||||
*/
|
||||
le = ZAP_LEAF_ENTRY(&l, *lptr);
|
||||
assert(le->le_type == ZAP_CHUNK_FREE);
|
||||
le->le_type = ZAP_CHUNK_ENTRY;
|
||||
le->le_next = 0xffff;
|
||||
le->le_name_chunk = *lptr + 1;
|
||||
le->le_name_numints = namelen;
|
||||
le->le_value_chunk = *lptr + 1 + nnamechunks;
|
||||
le->le_value_intlen = ent->intsz;
|
||||
le->le_value_numints = ent->intcnt;
|
||||
le->le_hash = ent->hash;
|
||||
zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name);
|
||||
zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks,
|
||||
ent->intcnt * ent->intsz, ent->valp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize unused slots of the pointer table.
|
||||
*/
|
||||
for (int i = 0; i < ptrcnt; i++)
|
||||
if (ptrhasht[i] == 0)
|
||||
ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1;
|
||||
|
||||
/*
|
||||
* Write the whole thing to disk.
|
||||
*/
|
||||
dnode = zap->dnode;
|
||||
dnode->dn_nblkptr = 1;
|
||||
dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
|
||||
dnode->dn_maxblkid = lblkcnt + 1;
|
||||
dnode->dn_flags = DNODE_FLAG_USED_BYTES;
|
||||
|
||||
c = dnode_cursor_init(zfs, zap->os, zap->dnode,
|
||||
(lblkcnt + 1) * blksz, blksz);
|
||||
|
||||
loc = objset_space_alloc(zfs, zap->os, &blksz);
|
||||
vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc,
|
||||
dnode_cursor_next(zfs, c, 0));
|
||||
|
||||
for (uint64_t i = 0; i < lblkcnt; i++) {
|
||||
loc = objset_space_alloc(zfs, zap->os, &blksz);
|
||||
vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz,
|
||||
blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz));
|
||||
}
|
||||
|
||||
dnode_cursor_finish(zfs, c);
|
||||
|
||||
free(leafblks);
|
||||
}
|
||||
|
||||
void
|
||||
zap_write(zfs_opt_t *zfs, zfs_zap_t *zap)
|
||||
{
|
||||
zfs_zap_entry_t *ent;
|
||||
|
||||
if (zap->micro) {
|
||||
zap_micro_write(zfs, zap);
|
||||
} else {
|
||||
assert(!STAILQ_EMPTY(&zap->kvps));
|
||||
assert(zap->kvpcnt > 0);
|
||||
zap_fat_write(zfs, zap);
|
||||
}
|
||||
|
||||
while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) {
|
||||
STAILQ_REMOVE_HEAD(&zap->kvps, next);
|
||||
if (ent->val64p != &ent->val64)
|
||||
free(ent->valp);
|
||||
free(ent->name);
|
||||
free(ent);
|
||||
}
|
||||
free(zap);
|
||||
}
|
167
usr.sbin/makefs/zfs/zfs.h
Normal file
167
usr.sbin/makefs/zfs/zfs.h
Normal file
|
@ -0,0 +1,167 @@
|
|||
/*-
|
||||
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
||||
*
|
||||
* Copyright (c) 2022 The FreeBSD Foundation
|
||||
*
|
||||
* This software was developed by Mark Johnston under sponsorship from
|
||||
* the FreeBSD Foundation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _MAKEFS_ZFS_H_
|
||||
#define _MAKEFS_ZFS_H_
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/queue.h>
|
||||
|
||||
#include <bitstring.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "makefs.h"
|
||||
|
||||
#include "zfs/nvlist.h"
|
||||
#define ASSERT assert
|
||||
#include "zfs/zfsimpl.h"
|
||||
|
||||
#define MAXBLOCKSHIFT 17 /* 128KB */
|
||||
#define MAXBLOCKSIZE ((off_t)(1 << MAXBLOCKSHIFT))
|
||||
_Static_assert(MAXBLOCKSIZE == SPA_OLDMAXBLOCKSIZE, "");
|
||||
#define MINBLOCKSHIFT 9 /* 512B */
|
||||
#define MINBLOCKSIZE ((off_t)(1 << MINBLOCKSHIFT))
|
||||
_Static_assert(MINBLOCKSIZE == SPA_MINBLOCKSIZE, "");
|
||||
#define MINDEVSIZE ((off_t)SPA_MINDEVSIZE)
|
||||
|
||||
/* All data was written in this transaction group. */
|
||||
#define TXG 4
|
||||
|
||||
typedef struct zfs_dsl_dataset zfs_dsl_dataset_t;
|
||||
typedef struct zfs_dsl_dir zfs_dsl_dir_t;
|
||||
typedef struct zfs_objset zfs_objset_t;
|
||||
typedef struct zfs_zap zfs_zap_t;
|
||||
|
||||
struct dataset_desc {
|
||||
char *params;
|
||||
STAILQ_ENTRY(dataset_desc) next;
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
bool nowarn;
|
||||
|
||||
/* I/O buffer, just for convenience. */
|
||||
char filebuf[MAXBLOCKSIZE];
|
||||
|
||||
/* Pool parameters. */
|
||||
const char *poolname;
|
||||
char *rootpath; /* implicit mount point prefix */
|
||||
char *bootfs; /* bootable dataset, pool property */
|
||||
int ashift; /* vdev block size */
|
||||
uint64_t mssize; /* metaslab size */
|
||||
STAILQ_HEAD(, dataset_desc) datasetdescs; /* non-root dataset descrs */
|
||||
|
||||
/* Pool state. */
|
||||
uint64_t poolguid; /* pool and root vdev GUID */
|
||||
zfs_zap_t *poolprops;
|
||||
|
||||
/* MOS state. */
|
||||
zfs_objset_t *mos; /* meta object set */
|
||||
uint64_t objarrid; /* space map object array */
|
||||
|
||||
/* DSL state. */
|
||||
zfs_dsl_dir_t *rootdsldir; /* root DSL directory */
|
||||
zfs_dsl_dataset_t *rootds;
|
||||
zfs_dsl_dir_t *origindsldir; /* $ORIGIN */
|
||||
zfs_dsl_dataset_t *originds;
|
||||
zfs_dsl_dataset_t *snapds;
|
||||
zfs_zap_t *cloneszap;
|
||||
zfs_dsl_dir_t *freedsldir; /* $FREE */
|
||||
zfs_dsl_dir_t *mosdsldir; /* $MOS */
|
||||
|
||||
/* vdev state. */
|
||||
int fd; /* vdev disk fd */
|
||||
uint64_t vdevguid; /* disk vdev GUID */
|
||||
off_t vdevsize; /* vdev size, including labels */
|
||||
off_t asize; /* vdev size, excluding labels */
|
||||
bitstr_t *spacemap; /* space allocation tracking */
|
||||
int spacemapbits; /* one bit per ashift-sized block */
|
||||
uint64_t msshift; /* log2(metaslab size) */
|
||||
uint64_t mscount; /* number of metaslabs for this vdev */
|
||||
} zfs_opt_t;
|
||||
|
||||
/* dsl.c */
|
||||
void dsl_init(zfs_opt_t *);
|
||||
const char *dsl_dir_fullname(const zfs_dsl_dir_t *);
|
||||
uint64_t dsl_dir_id(zfs_dsl_dir_t *);
|
||||
uint64_t dsl_dir_dataset_id(zfs_dsl_dir_t *);
|
||||
void dsl_dir_foreach(zfs_opt_t *, zfs_dsl_dir_t *,
|
||||
void (*)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *);
|
||||
int dsl_dir_get_canmount(zfs_dsl_dir_t *, uint64_t *);
|
||||
char *dsl_dir_get_mountpoint(zfs_opt_t *, zfs_dsl_dir_t *);
|
||||
bool dsl_dir_has_dataset(zfs_dsl_dir_t *);
|
||||
bool dsl_dir_dataset_has_objset(zfs_dsl_dir_t *);
|
||||
void dsl_dir_dataset_write(zfs_opt_t *, zfs_objset_t *, zfs_dsl_dir_t *);
|
||||
void dsl_dir_size_set(zfs_dsl_dir_t *, uint64_t);
|
||||
void dsl_write(zfs_opt_t *);
|
||||
|
||||
/* fs.c */
|
||||
void fs_build(zfs_opt_t *, int, fsnode *);
|
||||
|
||||
/* objset.c */
|
||||
zfs_objset_t *objset_alloc(zfs_opt_t *zfs, uint64_t type);
|
||||
off_t objset_space_alloc(zfs_opt_t *, zfs_objset_t *, off_t *);
|
||||
dnode_phys_t *objset_dnode_alloc(zfs_objset_t *, uint8_t, uint64_t *);
|
||||
dnode_phys_t *objset_dnode_bonus_alloc(zfs_objset_t *, uint8_t, uint8_t,
|
||||
uint16_t, uint64_t *);
|
||||
dnode_phys_t *objset_dnode_lookup(zfs_objset_t *, uint64_t);
|
||||
void objset_root_blkptr_copy(const zfs_objset_t *, blkptr_t *);
|
||||
uint64_t objset_space(const zfs_objset_t *);
|
||||
void objset_write(zfs_opt_t *zfs, zfs_objset_t *os);
|
||||
|
||||
/* vdev.c */
|
||||
void vdev_init(zfs_opt_t *, const char *);
|
||||
off_t vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp);
|
||||
void vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
|
||||
uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
|
||||
blkptr_t *bp);
|
||||
void vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level,
|
||||
uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp);
|
||||
void vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data,
|
||||
off_t sz, off_t loc);
|
||||
void vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp);
|
||||
void vdev_spacemap_write(zfs_opt_t *);
|
||||
void vdev_fini(zfs_opt_t *zfs);
|
||||
|
||||
/* zap.c */
|
||||
zfs_zap_t *zap_alloc(zfs_objset_t *, dnode_phys_t *);
|
||||
void zap_add(zfs_zap_t *, const char *, size_t, size_t, const uint8_t *);
|
||||
void zap_add_uint64(zfs_zap_t *, const char *, uint64_t);
|
||||
void zap_add_string(zfs_zap_t *, const char *, const char *);
|
||||
bool zap_entry_exists(zfs_zap_t *, const char *);
|
||||
void zap_write(zfs_opt_t *, zfs_zap_t *);
|
||||
|
||||
/* zfs.c */
|
||||
struct dnode_cursor *dnode_cursor_init(zfs_opt_t *, zfs_objset_t *,
|
||||
dnode_phys_t *, off_t, off_t);
|
||||
blkptr_t *dnode_cursor_next(zfs_opt_t *, struct dnode_cursor *, off_t);
|
||||
void dnode_cursor_finish(zfs_opt_t *, struct dnode_cursor *);
|
||||
|
||||
#endif /* !_MAKEFS_ZFS_H_ */
|
Loading…
Reference in a new issue