bcachefs: Freespace, need_discard btrees

This adds two new btrees for the upcoming allocator rewrite: an extents
btree of free buckets, and a btree for buckets awaiting discards.

We also add a new trigger for alloc keys to keep the new btrees up to
date, and a compatibility path to initialize them on existing
filesystems.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2021-12-11 17:13:09 -05:00 committed by Kent Overstreet
parent 3d48a7f85f
commit c6b2826cd1
16 changed files with 377 additions and 54 deletions

View file

@ -14,6 +14,7 @@
#include "debug.h"
#include "ec.h"
#include "error.h"
#include "lru.h"
#include "recovery.h"
#include "trace.h"
#include "varint.h"
@ -41,6 +42,15 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#undef x
};
const char * const bch2_bucket_states[] = {
"free",
"need gc gens",
"need discard",
"cached",
"dirty",
NULL
};
struct bkey_alloc_unpacked {
u64 journal_seq;
u64 bucket;
@ -448,6 +458,217 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
return ret;
}
/* Free space/discard btree: */
static int bch2_bucket_do_index(struct btree_trans *trans,
struct bkey_s_c alloc_k,
struct bch_alloc_v4 a,
bool set)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
struct btree_iter iter;
struct bkey_s_c old;
struct bkey_i *k;
enum bucket_state state = bucket_state(a);
enum btree_id btree;
enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
struct printbuf buf = PRINTBUF;
int ret;
if (state != BUCKET_free &&
state != BUCKET_need_discard)
return 0;
k = bch2_trans_kmalloc(trans, sizeof(*k));
if (IS_ERR(k))
return PTR_ERR(k);
bkey_init(&k->k);
k->k.type = new_type;
switch (state) {
case BUCKET_free:
btree = BTREE_ID_freespace;
k->k.p = alloc_freespace_pos(alloc_k.k->p, a);
bch2_key_resize(&k->k, 1);
break;
case BUCKET_need_discard:
btree = BTREE_ID_need_discard;
k->k.p = alloc_k.k->p;
break;
default:
return 0;
}
bch2_trans_iter_init(trans, &iter, btree,
bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
old = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(old);
if (ret)
goto err;
if (ca->mi.freespace_initialized &&
bch2_fs_inconsistent_on(old.k->type != old_type, c,
"incorrect key when %s %s btree (got %s should be %s)\n"
" for %s",
set ? "setting" : "clearing",
bch2_btree_ids[btree],
bch2_bkey_types[old.k->type],
bch2_bkey_types[old_type],
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
ret = -EIO;
goto err;
}
ret = bch2_trans_update(trans, &iter, k, 0);
err:
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
return ret;
}
int bch2_trans_mark_alloc(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
struct bch_fs *c = trans->c;
struct bch_alloc_v4 old_a, *new_a;
u64 old_lru, new_lru;
int ret = 0;
/*
* Deletion only happens in the device removal path, with
* BTREE_TRIGGER_NORUN:
*/
BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
bch2_alloc_to_v4(old, &old_a);
new_a = &bkey_i_to_alloc_v4(new)->v;
if (new_a->dirty_sectors > old_a.dirty_sectors ||
new_a->cached_sectors > old_a.cached_sectors) {
new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
}
if (old_a.data_type && !new_a->data_type &&
old_a.gen == new_a->gen &&
!bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
new_a->gen++;
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
}
if (bucket_state(old_a) != bucket_state(*new_a) ||
(bucket_state(*new_a) == BUCKET_free &&
alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true);
if (ret)
return ret;
}
old_lru = alloc_lru_idx(old_a);
new_lru = alloc_lru_idx(*new_a);
if (old_lru != new_lru) {
ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
old_lru, &new_lru);
if (ret)
return ret;
if (new_lru && new_a->io_time[READ] != new_lru)
new_a->io_time[READ] = new_lru;
}
return 0;
}
static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bch_alloc_v4 a;
struct bch_member *m;
int ret;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_alloc,
POS(ca->dev_idx, ca->mi.first_bucket),
BTREE_ITER_SLOTS|
BTREE_ITER_PREFETCH, k, ret) {
if (iter.pos.offset >= ca->mi.nbuckets)
break;
bch2_alloc_to_v4(k, &a);
ret = __bch2_trans_do(&trans, NULL, NULL,
BTREE_INSERT_LAZY_RW,
bch2_bucket_do_index(&trans, k, a, true));
if (ret)
break;
}
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
if (ret) {
bch_err(ca, "error initializing free space: %i", ret);
return ret;
}
mutex_lock(&c->sb_lock);
m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
mutex_unlock(&c->sb_lock);
return ret;
}
int bch2_fs_freespace_init(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
int ret = 0;
bool doing_init = false;
/*
* We can crash during the device add path, so we need to check this on
* every mount:
*/
for_each_member_device(ca, c, i) {
if (ca->mi.freespace_initialized)
continue;
if (!doing_init) {
bch_info(c, "initializing freespace");
doing_init = true;
}
ret = bch2_dev_freespace_init(c, ca);
if (ret) {
percpu_ref_put(&ca->ref);
return ret;
}
}
if (doing_init) {
mutex_lock(&c->sb_lock);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
bch_verbose(c, "done initializing freespace");
}
return ret;
}
/* Bucket IO clocks: */
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
@ -485,6 +706,16 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
* commands to the newly free buckets, then puts them on the various freelists.
*/
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
*/
static inline u8 bucket_gc_gen(struct bucket *g)
{
return g->mark.gen - g->oldest_gen;
}
static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
struct bucket_mark m)
{

View file

@ -13,6 +13,51 @@ extern const char * const bch2_allocator_states[];
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
{
return a.gen - a.oldest_gen;
}
enum bucket_state {
BUCKET_free,
BUCKET_need_gc_gens,
BUCKET_need_discard,
BUCKET_cached,
BUCKET_dirty,
};
extern const char * const bch2_bucket_states[];
static inline enum bucket_state bucket_state(struct bch_alloc_v4 a)
{
if (a.dirty_sectors || a.stripe)
return BUCKET_dirty;
if (a.cached_sectors)
return BUCKET_cached;
BUG_ON(a.data_type);
if (BCH_ALLOC_V4_NEED_DISCARD(&a))
return BUCKET_need_discard;
if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
return BUCKET_need_gc_gens;
return BUCKET_free;
}
static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
{
return bucket_state(a) == BUCKET_cached ? a.io_time[READ] : 0;
}
static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
{
return ((u64) alloc_gc_gen(a) >> 4) << 56;
}
static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
{
pos.offset |= alloc_freespace_genbits(a);
return pos;
}
struct bkey_i_alloc_v4 *
bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
@ -33,18 +78,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \
.key_invalid = bch2_alloc_v1_invalid, \
.val_to_text = bch2_alloc_to_text, \
.trans_trigger = bch2_trans_mark_alloc, \
.atomic_trigger = bch2_mark_alloc, \
}
#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
.key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
.trans_trigger = bch2_trans_mark_alloc, \
.atomic_trigger = bch2_mark_alloc, \
}
#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \
.key_invalid = bch2_alloc_v3_invalid, \
.val_to_text = bch2_alloc_to_text, \
.trans_trigger = bch2_trans_mark_alloc, \
.atomic_trigger = bch2_mark_alloc, \
}
@ -52,6 +100,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
.key_invalid = bch2_alloc_v4_invalid, \
.val_to_text = bch2_alloc_to_text, \
.swab = bch2_alloc_v4_swab, \
.trans_trigger = bch2_trans_mark_alloc, \
.atomic_trigger = bch2_mark_alloc, \
}
@ -64,6 +113,10 @@ static inline bool bkey_is_alloc(const struct bkey *k)
int bch2_alloc_read(struct bch_fs *, bool, bool);
int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
struct bkey_i *, unsigned);
int bch2_fs_freespace_init(struct bch_fs *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
struct task_struct *p;

View file

@ -117,6 +117,20 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke
return false;
}
static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
{
bool ret;
if (bch2_bucket_is_open(c, dev, bucket))
return true;
spin_lock(&c->freelist_lock);
ret = bch2_bucket_is_open(c, dev, bucket);
spin_unlock(&c->freelist_lock);
return ret;
}
int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
unsigned, unsigned *, bool *, enum alloc_reserve,

View file

@ -392,6 +392,8 @@ enum gc_phase {
GC_PHASE_BTREE_subvolumes,
GC_PHASE_BTREE_snapshots,
GC_PHASE_BTREE_lru,
GC_PHASE_BTREE_freespace,
GC_PHASE_BTREE_need_discard,
GC_PHASE_PENDING_DELETE,
};

View file

@ -947,7 +947,6 @@ enum {
#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
BCH_ALLOC_FIELDS_V1()
#undef x
BCH_ALLOC_FIELD_NR
};
/* Quotas: */
@ -1146,6 +1145,8 @@ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30)
LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
struct bch_member, flags[0], 30, 31)
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
@ -1361,7 +1362,9 @@ struct bch_sb_field_journal_seq_blacklist {
x(snapshot_2, 15) \
x(reflink_p_fix, 16) \
x(subvol_dirent, 17) \
x(inode_v2, 18)
x(inode_v2, 18) \
x(freespace, 19) \
x(alloc_v4, 20)
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@ -1889,7 +1892,9 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
x(reflink, 7) \
x(subvolumes, 8) \
x(snapshots, 9) \
x(lru, 10)
x(lru, 10) \
x(freespace, 11) \
x(need_discard, 12)
enum btree_id {
#define x(kwd, val) BTREE_ID_##kwd = val,

View file

@ -170,6 +170,12 @@ static unsigned bch2_key_types_allowed[] = {
[BKEY_TYPE_lru] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_lru),
[BKEY_TYPE_freespace] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_set),
[BKEY_TYPE_need_discard] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_set),
[BKEY_TYPE_btree] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_btree_ptr)|

View file

@ -925,7 +925,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"error decrypting btree node: %i", ret))
goto fsck_err;
btree_err_on(btree_node_is_extents(b) &&
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
BTREE_ERR_FATAL, c, NULL, b, NULL,
"btree node does not have NEW_EXTENT_OVERWRITE set");

View file

@ -595,24 +595,9 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
return __btree_node_type(b->c.level, b->c.btree_id);
}
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
switch (type) {
case BKEY_TYPE_extents:
case BKEY_TYPE_reflink:
return true;
default:
return false;
}
}
static inline bool btree_node_is_extents(struct btree *b)
{
return btree_node_type_is_extents(btree_node_type(b));
}
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
((1U << BKEY_TYPE_extents)| \
(1U << BKEY_TYPE_alloc)| \
(1U << BKEY_TYPE_inodes)| \
(1U << BKEY_TYPE_stripes)| \
(1U << BKEY_TYPE_reflink)| \
@ -628,6 +613,16 @@ static inline bool btree_node_is_extents(struct btree *b)
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
#define BTREE_ID_IS_EXTENTS \
((1U << BTREE_ID_extents)| \
(1U << BTREE_ID_reflink)| \
(1U << BTREE_ID_freespace))
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
return (1U << type) & BTREE_ID_IS_EXTENTS;
}
#define BTREE_ID_HAS_SNAPSHOTS \
((1U << BTREE_ID_extents)| \
(1U << BTREE_ID_inodes)| \

View file

@ -560,6 +560,11 @@ int bch2_mark_alloc(struct btree_trans *trans,
}
}
if (bucket_state(new_a) == BUCKET_need_gc_gens) {
atomic_inc(&c->kick_gc);
wake_up_process(c->gc_thread);
}
percpu_down_read(&c->mark_lock);
if (!gc && new_a.gen != old_a.gen)
*bucket_gen(ca, new.k->p.offset) = new_a.gen;

View file

@ -81,16 +81,6 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
return gens->b + b;
}
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
*/
static inline u8 bucket_gc_gen(struct bucket *g)
{
return g->mark.gen - g->oldest_gen;
}
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{

View file

@ -15,17 +15,26 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
unsigned ret = 0;
unsigned ret = 0, lru = 0;
bkey_extent_entry_for_each(ptrs, entry) {
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
/* Might also be updating LRU btree */
if (entry->ptr.cached)
lru++;
fallthrough;
case BCH_EXTENT_ENTRY_stripe_ptr:
ret++;
}
}
return ret;
/*
* Updating keys in the alloc btree may also update keys in the
* freespace or discard btrees:
*/
return lru + ret * 2;
}
static int count_iters_for_insert(struct btree_trans *trans,

View file

@ -1028,8 +1028,8 @@ int bch2_fs_recovery(struct bch_fs *c)
bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
c->opts.version_upgrade = true;
c->opts.fsck = true;
} else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
} else if (c->sb.version < bcachefs_metadata_version_alloc_v4) {
bch_info(c, "filesystem version is prior to alloc_v4 - upgrading");
c->opts.version_upgrade = true;
}
}
@ -1197,6 +1197,11 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->opts.verbose || !c->sb.clean)
bch_info(c, "journal replay done");
err = "error initializing freespace";
ret = bch2_fs_freespace_init(c);
if (ret)
goto err;
if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
bch2_fs_lazy_rw(c);
@ -1380,6 +1385,11 @@ int bch2_fs_initialize(struct bch_fs *c)
ca->new_fs_bucket_idx = 0;
}
err = "error initializing freespace";
ret = bch2_fs_freespace_init(c);
if (ret)
goto err;
err = "error creating root snapshot node";
ret = bch2_fs_initialize_subvolumes(c);
if (ret)

View file

@ -1083,6 +1083,11 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m));
pr_newline(out);
pr_buf(out, "Freespace initialized:");
pr_tab(out);
pr_buf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
pr_newline(out);
pr_indent_pop(out, 2);
}
}

View file

@ -103,6 +103,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
.durability = BCH_MEMBER_DURABILITY(mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
.valid = bch2_member_exists(mi),
};
}

View file

@ -1471,30 +1471,20 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
{
struct btree_trans trans;
size_t i;
struct bpos start = POS(ca->dev_idx, 0);
struct bpos end = POS(ca->dev_idx, U64_MAX);
int ret;
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < ca->mi.nbuckets; i++) {
ret = lockrestart_do(&trans,
bch2_btree_key_cache_flush(&trans,
BTREE_ID_alloc, POS(ca->dev_idx, i)));
if (ret)
break;
}
bch2_trans_exit(&trans);
if (ret) {
ret = bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
BTREE_TRIGGER_NORUN, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
BTREE_TRIGGER_NORUN, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
BTREE_TRIGGER_NORUN, NULL);
if (ret)
bch_err(c, "error %i removing dev alloc info", ret);
return ret;
}
return bch2_btree_delete_range(c, BTREE_ID_alloc,
POS(ca->dev_idx, 0),
POS(ca->dev_idx + 1, 0),
0, NULL);
return ret;
}
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
@ -1712,6 +1702,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_late;
}
ret = bch2_fs_freespace_init(c);
if (ret) {
bch_err(c, "device add error: error initializing free space: %i", ret);
goto err_late;
}
ca->new_fs_bucket_idx = 0;
if (ca->mi.state == BCH_MEMBER_STATE_rw) {

View file

@ -33,6 +33,7 @@ struct bch_member_cpu {
u8 discard;
u8 data_allowed;
u8 durability;
u8 freespace_initialized;
u8 valid;
};