Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs into for-linus

This commit is contained in:
Li Zefan 2012-01-11 09:54:49 +08:00
commit d25223a0d2
34 changed files with 6361 additions and 2033 deletions

View file

@ -63,8 +63,8 @@ IRC network.
Userspace tools for creating and manipulating Btrfs file systems are
available from the git repository at the following location:
http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs-unstable.git
git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs-unstable.git
http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs.git
git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs.git
These include the following tools:

View file

@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o

View file

@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
if (!value)
return ERR_PTR(-ENOMEM);
size = __btrfs_getxattr(inode, name, value, size);
if (size > 0) {
acl = posix_acl_from_xattr(value, size);
if (IS_ERR(acl)) {
kfree(value);
return acl;
}
set_cached_acl(inode, type, acl);
}
kfree(value);
}
if (size > 0) {
acl = posix_acl_from_xattr(value, size);
} else if (size == -ENOENT || size == -ENODATA || size == 0) {
/* FIXME, who returns -ENOENT? I think nobody */
acl = NULL;
set_cached_acl(inode, type, acl);
} else {
acl = ERR_PTR(-EIO);
}
kfree(value);
if (!IS_ERR(acl))
set_cached_acl(inode, type, acl);
return acl;
}

View file

@ -64,6 +64,8 @@ struct btrfs_worker_thread {
int idle;
};
static int __btrfs_start_workers(struct btrfs_workers *workers);
/*
* btrfs_start_workers uses kthread_run, which can block waiting for memory
* for a very long time. It will actually throttle on page writeback,
@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
{
struct worker_start *start;
start = container_of(work, struct worker_start, work);
btrfs_start_workers(start->queue, 1);
__btrfs_start_workers(start->queue);
kfree(start);
}
static int start_new_worker(struct btrfs_workers *queue)
{
struct worker_start *start;
int ret;
start = kzalloc(sizeof(*start), GFP_NOFS);
if (!start)
return -ENOMEM;
start->work.func = start_new_worker_func;
start->queue = queue;
ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
if (ret)
kfree(start);
return ret;
}
/*
* helper function to move a thread onto the idle list after it
* has finished some requests.
@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
{
struct btrfs_workers *workers = worker->workers;
struct worker_start *start;
unsigned long flags;
rmb();
if (!workers->atomic_start_pending)
return;
start = kzalloc(sizeof(*start), GFP_NOFS);
if (!start)
return;
start->work.func = start_new_worker_func;
start->queue = workers;
spin_lock_irqsave(&workers->lock, flags);
if (!workers->atomic_start_pending)
goto out;
@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
workers->num_workers_starting += 1;
spin_unlock_irqrestore(&workers->lock, flags);
start_new_worker(workers);
btrfs_queue_worker(workers->atomic_worker_start, &start->work);
return;
out:
kfree(start);
spin_unlock_irqrestore(&workers->lock, flags);
}
@ -331,7 +325,7 @@ static int worker_loop(void *arg)
run_ordered_completions(worker->workers, work);
check_pending_worker_creates(worker);
cond_resched();
}
spin_lock_irq(&worker->lock);
@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
* starts new worker threads. This does not enforce the max worker
* count in case you need to temporarily go past it.
*/
static int __btrfs_start_workers(struct btrfs_workers *workers,
int num_workers)
static int __btrfs_start_workers(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
int ret = 0;
int i;
for (i = 0; i < num_workers; i++) {
worker = kzalloc(sizeof(*worker), GFP_NOFS);
if (!worker) {
ret = -ENOMEM;
goto fail;
}
INIT_LIST_HEAD(&worker->pending);
INIT_LIST_HEAD(&worker->prio_pending);
INIT_LIST_HEAD(&worker->worker_list);
spin_lock_init(&worker->lock);
atomic_set(&worker->num_pending, 0);
atomic_set(&worker->refs, 1);
worker->workers = workers;
worker->task = kthread_run(worker_loop, worker,
"btrfs-%s-%d", workers->name,
workers->num_workers + i);
if (IS_ERR(worker->task)) {
ret = PTR_ERR(worker->task);
kfree(worker);
goto fail;
}
spin_lock_irq(&workers->lock);
list_add_tail(&worker->worker_list, &workers->idle_list);
worker->idle = 1;
workers->num_workers++;
workers->num_workers_starting--;
WARN_ON(workers->num_workers_starting < 0);
spin_unlock_irq(&workers->lock);
worker = kzalloc(sizeof(*worker), GFP_NOFS);
if (!worker) {
ret = -ENOMEM;
goto fail;
}
INIT_LIST_HEAD(&worker->pending);
INIT_LIST_HEAD(&worker->prio_pending);
INIT_LIST_HEAD(&worker->worker_list);
spin_lock_init(&worker->lock);
atomic_set(&worker->num_pending, 0);
atomic_set(&worker->refs, 1);
worker->workers = workers;
worker->task = kthread_run(worker_loop, worker,
"btrfs-%s-%d", workers->name,
workers->num_workers + 1);
if (IS_ERR(worker->task)) {
ret = PTR_ERR(worker->task);
kfree(worker);
goto fail;
}
spin_lock_irq(&workers->lock);
list_add_tail(&worker->worker_list, &workers->idle_list);
worker->idle = 1;
workers->num_workers++;
workers->num_workers_starting--;
WARN_ON(workers->num_workers_starting < 0);
spin_unlock_irq(&workers->lock);
return 0;
fail:
btrfs_stop_workers(workers);
spin_lock_irq(&workers->lock);
workers->num_workers_starting--;
spin_unlock_irq(&workers->lock);
return ret;
}
int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
int btrfs_start_workers(struct btrfs_workers *workers)
{
spin_lock_irq(&workers->lock);
workers->num_workers_starting += num_workers;
workers->num_workers_starting++;
spin_unlock_irq(&workers->lock);
return __btrfs_start_workers(workers, num_workers);
return __btrfs_start_workers(workers);
}
/*
@ -568,9 +561,10 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
struct btrfs_worker_thread *worker;
unsigned long flags;
struct list_head *fallback;
int ret;
again:
spin_lock_irqsave(&workers->lock, flags);
again:
worker = next_worker(workers);
if (!worker) {
@ -584,7 +578,10 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
workers->num_workers_starting++;
spin_unlock_irqrestore(&workers->lock, flags);
/* we're below the limit, start another worker */
__btrfs_start_workers(workers, 1);
ret = __btrfs_start_workers(workers);
spin_lock_irqsave(&workers->lock, flags);
if (ret)
goto fallback;
goto again;
}
}
@ -665,7 +662,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
/*
* places a struct btrfs_work into the pending queue of one of the kthreads
*/
int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
{
struct btrfs_worker_thread *worker;
unsigned long flags;
@ -673,7 +670,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
/* don't requeue something already on a list */
if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
goto out;
return;
worker = find_worker(workers);
if (workers->ordered) {
@ -712,7 +709,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
if (wake)
wake_up_process(worker->task);
spin_unlock_irqrestore(&worker->lock, flags);
out:
return 0;
}

View file

@ -109,8 +109,8 @@ struct btrfs_workers {
char *name;
};
int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
int btrfs_start_workers(struct btrfs_workers *workers);
int btrfs_stop_workers(struct btrfs_workers *workers);
void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
struct btrfs_workers *async_starter);

776
fs/btrfs/backref.c Normal file
View file

@ -0,0 +1,776 @@
/*
* Copyright (C) 2011 STRATO. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include "ctree.h"
#include "disk-io.h"
#include "backref.h"
struct __data_ref {
struct list_head list;
u64 inum;
u64 root;
u64 extent_data_item_offset;
};
struct __shared_ref {
struct list_head list;
u64 disk_byte;
};
static int __inode_info(u64 inum, u64 ioff, u8 key_type,
struct btrfs_root *fs_root, struct btrfs_path *path,
struct btrfs_key *found_key)
{
int ret;
struct btrfs_key key;
struct extent_buffer *eb;
key.type = key_type;
key.objectid = inum;
key.offset = ioff;
ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
if (ret < 0)
return ret;
eb = path->nodes[0];
if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
ret = btrfs_next_leaf(fs_root, path);
if (ret)
return ret;
eb = path->nodes[0];
}
btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
if (found_key->type != key.type || found_key->objectid != key.objectid)
return 1;
return 0;
}
/*
* this makes the path point to (inum INODE_ITEM ioff)
*/
int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
struct btrfs_path *path)
{
struct btrfs_key key;
return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
&key);
}
static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
struct btrfs_path *path,
struct btrfs_key *found_key)
{
return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
found_key);
}
/*
* this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
* of the path are separated by '/' and the path is guaranteed to be
* 0-terminated. the path is only given within the current file system.
* Therefore, it never starts with a '/'. the caller is responsible to provide
* "size" bytes in "dest". the dest buffer will be filled backwards. finally,
* the start point of the resulting string is returned. this pointer is within
* dest, normally.
* in case the path buffer would overflow, the pointer is decremented further
* as if output was written to the buffer, though no more output is actually
* generated. that way, the caller can determine how much space would be
* required for the path to fit into the buffer. in that case, the returned
* value will be smaller than dest. callers must check this!
*/
static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
struct btrfs_inode_ref *iref,
struct extent_buffer *eb_in, u64 parent,
char *dest, u32 size)
{
u32 len;
int slot;
u64 next_inum;
int ret;
s64 bytes_left = size - 1;
struct extent_buffer *eb = eb_in;
struct btrfs_key found_key;
if (bytes_left >= 0)
dest[bytes_left] = '\0';
while (1) {
len = btrfs_inode_ref_name_len(eb, iref);
bytes_left -= len;
if (bytes_left >= 0)
read_extent_buffer(eb, dest + bytes_left,
(unsigned long)(iref + 1), len);
if (eb != eb_in)
free_extent_buffer(eb);
ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
if (ret)
break;
next_inum = found_key.offset;
/* regular exit ahead */
if (parent == next_inum)
break;
slot = path->slots[0];
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
if (eb != eb_in)
atomic_inc(&eb->refs);
btrfs_release_path(path);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
parent = next_inum;
--bytes_left;
if (bytes_left >= 0)
dest[bytes_left] = '/';
}
btrfs_release_path(path);
if (ret)
return ERR_PTR(ret);
return dest + bytes_left;
}
/*
* this makes the path point to (logical EXTENT_ITEM *)
* returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
* tree blocks and <0 on error.
*/
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key)
{
int ret;
u64 flags;
u32 item_size;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_key key;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.objectid = logical;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
ret = btrfs_previous_item(fs_info->extent_root, path,
0, BTRFS_EXTENT_ITEM_KEY);
if (ret < 0)
return ret;
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
found_key->objectid > logical ||
found_key->objectid + found_key->offset <= logical)
return -ENOENT;
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, path->slots[0]);
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
return BTRFS_EXTENT_FLAG_TREE_BLOCK;
if (flags & BTRFS_EXTENT_FLAG_DATA)
return BTRFS_EXTENT_FLAG_DATA;
return -EIO;
}
/*
* helper function to iterate extent inline refs. ptr must point to a 0 value
* for the first call and may be modified. it is used to track state.
* if more refs exist, 0 is returned and the next call to
* __get_extent_inline_ref must pass the modified ptr parameter to get the
* next ref. after the last ref was processed, 1 is returned.
* returns <0 on error
*/
static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_extent_item *ei, u32 item_size,
struct btrfs_extent_inline_ref **out_eiref,
int *out_type)
{
unsigned long end;
u64 flags;
struct btrfs_tree_block_info *info;
if (!*ptr) {
/* first call */
flags = btrfs_extent_flags(eb, ei);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
info = (struct btrfs_tree_block_info *)(ei + 1);
*out_eiref =
(struct btrfs_extent_inline_ref *)(info + 1);
} else {
*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
}
*ptr = (unsigned long)*out_eiref;
if ((void *)*ptr >= (void *)ei + item_size)
return -ENOENT;
}
end = (unsigned long)ei + item_size;
*out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
*ptr += btrfs_extent_inline_ref_size(*out_type);
WARN_ON(*ptr > end);
if (*ptr == end)
return 1; /* last */
return 0;
}
/*
* reads the tree block backref for an extent. tree level and root are returned
* through out_level and out_root. ptr must point to a 0 value for the first
* call and may be modified (see __get_extent_inline_ref comment).
* returns 0 if data was provided, 1 if there was no more data to provide or
* <0 on error.
*/
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_extent_item *ei, u32 item_size,
u64 *out_root, u8 *out_level)
{
int ret;
int type;
struct btrfs_tree_block_info *info;
struct btrfs_extent_inline_ref *eiref;
if (*ptr == (unsigned long)-1)
return 1;
while (1) {
ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
&eiref, &type);
if (ret < 0)
return ret;
if (type == BTRFS_TREE_BLOCK_REF_KEY ||
type == BTRFS_SHARED_BLOCK_REF_KEY)
break;
if (ret == 1)
return 1;
}
/* we can treat both ref types equally here */
info = (struct btrfs_tree_block_info *)(ei + 1);
*out_root = btrfs_extent_inline_ref_offset(eb, eiref);
*out_level = btrfs_tree_block_level(eb, info);
if (ret == 1)
*ptr = (unsigned long)-1;
return 0;
}
static int __data_list_add(struct list_head *head, u64 inum,
u64 extent_data_item_offset, u64 root)
{
struct __data_ref *ref;
ref = kmalloc(sizeof(*ref), GFP_NOFS);
if (!ref)
return -ENOMEM;
ref->inum = inum;
ref->extent_data_item_offset = extent_data_item_offset;
ref->root = root;
list_add_tail(&ref->list, head);
return 0;
}
static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
struct btrfs_extent_data_ref *dref)
{
return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
btrfs_extent_data_ref_offset(eb, dref),
btrfs_extent_data_ref_root(eb, dref));
}
static int __shared_list_add(struct list_head *head, u64 disk_byte)
{
struct __shared_ref *ref;
ref = kmalloc(sizeof(*ref), GFP_NOFS);
if (!ref)
return -ENOMEM;
ref->disk_byte = disk_byte;
list_add_tail(&ref->list, head);
return 0;
}
static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
u64 logical, u64 inum,
u64 extent_data_item_offset,
u64 extent_offset,
struct btrfs_path *path,
struct list_head *data_refs,
iterate_extent_inodes_t *iterate,
void *ctx)
{
u64 ref_root;
u32 item_size;
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *eiref;
struct __data_ref *ref;
int ret;
int type;
int last;
unsigned long ptr = 0;
WARN_ON(!list_empty(data_refs));
ret = extent_from_logical(fs_info, logical, path, &key);
if (ret & BTRFS_EXTENT_FLAG_DATA)
ret = -EIO;
if (ret < 0)
goto out;
eb = path->nodes[0];
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
item_size = btrfs_item_size_nr(eb, path->slots[0]);
ret = 0;
ref_root = 0;
/*
* as done in iterate_extent_inodes, we first build a list of refs to
* iterate, then free the path and then iterate them to avoid deadlocks.
*/
do {
last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
&eiref, &type);
if (last < 0) {
ret = last;
goto out;
}
if (type == BTRFS_TREE_BLOCK_REF_KEY ||
type == BTRFS_SHARED_BLOCK_REF_KEY) {
ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
ret = __data_list_add(data_refs, inum,
extent_data_item_offset,
ref_root);
}
} while (!ret && !last);
btrfs_release_path(path);
if (ref_root == 0) {
printk(KERN_ERR "btrfs: failed to find tree block ref "
"for shared data backref %llu\n", logical);
WARN_ON(1);
ret = -EIO;
}
out:
while (!list_empty(data_refs)) {
ref = list_first_entry(data_refs, struct __data_ref, list);
list_del(&ref->list);
if (!ret)
ret = iterate(ref->inum, extent_offset +
ref->extent_data_item_offset,
ref->root, ctx);
kfree(ref);
}
return ret;
}
static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
u64 logical, u64 orig_extent_item_objectid,
u64 extent_offset, struct btrfs_path *path,
struct list_head *data_refs,
iterate_extent_inodes_t *iterate,
void *ctx)
{
u64 disk_byte;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *eb;
int slot;
int nritems;
int ret;
int found = 0;
eb = read_tree_block(fs_info->tree_root, logical,
fs_info->tree_root->leafsize, 0);
if (!eb)
return -EIO;
/*
* from the shared data ref, we only have the leaf but we need
* the key. thus, we must look into all items and see that we
* find one (some) with a reference to our extent item.
*/
nritems = btrfs_header_nritems(eb);
for (slot = 0; slot < nritems; ++slot) {
btrfs_item_key_to_cpu(eb, &key, slot);
if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
if (!fi) {
free_extent_buffer(eb);
return -EIO;
}
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
if (disk_byte != orig_extent_item_objectid) {
if (found)
break;
else
continue;
}
++found;
ret = __iter_shared_inline_ref_inodes(fs_info, logical,
key.objectid,
key.offset,
extent_offset, path,
data_refs,
iterate, ctx);
if (ret)
break;
}
if (!found) {
printk(KERN_ERR "btrfs: failed to follow shared data backref "
"to parent %llu\n", logical);
WARN_ON(1);
ret = -EIO;
}
free_extent_buffer(eb);
return ret;
}
/*
* calls iterate() for every inode that references the extent identified by
* the given parameters. will use the path given as a parameter and return it
* released.
* when the iterator function returns a non-zero value, iteration stops.
*/
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
u64 extent_item_objectid,
u64 extent_offset,
iterate_extent_inodes_t *iterate, void *ctx)
{
unsigned long ptr = 0;
int last;
int ret;
int type;
u64 logical;
u32 item_size;
struct btrfs_extent_inline_ref *eiref;
struct btrfs_extent_data_ref *dref;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_key key;
struct list_head data_refs = LIST_HEAD_INIT(data_refs);
struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
struct __data_ref *ref_d;
struct __shared_ref *ref_s;
eb = path->nodes[0];
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
item_size = btrfs_item_size_nr(eb, path->slots[0]);
/* first we iterate the inline refs, ... */
do {
last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
&eiref, &type);
if (last == -ENOENT) {
ret = 0;
break;
}
if (last < 0) {
ret = last;
break;
}
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
ret = __data_list_add_eb(&data_refs, eb, dref);
} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
logical = btrfs_extent_inline_ref_offset(eb, eiref);
ret = __shared_list_add(&shared_refs, logical);
}
} while (!ret && !last);
/* ... then we proceed to in-tree references and ... */
while (!ret) {
++path->slots[0];
if (path->slots[0] > btrfs_header_nritems(eb)) {
ret = btrfs_next_leaf(fs_info->extent_root, path);
if (ret) {
if (ret == 1)
ret = 0; /* we're done */
break;
}
eb = path->nodes[0];
}
btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
if (key.objectid != extent_item_objectid)
break;
if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
dref = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_extent_data_ref);
ret = __data_list_add_eb(&data_refs, eb, dref);
} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
ret = __shared_list_add(&shared_refs, key.offset);
}
}
btrfs_release_path(path);
/*
* ... only at the very end we can process the refs we found. this is
* because the iterator function we call is allowed to make tree lookups
* and we have to avoid deadlocks. additionally, we need more tree
* lookups ourselves for shared data refs.
*/
while (!list_empty(&data_refs)) {
ref_d = list_first_entry(&data_refs, struct __data_ref, list);
list_del(&ref_d->list);
if (!ret)
ret = iterate(ref_d->inum, extent_offset +
ref_d->extent_data_item_offset,
ref_d->root, ctx);
kfree(ref_d);
}
while (!list_empty(&shared_refs)) {
ref_s = list_first_entry(&shared_refs, struct __shared_ref,
list);
list_del(&ref_s->list);
if (!ret)
ret = __iter_shared_inline_ref(fs_info,
ref_s->disk_byte,
extent_item_objectid,
extent_offset, path,
&data_refs,
iterate, ctx);
kfree(ref_s);
}
return ret;
}
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
iterate_extent_inodes_t *iterate, void *ctx)
{
int ret;
u64 offset;
struct btrfs_key found_key;
ret = extent_from_logical(fs_info, logical, path,
&found_key);
if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = -EINVAL;
if (ret < 0)
return ret;
offset = logical - found_key.objectid;
ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
offset, iterate, ctx);
return ret;
}
static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
struct btrfs_path *path,
iterate_irefs_t *iterate, void *ctx)
{
int ret;
int slot;
u32 cur;
u32 len;
u32 name_len;
u64 parent = 0;
int found = 0;
struct extent_buffer *eb;
struct btrfs_item *item;
struct btrfs_inode_ref *iref;
struct btrfs_key found_key;
while (1) {
ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
&found_key);
if (ret < 0)
break;
if (ret) {
ret = found ? 0 : -ENOENT;
break;
}
++found;
parent = found_key.offset;
slot = path->slots[0];
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
atomic_inc(&eb->refs);
btrfs_release_path(path);
item = btrfs_item_nr(eb, slot);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
name_len = btrfs_inode_ref_name_len(eb, iref);
/* path must be released before calling iterate()! */
ret = iterate(parent, iref, eb, ctx);
if (ret) {
free_extent_buffer(eb);
break;
}
len = sizeof(*iref) + name_len;
iref = (struct btrfs_inode_ref *)((char *)iref + len);
}
free_extent_buffer(eb);
}
btrfs_release_path(path);
return ret;
}
/*
* returns 0 if the path could be dumped (probably truncated)
* returns <0 in case of an error
*/
static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
struct extent_buffer *eb, void *ctx)
{
struct inode_fs_paths *ipath = ctx;
char *fspath;
char *fspath_min;
int i = ipath->fspath->elem_cnt;
const int s_ptr = sizeof(char *);
u32 bytes_left;
bytes_left = ipath->fspath->bytes_left > s_ptr ?
ipath->fspath->bytes_left - s_ptr : 0;
fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
inum, fspath_min, bytes_left);
if (IS_ERR(fspath))
return PTR_ERR(fspath);
if (fspath > fspath_min) {
ipath->fspath->val[i] = (u64)(unsigned long)fspath;
++ipath->fspath->elem_cnt;
ipath->fspath->bytes_left = fspath - fspath_min;
} else {
++ipath->fspath->elem_missed;
ipath->fspath->bytes_missing += fspath_min - fspath;
ipath->fspath->bytes_left = 0;
}
return 0;
}
/*
* this dumps all file system paths to the inode into the ipath struct, provided
* is has been created large enough. each path is zero-terminated and accessed
* from ipath->fspath->val[i].
* when it returns, there are ipath->fspath->elem_cnt number of paths available
* in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
* number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
* it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
* have been needed to return all paths.
*/
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
{
return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
inode_to_path, ipath);
}
/*
* allocates space to return multiple file system paths for an inode.
* total_bytes to allocate are passed, note that space usable for actual path
* information will be total_bytes - sizeof(struct inode_fs_paths).
* the returned pointer must be freed with free_ipath() in the end.
*/
struct btrfs_data_container *init_data_container(u32 total_bytes)
{
struct btrfs_data_container *data;
size_t alloc_bytes;
alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
data = kmalloc(alloc_bytes, GFP_NOFS);
if (!data)
return ERR_PTR(-ENOMEM);
if (total_bytes >= sizeof(*data)) {
data->bytes_left = total_bytes - sizeof(*data);
data->bytes_missing = 0;
} else {
data->bytes_missing = sizeof(*data) - total_bytes;
data->bytes_left = 0;
}
data->elem_cnt = 0;
data->elem_missed = 0;
return data;
}
/*
* allocates space to return multiple file system paths for an inode.
* total_bytes to allocate are passed, note that space usable for actual path
* information will be total_bytes - sizeof(struct inode_fs_paths).
* the returned pointer must be freed with free_ipath() in the end.
*/
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path)
{
struct inode_fs_paths *ifp;
struct btrfs_data_container *fspath;
fspath = init_data_container(total_bytes);
if (IS_ERR(fspath))
return (void *)fspath;
ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
if (!ifp) {
kfree(fspath);
return ERR_PTR(-ENOMEM);
}
ifp->btrfs_path = path;
ifp->fspath = fspath;
ifp->fs_root = fs_root;
return ifp;
}
void free_ipath(struct inode_fs_paths *ipath)
{
kfree(ipath);
}

62
fs/btrfs/backref.h Normal file
View file

@ -0,0 +1,62 @@
/*
* Copyright (C) 2011 STRATO. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef __BTRFS_BACKREF__
#define __BTRFS_BACKREF__
#include "ioctl.h"
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
struct btrfs_root *fs_root;
struct btrfs_data_container *fspath;
};
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
struct extent_buffer *eb, void *ctx);
int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
struct btrfs_path *path);
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key);
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_extent_item *ei, u32 item_size,
u64 *out_root, u8 *out_level);
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
u64 extent_item_objectid,
u64 extent_offset,
iterate_extent_inodes_t *iterate, void *ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
iterate_extent_inodes_t *iterate, void *ctx);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
void free_ipath(struct inode_fs_paths *ipath);
#endif

View file

@ -103,11 +103,6 @@ struct btrfs_inode {
*/
u64 delalloc_bytes;
/* total number of bytes that may be used for this inode for
* delalloc
*/
u64 reserved_bytes;
/*
* the size of the file stored in the metadata on disk. data=ordered
* means the in-memory i_size might be larger than the size on disk
@ -115,9 +110,6 @@ struct btrfs_inode {
*/
u64 disk_i_size;
/* flags field from the on disk inode */
u32 flags;
/*
* if this is a directory then index_cnt is the counter for the index
* number for new files that are created
@ -131,6 +123,15 @@ struct btrfs_inode {
*/
u64 last_unlink_trans;
/*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
*/
u64 csum_bytes;
/* flags field from the on disk inode */
u32 flags;
/*
* Counters to keep track of the number of extent item's we may use due
* to delalloc and such. outstanding_extents is the number of extent
@ -146,14 +147,12 @@ struct btrfs_inode {
* the btrfs file release call will add this inode to the
* ordered operations list so that we make sure to flush out any
* new data the application may have written before commit.
*
* yes, its silly to have a single bitflag, but we might grow more
* of these.
*/
unsigned ordered_data_close:1;
unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1;
unsigned in_defrag:1;
unsigned delalloc_meta_reserved:1;
/*
* always compress this one file

View file

@ -85,7 +85,8 @@ struct compressed_bio {
static inline int compressed_bio_size(struct btrfs_root *root,
unsigned long disk_size)
{
u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
return sizeof(struct compressed_bio) +
((disk_size + root->sectorsize - 1) / root->sectorsize) *
csum_size;

View file

@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
/* ensure we can see the force_cow */
smp_rmb();
/*
* We do not need to cow a block if
* 1) this block is not created or changed in this transaction;
* 2) this block does not belong to TREE_RELOC tree;
* 3) the root is not forced COW.
*
* What is forced COW:
* when we create snapshot during commiting the transaction,
* after we've finished coping src root, we must COW the shared
* block to ensure the metadata consistency.
*/
if (btrfs_header_generation(buf) == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
!(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
!root->force_cow)
return 0;
return 1;
}
@ -902,9 +917,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
orig_ptr = btrfs_node_blockptr(mid, orig_slot);
if (level < BTRFS_MAX_LEVEL - 1)
if (level < BTRFS_MAX_LEVEL - 1) {
parent = path->nodes[level + 1];
pslot = path->slots[level + 1];
pslot = path->slots[level + 1];
}
/*
* deal with the case where there is only one pointer in the root
@ -1107,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
mid = path->nodes[level];
WARN_ON(btrfs_header_generation(mid) != trans->transid);
if (level < BTRFS_MAX_LEVEL - 1)
if (level < BTRFS_MAX_LEVEL - 1) {
parent = path->nodes[level + 1];
pslot = path->slots[level + 1];
pslot = path->slots[level + 1];
}
if (!parent)
return 1;

View file

@ -30,6 +30,7 @@
#include <linux/kobject.h>
#include <trace/events/btrfs.h>
#include <asm/kmap_types.h>
#include <linux/pagemap.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@ -359,6 +360,47 @@ struct btrfs_header {
#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
#define BTRFS_LABEL_SIZE 256
/*
* just in case we somehow lose the roots and are not able to mount,
* we store an array of the roots from previous transactions
* in the super.
*/
#define BTRFS_NUM_BACKUP_ROOTS 4
struct btrfs_root_backup {
__le64 tree_root;
__le64 tree_root_gen;
__le64 chunk_root;
__le64 chunk_root_gen;
__le64 extent_root;
__le64 extent_root_gen;
__le64 fs_root;
__le64 fs_root_gen;
__le64 dev_root;
__le64 dev_root_gen;
__le64 csum_root;
__le64 csum_root_gen;
__le64 total_bytes;
__le64 bytes_used;
__le64 num_devices;
/* future */
__le64 unsed_64[4];
u8 tree_root_level;
u8 chunk_root_level;
u8 extent_root_level;
u8 fs_root_level;
u8 dev_root_level;
u8 csum_root_level;
/* future and to align */
u8 unused_8[10];
} __attribute__ ((__packed__));
/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
@ -405,6 +447,7 @@ struct btrfs_super_block {
/* future expansion */
__le64 reserved[31];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
} __attribute__ ((__packed__));
/*
@ -772,14 +815,8 @@ struct btrfs_space_info {
struct btrfs_block_rsv {
u64 size;
u64 reserved;
u64 freed[2];
struct btrfs_space_info *space_info;
struct list_head list;
spinlock_t lock;
atomic_t usage;
unsigned int priority:8;
unsigned int durable:1;
unsigned int refill_used:1;
unsigned int full:1;
};
@ -811,7 +848,8 @@ struct btrfs_free_cluster {
enum btrfs_caching_type {
BTRFS_CACHE_NO = 0,
BTRFS_CACHE_STARTED = 1,
BTRFS_CACHE_FINISHED = 2,
BTRFS_CACHE_FAST = 2,
BTRFS_CACHE_FINISHED = 3,
};
enum btrfs_disk_cache_state {
@ -840,10 +878,10 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
u64 reserved_pinned;
u64 bytes_super;
u64 flags;
u64 sectorsize;
u64 cache_generation;
unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
@ -899,6 +937,10 @@ struct btrfs_fs_info {
spinlock_t block_group_cache_lock;
struct rb_root block_group_cache_tree;
/* keep track of unallocated space */
spinlock_t free_chunk_lock;
u64 free_chunk_space;
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
@ -916,14 +958,11 @@ struct btrfs_fs_info {
struct btrfs_block_rsv trans_block_rsv;
/* block reservation for chunk tree */
struct btrfs_block_rsv chunk_block_rsv;
/* block reservation for delayed operations */
struct btrfs_block_rsv delayed_block_rsv;
struct btrfs_block_rsv empty_block_rsv;
/* list of block reservations that cross multiple transactions */
struct list_head durable_block_rsv_list;
struct mutex durable_block_rsv_mutex;
u64 generation;
u64 last_trans_committed;
@ -942,8 +981,8 @@ struct btrfs_fs_info {
wait_queue_head_t transaction_blocked_wait;
wait_queue_head_t async_submit_wait;
struct btrfs_super_block super_copy;
struct btrfs_super_block super_for_commit;
struct btrfs_super_block *super_copy;
struct btrfs_super_block *super_for_commit;
struct block_device *__bdev;
struct super_block *sb;
struct inode *btree_inode;
@ -1036,6 +1075,7 @@ struct btrfs_fs_info {
struct btrfs_workers endio_freespace_worker;
struct btrfs_workers submit_workers;
struct btrfs_workers caching_workers;
struct btrfs_workers readahead_workers;
/*
* fixup workers take dirty pages that didn't properly go through
@ -1119,6 +1159,13 @@ struct btrfs_fs_info {
u64 fs_state;
struct btrfs_delayed_root *delayed_root;
/* readahead tree */
spinlock_t reada_lock;
struct radix_tree_root reada_tree;
/* next backup root to be overwritten */
int backup_root_index;
};
/*
@ -1225,6 +1272,8 @@ struct btrfs_root {
* for stat. It may be used for more later
*/
dev_t anon_dev;
int force_cow;
};
struct btrfs_ioctl_defrag_range_args {
@ -1363,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
#define BTRFS_MOUNT_RECOVERY (1 << 18)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@ -1978,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
}
/* struct btrfs_root_backup */
BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
tree_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
tree_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
tree_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
chunk_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
chunk_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
chunk_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
extent_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
extent_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
extent_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
fs_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
fs_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
fs_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
dev_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
dev_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
dev_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
csum_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
csum_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
csum_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
total_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
bytes_used, 64);
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@ -2129,6 +2228,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
}
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
return mapping_gfp_mask(mapping) & ~__GFP_FS;
}
/* extent-tree.c */
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
unsigned num_items)
@ -2137,6 +2241,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3 * num_items;
}
/*
* Doing a truncate won't result in new nodes or leaves, just what we need for
* COW.
*/
static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
num_items;
}
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
@ -2146,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr);
@ -2196,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve, int sinfo);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_block_rsv_check(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int min_factor);
int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 min_reserved, int min_factor);
u64 min_reserved);
int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 min_reserved);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
int btrfs_set_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
int btrfs_set_block_group_rw(struct btrfs_root *root,
@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
smp_mb();
return fs_info->closing;
}
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
{
kfree(fs_info->delayed_root);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
kfree(fs_info->chunk_root);
kfree(fs_info->dev_root);
kfree(fs_info->csum_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
kfree(fs_info);
}
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
@ -2561,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
void btrfs_dirty_inode(struct inode *inode, int flags);
int btrfs_dirty_inode(struct inode *inode);
int btrfs_update_time(struct file *file);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
@ -2579,11 +2711,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@ -2697,4 +2824,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
/* reada.c */
struct reada_control {
struct btrfs_root *root; /* tree to prefetch */
struct btrfs_key key_start;
struct btrfs_key key_end; /* exclusive */
atomic_t elems;
struct kref refcnt;
wait_queue_head_t wait;
};
struct reada_control *btrfs_reada_add(struct btrfs_root *root,
struct btrfs_key *start, struct btrfs_key *end);
int btrfs_reada_wait(void *handle);
void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err);
#endif

View file

@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
return 0;
src_rsv = trans->block_rsv;
dst_rsv = &root->fs_info->global_block_rsv;
dst_rsv = &root->fs_info->delayed_block_rsv;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
if (!item->bytes_reserved)
return;
rsv = &root->fs_info->global_block_rsv;
rsv = &root->fs_info->delayed_block_rsv;
btrfs_block_rsv_release(root, rsv,
item->bytes_reserved);
}
@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode,
struct btrfs_delayed_node *node)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
u64 num_bytes;
int ret;
if (!trans->bytes_reserved)
return 0;
int release = false;
src_rsv = trans->block_rsv;
dst_rsv = &root->fs_info->global_block_rsv;
dst_rsv = &root->fs_info->delayed_block_rsv;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
/*
* btrfs_dirty_inode will update the inode under btrfs_join_transaction
* which doesn't reserve space for speed. This is a problem since we
* still need to reserve space for this update, so try to reserve the
* space.
*
* Now if src_rsv == delalloc_block_rsv we'll let it just steal since
* we're accounted for.
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv != &root->fs_info->delalloc_block_rsv)) {
ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
/*
* Since we're under a transaction reserve_metadata_bytes could
* try to commit the transaction which will make it return
* EAGAIN to make us stop the transaction we have, so return
* ENOSPC instead so that btrfs_dirty_inode knows what to do.
*/
if (ret == -EAGAIN)
ret = -ENOSPC;
if (!ret)
node->bytes_reserved = num_bytes;
return ret;
} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
spin_lock(&BTRFS_I(inode)->lock);
if (BTRFS_I(inode)->delalloc_meta_reserved) {
BTRFS_I(inode)->delalloc_meta_reserved = 0;
spin_unlock(&BTRFS_I(inode)->lock);
release = true;
goto migrate;
}
spin_unlock(&BTRFS_I(inode)->lock);
/* Ok we didn't have space pre-reserved. This shouldn't happen
* too often but it can happen if we do delalloc to an existing
* inode which gets dirtied because of the time update, and then
* isn't touched again until after the transaction commits and
* then we try to write out the data. First try to be nice and
* reserve something strictly for us. If not be a pain and try
* to steal from the delalloc block rsv.
*/
ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
if (!ret)
goto out;
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
if (!ret)
goto out;
/*
* Ok this is a problem, let's just steal from the global rsv
* since this really shouldn't happen that often.
*/
WARN_ON(1);
ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
dst_rsv, num_bytes);
goto out;
}
migrate:
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
out:
/*
* Migrate only takes a reservation, it doesn't touch the size of the
* block_rsv. This is to simplify people who don't normally have things
* migrated from their block rsv. If they go to release their
* reservation, that will decrease the size as well, so if migrate
* reduced size we'd end up with a negative size. But for the
* delalloc_meta_reserved stuff we will only know to drop 1 reservation,
* but we could in fact do this reserve/migrate dance several times
* between the time we did the original reservation and we'd clean it
* up. So to take care of this, release the space for the meta
* reservation here. I think it may be time for a documentation page on
* how block rsvs. work.
*/
if (!ret)
node->bytes_reserved = num_bytes;
if (release)
btrfs_block_rsv_release(root, src_rsv, num_bytes);
return ret;
}
@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
if (!node->bytes_reserved)
return;
rsv = &root->fs_info->global_block_rsv;
rsv = &root->fs_info->delayed_block_rsv;
btrfs_block_rsv_release(root, rsv,
node->bytes_reserved);
node->bytes_reserved = 0;
@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &root->fs_info->global_block_rsv;
trans->block_rsv = &root->fs_info->delayed_block_rsv;
delayed_root = btrfs_get_delayed_root(root);
@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &node->root->fs_info->global_block_rsv;
trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
ret = btrfs_insert_delayed_items(trans, path, node->root, node);
if (!ret)
@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
goto free_path;
block_rsv = trans->block_rsv;
trans->block_rsv = &root->fs_info->global_block_rsv;
trans->block_rsv = &root->fs_info->delayed_block_rsv;
ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
if (!ret)
@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
goto release_node;
}
ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
/*
* we must reserve enough space when we start a new transaction,
* so reserving metadata failure is impossible
*/
BUG_ON(ret);
ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
delayed_node);
if (ret)
goto release_node;
fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
delayed_node->inode_dirty = 1;

View file

@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
int verify)
{
u16 csum_size =
btrfs_super_csum_size(&root->fs_info->super_copy);
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
char *result = NULL;
unsigned long len;
unsigned long cur_len;
@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
while (1) {
ret = read_extent_buffer_pages(io_tree, eb, start, 1,
ret = read_extent_buffer_pages(io_tree, eb, start,
WAIT_COMPLETE,
btree_get_extent, mirror_num);
if (!ret &&
!verify_parent_transid(io_tree, eb, parent_transid))
@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
err:
if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
btree_readahead_hook(root, eb, eb->start, ret);
}
free_extent_buffer(eb);
out:
return ret;
}
static int btree_io_failed_hook(struct bio *failed_bio,
struct page *page, u64 start, u64 end,
int mirror_num, struct extent_state *state)
{
struct extent_io_tree *tree;
unsigned long len;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
tree = &BTRFS_I(page->mapping->host)->io_tree;
if (page->private == EXTENT_PAGE_PRIVATE)
goto out;
if (!page->private)
goto out;
len = page->private >> 2;
WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page);
if (eb == NULL)
goto out;
if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
btree_readahead_hook(root, eb, eb->start, -EIO);
}
free_extent_buffer(eb);
out:
return -EIO; /* we fixed nothing */
}
static void end_workqueue_bio(struct bio *bio, int err)
{
struct end_io_wq *end_io_wq = bio->bi_private;
@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
{
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
return extent_read_full_page(tree, page, btree_get_extent);
return extent_read_full_page(tree, page, btree_get_extent, 0);
}
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
if (!buf)
return 0;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
buf, 0, 0, btree_get_extent, 0);
buf, 0, WAIT_NONE, btree_get_extent, 0);
free_extent_buffer(buf);
return ret;
}
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
int mirror_num, struct extent_buffer **eb)
{
struct extent_buffer *buf = NULL;
struct inode *btree_inode = root->fs_info->btree_inode;
struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
int ret;
buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!buf)
return 0;
set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
btree_get_extent, mirror_num);
if (ret) {
free_extent_buffer(buf);
return ret;
}
if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
free_extent_buffer(buf);
return -EIO;
} else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
*eb = buf;
} else {
free_extent_buffer(buf);
}
return 0;
}
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->commit_root = NULL;
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
free_extent_buffer(root->node);
root->node = NULL;
return -EIO;
}
root->commit_root = btrfs_root_node(root);
@ -1577,6 +1648,235 @@ static int transaction_kthread(void *arg)
return 0;
}
/*
* this will find the highest generation in the array of
* root backups. The index of the highest array is returned,
* or -1 if we can't find anything.
*
* We check to make sure the array is valid by comparing the
* generation of the latest root in the array with the generation
* in the super block. If they don't match we pitch it.
*/
static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
{
u64 cur;
int newest_index = -1;
struct btrfs_root_backup *root_backup;
int i;
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
root_backup = info->super_copy->super_roots + i;
cur = btrfs_backup_tree_root_gen(root_backup);
if (cur == newest_gen)
newest_index = i;
}
/* check to see if we actually wrapped around */
if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
root_backup = info->super_copy->super_roots;
cur = btrfs_backup_tree_root_gen(root_backup);
if (cur == newest_gen)
newest_index = 0;
}
return newest_index;
}
/*
* find the oldest backup so we know where to store new entries
* in the backup array. This will set the backup_root_index
* field in the fs_info struct
*/
static void find_oldest_super_backup(struct btrfs_fs_info *info,
u64 newest_gen)
{
int newest_index = -1;
newest_index = find_newest_super_backup(info, newest_gen);
/* if there was garbage in there, just move along */
if (newest_index == -1) {
info->backup_root_index = 0;
} else {
info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
}
}
/*
* copy all the root pointers into the super backup array.
* this will bump the backup pointer by one when it is
* done
*/
static void backup_super_roots(struct btrfs_fs_info *info)
{
int next_backup;
struct btrfs_root_backup *root_backup;
int last_backup;
next_backup = info->backup_root_index;
last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
BTRFS_NUM_BACKUP_ROOTS;
/*
* just overwrite the last backup if we're at the same generation
* this happens only at umount
*/
root_backup = info->super_for_commit->super_roots + last_backup;
if (btrfs_backup_tree_root_gen(root_backup) ==
btrfs_header_generation(info->tree_root->node))
next_backup = last_backup;
root_backup = info->super_for_commit->super_roots + next_backup;
/*
* make sure all of our padding and empty slots get zero filled
* regardless of which ones we use today
*/
memset(root_backup, 0, sizeof(*root_backup));
info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
btrfs_set_backup_tree_root_gen(root_backup,
btrfs_header_generation(info->tree_root->node));
btrfs_set_backup_tree_root_level(root_backup,
btrfs_header_level(info->tree_root->node));
btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
btrfs_set_backup_chunk_root_gen(root_backup,
btrfs_header_generation(info->chunk_root->node));
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
btrfs_set_backup_extent_root_gen(root_backup,
btrfs_header_generation(info->extent_root->node));
btrfs_set_backup_extent_root_level(root_backup,
btrfs_header_level(info->extent_root->node));
/*
* we might commit during log recovery, which happens before we set
* the fs_root. Make sure it is valid before we fill it in.
*/
if (info->fs_root && info->fs_root->node) {
btrfs_set_backup_fs_root(root_backup,
info->fs_root->node->start);
btrfs_set_backup_fs_root_gen(root_backup,
btrfs_header_generation(info->fs_root->node));
btrfs_set_backup_fs_root_level(root_backup,
btrfs_header_level(info->fs_root->node));
}
btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
btrfs_set_backup_dev_root_gen(root_backup,
btrfs_header_generation(info->dev_root->node));
btrfs_set_backup_dev_root_level(root_backup,
btrfs_header_level(info->dev_root->node));
btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
btrfs_set_backup_csum_root_gen(root_backup,
btrfs_header_generation(info->csum_root->node));
btrfs_set_backup_csum_root_level(root_backup,
btrfs_header_level(info->csum_root->node));
btrfs_set_backup_total_bytes(root_backup,
btrfs_super_total_bytes(info->super_copy));
btrfs_set_backup_bytes_used(root_backup,
btrfs_super_bytes_used(info->super_copy));
btrfs_set_backup_num_devices(root_backup,
btrfs_super_num_devices(info->super_copy));
/*
* if we don't copy this out to the super_copy, it won't get remembered
* for the next commit
*/
memcpy(&info->super_copy->super_roots,
&info->super_for_commit->super_roots,
sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
}
/*
* this copies info out of the root backup array and back into
* the in-memory super block. It is meant to help iterate through
* the array, so you send it the number of backups you've already
* tried and the last backup index you used.
*
* this returns -1 when it has tried all the backups
*/
static noinline int next_root_backup(struct btrfs_fs_info *info,
struct btrfs_super_block *super,
int *num_backups_tried, int *backup_index)
{
struct btrfs_root_backup *root_backup;
int newest = *backup_index;
if (*num_backups_tried == 0) {
u64 gen = btrfs_super_generation(super);
newest = find_newest_super_backup(info, gen);
if (newest == -1)
return -1;
*backup_index = newest;
*num_backups_tried = 1;
} else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
/* we've tried all the backups, all done */
return -1;
} else {
/* jump to the next oldest backup */
newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
BTRFS_NUM_BACKUP_ROOTS;
*backup_index = newest;
*num_backups_tried += 1;
}
root_backup = super->super_roots + newest;
btrfs_set_super_generation(super,
btrfs_backup_tree_root_gen(root_backup));
btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
btrfs_set_super_root_level(super,
btrfs_backup_tree_root_level(root_backup));
btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
/*
* fixme: the total bytes and num_devices need to match or we should
* need a fsck
*/
btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
return 0;
}
/* helper to cleanup tree roots */
static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
{
free_extent_buffer(info->tree_root->node);
free_extent_buffer(info->tree_root->commit_root);
free_extent_buffer(info->dev_root->node);
free_extent_buffer(info->dev_root->commit_root);
free_extent_buffer(info->extent_root->node);
free_extent_buffer(info->extent_root->commit_root);
free_extent_buffer(info->csum_root->node);
free_extent_buffer(info->csum_root->commit_root);
info->tree_root->node = NULL;
info->tree_root->commit_root = NULL;
info->dev_root->node = NULL;
info->dev_root->commit_root = NULL;
info->extent_root->node = NULL;
info->extent_root->commit_root = NULL;
info->csum_root->node = NULL;
info->csum_root->commit_root = NULL;
if (chunk_root) {
free_extent_buffer(info->chunk_root->node);
free_extent_buffer(info->chunk_root->commit_root);
info->chunk_root->node = NULL;
info->chunk_root->commit_root = NULL;
}
}
struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
@ -1590,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
u64 features;
struct btrfs_key location;
struct buffer_head *bh;
struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_super_block *disk_super;
struct btrfs_root *tree_root = btrfs_sb(sb);
struct btrfs_fs_info *fs_info = NULL;
struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_root *extent_root;
struct btrfs_root *csum_root;
struct btrfs_root *chunk_root;
struct btrfs_root *dev_root;
struct btrfs_root *log_tree_root;
int ret;
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
struct btrfs_super_block *disk_super;
extent_root = fs_info->extent_root =
kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
csum_root = fs_info->csum_root =
kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
chunk_root = fs_info->chunk_root =
kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
dev_root = fs_info->dev_root =
kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
if (!extent_root || !tree_root || !tree_root->fs_info ||
!chunk_root || !dev_root || !csum_root) {
if (!extent_root || !csum_root || !chunk_root || !dev_root) {
err = -ENOMEM;
goto fail;
}
fs_info = tree_root->fs_info;
ret = init_srcu_struct(&fs_info->subvol_srcu);
if (ret) {
@ -1648,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->free_chunk_lock);
mutex_init(&fs_info->reloc_mutex);
init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
fs_info->extent_root = extent_root;
fs_info->csum_root = csum_root;
fs_info->chunk_root = chunk_root;
fs_info->dev_root = dev_root;
fs_info->fs_devices = fs_devices;
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
btrfs_mapping_init(&fs_info->mapping_tree);
@ -1665,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_block_rsv(&fs_info->trans_block_rsv);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
btrfs_init_block_rsv(&fs_info->empty_block_rsv);
INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
mutex_init(&fs_info->durable_block_rsv_mutex);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
@ -1677,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->trans_no_join = 0;
fs_info->free_chunk_space = 0;
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
spin_lock_init(&fs_info->reada_lock);
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
@ -1766,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_alloc;
}
memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
sizeof(fs_info->super_for_commit));
memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
memcpy(fs_info->super_for_commit, fs_info->super_copy,
sizeof(*fs_info->super_for_commit));
brelse(bh);
memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
disk_super = &fs_info->super_copy;
disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
goto fail_alloc;
@ -1782,6 +2084,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
/*
* run through our array of backup supers and setup
* our ring pointer to the oldest one
*/
generation = btrfs_super_generation(disk_super);
find_oldest_super_backup(fs_info, generation);
/*
* In the long term, we'll store the compression type in the super
* block, and it'll be used for per file compression control.
@ -1870,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
fs_info->thread_pool_size,
&fs_info->generic_worker);
btrfs_init_workers(&fs_info->readahead_workers, "readahead",
fs_info->thread_pool_size,
&fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@ -1880,19 +2192,29 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->endio_write_workers.idle_thresh = 2;
fs_info->endio_meta_write_workers.idle_thresh = 2;
fs_info->readahead_workers.idle_thresh = 2;
btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->generic_worker, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
btrfs_start_workers(&fs_info->delalloc_workers, 1);
btrfs_start_workers(&fs_info->fixup_workers, 1);
btrfs_start_workers(&fs_info->endio_workers, 1);
btrfs_start_workers(&fs_info->endio_meta_workers, 1);
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
btrfs_start_workers(&fs_info->endio_write_workers, 1);
btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
btrfs_start_workers(&fs_info->delayed_workers, 1);
btrfs_start_workers(&fs_info->caching_workers, 1);
/*
* btrfs_start_workers can really only fail because of ENOMEM so just
* return -ENOMEM if any of these fail.
*/
ret = btrfs_start_workers(&fs_info->workers);
ret |= btrfs_start_workers(&fs_info->generic_worker);
ret |= btrfs_start_workers(&fs_info->submit_workers);
ret |= btrfs_start_workers(&fs_info->delalloc_workers);
ret |= btrfs_start_workers(&fs_info->fixup_workers);
ret |= btrfs_start_workers(&fs_info->endio_workers);
ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
ret |= btrfs_start_workers(&fs_info->endio_write_workers);
ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
ret |= btrfs_start_workers(&fs_info->delayed_workers);
ret |= btrfs_start_workers(&fs_info->caching_workers);
ret |= btrfs_start_workers(&fs_info->readahead_workers);
if (ret) {
ret = -ENOMEM;
goto fail_sb_buffer;
}
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@ -1939,7 +2261,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
sb->s_id);
goto fail_chunk_root;
goto fail_tree_roots;
}
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
chunk_root->commit_root = btrfs_root_node(chunk_root);
@ -1954,11 +2276,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (ret) {
printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
sb->s_id);
goto fail_chunk_root;
goto fail_tree_roots;
}
btrfs_close_extra_devices(fs_devices);
retry_root_backup:
blocksize = btrfs_level_size(tree_root,
btrfs_super_root_level(disk_super));
generation = btrfs_super_generation(disk_super);
@ -1966,32 +2289,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
blocksize, generation);
if (!tree_root->node)
goto fail_chunk_root;
if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
if (!tree_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
sb->s_id);
goto fail_tree_root;
goto recovery_tree_root;
}
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_EXTENT_TREE_OBJECTID, extent_root);
if (ret)
goto fail_tree_root;
goto recovery_tree_root;
extent_root->track_dirty = 1;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_DEV_TREE_OBJECTID, dev_root);
if (ret)
goto fail_extent_root;
goto recovery_tree_root;
dev_root->track_dirty = 1;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_CSUM_TREE_OBJECTID, csum_root);
if (ret)
goto fail_dev_root;
goto recovery_tree_root;
csum_root->track_dirty = 1;
@ -2124,22 +2448,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fail_block_groups:
btrfs_free_block_groups(fs_info);
free_extent_buffer(csum_root->node);
free_extent_buffer(csum_root->commit_root);
fail_dev_root:
free_extent_buffer(dev_root->node);
free_extent_buffer(dev_root->commit_root);
fail_extent_root:
free_extent_buffer(extent_root->node);
free_extent_buffer(extent_root->commit_root);
fail_tree_root:
free_extent_buffer(tree_root->node);
free_extent_buffer(tree_root->commit_root);
fail_chunk_root:
free_extent_buffer(chunk_root->node);
free_extent_buffer(chunk_root->commit_root);
fail_tree_roots:
free_root_pointers(fs_info, 1);
fail_sb_buffer:
btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->readahead_workers);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
@ -2152,25 +2467,37 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
fail_alloc:
kfree(fs_info->delayed_root);
fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
fail_bdi:
bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
kfree(extent_root);
kfree(tree_root);
kfree(fs_info);
kfree(chunk_root);
kfree(dev_root);
kfree(csum_root);
btrfs_close_devices(fs_info->fs_devices);
free_fs_info(fs_info);
return ERR_PTR(err);
recovery_tree_root:
if (!btrfs_test_opt(tree_root, RECOVERY))
goto fail_tree_roots;
free_root_pointers(fs_info, 0);
/* don't use the log in recovery mode, it won't be valid */
btrfs_set_super_log_root(disk_super, 0);
/* we can't trust the free space cache either */
btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
ret = next_root_backup(fs_info, fs_info->super_copy,
&num_backups_tried, &backup_index);
if (ret == -1)
goto fail_block_groups;
goto retry_root_backup;
}
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@ -2254,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device,
int errors = 0;
u32 crc;
u64 bytenr;
int last_barrier = 0;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
/* make sure only the last submit_bh does a barrier */
if (do_barriers) {
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->total_bytes)
break;
last_barrier = i;
}
}
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@ -2315,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device,
bh->b_end_io = btrfs_end_buffer_write_sync;
}
if (i == last_barrier && do_barriers)
ret = submit_bh(WRITE_FLUSH_FUA, bh);
else
ret = submit_bh(WRITE_SYNC, bh);
/*
* we fua the first super. The others we allow
* to go down lazy.
*/
ret = submit_bh(WRITE_FUA, bh);
if (ret)
errors++;
}
return errors < i ? 0 : -1;
}
/*
* endio for the write_dev_flush, this will wake anyone waiting
* for the barrier when it is done
*/
static void btrfs_end_empty_barrier(struct bio *bio, int err)
{
if (err) {
if (err == -EOPNOTSUPP)
set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
clear_bit(BIO_UPTODATE, &bio->bi_flags);
}
if (bio->bi_private)
complete(bio->bi_private);
bio_put(bio);
}
/*
* trigger flushes for one the devices. If you pass wait == 0, the flushes are
* sent down. With wait == 1, it waits for the previous flush.
*
* any device where the flush fails with eopnotsupp are flagged as not-barrier
* capable
*/
static int write_dev_flush(struct btrfs_device *device, int wait)
{
struct bio *bio;
int ret = 0;
if (device->nobarriers)
return 0;
if (wait) {
bio = device->flush_bio;
if (!bio)
return 0;
wait_for_completion(&device->flush_wait);
if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
printk("btrfs: disabling barriers on dev %s\n",
device->name);
device->nobarriers = 1;
}
if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
}
/* drop the reference from the wait == 0 run */
bio_put(bio);
device->flush_bio = NULL;
return ret;
}
/*
* one reference for us, and we leave it for the
* caller
*/
device->flush_bio = NULL;;
bio = bio_alloc(GFP_NOFS, 0);
if (!bio)
return -ENOMEM;
bio->bi_end_io = btrfs_end_empty_barrier;
bio->bi_bdev = device->bdev;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
device->flush_bio = bio;
bio_get(bio);
submit_bio(WRITE_FLUSH, bio);
return 0;
}
/*
* send an empty flush down to each device in parallel,
* then wait for them
*/
static int barrier_all_devices(struct btrfs_fs_info *info)
{
struct list_head *head;
struct btrfs_device *dev;
int errors = 0;
int ret;
/* send down all the barriers */
head = &info->fs_devices->devices;
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
errors++;
continue;
}
if (!dev->in_fs_metadata || !dev->writeable)
continue;
ret = write_dev_flush(dev, 0);
if (ret)
errors++;
}
/* wait for all the barriers */
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
errors++;
continue;
}
if (!dev->in_fs_metadata || !dev->writeable)
continue;
ret = write_dev_flush(dev, 1);
if (ret)
errors++;
}
if (errors)
return -EIO;
return 0;
}
int write_all_supers(struct btrfs_root *root, int max_mirrors)
{
struct list_head *head;
@ -2338,14 +2772,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
int total_errors = 0;
u64 flags;
max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
do_barriers = !btrfs_test_opt(root, NOBARRIER);
backup_super_roots(root->fs_info);
sb = &root->fs_info->super_for_commit;
sb = root->fs_info->super_for_commit;
dev_item = &sb->dev_item;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices;
if (do_barriers)
barrier_all_devices(root->fs_info);
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
total_errors++;
@ -2545,8 +2984,6 @@ int close_ctree(struct btrfs_root *root)
/* clear out the rbtree of defraggable inodes */
btrfs_run_defrag_inodes(root->fs_info);
btrfs_put_block_group_cache(fs_info);
/*
* Here come 2 situations when btrfs is broken to flip readonly:
*
@ -2572,6 +3009,8 @@ int close_ctree(struct btrfs_root *root)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
btrfs_put_block_group_cache(fs_info);
kthread_stop(root->fs_info->transaction_kthread);
kthread_stop(root->fs_info->cleaner_kthread);
@ -2603,7 +3042,6 @@ int close_ctree(struct btrfs_root *root)
del_fs_roots(fs_info);
iput(fs_info->btree_inode);
kfree(fs_info->delayed_root);
btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
@ -2617,6 +3055,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
btrfs_stop_workers(&fs_info->readahead_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@ -2624,12 +3063,7 @@ int close_ctree(struct btrfs_root *root)
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
kfree(fs_info->chunk_root);
kfree(fs_info->dev_root);
kfree(fs_info->csum_root);
kfree(fs_info);
free_fs_info(fs_info);
return 0;
}
@ -2735,7 +3169,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
return ret;
}
int btree_lock_page_hook(struct page *page)
static int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *))
{
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
@ -2752,7 +3187,10 @@ int btree_lock_page_hook(struct page *page)
if (!eb)
goto out;
btrfs_tree_lock(eb);
if (!btrfs_try_tree_write_lock(eb)) {
flush_fn(data);
btrfs_tree_lock(eb);
}
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@ -2767,7 +3205,10 @@ int btree_lock_page_hook(struct page *page)
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
out:
lock_page(page);
if (!trylock_page(page)) {
flush_fn(data);
lock_page(page);
}
return 0;
}
@ -3123,6 +3564,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
.readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,

View file

@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
u32 blocksize, u64 parent_transid);
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
u64 parent_transid);
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
int clean_tree_block(struct btrfs_trans_handle *trans,
@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btree_lock_page_hook(struct page *page);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);

File diff suppressed because it is too large Load diff

View file

@ -17,6 +17,7 @@
#include "compat.h"
#include "ctree.h"
#include "btrfs_inode.h"
#include "volumes.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@ -894,6 +895,202 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
goto again;
}
/**
* convert_extent - convert all bits in a given range from one bit to another
* @tree: the io tree to search
* @start: the start offset in bytes
* @end: the end offset in bytes (inclusive)
* @bits: the bits to set in this range
* @clear_bits: the bits to clear in this range
* @mask: the allocation mask
*
* This will go through and set bits for the given range. If any states exist
* already in this range they are set with the given bit and cleared of the
* clear_bits. This is only meant to be used by things that are mergeable, ie
* converting from say DELALLOC to DIRTY. This is not meant to be used with
* boundary bits like LOCK.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int clear_bits, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node *node;
int err = 0;
u64 last_start;
u64 last_end;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
if (!prealloc)
return -ENOMEM;
}
spin_lock(&tree->lock);
/*
* this search will find all the extents that end after
* our range starts.
*/
node = tree_search(tree, start);
if (!node) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
err = insert_state(tree, prealloc, start, end, &bits);
prealloc = NULL;
BUG_ON(err == -EEXIST);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
hit_next:
last_start = state->start;
last_end = state->end;
/*
* | ---- desired range ---- |
* | state |
*
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
struct rb_node *next_node;
set_state_bits(tree, state, &bits);
clear_state_bit(tree, state, &clear_bits, 0);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
next_node = rb_next(&state->rb_node);
if (next_node && start < end && prealloc && !need_resched()) {
state = rb_entry(next_node, struct extent_state,
rb_node);
if (state->start == start)
goto hit_next;
}
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* or
* | ------------- state -------------- |
*
* We need to split the extent we found, and may flip bits on
* second half.
*
* If the extent we found extends past our
* range, we just split and search again. It'll get split
* again the next time though.
*
* If the extent we found is inside our range, we set the
* desired bit on it.
*/
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
err = split_state(tree, state, prealloc, start);
BUG_ON(err == -EEXIST);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, &bits);
clear_state_bit(tree, state, &clear_bits, 0);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
}
goto search_again;
}
/*
* | ---- desired range ---- |
* | state | or | state |
*
* There's a hole, we need to insert something in it and
* ignore the extent we found.
*/
if (state->start > start) {
u64 this_end;
if (end < last_start)
this_end = end;
else
this_end = last_start - 1;
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
/*
* Avoid to free 'prealloc' if it can be merged with
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
&bits);
BUG_ON(err == -EEXIST);
if (err) {
free_extent_state(prealloc);
prealloc = NULL;
goto out;
}
prealloc = NULL;
start = this_end + 1;
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* We need to split the extent, and set the bit
* on the first half
*/
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
set_state_bits(tree, prealloc, &bits);
clear_state_bit(tree, prealloc, &clear_bits, 0);
merge_state(tree, prealloc);
prealloc = NULL;
goto out;
}
goto search_again;
out:
spin_unlock(&tree->lock);
if (prealloc)
free_extent_state(prealloc);
return err;
search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
if (mask & __GFP_WAIT)
cond_resched();
goto again;
}
/* wrappers around set/clear extent bit */
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
@ -919,7 +1116,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
EXTENT_DELALLOC | EXTENT_UPTODATE,
0, NULL, cached_state, mask);
}
@ -1599,6 +1796,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
return 0;
}
/*
* When IO fails, either with EIO or csum verification fails, we
* try other mirrors that might have a good copy of the data. This
* io_failure_record is used to record state as we go through all the
* mirrors. If another mirror has good data, the page is set up to date
* and things continue. If a good mirror can't be found, the original
* bio end_io callback is called to indicate things have failed.
*/
struct io_failure_record {
struct page *page;
u64 start;
u64 len;
u64 logical;
unsigned long bio_flags;
int this_mirror;
int failed_mirror;
int in_validation;
};
static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
int did_repair)
{
int ret;
int err = 0;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
set_state_private(failure_tree, rec->start, 0);
ret = clear_extent_bits(failure_tree, rec->start,
rec->start + rec->len - 1,
EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
if (ret)
err = ret;
if (did_repair) {
ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
rec->start + rec->len - 1,
EXTENT_DAMAGED, GFP_NOFS);
if (ret && !err)
err = ret;
}
kfree(rec);
return err;
}
static void repair_io_failure_callback(struct bio *bio, int err)
{
complete(bio->bi_private);
}
/*
* this bypasses the standard btrfs submit functions deliberately, as
* the standard behavior is to write all copies in a raid setup. here we only
* want to write the one bad copy. so we do the mapping for ourselves and issue
* submit_bio directly.
* to avoid any synchonization issues, wait for the data after writing, which
* actually prevents the read that triggered the error from finishing.
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
u64 length, u64 logical, struct page *page,
int mirror_num)
{
struct bio *bio;
struct btrfs_device *dev;
DECLARE_COMPLETION_ONSTACK(compl);
u64 map_length = 0;
u64 sector;
struct btrfs_bio *bbio = NULL;
int ret;
BUG_ON(!mirror_num);
bio = bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
bio->bi_private = &compl;
bio->bi_end_io = repair_io_failure_callback;
bio->bi_size = 0;
map_length = length;
ret = btrfs_map_block(map_tree, WRITE, logical,
&map_length, &bbio, mirror_num);
if (ret) {
bio_put(bio);
return -EIO;
}
BUG_ON(mirror_num != bbio->mirror_num);
sector = bbio->stripes[mirror_num-1].physical >> 9;
bio->bi_sector = sector;
dev = bbio->stripes[mirror_num-1].dev;
kfree(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
bio_put(bio);
return -EIO;
}
bio->bi_bdev = dev->bdev;
bio_add_page(bio, page, length, start-page_offset(page));
submit_bio(WRITE_SYNC, bio);
wait_for_completion(&compl);
if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
/* try to remap that extent elsewhere? */
bio_put(bio);
return -EIO;
}
printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
"sector %llu)\n", page->mapping->host->i_ino, start,
dev->name, sector);
bio_put(bio);
return 0;
}
/*
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
static int clean_io_failure(u64 start, struct page *page)
{
u64 private;
u64 private_failure;
struct io_failure_record *failrec;
struct btrfs_mapping_tree *map_tree;
struct extent_state *state;
int num_copies;
int did_repair = 0;
int ret;
struct inode *inode = page->mapping->host;
private = 0;
ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
(u64)-1, 1, EXTENT_DIRTY, 0);
if (!ret)
return 0;
ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
&private_failure);
if (ret)
return 0;
failrec = (struct io_failure_record *)(unsigned long) private_failure;
BUG_ON(!failrec->this_mirror);
if (failrec->in_validation) {
/* there was no real error, just free the record */
pr_debug("clean_io_failure: freeing dummy error at %llu\n",
failrec->start);
did_repair = 1;
goto out;
}
spin_lock(&BTRFS_I(inode)->io_tree.lock);
state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
failrec->start,
EXTENT_LOCKED);
spin_unlock(&BTRFS_I(inode)->io_tree.lock);
if (state && state->start == failrec->start) {
map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
num_copies = btrfs_num_copies(map_tree, failrec->logical,
failrec->len);
if (num_copies > 1) {
ret = repair_io_failure(map_tree, start, failrec->len,
failrec->logical, page,
failrec->failed_mirror);
did_repair = !ret;
}
}
out:
if (!ret)
ret = free_io_failure(inode, failrec, did_repair);
return ret;
}
/*
* this is a generic handler for readpage errors (default
* readpage_io_failed_hook). if other copies exist, read those and write back
* good data to the failed position. does not investigate in remapping the
* failed extent elsewhere, hoping the device will be smart enough to do this as
* needed
*/
static int bio_readpage_error(struct bio *failed_bio, struct page *page,
u64 start, u64 end, int failed_mirror,
struct extent_state *state)
{
struct io_failure_record *failrec = NULL;
u64 private;
struct extent_map *em;
struct inode *inode = page->mapping->host;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct bio *bio;
int num_copies;
int ret;
int read_mode;
u64 logical;
BUG_ON(failed_bio->bi_rw & REQ_WRITE);
ret = get_state_private(failure_tree, start, &private);
if (ret) {
failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
if (!failrec)
return -ENOMEM;
failrec->start = start;
failrec->len = end - start + 1;
failrec->this_mirror = 0;
failrec->bio_flags = 0;
failrec->in_validation = 0;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
if (!em) {
read_unlock(&em_tree->lock);
kfree(failrec);
return -EIO;
}
if (em->start > start || em->start + em->len < start) {
free_extent_map(em);
em = NULL;
}
read_unlock(&em_tree->lock);
if (!em || IS_ERR(em)) {
kfree(failrec);
return -EIO;
}
logical = start - em->start;
logical = em->block_start + logical;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
logical = em->block_start;
failrec->bio_flags = EXTENT_BIO_COMPRESSED;
extent_set_compress_type(&failrec->bio_flags,
em->compress_type);
}
pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
"len=%llu\n", logical, start, failrec->len);
failrec->logical = logical;
free_extent_map(em);
/* set the bits in the private failure tree */
ret = set_extent_bits(failure_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
if (ret >= 0)
ret = set_state_private(failure_tree, start,
(u64)(unsigned long)failrec);
/* set the bits in the inode's tree */
if (ret >= 0)
ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
GFP_NOFS);
if (ret < 0) {
kfree(failrec);
return ret;
}
} else {
failrec = (struct io_failure_record *)(unsigned long)private;
pr_debug("bio_readpage_error: (found) logical=%llu, "
"start=%llu, len=%llu, validation=%d\n",
failrec->logical, failrec->start, failrec->len,
failrec->in_validation);
/*
* when data can be on disk more than twice, add to failrec here
* (e.g. with a list for failed_mirror) to make
* clean_io_failure() clean all those errors at once.
*/
}
num_copies = btrfs_num_copies(
&BTRFS_I(inode)->root->fs_info->mapping_tree,
failrec->logical, failrec->len);
if (num_copies == 1) {
/*
* we only have a single copy of the data, so don't bother with
* all the retry and error correction code that follows. no
* matter what the error is, it is very likely to persist.
*/
pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
"state=%p, num_copies=%d, next_mirror %d, "
"failed_mirror %d\n", state, num_copies,
failrec->this_mirror, failed_mirror);
free_io_failure(inode, failrec, 0);
return -EIO;
}
if (!state) {
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, failrec->start,
EXTENT_LOCKED);
if (state && state->start != failrec->start)
state = NULL;
spin_unlock(&tree->lock);
}
/*
* there are two premises:
* a) deliver good data to the caller
* b) correct the bad sectors on disk
*/
if (failed_bio->bi_vcnt > 1) {
/*
* to fulfill b), we need to know the exact failing sectors, as
* we don't want to rewrite any more than the failed ones. thus,
* we need separate read requests for the failed bio
*
* if the following BUG_ON triggers, our validation request got
* merged. we need separate requests for our algorithm to work.
*/
BUG_ON(failrec->in_validation);
failrec->in_validation = 1;
failrec->this_mirror = failed_mirror;
read_mode = READ_SYNC | REQ_FAILFAST_DEV;
} else {
/*
* we're ready to fulfill a) and b) alongside. get a good copy
* of the failed sector and if we succeed, we have setup
* everything for repair_io_failure to do the rest for us.
*/
if (failrec->in_validation) {
BUG_ON(failrec->this_mirror != failed_mirror);
failrec->in_validation = 0;
failrec->this_mirror = 0;
}
failrec->failed_mirror = failed_mirror;
failrec->this_mirror++;
if (failrec->this_mirror == failed_mirror)
failrec->this_mirror++;
read_mode = READ_SYNC;
}
if (!state || failrec->this_mirror > num_copies) {
pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
"next_mirror %d, failed_mirror %d\n", state,
num_copies, failrec->this_mirror, failed_mirror);
free_io_failure(inode, failrec, 0);
return -EIO;
}
bio = bio_alloc(GFP_NOFS, 1);
bio->bi_private = state;
bio->bi_end_io = failed_bio->bi_end_io;
bio->bi_sector = failrec->logical >> 9;
bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
bio->bi_size = 0;
bio_add_page(bio, page, failrec->len, start - page_offset(page));
pr_debug("bio_readpage_error: submitting new read[%#x] to "
"this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
failrec->this_mirror, num_copies, failrec->in_validation);
tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
failrec->bio_flags, 0);
return 0;
}
/* lots and lots of room for performance fixes in the end_bio funcs */
/*
@ -1697,6 +2256,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct extent_state *cached = NULL;
struct extent_state *state;
pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
"mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
(long int)bio->bi_bdev);
tree = &BTRFS_I(page->mapping->host)->io_tree;
start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@ -1727,12 +2289,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
state);
if (ret)
uptodate = 0;
else
clean_io_failure(start, page);
}
if (!uptodate && tree->ops &&
tree->ops->readpage_io_failed_hook) {
ret = tree->ops->readpage_io_failed_hook(bio, page,
start, end, NULL);
if (!uptodate) {
int failed_mirror;
failed_mirror = (int)(unsigned long)bio->bi_bdev;
/*
* The generic bio_readpage_error handles errors the
* following way: If possible, new read requests are
* created and submitted and will end up in
* end_bio_extent_readpage as well (if we're lucky, not
* in the !uptodate case). In that case it returns 0 and
* we just go on with the next page in our bio. If it
* can't handle the error it will return -EIO and we
* remain responsible for that page.
*/
ret = bio_readpage_error(bio, page, start, end,
failed_mirror, NULL);
if (ret == 0) {
error_handled:
uptodate =
test_bit(BIO_UPTODATE, &bio->bi_flags);
if (err)
@ -1740,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
uncache_state(&cached);
continue;
}
if (tree->ops && tree->ops->readpage_io_failed_hook) {
ret = tree->ops->readpage_io_failed_hook(
bio, page, start, end,
failed_mirror, state);
if (ret == 0)
goto error_handled;
}
}
if (uptodate) {
@ -1811,6 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
mirror_num, bio_flags, start);
else
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
bio_put(bio);
@ -2076,16 +2660,16 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
}
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent)
get_extent_t *get_extent, int mirror_num)
{
struct bio *bio = NULL;
unsigned long bio_flags = 0;
int ret;
ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
&bio_flags);
if (bio)
ret = submit_one_bio(READ, bio, 0, bio_flags);
ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
return ret;
}
@ -2136,6 +2720,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
int compressed;
int write_flags;
unsigned long nr_written = 0;
bool fill_delalloc = true;
if (wbc->sync_mode == WB_SYNC_ALL)
write_flags = WRITE_SYNC;
@ -2145,6 +2730,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
ClearPageError(page);
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
@ -2166,10 +2754,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
set_page_extent_mapped(page);
if (!tree->ops || !tree->ops->fill_delalloc)
fill_delalloc = false;
delalloc_start = start;
delalloc_end = 0;
page_started = 0;
if (!epd->extent_locked) {
if (!epd->extent_locked && fill_delalloc) {
u64 delalloc_to_write = 0;
/*
* make sure the wbc mapping index is at least updated
@ -2421,10 +3012,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
* swizzled back from swapper_space to tmpfs file
* mapping
*/
if (tree->ops && tree->ops->write_cache_pages_lock_hook)
tree->ops->write_cache_pages_lock_hook(page);
else
lock_page(page);
if (tree->ops &&
tree->ops->write_cache_pages_lock_hook) {
tree->ops->write_cache_pages_lock_hook(page,
data, flush_fn);
} else {
if (!trylock_page(page)) {
flush_fn(data);
lock_page(page);
}
}
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
@ -2790,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -ENOMEM;
path->leave_spinning = 1;
start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
/*
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
@ -2837,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
&cached_state, GFP_NOFS);
em = get_extent_skip_holes(inode, off, last_for_get_extent,
em = get_extent_skip_holes(inode, start, last_for_get_extent,
get_extent);
if (!em)
goto out;
@ -2926,7 +3526,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
static inline struct page *extent_buffer_page(struct extent_buffer *eb,
inline struct page *extent_buffer_page(struct extent_buffer *eb,
unsigned long i)
{
struct page *p;
@ -2951,7 +3551,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
return p;
}
static inline unsigned long num_extent_pages(u64 start, u64 len)
inline unsigned long num_extent_pages(u64 start, u64 len)
{
return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
(start >> PAGE_CACHE_SHIFT);
@ -3204,6 +3804,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
PAGECACHE_TAG_DIRTY);
}
spin_unlock_irq(&page->mapping->tree_lock);
ClearPageError(page);
unlock_page(page);
}
return 0;
@ -3349,8 +3950,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
}
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb,
u64 start, int wait,
struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num)
{
unsigned long i;
@ -3386,7 +3986,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
if (!wait) {
if (wait == WAIT_NONE) {
if (!trylock_page(page))
goto unlock_exit;
} else {
@ -3430,7 +4030,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
if (bio)
submit_one_bio(READ, bio, mirror_num, bio_flags);
if (ret || !wait)
if (ret || wait != WAIT_COMPLETE)
return ret;
for (i = start_i; i < num_pages; i++) {

View file

@ -17,6 +17,8 @@
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
#define EXTENT_FIRST_DELALLOC (1 << 12)
#define EXTENT_NEED_WAIT (1 << 13)
#define EXTENT_DAMAGED (1 << 14)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@ -32,6 +34,7 @@
#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
#define EXTENT_BUFFER_CORRUPT 3
#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
/* these are flags for extent_clear_unlock_delalloc */
#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@ -67,7 +70,7 @@ struct extent_io_ops {
unsigned long bio_flags);
int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
u64 start, u64 end,
u64 start, u64 end, int failed_mirror,
struct extent_state *state);
int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
u64 start, u64 end,
@ -85,7 +88,8 @@ struct extent_io_ops {
struct extent_state *other);
void (*split_extent_hook)(struct inode *inode,
struct extent_state *orig, u64 split);
int (*write_cache_pages_lock_hook)(struct page *page);
int (*write_cache_pages_lock_hook)(struct page *page, void *data,
void (*flush_fn)(void *));
};
struct extent_io_tree {
@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent);
get_extent_t *get_extent, int mirror_num);
int __init extent_io_init(void);
void extent_io_exit(void);
@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int clear_bits, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb);
#define WAIT_NONE 0
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, u64 start, int wait,
get_extent_t *get_extent, int mirror_num);
unsigned long num_extent_pages(u64 start, u64 len);
struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
static inline void extent_buffer_get(struct extent_buffer *eb)
{
@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
gfp_t gfp_flags);
struct btrfs_mapping_tree;
int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
u64 length, u64 logical, struct page *page,
int mirror_num);
#endif

View file

@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_csum_item *item;
struct extent_buffer *leaf;
u64 csum_offset = 0;
u16 csum_size =
btrfs_super_csum_size(&root->fs_info->super_copy);
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u64 item_last_offset = 0;
u64 disk_bytenr;
u32 diff;
u16 csum_size =
btrfs_super_csum_size(&root->fs_info->super_copy);
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
int ret;
struct btrfs_path *path;
struct btrfs_csum_item *item = NULL;
@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
int ret;
size_t size;
u64 csum_end;
u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)
@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
u16 csum_size =
btrfs_super_csum_size(&root->fs_info->super_copy);
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
u64 csum_end;
u64 end_byte = bytenr + len;
u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 csum_end;
struct extent_buffer *leaf;
int ret;
u16 csum_size =
btrfs_super_csum_size(&root->fs_info->super_copy);
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
root = root->fs_info->csum_root;
@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_sector_sum *sector_sum;
u32 nritems;
u32 ins_size;
u16 csum_size =
btrfs_super_csum_size(&root->fs_info->super_copy);
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)

View file

@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
int i;
unsigned long index = pos >> PAGE_CACHE_SHIFT;
struct inode *inode = fdentry(file)->d_inode;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int err = 0;
int faili = 0;
u64 start_pos;
@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
again:
for (i = 0; i < num_pages; i++) {
pages[i] = find_or_create_page(inode->i_mapping, index + i,
GFP_NOFS);
mask);
if (!pages[i]) {
faili = i - 1;
err = -ENOMEM;
@ -1386,7 +1387,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
file_update_time(file);
err = btrfs_update_time(file);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
}
BTRFS_I(inode)->sequence++;
start_pos = round_down(pos, root->sectorsize);
@ -1615,10 +1620,6 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out;
}
ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
if (ret)
goto out;
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
@ -1664,11 +1665,27 @@ static long btrfs_fallocate(struct file *file, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
/*
* Make sure we have enough space before we do the
* allocation.
*/
ret = btrfs_check_data_free_space(inode, last_byte -
cur_offset);
if (ret) {
free_extent_map(em);
break;
}
ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
last_byte - cur_offset,
1 << inode->i_blkbits,
offset + len,
&alloc_hint);
/* Let go of our reservation. */
btrfs_free_reserved_data_space(inode, last_byte -
cur_offset);
if (ret < 0) {
free_extent_map(em);
break;
@ -1694,8 +1711,6 @@ static long btrfs_fallocate(struct file *file, int mode,
}
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
out:
mutex_unlock(&inode->i_mutex);
return ret;

File diff suppressed because it is too large Load diff

View file

@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_path *path;
struct inode *inode;
struct btrfs_block_rsv *rsv;
u64 num_bytes;
u64 alloc_hint = 0;
int ret;
int prealloc;
@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
if (!path)
return -ENOMEM;
rsv = trans->block_rsv;
trans->block_rsv = &root->fs_info->trans_block_rsv;
num_bytes = trans->bytes_reserved;
/*
* 1 item for inode item insertion if need
* 3 items for inode item update (in the worst case)
* 1 item for free space object
* 3 items for pre-allocation
*/
trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
trans->bytes_reserved);
if (ret)
goto out;
again:
inode = lookup_free_ino_inode(root, path);
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
ret = PTR_ERR(inode);
goto out;
goto out_release;
}
if (IS_ERR(inode)) {
@ -434,7 +451,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
ret = create_free_ino_inode(root, trans, path);
if (ret)
goto out;
goto out_release;
goto again;
}
@ -465,21 +482,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_CACHE_SIZE;
ret = btrfs_check_data_free_space(inode, prealloc);
ret = btrfs_delalloc_reserve_space(inode, prealloc);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret)
if (ret) {
btrfs_delalloc_release_space(inode, prealloc);
goto out_put;
}
btrfs_free_reserved_data_space(inode, prealloc);
ret = btrfs_write_out_ino_cache(root, trans, path);
out_put:
iput(inode);
out_release:
btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
out:
if (ret == 0)
ret = btrfs_write_out_ino_cache(root, trans, path);
trans->block_rsv = rsv;
trans->bytes_reserved = num_bytes;
btrfs_free_path(path);
return ret;

File diff suppressed because it is too large Load diff

View file

@ -51,6 +51,7 @@
#include "volumes.h"
#include "locking.h"
#include "inode-map.h"
#include "backref.h"
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
/*
* Inherit flags from the parent inode.
*
* Unlike extN we don't have any flags we don't want to inherit currently.
* Currently only the compression flags and the cow flags are inherited.
*/
void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
{
@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
flags = BTRFS_I(dir)->flags;
if (S_ISREG(inode->i_mode))
flags &= ~BTRFS_INODE_DIRSYNC;
else if (!S_ISDIR(inode->i_mode))
flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
if (flags & BTRFS_INODE_NOCOMPRESS) {
BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
} else if (flags & BTRFS_INODE_COMPRESS) {
BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
}
if (flags & BTRFS_INODE_NODATACOW)
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
BTRFS_I(inode)->flags = flags;
btrfs_update_iflags(inode);
}
@ -246,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
btrfs_update_iflags(inode);
inode->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
btrfs_update_iflags(inode);
inode->i_ctime = CURRENT_TIME;
btrfs_end_transaction(trans, root);
mnt_drop_write(file->f_path.mnt);
@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
}
}
rcu_read_unlock();
if (!num_devices)
return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
if (range.start > total_bytes)
return -EINVAL;
range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(root, &range);
if (ret < 0)
@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
int ret = 1;
/*
* make sure that once we start defragging and extent, we keep on
* make sure that once we start defragging an extent, we keep on
* defragging it
*/
if (start < *defrag_end)
@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
* extent will force at least part of that big extent to be defragged.
*/
if (ret) {
*last_len += len;
*defrag_end = extent_map_end(em);
} else {
*last_len = 0;
@ -843,13 +852,16 @@ static int cluster_pages_for_defrag(struct inode *inode,
int i_done;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
if (isize == 0)
return 0;
file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_space(inode,
num_pages << PAGE_CACHE_SHIFT);
mutex_unlock(&inode->i_mutex);
if (ret)
return ret;
again:
@ -860,7 +872,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
for (i = 0; i < num_pages; i++) {
struct page *page;
page = find_or_create_page(inode->i_mapping,
start_index + i, GFP_NOFS);
start_index + i, mask);
if (!page)
break;
@ -972,18 +984,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_super_block *disk_super;
struct file_ra_state *ra = NULL;
unsigned long last_index;
u64 isize = i_size_read(inode);
u64 features;
u64 last_len = 0;
u64 skip = 0;
u64 defrag_end = 0;
u64 newer_off = range->start;
int newer_left = 0;
unsigned long i;
unsigned long ra_index = 0;
int ret;
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
int extent_thresh = range->extent_thresh;
int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
int cluster = max_cluster;
u64 new_align = ~((u64)128 * 1024 - 1);
struct page **pages = NULL;
@ -997,7 +1011,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
compress_type = range->compress_type;
}
if (inode->i_size == 0)
if (isize == 0)
return 0;
/*
@ -1013,7 +1027,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra = &file->f_ra;
}
pages = kmalloc(sizeof(struct page *) * newer_cluster,
pages = kmalloc(sizeof(struct page *) * max_cluster,
GFP_NOFS);
if (!pages) {
ret = -ENOMEM;
@ -1022,10 +1036,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
/* find the last page to defrag */
if (range->start + range->len > range->start) {
last_index = min_t(u64, inode->i_size - 1,
last_index = min_t(u64, isize - 1,
range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
} else {
last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
}
if (newer_than) {
@ -1038,14 +1052,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* the extents in the file evenly spaced
*/
i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
newer_left = newer_cluster;
} else
goto out_ra;
} else {
i = range->start >> PAGE_CACHE_SHIFT;
}
if (!max_to_defrag)
max_to_defrag = last_index - 1;
max_to_defrag = last_index;
/*
* make writeback starts from i, so the defrag range can be
@ -1079,18 +1092,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i = max(i + 1, next);
continue;
}
if (!newer_than) {
cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
PAGE_CACHE_SHIFT) - i;
cluster = min(cluster, max_cluster);
} else {
cluster = max_cluster;
}
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = compress_type;
btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
if (i + cluster > ra_index) {
ra_index = max(i, ra_index);
btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
cluster);
ra_index += max_cluster;
}
ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
ret = cluster_pages_for_defrag(inode, pages, i, cluster);
if (ret < 0)
goto out_ra;
defrag_count += ret;
balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
i += ret;
if (newer_than) {
if (newer_off == (u64)-1)
@ -1105,12 +1131,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (!ret) {
range->start = newer_off;
i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
newer_left = newer_cluster;
} else {
break;
}
} else {
i++;
if (ret > 0) {
i += ret;
last_len += ret << PAGE_CACHE_SHIFT;
} else {
i++;
last_len = 0;
}
}
}
@ -1136,16 +1167,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
mutex_unlock(&inode->i_mutex);
}
disk_super = &root->fs_info->super_copy;
disk_super = root->fs_info->super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (range->compress_type == BTRFS_COMPRESS_LZO) {
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
btrfs_set_super_incompat_flags(disk_super, features);
}
if (!file)
kfree(ra);
return defrag_count;
ret = defrag_count;
out_ra:
if (!file)
@ -1189,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
*devstr = '\0';
devstr = vol_args->name;
devid = simple_strtoull(devstr, &end, 10);
printk(KERN_INFO "resizing devid %llu\n",
printk(KERN_INFO "btrfs: resizing devid %llu\n",
(unsigned long long)devid);
}
device = btrfs_find_device(root, devid, NULL, NULL);
if (!device) {
printk(KERN_INFO "resizer unable to find device %llu\n",
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
goto out_unlock;
@ -1240,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
do_div(new_size, root->sectorsize);
new_size *= root->sectorsize;
printk(KERN_INFO "new size for %s is %llu\n",
printk(KERN_INFO "btrfs: new size for %s is %llu\n",
device->name, (unsigned long long)new_size);
if (new_size > old_size) {
@ -1251,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans, root);
} else {
} else if (new_size < old_size) {
ret = btrfs_shrink_device(device, new_size);
}
@ -2587,7 +2616,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
return PTR_ERR(trans);
}
dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
dir_id, "default", 7, 1);
if (IS_ERR_OR_NULL(di)) {
@ -2603,7 +2632,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
disk_super = &root->fs_info->super_copy;
disk_super = root->fs_info->super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@ -2864,6 +2893,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
return ret;
}
static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
{
int ret = 0;
int i;
u64 rel_ptr;
int size;
struct btrfs_ioctl_ino_path_args *ipa = NULL;
struct inode_fs_paths *ipath = NULL;
struct btrfs_path *path;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
ipa = memdup_user(arg, sizeof(*ipa));
if (IS_ERR(ipa)) {
ret = PTR_ERR(ipa);
ipa = NULL;
goto out;
}
size = min_t(u32, ipa->size, 4096);
ipath = init_ipath(size, root, path);
if (IS_ERR(ipath)) {
ret = PTR_ERR(ipath);
ipath = NULL;
goto out;
}
ret = paths_from_inode(ipa->inum, ipath);
if (ret < 0)
goto out;
for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
rel_ptr = ipath->fspath->val[i] -
(u64)(unsigned long)ipath->fspath->val;
ipath->fspath->val[i] = rel_ptr;
}
ret = copy_to_user((void *)(unsigned long)ipa->fspath,
(void *)(unsigned long)ipath->fspath, size);
if (ret) {
ret = -EFAULT;
goto out;
}
out:
btrfs_free_path(path);
free_ipath(ipath);
kfree(ipa);
return ret;
}
static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
{
struct btrfs_data_container *inodes = ctx;
const size_t c = 3 * sizeof(u64);
if (inodes->bytes_left >= c) {
inodes->bytes_left -= c;
inodes->val[inodes->elem_cnt] = inum;
inodes->val[inodes->elem_cnt + 1] = offset;
inodes->val[inodes->elem_cnt + 2] = root;
inodes->elem_cnt += 3;
} else {
inodes->bytes_missing += c - inodes->bytes_left;
inodes->bytes_left = 0;
inodes->elem_missed += 3;
}
return 0;
}
static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
void __user *arg)
{
int ret = 0;
int size;
u64 extent_offset;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
struct btrfs_path *path = NULL;
struct btrfs_key key;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
loi = memdup_user(arg, sizeof(*loi));
if (IS_ERR(loi)) {
ret = PTR_ERR(loi);
loi = NULL;
goto out;
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
size = min_t(u32, loi->size, 4096);
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
inodes = NULL;
goto out;
}
ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = -ENOENT;
if (ret < 0)
goto out;
extent_offset = loi->logical - key.objectid;
ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
extent_offset, build_ino_list, inodes);
if (ret < 0)
goto out;
ret = copy_to_user((void *)(unsigned long)loi->inodes,
(void *)(unsigned long)inodes, size);
if (ret)
ret = -EFAULT;
out:
btrfs_free_path(path);
kfree(inodes);
kfree(loi);
return ret;
}
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@ -2921,6 +3091,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_tree_search(file, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(file, argp);
case BTRFS_IOC_INO_PATHS:
return btrfs_ioctl_ino_to_path(root, argp);
case BTRFS_IOC_LOGICAL_INO:
return btrfs_ioctl_logical_to_ino(root, argp);
case BTRFS_IOC_SPACE_INFO:
return btrfs_ioctl_space_info(root, argp);
case BTRFS_IOC_SYNC:

View file

@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
struct btrfs_ioctl_space_info spaces[0];
};
struct btrfs_data_container {
__u32 bytes_left; /* out -- bytes not needed to deliver output */
__u32 bytes_missing; /* out -- additional bytes needed for result */
__u32 elem_cnt; /* out */
__u32 elem_missed; /* out */
__u64 val[0]; /* out */
};
struct btrfs_ioctl_ino_path_args {
__u64 inum; /* in */
__u32 size; /* in */
__u64 reserved[4];
/* struct btrfs_data_container *fspath; out */
__u64 fspath; /* out */
};
struct btrfs_ioctl_logical_ino_args {
__u64 logical; /* in */
__u32 size; /* in */
__u64 reserved[4];
/* struct btrfs_data_container *inodes; out */
__u64 inodes;
};
#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
struct btrfs_ioctl_dev_info_args)
#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
struct btrfs_ioctl_fs_info_args)
#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
struct btrfs_ioctl_ino_path_args)
#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
struct btrfs_ioctl_ino_path_args)
#endif

View file

@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
{
int i;
u32 type;
u32 nr = btrfs_header_nritems(l);
u32 type, nr;
struct btrfs_item *item;
struct btrfs_root_item *ri;
struct btrfs_dir_item *di;
@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
struct btrfs_key key;
struct btrfs_key found_key;
if (!l)
return;
nr = btrfs_header_nritems(l);
printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
(unsigned long long)btrfs_header_bytenr(l), nr,
btrfs_leaf_free_space(root, l));

951
fs/btrfs/reada.c Normal file
View file

@ -0,0 +1,951 @@
/*
* Copyright (C) 2011 STRATO. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include "ctree.h"
#include "volumes.h"
#include "disk-io.h"
#include "transaction.h"
#undef DEBUG
/*
* This is the implementation for the generic read ahead framework.
*
* To trigger a readahead, btrfs_reada_add must be called. It will start
* a read ahead for the given range [start, end) on tree root. The returned
* handle can either be used to wait on the readahead to finish
* (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
*
* The read ahead works as follows:
* On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
* reada_start_machine will then search for extents to prefetch and trigger
* some reads. When a read finishes for a node, all contained node/leaf
* pointers that lie in the given range will also be enqueued. The reads will
* be triggered in sequential order, thus giving a big win over a naive
* enumeration. It will also make use of multi-device layouts. Each disk
* will have its on read pointer and all disks will by utilized in parallel.
* Also will no two disks read both sides of a mirror simultaneously, as this
* would waste seeking capacity. Instead both disks will read different parts
* of the filesystem.
* Any number of readaheads can be started in parallel. The read order will be
* determined globally, i.e. 2 parallel readaheads will normally finish faster
* than the 2 started one after another.
*/
#define MAX_MIRRORS 2
#define MAX_IN_FLIGHT 6
struct reada_extctl {
struct list_head list;
struct reada_control *rc;
u64 generation;
};
struct reada_extent {
u64 logical;
struct btrfs_key top;
u32 blocksize;
int err;
struct list_head extctl;
struct kref refcnt;
spinlock_t lock;
struct reada_zone *zones[MAX_MIRRORS];
int nzones;
struct btrfs_device *scheduled_for;
};
struct reada_zone {
u64 start;
u64 end;
u64 elems;
struct list_head list;
spinlock_t lock;
int locked;
struct btrfs_device *device;
struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
int ndevs;
struct kref refcnt;
};
struct reada_machine_work {
struct btrfs_work work;
struct btrfs_fs_info *fs_info;
};
static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
static void reada_control_release(struct kref *kref);
static void reada_zone_release(struct kref *kref);
static void reada_start_machine(struct btrfs_fs_info *fs_info);
static void __reada_start_machine(struct btrfs_fs_info *fs_info);
static int reada_add_block(struct reada_control *rc, u64 logical,
struct btrfs_key *top, int level, u64 generation);
/* recurses */
/* in case of err, eb might be NULL */
static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err)
{
int level = 0;
int nritems;
int i;
u64 bytenr;
u64 generation;
struct reada_extent *re;
struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head list;
unsigned long index = start >> PAGE_CACHE_SHIFT;
struct btrfs_device *for_dev;
if (eb)
level = btrfs_header_level(eb);
/* find extent */
spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree, index);
if (re)
kref_get(&re->refcnt);
spin_unlock(&fs_info->reada_lock);
if (!re)
return -1;
spin_lock(&re->lock);
/*
* just take the full list from the extent. afterwards we
* don't need the lock anymore
*/
list_replace_init(&re->extctl, &list);
for_dev = re->scheduled_for;
re->scheduled_for = NULL;
spin_unlock(&re->lock);
if (err == 0) {
nritems = level ? btrfs_header_nritems(eb) : 0;
generation = btrfs_header_generation(eb);
/*
* FIXME: currently we just set nritems to 0 if this is a leaf,
* effectively ignoring the content. In a next step we could
* trigger more readahead depending from the content, e.g.
* fetch the checksums for the extents in the leaf.
*/
} else {
/*
* this is the error case, the extent buffer has not been
* read correctly. We won't access anything from it and
* just cleanup our data structures. Effectively this will
* cut the branch below this node from read ahead.
*/
nritems = 0;
generation = 0;
}
for (i = 0; i < nritems; i++) {
struct reada_extctl *rec;
u64 n_gen;
struct btrfs_key key;
struct btrfs_key next_key;
btrfs_node_key_to_cpu(eb, &key, i);
if (i + 1 < nritems)
btrfs_node_key_to_cpu(eb, &next_key, i + 1);
else
next_key = re->top;
bytenr = btrfs_node_blockptr(eb, i);
n_gen = btrfs_node_ptr_generation(eb, i);
list_for_each_entry(rec, &list, list) {
struct reada_control *rc = rec->rc;
/*
* if the generation doesn't match, just ignore this
* extctl. This will probably cut off a branch from
* prefetch. Alternatively one could start a new (sub-)
* prefetch for this branch, starting again from root.
* FIXME: move the generation check out of this loop
*/
#ifdef DEBUG
if (rec->generation != generation) {
printk(KERN_DEBUG "generation mismatch for "
"(%llu,%d,%llu) %llu != %llu\n",
key.objectid, key.type, key.offset,
rec->generation, generation);
}
#endif
if (rec->generation == generation &&
btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
reada_add_block(rc, bytenr, &next_key,
level - 1, n_gen);
}
}
/*
* free extctl records
*/
while (!list_empty(&list)) {
struct reada_control *rc;
struct reada_extctl *rec;
rec = list_first_entry(&list, struct reada_extctl, list);
list_del(&rec->list);
rc = rec->rc;
kfree(rec);
kref_get(&rc->refcnt);
if (atomic_dec_and_test(&rc->elems)) {
kref_put(&rc->refcnt, reada_control_release);
wake_up(&rc->wait);
}
kref_put(&rc->refcnt, reada_control_release);
reada_extent_put(fs_info, re); /* one ref for each entry */
}
reada_extent_put(fs_info, re); /* our ref */
if (for_dev)
atomic_dec(&for_dev->reada_in_flight);
return 0;
}
/*
* start is passed separately in case eb in NULL, which may be the case with
* failed I/O
*/
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err)
{
int ret;
ret = __readahead_hook(root, eb, start, err);
reada_start_machine(root->fs_info);
return ret;
}
static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
struct btrfs_device *dev, u64 logical,
struct btrfs_bio *bbio)
{
int ret;
int looped = 0;
struct reada_zone *zone;
struct btrfs_block_group_cache *cache = NULL;
u64 start;
u64 end;
int i;
again:
zone = NULL;
spin_lock(&fs_info->reada_lock);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
logical >> PAGE_CACHE_SHIFT, 1);
if (ret == 1)
kref_get(&zone->refcnt);
spin_unlock(&fs_info->reada_lock);
if (ret == 1) {
if (logical >= zone->start && logical < zone->end)
return zone;
spin_lock(&fs_info->reada_lock);
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
if (looped)
return NULL;
cache = btrfs_lookup_block_group(fs_info, logical);
if (!cache)
return NULL;
start = cache->key.objectid;
end = start + cache->key.offset - 1;
btrfs_put_block_group(cache);
zone = kzalloc(sizeof(*zone), GFP_NOFS);
if (!zone)
return NULL;
zone->start = start;
zone->end = end;
INIT_LIST_HEAD(&zone->list);
spin_lock_init(&zone->lock);
zone->locked = 0;
kref_init(&zone->refcnt);
zone->elems = 0;
zone->device = dev; /* our device always sits at index 0 */
for (i = 0; i < bbio->num_stripes; ++i) {
/* bounds have already been checked */
zone->devs[i] = bbio->stripes[i].dev;
}
zone->ndevs = bbio->num_stripes;
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&dev->reada_zones,
(unsigned long)zone->end >> PAGE_CACHE_SHIFT,
zone);
spin_unlock(&fs_info->reada_lock);
if (ret) {
kfree(zone);
looped = 1;
goto again;
}
return zone;
}
static struct reada_extent *reada_find_extent(struct btrfs_root *root,
u64 logical,
struct btrfs_key *top, int level)
{
int ret;
int looped = 0;
struct reada_extent *re = NULL;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct btrfs_bio *bbio = NULL;
struct btrfs_device *dev;
u32 blocksize;
u64 length;
int nzones = 0;
int i;
unsigned long index = logical >> PAGE_CACHE_SHIFT;
again:
spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree, index);
if (re)
kref_get(&re->refcnt);
spin_unlock(&fs_info->reada_lock);
if (re || looped)
return re;
re = kzalloc(sizeof(*re), GFP_NOFS);
if (!re)
return NULL;
blocksize = btrfs_level_size(root, level);
re->logical = logical;
re->blocksize = blocksize;
re->top = *top;
INIT_LIST_HEAD(&re->extctl);
spin_lock_init(&re->lock);
kref_init(&re->refcnt);
/*
* map block
*/
length = blocksize;
ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
if (ret || !bbio || length < blocksize)
goto error;
if (bbio->num_stripes > MAX_MIRRORS) {
printk(KERN_ERR "btrfs readahead: more than %d copies not "
"supported", MAX_MIRRORS);
goto error;
}
for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
struct reada_zone *zone;
dev = bbio->stripes[nzones].dev;
zone = reada_find_zone(fs_info, dev, logical, bbio);
if (!zone)
break;
re->zones[nzones] = zone;
spin_lock(&zone->lock);
if (!zone->elems)
kref_get(&zone->refcnt);
++zone->elems;
spin_unlock(&zone->lock);
spin_lock(&fs_info->reada_lock);
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
re->nzones = nzones;
if (nzones == 0) {
/* not a single zone found, error and out */
goto error;
}
/* insert extent in reada_tree + all per-device trees, all or nothing */
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&fs_info->reada_tree, index, re);
if (ret) {
spin_unlock(&fs_info->reada_lock);
if (ret != -ENOMEM) {
/* someone inserted the extent in the meantime */
looped = 1;
}
goto error;
}
for (i = 0; i < nzones; ++i) {
dev = bbio->stripes[i].dev;
ret = radix_tree_insert(&dev->reada_extents, index, re);
if (ret) {
while (--i >= 0) {
dev = bbio->stripes[i].dev;
BUG_ON(dev == NULL);
radix_tree_delete(&dev->reada_extents, index);
}
BUG_ON(fs_info == NULL);
radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
goto error;
}
}
spin_unlock(&fs_info->reada_lock);
kfree(bbio);
return re;
error:
while (nzones) {
struct reada_zone *zone;
--nzones;
zone = re->zones[nzones];
kref_get(&zone->refcnt);
spin_lock(&zone->lock);
--zone->elems;
if (zone->elems == 0) {
/*
* no fs_info->reada_lock needed, as this can't be
* the last ref
*/
kref_put(&zone->refcnt, reada_zone_release);
}
spin_unlock(&zone->lock);
spin_lock(&fs_info->reada_lock);
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
kfree(bbio);
kfree(re);
if (looped)
goto again;
return NULL;
}
static void reada_kref_dummy(struct kref *kr)
{
}
static void reada_extent_put(struct btrfs_fs_info *fs_info,
struct reada_extent *re)
{
int i;
unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
spin_lock(&fs_info->reada_lock);
if (!kref_put(&re->refcnt, reada_kref_dummy)) {
spin_unlock(&fs_info->reada_lock);
return;
}
radix_tree_delete(&fs_info->reada_tree, index);
for (i = 0; i < re->nzones; ++i) {
struct reada_zone *zone = re->zones[i];
radix_tree_delete(&zone->device->reada_extents, index);
}
spin_unlock(&fs_info->reada_lock);
for (i = 0; i < re->nzones; ++i) {
struct reada_zone *zone = re->zones[i];
kref_get(&zone->refcnt);
spin_lock(&zone->lock);
--zone->elems;
if (zone->elems == 0) {
/* no fs_info->reada_lock needed, as this can't be
* the last ref */
kref_put(&zone->refcnt, reada_zone_release);
}
spin_unlock(&zone->lock);
spin_lock(&fs_info->reada_lock);
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
if (re->scheduled_for)
atomic_dec(&re->scheduled_for->reada_in_flight);
kfree(re);
}
static void reada_zone_release(struct kref *kref)
{
struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
radix_tree_delete(&zone->device->reada_zones,
zone->end >> PAGE_CACHE_SHIFT);
kfree(zone);
}
static void reada_control_release(struct kref *kref)
{
struct reada_control *rc = container_of(kref, struct reada_control,
refcnt);
kfree(rc);
}
static int reada_add_block(struct reada_control *rc, u64 logical,
struct btrfs_key *top, int level, u64 generation)
{
struct btrfs_root *root = rc->root;
struct reada_extent *re;
struct reada_extctl *rec;
re = reada_find_extent(root, logical, top, level); /* takes one ref */
if (!re)
return -1;
rec = kzalloc(sizeof(*rec), GFP_NOFS);
if (!rec) {
reada_extent_put(root->fs_info, re);
return -1;
}
rec->rc = rc;
rec->generation = generation;
atomic_inc(&rc->elems);
spin_lock(&re->lock);
list_add_tail(&rec->list, &re->extctl);
spin_unlock(&re->lock);
/* leave the ref on the extent */
return 0;
}
/*
* called with fs_info->reada_lock held
*/
static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
{
int i;
unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
for (i = 0; i < zone->ndevs; ++i) {
struct reada_zone *peer;
peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
if (peer && peer->device != zone->device)
peer->locked = lock;
}
}
/*
* called with fs_info->reada_lock held
*/
static int reada_pick_zone(struct btrfs_device *dev)
{
struct reada_zone *top_zone = NULL;
struct reada_zone *top_locked_zone = NULL;
u64 top_elems = 0;
u64 top_locked_elems = 0;
unsigned long index = 0;
int ret;
if (dev->reada_curr_zone) {
reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
dev->reada_curr_zone = NULL;
}
/* pick the zone with the most elements */
while (1) {
struct reada_zone *zone;
ret = radix_tree_gang_lookup(&dev->reada_zones,
(void **)&zone, index, 1);
if (ret == 0)
break;
index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
if (zone->locked) {
if (zone->elems > top_locked_elems) {
top_locked_elems = zone->elems;
top_locked_zone = zone;
}
} else {
if (zone->elems > top_elems) {
top_elems = zone->elems;
top_zone = zone;
}
}
}
if (top_zone)
dev->reada_curr_zone = top_zone;
else if (top_locked_zone)
dev->reada_curr_zone = top_locked_zone;
else
return 0;
dev->reada_next = dev->reada_curr_zone->start;
kref_get(&dev->reada_curr_zone->refcnt);
reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
return 1;
}
static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
struct btrfs_device *dev)
{
struct reada_extent *re = NULL;
int mirror_num = 0;
struct extent_buffer *eb = NULL;
u64 logical;
u32 blocksize;
int ret;
int i;
int need_kick = 0;
spin_lock(&fs_info->reada_lock);
if (dev->reada_curr_zone == NULL) {
ret = reada_pick_zone(dev);
if (!ret) {
spin_unlock(&fs_info->reada_lock);
return 0;
}
}
/*
* FIXME currently we issue the reads one extent at a time. If we have
* a contiguous block of extents, we could also coagulate them or use
* plugging to speed things up
*/
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
dev->reada_next >> PAGE_CACHE_SHIFT, 1);
if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
ret = reada_pick_zone(dev);
if (!ret) {
spin_unlock(&fs_info->reada_lock);
return 0;
}
re = NULL;
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
dev->reada_next >> PAGE_CACHE_SHIFT, 1);
}
if (ret == 0) {
spin_unlock(&fs_info->reada_lock);
return 0;
}
dev->reada_next = re->logical + re->blocksize;
kref_get(&re->refcnt);
spin_unlock(&fs_info->reada_lock);
/*
* find mirror num
*/
for (i = 0; i < re->nzones; ++i) {
if (re->zones[i]->device == dev) {
mirror_num = i + 1;
break;
}
}
logical = re->logical;
blocksize = re->blocksize;
spin_lock(&re->lock);
if (re->scheduled_for == NULL) {
re->scheduled_for = dev;
need_kick = 1;
}
spin_unlock(&re->lock);
reada_extent_put(fs_info, re);
if (!need_kick)
return 0;
atomic_inc(&dev->reada_in_flight);
ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
mirror_num, &eb);
if (ret)
__readahead_hook(fs_info->extent_root, NULL, logical, ret);
else if (eb)
__readahead_hook(fs_info->extent_root, eb, eb->start, ret);
if (eb)
free_extent_buffer(eb);
return 1;
}
static void reada_start_machine_worker(struct btrfs_work *work)
{
struct reada_machine_work *rmw;
struct btrfs_fs_info *fs_info;
rmw = container_of(work, struct reada_machine_work, work);
fs_info = rmw->fs_info;
kfree(rmw);
__reada_start_machine(fs_info);
}
static void __reada_start_machine(struct btrfs_fs_info *fs_info)
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 enqueued;
u64 total = 0;
int i;
do {
enqueued = 0;
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
enqueued += reada_start_machine_dev(fs_info,
device);
}
total += enqueued;
} while (enqueued && total < 10000);
if (enqueued == 0)
return;
/*
* If everything is already in the cache, this is effectively single
* threaded. To a) not hold the caller for too long and b) to utilize
* more cores, we broke the loop above after 10000 iterations and now
* enqueue to workers to finish it. This will distribute the load to
* the cores.
*/
for (i = 0; i < 2; ++i)
reada_start_machine(fs_info);
}
static void reada_start_machine(struct btrfs_fs_info *fs_info)
{
struct reada_machine_work *rmw;
rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
if (!rmw) {
/* FIXME we cannot handle this properly right now */
BUG();
}
rmw->work.func = reada_start_machine_worker;
rmw->fs_info = fs_info;
btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
}
#ifdef DEBUG
static void dump_devs(struct btrfs_fs_info *fs_info, int all)
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
unsigned long index;
int ret;
int i;
int j;
int cnt;
spin_lock(&fs_info->reada_lock);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
atomic_read(&device->reada_in_flight));
index = 0;
while (1) {
struct reada_zone *zone;
ret = radix_tree_gang_lookup(&device->reada_zones,
(void **)&zone, index, 1);
if (ret == 0)
break;
printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
"%d devs", zone->start, zone->end, zone->elems,
zone->locked);
for (j = 0; j < zone->ndevs; ++j) {
printk(KERN_CONT " %lld",
zone->devs[j]->devid);
}
if (device->reada_curr_zone == zone)
printk(KERN_CONT " curr off %llu",
device->reada_next - zone->start);
printk(KERN_CONT "\n");
index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
}
cnt = 0;
index = 0;
while (all) {
struct reada_extent *re = NULL;
ret = radix_tree_gang_lookup(&device->reada_extents,
(void **)&re, index, 1);
if (ret == 0)
break;
printk(KERN_DEBUG
" re: logical %llu size %u empty %d for %lld",
re->logical, re->blocksize,
list_empty(&re->extctl), re->scheduled_for ?
re->scheduled_for->devid : -1);
for (i = 0; i < re->nzones; ++i) {
printk(KERN_CONT " zone %llu-%llu devs",
re->zones[i]->start,
re->zones[i]->end);
for (j = 0; j < re->zones[i]->ndevs; ++j) {
printk(KERN_CONT " %lld",
re->zones[i]->devs[j]->devid);
}
}
printk(KERN_CONT "\n");
index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
if (++cnt > 15)
break;
}
}
index = 0;
cnt = 0;
while (all) {
struct reada_extent *re = NULL;
ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
index, 1);
if (ret == 0)
break;
if (!re->scheduled_for) {
index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
continue;
}
printk(KERN_DEBUG
"re: logical %llu size %u list empty %d for %lld",
re->logical, re->blocksize, list_empty(&re->extctl),
re->scheduled_for ? re->scheduled_for->devid : -1);
for (i = 0; i < re->nzones; ++i) {
printk(KERN_CONT " zone %llu-%llu devs",
re->zones[i]->start,
re->zones[i]->end);
for (i = 0; i < re->nzones; ++i) {
printk(KERN_CONT " zone %llu-%llu devs",
re->zones[i]->start,
re->zones[i]->end);
for (j = 0; j < re->zones[i]->ndevs; ++j) {
printk(KERN_CONT " %lld",
re->zones[i]->devs[j]->devid);
}
}
}
printk(KERN_CONT "\n");
index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
}
spin_unlock(&fs_info->reada_lock);
}
#endif
/*
* interface
*/
struct reada_control *btrfs_reada_add(struct btrfs_root *root,
struct btrfs_key *key_start, struct btrfs_key *key_end)
{
struct reada_control *rc;
u64 start;
u64 generation;
int level;
struct extent_buffer *node;
static struct btrfs_key max_key = {
.objectid = (u64)-1,
.type = (u8)-1,
.offset = (u64)-1
};
rc = kzalloc(sizeof(*rc), GFP_NOFS);
if (!rc)
return ERR_PTR(-ENOMEM);
rc->root = root;
rc->key_start = *key_start;
rc->key_end = *key_end;
atomic_set(&rc->elems, 0);
init_waitqueue_head(&rc->wait);
kref_init(&rc->refcnt);
kref_get(&rc->refcnt); /* one ref for having elements */
node = btrfs_root_node(root);
start = node->start;
level = btrfs_header_level(node);
generation = btrfs_header_generation(node);
free_extent_buffer(node);
reada_add_block(rc, start, &max_key, level, generation);
reada_start_machine(root->fs_info);
return rc;
}
#ifdef DEBUG
int btrfs_reada_wait(void *handle)
{
struct reada_control *rc = handle;
while (atomic_read(&rc->elems)) {
wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
5 * HZ);
dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
}
dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
kref_put(&rc->refcnt, reada_control_release);
return 0;
}
#else
int btrfs_reada_wait(void *handle)
{
struct reada_control *rc = handle;
while (atomic_read(&rc->elems)) {
wait_event(rc->wait, atomic_read(&rc->elems) == 0);
}
kref_put(&rc->refcnt, reada_control_release);
return 0;
}
#endif
void btrfs_reada_detach(void *handle)
{
struct reada_control *rc = handle;
kref_put(&rc->refcnt, reada_control_release);
}

View file

@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
list_add_tail(&new_edge->list[UPPER],
&new_node->lower);
}
} else {
list_add_tail(&new_node->lower, &cache->leaves);
}
rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
BUG_ON(IS_ERR(trans));
trans->block_rsv = rc->block_rsv;
ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
min_reserved, 0);
ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
if (ret) {
BUG_ON(ret != -EAGAIN);
ret = btrfs_commit_transaction(trans, root);
@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
again:
if (!err) {
num_bytes = rc->merging_rsv_size;
ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
num_bytes);
ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
if (ret)
err = ret;
}
@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
num_bytes = calcu_metadata_size(rc, node, 1) * 2;
trans->block_rsv = rc->block_rsv;
ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
if (ret) {
if (ret == -EAGAIN)
rc->commit_transaction = 1;
@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
unsigned long last_index;
struct page *page;
struct file_ra_state *ra;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int nr = 0;
int ret = 0;
@ -2946,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
while (index <= last_index) {
mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
mutex_unlock(&inode->i_mutex);
if (ret)
goto out;
@ -2956,7 +2959,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
ra, NULL, index,
last_index + 1 - index);
page = find_or_create_page(inode->i_mapping, index,
GFP_NOFS);
mask);
if (!page) {
btrfs_delalloc_release_metadata(inode,
PAGE_CACHE_SIZE);
@ -3323,8 +3326,11 @@ static int find_data_references(struct reloc_control *rc,
}
key.objectid = ref_objectid;
key.offset = ref_offset;
key.type = BTRFS_EXTENT_DATA_KEY;
if (ref_offset > ((u64)-1 << 32))
key.offset = 0;
else
key.offset = ref_offset;
path->search_commit_root = 1;
path->skip_locking = 1;
@ -3645,14 +3651,11 @@ int prepare_to_relocate(struct reloc_control *rc)
* btrfs_init_reloc_root will use them when there
* is no reservation in transaction handle.
*/
ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
rc->extent_root->nodesize * 256);
if (ret)
return ret;
rc->block_rsv->refill_used = 1;
btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
memset(&rc->cluster, 0, sizeof(rc->cluster));
rc->search_start = rc->block_group->key.objectid;
rc->extents_found = 0;
@ -3777,8 +3780,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
}
ret = btrfs_block_rsv_check(trans, rc->extent_root,
rc->block_rsv, 0, 5);
ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
if (ret < 0) {
if (ret != -EAGAIN) {
err = ret;

View file

@ -17,10 +17,14 @@
*/
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include "ctree.h"
#include "volumes.h"
#include "disk-io.h"
#include "ordered-data.h"
#include "transaction.h"
#include "backref.h"
#include "extent_io.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@ -29,15 +33,12 @@
* any can be found.
*
* Future enhancements:
* - To enhance the performance, better read-ahead strategies for the
* extent-tree can be employed.
* - In case an unrepairable extent is encountered, track which files are
* affected and report them
* - In case of a read error on files with nodatasum, map the file and read
* the extent to trigger a writeback of the good copy
* - track and record media errors, throw out bad devices
* - add a mode to also read unallocated space
* - make the prefetch cancellable
*/
struct scrub_bio;
@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
struct scrub_page {
u64 flags; /* extent flags */
u64 generation;
u64 mirror_num;
int mirror_num;
int have_csum;
u8 csum[BTRFS_CSUM_SIZE];
};
@ -87,6 +88,7 @@ struct scrub_dev {
int first_free;
int curr;
atomic_t in_flight;
atomic_t fixup_cnt;
spinlock_t list_lock;
wait_queue_head_t list_wait;
u16 csum_size;
@ -100,6 +102,27 @@ struct scrub_dev {
spinlock_t stat_lock;
};
struct scrub_fixup_nodatasum {
struct scrub_dev *sdev;
u64 logical;
struct btrfs_root *root;
struct btrfs_work work;
int mirror_num;
};
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
char *scratch_buf;
char *msg_buf;
const char *errstr;
sector_t sector;
u64 logical;
struct btrfs_device *dev;
int msg_bufsize;
int scratch_bufsize;
};
static void scrub_free_csums(struct scrub_dev *sdev)
{
while (!list_empty(&sdev->csum_list)) {
@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
if (i != SCRUB_BIOS_PER_DEV-1)
sdev->bios[i]->next_free = i + 1;
else
else
sdev->bios[i]->next_free = -1;
}
sdev->first_free = 0;
sdev->curr = -1;
atomic_set(&sdev->in_flight, 0);
atomic_set(&sdev->fixup_cnt, 0);
atomic_set(&sdev->cancel_req, 0);
sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
INIT_LIST_HEAD(&sdev->csum_list);
spin_lock_init(&sdev->list_lock);
@ -195,24 +219,366 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
return ERR_PTR(-ENOMEM);
}
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
{
u64 isize;
u32 nlink;
int ret;
int i;
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
struct scrub_warning *swarn = ctx;
struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
struct btrfs_key root_key;
root_key.objectid = root;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
if (IS_ERR(local_root)) {
ret = PTR_ERR(local_root);
goto err;
}
ret = inode_item_info(inum, 0, local_root, swarn->path);
if (ret) {
btrfs_release_path(swarn->path);
goto err;
}
eb = swarn->path->nodes[0];
inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
struct btrfs_inode_item);
isize = btrfs_inode_size(eb, inode_item);
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(swarn->path);
ipath = init_ipath(4096, local_root, swarn->path);
if (IS_ERR(ipath)) {
ret = PTR_ERR(ipath);
ipath = NULL;
goto err;
}
ret = paths_from_inode(inum, ipath);
if (ret < 0)
goto err;
/*
* we deliberately ignore the bit ipath might have been too small to
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu, "
"length %llu, links %u (path: %s)\n", swarn->errstr,
swarn->logical, swarn->dev->name,
(unsigned long long)swarn->sector, root, inum, offset,
min(isize - offset, (u64)PAGE_SIZE), nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
free_ipath(ipath);
return 0;
err:
printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
"resolving failed with ret=%d\n", swarn->errstr,
swarn->logical, swarn->dev->name,
(unsigned long long)swarn->sector, root, inum, offset, ret);
free_ipath(ipath);
return 0;
}
static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
int ix)
{
struct btrfs_device *dev = sbio->sdev->dev;
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
struct btrfs_path *path;
struct btrfs_key found_key;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct scrub_warning swarn;
u32 item_size;
int ret;
u64 ref_root;
u8 ref_level;
unsigned long ptr = 0;
const int bufsize = 4096;
u64 extent_offset;
path = btrfs_alloc_path();
swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
swarn.logical = sbio->logical + ix * PAGE_SIZE;
swarn.errstr = errstr;
swarn.dev = dev;
swarn.msg_bufsize = bufsize;
swarn.scratch_bufsize = bufsize;
if (!path || !swarn.scratch_buf || !swarn.msg_buf)
goto out;
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
if (ret < 0)
goto out;
extent_offset = swarn.logical - found_key.objectid;
swarn.extent_item_size = found_key.offset;
eb = path->nodes[0];
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
item_size = btrfs_item_size_nr(eb, path->slots[0]);
if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
&ref_root, &ref_level);
printk(KERN_WARNING "%s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
"%llu\n", errstr, swarn.logical, dev->name,
(unsigned long long)swarn.sector,
ref_level ? "node" : "leaf",
ret < 0 ? -1 : ref_level,
ret < 0 ? -1 : ref_root);
} while (ret != 1);
} else {
swarn.path = path;
iterate_extent_inodes(fs_info, path, found_key.objectid,
extent_offset,
scrub_print_warning_inode, &swarn);
}
out:
btrfs_free_path(path);
kfree(swarn.scratch_buf);
kfree(swarn.msg_buf);
}
static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
{
struct page *page = NULL;
unsigned long index;
struct scrub_fixup_nodatasum *fixup = ctx;
int ret;
int corrected = 0;
struct btrfs_key key;
struct inode *inode = NULL;
u64 end = offset + PAGE_SIZE - 1;
struct btrfs_root *local_root;
key.objectid = root;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
if (IS_ERR(local_root))
return PTR_ERR(local_root);
key.type = BTRFS_INODE_ITEM_KEY;
key.objectid = inum;
key.offset = 0;
inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
index = offset >> PAGE_CACHE_SHIFT;
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
goto out;
}
if (PageUptodate(page)) {
struct btrfs_mapping_tree *map_tree;
if (PageDirty(page)) {
/*
* we need to write the data to the defect sector. the
* data that was in that sector is not in memory,
* because the page was modified. we must not write the
* modified page to that sector.
*
* TODO: what could be done here: wait for the delalloc
* runner to write out that page (might involve
* COW) and see whether the sector is still
* referenced afterwards.
*
* For the meantime, we'll treat this error
* incorrectable, although there is a chance that a
* later scrub will find the bad sector again and that
* there's no dirty page in memory, then.
*/
ret = -EIO;
goto out;
}
map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
fixup->logical, page,
fixup->mirror_num);
unlock_page(page);
corrected = !ret;
} else {
/*
* we need to get good data first. the general readpage path
* will call repair_io_failure for us, we just have to make
* sure we read the bad mirror.
*/
ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
EXTENT_DAMAGED, GFP_NOFS);
if (ret) {
/* set_extent_bits should give proper error */
WARN_ON(ret > 0);
if (ret > 0)
ret = -EFAULT;
goto out;
}
ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
btrfs_get_extent,
fixup->mirror_num);
wait_on_page_locked(page);
corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
end, EXTENT_DAMAGED, 0, NULL);
if (!corrected)
clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
EXTENT_DAMAGED, GFP_NOFS);
}
out:
if (page)
put_page(page);
if (inode)
iput(inode);
if (ret < 0)
return ret;
if (ret == 0 && corrected) {
/*
* we only need to call readpage for one of the inodes belonging
* to this extent. so make iterate_extent_inodes stop
*/
return 1;
}
return -EIO;
}
static void scrub_fixup_nodatasum(struct btrfs_work *work)
{
int ret;
struct scrub_fixup_nodatasum *fixup;
struct scrub_dev *sdev;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_fs_info *fs_info;
struct btrfs_path *path;
int uncorrectable = 0;
fixup = container_of(work, struct scrub_fixup_nodatasum, work);
sdev = fixup->sdev;
fs_info = fixup->root->fs_info;
path = btrfs_alloc_path();
if (!path) {
spin_lock(&sdev->stat_lock);
++sdev->stat.malloc_errors;
spin_unlock(&sdev->stat_lock);
uncorrectable = 1;
goto out;
}
trans = btrfs_join_transaction(fixup->root);
if (IS_ERR(trans)) {
uncorrectable = 1;
goto out;
}
/*
* the idea is to trigger a regular read through the standard path. we
* read a page from the (failed) logical address by specifying the
* corresponding copynum of the failed sector. thus, that readpage is
* expected to fail.
* that is the point where on-the-fly error correction will kick in
* (once it's finished) and rewrite the failed sector if a good copy
* can be found.
*/
ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
path, scrub_fixup_readpage,
fixup);
if (ret < 0) {
uncorrectable = 1;
goto out;
}
WARN_ON(ret != 1);
spin_lock(&sdev->stat_lock);
++sdev->stat.corrected_errors;
spin_unlock(&sdev->stat_lock);
out:
if (trans && !IS_ERR(trans))
btrfs_end_transaction(trans, fixup->root);
if (uncorrectable) {
spin_lock(&sdev->stat_lock);
++sdev->stat.uncorrectable_errors;
spin_unlock(&sdev->stat_lock);
printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
"(nodatasum) error at logical %llu\n",
fixup->logical);
}
btrfs_free_path(path);
kfree(fixup);
/* see caller why we're pretending to be paused in the scrub counters */
mutex_lock(&fs_info->scrub_lock);
atomic_dec(&fs_info->scrubs_running);
atomic_dec(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
atomic_dec(&sdev->fixup_cnt);
wake_up(&fs_info->scrub_pause_wait);
wake_up(&sdev->list_wait);
}
/*
* scrub_recheck_error gets called when either verification of the page
* failed or the bio failed to read, e.g. with EIO. In the latter case,
* recheck_error gets called for every page in the bio, even though only
* one may be bad
*/
static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
{
struct scrub_dev *sdev = sbio->sdev;
u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
if (sbio->err) {
if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
(sbio->physical + ix * PAGE_SIZE) >> 9,
if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
sbio->bio->bi_io_vec[ix].bv_page) == 0) {
if (scrub_fixup_check(sbio, ix) == 0)
return;
return 0;
}
if (__ratelimit(&_rs))
scrub_print_warning("i/o error", sbio, ix);
} else {
if (__ratelimit(&_rs))
scrub_print_warning("checksum error", sbio, ix);
}
spin_lock(&sdev->stat_lock);
++sdev->stat.read_errors;
spin_unlock(&sdev->stat_lock);
scrub_fixup(sbio, ix);
return 1;
}
static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
struct scrub_dev *sdev = sbio->sdev;
struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct btrfs_multi_bio *multi = NULL;
struct btrfs_bio *bbio = NULL;
struct scrub_fixup_nodatasum *fixup;
u64 logical = sbio->logical + ix * PAGE_SIZE;
u64 length;
int i;
@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
(sbio->spag[ix].have_csum == 0)) {
fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
if (!fixup)
goto uncorrectable;
fixup->sdev = sdev;
fixup->logical = logical;
fixup->root = fs_info->extent_root;
fixup->mirror_num = sbio->spag[ix].mirror_num;
/*
* nodatasum, don't try to fix anything
* FIXME: we can do better, open the inode and trigger a
* writeback
* increment scrubs_running to prevent cancel requests from
* completing as long as a fixup worker is running. we must also
* increment scrubs_paused to prevent deadlocking on pause
* requests used for transactions commits (as the worker uses a
* transaction context). it is safe to regard the fixup worker
* as paused for all matters practical. effectively, we only
* avoid cancellation requests from completing.
*/
goto uncorrectable;
mutex_lock(&fs_info->scrub_lock);
atomic_inc(&fs_info->scrubs_running);
atomic_inc(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
atomic_inc(&sdev->fixup_cnt);
fixup->work.func = scrub_fixup_nodatasum;
btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
return;
}
length = PAGE_SIZE;
ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
&multi, 0);
if (ret || !multi || length < PAGE_SIZE) {
&bbio, 0);
if (ret || !bbio || length < PAGE_SIZE) {
printk(KERN_ERR
"scrub_fixup: btrfs_map_block failed us for %llu\n",
(unsigned long long)logical);
WARN_ON(1);
kfree(bbio);
return;
}
if (multi->num_stripes == 1)
if (bbio->num_stripes == 1)
/* there aren't any replicas */
goto uncorrectable;
/*
* first find a good copy
*/
for (i = 0; i < multi->num_stripes; ++i) {
if (i == sbio->spag[ix].mirror_num)
for (i = 0; i < bbio->num_stripes; ++i) {
if (i + 1 == sbio->spag[ix].mirror_num)
continue;
if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
multi->stripes[i].physical >> 9,
if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
bbio->stripes[i].physical >> 9,
sbio->bio->bi_io_vec[ix].bv_page)) {
/* I/O-error, this is not a good copy */
continue;
@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
if (scrub_fixup_check(sbio, ix) == 0)
break;
}
if (i == multi->num_stripes)
if (i == bbio->num_stripes)
goto uncorrectable;
if (!sdev->readonly) {
@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
}
}
kfree(multi);
kfree(bbio);
spin_lock(&sdev->stat_lock);
++sdev->stat.corrected_errors;
spin_unlock(&sdev->stat_lock);
if (printk_ratelimit())
printk(KERN_ERR "btrfs: fixed up at %llu\n",
(unsigned long long)logical);
printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
(unsigned long long)logical);
return;
uncorrectable:
kfree(multi);
kfree(bbio);
spin_lock(&sdev->stat_lock);
++sdev->stat.uncorrectable_errors;
spin_unlock(&sdev->stat_lock);
if (printk_ratelimit())
printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
(unsigned long long)logical);
printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
"logical %llu\n", (unsigned long long)logical);
}
static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work)
int ret;
if (sbio->err) {
ret = 0;
for (i = 0; i < sbio->count; ++i)
scrub_recheck_error(sbio, i);
ret |= scrub_recheck_error(sbio, i);
if (!ret) {
spin_lock(&sdev->stat_lock);
++sdev->stat.unverified_errors;
spin_unlock(&sdev->stat_lock);
}
sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work)
bi->bv_offset = 0;
bi->bv_len = PAGE_SIZE;
}
spin_lock(&sdev->stat_lock);
++sdev->stat.read_errors;
spin_unlock(&sdev->stat_lock);
goto out;
}
for (i = 0; i < sbio->count; ++i) {
@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work)
WARN_ON(1);
}
kunmap_atomic(buffer, KM_USER0);
if (ret)
scrub_recheck_error(sbio, i);
if (ret) {
ret = scrub_recheck_error(sbio, i);
if (!ret) {
spin_lock(&sdev->stat_lock);
++sdev->stat.unverified_errors;
spin_unlock(&sdev->stat_lock);
}
}
}
out:
@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
static int scrub_submit(struct scrub_dev *sdev)
{
struct scrub_bio *sbio;
struct bio *bio;
int i;
if (sdev->curr == -1)
return 0;
sbio = sdev->bios[sdev->curr];
bio = bio_alloc(GFP_NOFS, sbio->count);
if (!bio)
goto nomem;
bio->bi_private = sbio;
bio->bi_end_io = scrub_bio_end_io;
bio->bi_bdev = sdev->dev->bdev;
bio->bi_sector = sbio->physical >> 9;
for (i = 0; i < sbio->count; ++i) {
struct page *page;
int ret;
page = alloc_page(GFP_NOFS);
if (!page)
goto nomem;
ret = bio_add_page(bio, page, PAGE_SIZE, 0);
if (!ret) {
__free_page(page);
goto nomem;
}
}
sbio->err = 0;
sdev->curr = -1;
atomic_inc(&sdev->in_flight);
submit_bio(READ, bio);
submit_bio(READ, sbio->bio);
return 0;
nomem:
scrub_free_bio(bio);
return -ENOMEM;
}
static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
u64 physical, u64 flags, u64 gen, u64 mirror_num,
u64 physical, u64 flags, u64 gen, int mirror_num,
u8 *csum, int force)
{
struct scrub_bio *sbio;
struct page *page;
int ret;
again:
/*
@ -628,12 +990,22 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
}
sbio = sdev->bios[sdev->curr];
if (sbio->count == 0) {
struct bio *bio;
sbio->physical = physical;
sbio->logical = logical;
bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
if (!bio)
return -ENOMEM;
bio->bi_private = sbio;
bio->bi_end_io = scrub_bio_end_io;
bio->bi_bdev = sdev->dev->bdev;
bio->bi_sector = sbio->physical >> 9;
sbio->err = 0;
sbio->bio = bio;
} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
sbio->logical + sbio->count * PAGE_SIZE != logical) {
int ret;
ret = scrub_submit(sdev);
if (ret)
return ret;
@ -643,6 +1015,20 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
sbio->spag[sbio->count].generation = gen;
sbio->spag[sbio->count].have_csum = 0;
sbio->spag[sbio->count].mirror_num = mirror_num;
page = alloc_page(GFP_NOFS);
if (!page)
return -ENOMEM;
ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
if (!ret) {
__free_page(page);
ret = scrub_submit(sdev);
if (ret)
return ret;
goto again;
}
if (csum) {
sbio->spag[sbio->count].have_csum = 1;
memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
/* scrub extent tries to collect up to 64 kB for each bio */
static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
u64 physical, u64 flags, u64 gen, u64 mirror_num)
u64 physical, u64 flags, u64 gen, int mirror_num)
{
int ret;
u8 csum[BTRFS_CSUM_SIZE];
@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
int slot;
int i;
u64 nstripes;
int start_stripe;
struct extent_buffer *l;
struct btrfs_key key;
u64 physical;
u64 logical;
u64 generation;
u64 mirror_num;
int mirror_num;
struct reada_control *reada1;
struct reada_control *reada2;
struct btrfs_key key_start;
struct btrfs_key key_end;
u64 increment = map->stripe_len;
u64 offset;
@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
mirror_num = 0;
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
int factor = map->num_stripes / map->sub_stripes;
offset = map->stripe_len * (num / map->sub_stripes);
increment = map->stripe_len * factor;
mirror_num = num % map->sub_stripes;
mirror_num = num % map->sub_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
increment = map->stripe_len;
mirror_num = num % map->num_stripes;
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
increment = map->stripe_len;
mirror_num = num % map->num_stripes;
mirror_num = num % map->num_stripes + 1;
} else {
increment = map->stripe_len;
mirror_num = 0;
mirror_num = 1;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = 2;
path->search_commit_root = 1;
path->skip_locking = 1;
/*
* find all extents for each stripe and just read them to get
* them into the page cache
* FIXME: we can do better. build a more intelligent prefetching
* trigger the readahead for extent tree csum tree and wait for
* completion. During readahead, the scrub is officially paused
* to not hold off transaction commits
*/
logical = base + offset;
physical = map->stripes[num].physical;
ret = 0;
for (i = 0; i < nstripes; ++i) {
key.objectid = logical;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out_noplug;
wait_event(sdev->list_wait,
atomic_read(&sdev->in_flight) == 0);
atomic_inc(&fs_info->scrubs_paused);
wake_up(&fs_info->scrub_pause_wait);
/*
* we might miss half an extent here, but that doesn't matter,
* as it's only the prefetch
*/
while (1) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out_noplug;
/* FIXME it might be better to start readahead at commit root */
key_start.objectid = logical;
key_start.type = BTRFS_EXTENT_ITEM_KEY;
key_start.offset = (u64)0;
key_end.objectid = base + offset + nstripes * increment;
key_end.type = BTRFS_EXTENT_ITEM_KEY;
key_end.offset = (u64)0;
reada1 = btrfs_reada_add(root, &key_start, &key_end);
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key_start.type = BTRFS_EXTENT_CSUM_KEY;
key_start.offset = logical;
key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key_end.type = BTRFS_EXTENT_CSUM_KEY;
key_end.offset = base + offset + nstripes * increment;
reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
if (key.objectid >= logical + map->stripe_len)
break;
if (!IS_ERR(reada1))
btrfs_reada_wait(reada1);
if (!IS_ERR(reada2))
btrfs_reada_wait(reada2);
path->slots[0]++;
}
btrfs_release_path(path);
logical += increment;
physical += map->stripe_len;
cond_resched();
mutex_lock(&fs_info->scrub_lock);
while (atomic_read(&fs_info->scrub_pause_req)) {
mutex_unlock(&fs_info->scrub_lock);
wait_event(fs_info->scrub_pause_wait,
atomic_read(&fs_info->scrub_pause_req) == 0);
mutex_lock(&fs_info->scrub_lock);
}
atomic_dec(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
wake_up(&fs_info->scrub_pause_wait);
/*
* collect all data csums for the stripe to avoid seeking during
* the scrub. This might currently (crc32) end up to be about 1MB
*/
start_stripe = 0;
blk_start_plug(&plug);
again:
logical = base + offset + start_stripe * increment;
for (i = start_stripe; i < nstripes; ++i) {
ret = btrfs_lookup_csums_range(csum_root, logical,
logical + map->stripe_len - 1,
&sdev->csum_list, 1);
if (ret)
goto out;
logical += increment;
cond_resched();
}
/*
* now find all extents for each stripe and scrub them
*/
logical = base + offset + start_stripe * increment;
physical = map->stripes[num].physical + start_stripe * map->stripe_len;
logical = base + offset;
physical = map->stripes[num].physical;
ret = 0;
for (i = start_stripe; i < nstripes; ++i) {
for (i = 0; i < nstripes; ++i) {
/*
* canceled?
*/
@ -882,11 +1257,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
atomic_dec(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
wake_up(&fs_info->scrub_pause_wait);
scrub_free_csums(sdev);
start_stripe = i;
goto again;
}
ret = btrfs_lookup_csums_range(csum_root, logical,
logical + map->stripe_len - 1,
&sdev->csum_list, 1);
if (ret)
goto out;
key.objectid = logical;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)0;
@ -982,7 +1360,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
out:
blk_finish_plug(&plug);
out_noplug:
btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
@ -1158,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
mutex_lock(&fs_info->scrub_lock);
if (fs_info->scrub_workers_refcnt == 0) {
btrfs_init_workers(&fs_info->scrub_workers, "scrub",
fs_info->thread_pool_size, &fs_info->generic_worker);
fs_info->scrub_workers.idle_thresh = 4;
btrfs_start_workers(&fs_info->scrub_workers, 1);
ret = btrfs_start_workers(&fs_info->scrub_workers);
if (ret)
goto out;
}
++fs_info->scrub_workers_refcnt;
out:
mutex_unlock(&fs_info->scrub_lock);
return 0;
return ret;
}
static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
@ -1253,10 +1634,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
ret = scrub_enumerate_chunks(sdev, start, end);
wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
wake_up(&fs_info->scrub_pause_wait);
wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
if (progress)
memcpy(progress, &sdev->stat, sizeof(*progress));

View file

@ -40,6 +40,8 @@
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/cleancache.h>
#include <linux/mnt_namespace.h>
#include <linux/ratelimit.h>
#include "compat.h"
#include "delayed-inode.h"
#include "ctree.h"
@ -58,6 +60,7 @@
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
static struct file_system_type btrfs_fs_type;
static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
char nbuf[16])
@ -162,7 +165,7 @@ enum {
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
Opt_inode_cache, Opt_err,
Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
};
static match_table_t tokens = {
@ -195,6 +198,8 @@ static match_table_t tokens = {
{Opt_subvolrootid, "subvolrootid=%d"},
{Opt_defrag, "autodefrag"},
{Opt_inode_cache, "inode_cache"},
{Opt_no_space_cache, "nospace_cache"},
{Opt_recovery, "recovery"},
{Opt_err, NULL},
};
@ -206,14 +211,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
{
struct btrfs_fs_info *info = root->fs_info;
substring_t args[MAX_OPT_ARGS];
char *p, *num, *orig;
char *p, *num, *orig = NULL;
u64 cache_gen;
int intarg;
int ret = 0;
char *compress_type;
bool compress_force = false;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
if (cache_gen)
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
if (!options)
return 0;
goto out;
/*
* strsep changes the string, duplicate it because parse_options
@ -360,9 +370,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, DISCARD);
break;
case Opt_space_cache:
printk(KERN_INFO "btrfs: enabling disk space caching\n");
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
break;
case Opt_no_space_cache:
printk(KERN_INFO "btrfs: disabling disk space caching\n");
btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
break;
case Opt_inode_cache:
printk(KERN_INFO "btrfs: enabling inode map caching\n");
btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@ -381,6 +394,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
printk(KERN_INFO "btrfs: enabling auto defrag");
btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
break;
case Opt_recovery:
printk(KERN_INFO "btrfs: enabling auto recovery");
btrfs_set_opt(info->mount_opt, RECOVERY);
break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@ -391,6 +408,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
}
out:
if (!ret && btrfs_test_opt(root, SPACE_CACHE))
printk(KERN_INFO "btrfs: disk space caching is enabled\n");
kfree(orig);
return ret;
}
@ -406,12 +425,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
{
substring_t args[MAX_OPT_ARGS];
char *opts, *orig, *p;
char *device_name, *opts, *orig, *p;
int error = 0;
int intarg;
if (!options)
goto out;
return 0;
/*
* strsep changes the string, duplicate it because parse_options
@ -430,6 +449,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
token = match_token(p, tokens, args);
switch (token) {
case Opt_subvol:
kfree(*subvol_name);
*subvol_name = match_strdup(&args[0]);
break;
case Opt_subvolid:
@ -457,29 +477,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
}
break;
case Opt_device:
error = btrfs_scan_one_device(match_strdup(&args[0]),
device_name = match_strdup(&args[0]);
if (!device_name) {
error = -ENOMEM;
goto out;
}
error = btrfs_scan_one_device(device_name,
flags, holder, fs_devices);
kfree(device_name);
if (error)
goto out_free_opts;
goto out;
break;
default:
break;
}
}
out_free_opts:
out:
kfree(orig);
out:
/*
* If no subvolume name is specified we use the default one. Allocate
* a copy of the string "." here so that code later in the
* mount path doesn't care if it's the default volume or another one.
*/
if (!*subvol_name) {
*subvol_name = kstrdup(".", GFP_KERNEL);
if (!*subvol_name)
return -ENOMEM;
}
return error;
}
@ -492,7 +507,6 @@ static struct dentry *get_default_root(struct super_block *sb,
struct btrfs_path *path;
struct btrfs_key location;
struct inode *inode;
struct dentry *dentry;
u64 dir_id;
int new = 0;
@ -517,7 +531,7 @@ static struct dentry *get_default_root(struct super_block *sb,
* will mount by default if we haven't been given a specific subvolume
* to mount.
*/
dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
@ -566,29 +580,7 @@ static struct dentry *get_default_root(struct super_block *sb,
return dget(sb->s_root);
}
if (new) {
const struct qstr name = { .name = "/", .len = 1 };
/*
* New inode, we need to make the dentry a sibling of s_root so
* everything gets cleaned up properly on unmount.
*/
dentry = d_alloc(sb->s_root, &name);
if (!dentry) {
iput(inode);
return ERR_PTR(-ENOMEM);
}
d_splice_alias(inode, dentry);
} else {
/*
* We found the inode in cache, just find a dentry for it and
* put the reference to the inode we just got.
*/
dentry = d_find_alias(inode);
iput(inode);
}
return dentry;
return d_obtain_alias(inode);
}
static int btrfs_fill_super(struct super_block *sb,
@ -719,6 +711,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",noacl");
if (btrfs_test_opt(root, SPACE_CACHE))
seq_puts(seq, ",space_cache");
else
seq_puts(seq, ",nospace_cache");
if (btrfs_test_opt(root, CLEAR_CACHE))
seq_puts(seq, ",clear_cache");
if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@ -753,6 +747,137 @@ static int btrfs_set_super(struct super_block *s, void *data)
return set_anon_super(s, data);
}
/*
* subvolumes are identified by ino 256
*/
static inline int is_subvolume_inode(struct inode *inode)
{
if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return 1;
return 0;
}
/*
* This will strip out the subvol=%s argument for an argument string and add
* subvolid=0 to make sure we get the actual tree root for path walking to the
* subvol we want.
*/
static char *setup_root_args(char *args)
{
unsigned copied = 0;
unsigned len = strlen(args) + 2;
char *pos;
char *ret;
/*
* We need the same args as before, but minus
*
* subvol=a
*
* and add
*
* subvolid=0
*
* which is a difference of 2 characters, so we allocate strlen(args) +
* 2 characters.
*/
ret = kzalloc(len * sizeof(char), GFP_NOFS);
if (!ret)
return NULL;
pos = strstr(args, "subvol=");
/* This shouldn't happen, but just in case.. */
if (!pos) {
kfree(ret);
return NULL;
}
/*
* The subvol=<> arg is not at the front of the string, copy everybody
* up to that into ret.
*/
if (pos != args) {
*pos = '\0';
strcpy(ret, args);
copied += strlen(args);
pos++;
}
strncpy(ret + copied, "subvolid=0", len - copied);
/* Length of subvolid=0 */
copied += 10;
/*
* If there is no , after the subvol= option then we know there's no
* other options and we can just return.
*/
pos = strchr(pos, ',');
if (!pos)
return ret;
/* Copy the rest of the arguments into our buffer */
strncpy(ret + copied, pos, len - copied);
copied += strlen(pos);
return ret;
}
static struct dentry *mount_subvol(const char *subvol_name, int flags,
const char *device_name, char *data)
{
struct super_block *s;
struct dentry *root;
struct vfsmount *mnt;
struct mnt_namespace *ns_private;
char *newargs;
struct path path;
int error;
newargs = setup_root_args(data);
if (!newargs)
return ERR_PTR(-ENOMEM);
mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
newargs);
kfree(newargs);
if (IS_ERR(mnt))
return ERR_CAST(mnt);
ns_private = create_mnt_ns(mnt);
if (IS_ERR(ns_private)) {
mntput(mnt);
return ERR_CAST(ns_private);
}
/*
* This will trigger the automount of the subvol so we can just
* drop the mnt we have here and return the dentry that we
* found.
*/
error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
LOOKUP_FOLLOW, &path);
put_mnt_ns(ns_private);
if (error)
return ERR_PTR(error);
if (!is_subvolume_inode(path.dentry->d_inode)) {
path_put(&path);
mntput(mnt);
error = -EINVAL;
printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
subvol_name);
return ERR_PTR(-EINVAL);
}
/* Get a ref to the sb and the dentry we found and return it */
s = path.mnt->mnt_sb;
atomic_inc(&s->s_active);
root = dget(path.dentry);
path_put(&path);
down_write(&s->s_umount);
return root;
}
/*
* Find a superblock for the given device / mount point.
@ -767,7 +892,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
struct super_block *s;
struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_root *tree_root = NULL;
struct btrfs_fs_info *fs_info = NULL;
fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
@ -781,21 +905,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
error = btrfs_parse_early_options(data, mode, fs_type,
&subvol_name, &subvol_objectid,
&subvol_rootid, &fs_devices);
if (error)
if (error) {
kfree(subvol_name);
return ERR_PTR(error);
}
if (subvol_name) {
root = mount_subvol(subvol_name, flags, device_name, data);
kfree(subvol_name);
return root;
}
error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
if (error)
goto error_free_subvol_name;
error = btrfs_open_devices(fs_devices, mode, fs_type);
if (error)
goto error_free_subvol_name;
if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
error = -EACCES;
goto error_close_devices;
}
return ERR_PTR(error);
/*
* Setup a dummy root and fs_info for test/set super. This is because
@ -804,19 +927,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
* then open_ctree will properly initialize everything later.
*/
fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
if (!fs_info || !tree_root) {
if (!fs_info)
return ERR_PTR(-ENOMEM);
fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
if (!fs_info->tree_root) {
error = -ENOMEM;
goto error_fs_info;
}
fs_info->tree_root->fs_info = fs_info;
fs_info->fs_devices = fs_devices;
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
error = -ENOMEM;
goto error_fs_info;
}
error = btrfs_open_devices(fs_devices, mode, fs_type);
if (error)
goto error_fs_info;
if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
error = -EACCES;
goto error_close_devices;
}
fs_info->tree_root = tree_root;
fs_info->fs_devices = fs_devices;
tree_root->fs_info = fs_info;
bdev = fs_devices->latest_bdev;
s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
if (IS_ERR(s))
goto error_s;
s = sget(fs_type, btrfs_test_super, btrfs_set_super,
fs_info->tree_root);
if (IS_ERR(s)) {
error = PTR_ERR(s);
goto error_close_devices;
}
if (s->s_root) {
if ((flags ^ s->s_flags) & MS_RDONLY) {
@ -826,75 +970,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
}
btrfs_close_devices(fs_devices);
kfree(fs_info);
kfree(tree_root);
free_fs_info(fs_info);
} else {
char b[BDEVNAME_SIZE];
s->s_flags = flags | MS_NOSEC;
strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
btrfs_sb(s)->fs_info->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
if (error) {
deactivate_locked_super(s);
goto error_free_subvol_name;
return ERR_PTR(error);
}
btrfs_sb(s)->fs_info->bdev_holder = fs_type;
s->s_flags |= MS_ACTIVE;
}
/* if they gave us a subvolume name bind mount into that */
if (strcmp(subvol_name, ".")) {
struct dentry *new_root;
root = get_default_root(s, subvol_rootid);
if (IS_ERR(root)) {
error = PTR_ERR(root);
deactivate_locked_super(s);
goto error_free_subvol_name;
}
mutex_lock(&root->d_inode->i_mutex);
new_root = lookup_one_len(subvol_name, root,
strlen(subvol_name));
mutex_unlock(&root->d_inode->i_mutex);
if (IS_ERR(new_root)) {
dput(root);
deactivate_locked_super(s);
error = PTR_ERR(new_root);
goto error_free_subvol_name;
}
if (!new_root->d_inode) {
dput(root);
dput(new_root);
deactivate_locked_super(s);
error = -ENXIO;
goto error_free_subvol_name;
}
dput(root);
root = new_root;
} else {
root = get_default_root(s, subvol_objectid);
if (IS_ERR(root)) {
error = PTR_ERR(root);
deactivate_locked_super(s);
goto error_free_subvol_name;
}
root = get_default_root(s, subvol_objectid);
if (IS_ERR(root)) {
deactivate_locked_super(s);
return root;
}
kfree(subvol_name);
return root;
error_s:
error = PTR_ERR(s);
error_close_devices:
btrfs_close_devices(fs_devices);
kfree(fs_info);
kfree(tree_root);
error_free_subvol_name:
kfree(subvol_name);
error_fs_info:
free_fs_info(fs_info);
return ERR_PTR(error);
}
@ -919,7 +1023,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (root->fs_info->fs_devices->rw_devices == 0)
return -EACCES;
if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
return -EINVAL;
ret = btrfs_cleanup_fs_roots(root->fs_info);
@ -976,11 +1080,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
u64 avail_space;
u64 used_space;
u64 min_stripe_size;
int min_stripes = 1;
int min_stripes = 1, num_stripes = 1;
int i = 0, nr_devices;
int ret;
nr_devices = fs_info->fs_devices->rw_devices;
nr_devices = fs_info->fs_devices->open_devices;
BUG_ON(!nr_devices);
devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@ -990,20 +1094,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
/* calc min stripe number for data space alloction */
type = btrfs_get_alloc_profile(root, 1);
if (type & BTRFS_BLOCK_GROUP_RAID0)
if (type & BTRFS_BLOCK_GROUP_RAID0) {
min_stripes = 2;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_stripes = nr_devices;
} else if (type & BTRFS_BLOCK_GROUP_RAID1) {
min_stripes = 2;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = 2;
} else if (type & BTRFS_BLOCK_GROUP_RAID10) {
min_stripes = 4;
num_stripes = 4;
}
if (type & BTRFS_BLOCK_GROUP_DUP)
min_stripe_size = 2 * BTRFS_STRIPE_LEN;
else
min_stripe_size = BTRFS_STRIPE_LEN;
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
if (!device->in_fs_metadata)
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (!device->in_fs_metadata || !device->bdev)
continue;
avail_space = device->total_bytes - device->bytes_used;
@ -1064,13 +1172,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
i = nr_devices - 1;
avail_space = 0;
while (nr_devices >= min_stripes) {
if (num_stripes > nr_devices)
num_stripes = nr_devices;
if (devices_info[i].max_avail >= min_stripe_size) {
int j;
u64 alloc_size;
avail_space += devices_info[i].max_avail * min_stripes;
avail_space += devices_info[i].max_avail * num_stripes;
alloc_size = devices_info[i].max_avail;
for (j = i + 1 - min_stripes; j <= i; j++)
for (j = i + 1 - num_stripes; j <= i; j++)
devices_info[j].max_avail -= alloc_size;
}
i--;
@ -1085,7 +1196,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_root *root = btrfs_sb(dentry->d_sb);
struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
struct btrfs_super_block *disk_super = root->fs_info->super_copy;
struct list_head *head = &root->fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
@ -1187,6 +1298,16 @@ static int btrfs_unfreeze(struct super_block *sb)
return 0;
}
static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
{
int ret;
ret = btrfs_dirty_inode(inode);
if (ret)
printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
"error %d\n", btrfs_ino(inode), ret);
}
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
@ -1194,7 +1315,7 @@ static const struct super_operations btrfs_super_ops = {
.sync_fs = btrfs_sync_fs,
.show_options = btrfs_show_options,
.write_inode = btrfs_write_inode,
.dirty_inode = btrfs_dirty_inode,
.dirty_inode = btrfs_fs_dirty_inode,
.alloc_inode = btrfs_alloc_inode,
.destroy_inode = btrfs_destroy_inode,
.statfs = btrfs_statfs,

View file

@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
struct btrfs_transaction *cur_trans;
spin_lock(&root->fs_info->trans_lock);
loop:
if (root->fs_info->trans_no_join) {
if (!nofail) {
spin_unlock(&root->fs_info->trans_lock);
@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
spin_lock(&root->fs_info->trans_lock);
if (root->fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
* to redo the trans_no_join checks above
*/
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
cur_trans = root->fs_info->running_transaction;
atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
cur_trans->num_joined++;
spin_unlock(&root->fs_info->trans_lock);
return 0;
goto loop;
}
atomic_set(&cur_trans->num_writers, 1);
cur_trans->num_joined = 0;
init_waitqueue_head(&cur_trans->writer_wait);
@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
*/
if (num_items > 0 && root != root->fs_info->chunk_root) {
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
ret = btrfs_block_rsv_add(NULL, root,
ret = btrfs_block_rsv_add(root,
&root->fs_info->trans_block_rsv,
num_bytes);
if (ret)
@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
int ret;
ret = btrfs_block_rsv_check(trans, root,
&root->fs_info->global_block_rsv, 0, 5);
ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
return ret ? 1 : 0;
}
@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_block_rsv *rsv = trans->block_rsv;
int updates;
smp_mb();
if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
return 1;
/*
* We need to do this in case we're deleting csums so the global block
* rsv get's used instead of the csum block rsv.
*/
trans->block_rsv = NULL;
updates = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (updates)
btrfs_run_delayed_refs(trans, root, updates);
trans->block_rsv = rsv;
return should_end_transaction(trans, root);
}
@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
return 0;
}
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
while (count < 4) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
count++;
}
btrfs_trans_release_metadata(trans, root);
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root)) {
trans->transaction->blocked = 1;
@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark)
{
int ret;
int err = 0;
int werr = 0;
struct page *page;
struct inode *btree_inode = root->fs_info->btree_inode;
struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
u64 start = 0;
u64 end;
unsigned long index;
while (1) {
ret = find_first_extent_bit(dirty_pages, start, &start, &end,
mark);
if (ret)
break;
while (start <= end) {
cond_resched();
index = start >> PAGE_CACHE_SHIFT;
start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
page = find_get_page(btree_inode->i_mapping, index);
if (!page)
continue;
btree_lock_page_hook(page);
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
continue;
}
if (PageWriteback(page)) {
if (PageDirty(page))
wait_on_page_writeback(page);
else {
unlock_page(page);
page_cache_release(page);
continue;
}
}
err = write_one_page(page, 0);
if (err)
werr = err;
page_cache_release(page);
}
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark)) {
convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
GFP_NOFS);
err = filemap_fdatawrite_range(mapping, start, end);
if (err)
werr = err;
cond_resched();
start = end + 1;
}
if (err)
werr = err;
@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark)
{
int ret;
int err = 0;
int werr = 0;
struct page *page;
struct inode *btree_inode = root->fs_info->btree_inode;
struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
u64 start = 0;
u64 end;
unsigned long index;
while (1) {
ret = find_first_extent_bit(dirty_pages, start, &start, &end,
mark);
if (ret)
break;
clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
index = start >> PAGE_CACHE_SHIFT;
start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
page = find_get_page(btree_inode->i_mapping, index);
if (!page)
continue;
if (PageDirty(page)) {
btree_lock_page_hook(page);
wait_on_page_writeback(page);
err = write_one_page(page, 0);
if (err)
werr = err;
}
wait_on_page_writeback(page);
page_cache_release(page);
cond_resched();
}
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT)) {
clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
err = filemap_fdatawait_range(mapping, start, end);
if (err)
werr = err;
cond_resched();
start = end + 1;
}
if (err)
werr = err;
@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
ret = btrfs_write_marked_extents(root, dirty_pages, mark);
ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
return ret || ret2;
if (ret)
return ret;
if (ret2)
return ret2;
return 0;
}
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@ -816,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_save_ino_cache(root, trans);
/* see comments in should_cow_block() */
root->force_cow = 0;
smp_wmb();
if (root->commit_root != root->node) {
mutex_lock(&root->fs_commit_mutex);
switch_commit_root(root);
@ -911,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
if (to_reserve > 0) {
ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
to_reserve);
ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
to_reserve);
if (ret) {
pending->error = ret;
goto fail;
@ -979,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(old);
free_extent_buffer(old);
/* see comments in should_cow_block() */
root->force_cow = 1;
smp_wmb();
btrfs_set_root_node(new_root_item, tmp);
/* record when the snapshot was created in key.offset */
key.offset = trans->transid;
@ -1002,7 +978,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
BUG_ON(IS_ERR(pending->snap));
btrfs_reloc_post_snapshot(trans, pending);
btrfs_orphan_post_snapshot(trans, pending);
fail:
kfree(new_root_item);
trans->block_rsv = rsv;
@ -1032,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root)
struct btrfs_root_item *root_item;
struct btrfs_super_block *super;
super = &root->fs_info->super_copy;
super = root->fs_info->super_copy;
root_item = &root->fs_info->chunk_root->root_item;
super->chunk_root = root_item->bytenr;
@ -1043,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root)
super->root = root_item->bytenr;
super->generation = root_item->generation;
super->root_level = root_item->level;
if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
if (btrfs_test_opt(root, SPACE_CACHE))
super->cache_generation = root_item->generation;
}
@ -1168,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_run_ordered_operations(root, 0);
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
ret = btrfs_run_delayed_refs(trans, root, 0);
BUG_ON(ret);
btrfs_trans_release_metadata(trans, root);
cur_trans = trans->transaction;
/*
* set the flushing flag so procs in this transaction have to
@ -1341,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
update_super_roots(root);
if (!root->fs_info->log_root_recovering) {
btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
btrfs_set_super_log_root(root->fs_info->super_copy, 0);
btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
}
memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
sizeof(root->fs_info->super_copy));
memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
sizeof(*root->fs_info->super_copy));
trans->transaction->blocked = 0;
spin_lock(&root->fs_info->trans_lock);

View file

@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
struct walk_control *wc, u64 gen)
{
if (wc->pin)
btrfs_pin_extent(log->fs_info->extent_root,
eb->start, eb->len, 0);
btrfs_pin_extent_for_log_replay(wc->trans,
log->fs_info->extent_root,
eb->start, eb->len);
if (btrfs_buffer_uptodate(eb, gen)) {
if (wc->write)
@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root_owner !=
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_reserved_extent(root,
ret = btrfs_free_and_pin_reserved_extent(root,
bytenr, blocksize);
BUG_ON(ret);
}
@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(next);
WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_reserved_extent(root,
ret = btrfs_free_and_pin_reserved_extent(root,
path->nodes[*level]->start,
path->nodes[*level]->len);
BUG_ON(ret);
@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(log->root_key.objectid !=
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_reserved_extent(log, next->start,
ret = btrfs_free_and_pin_reserved_extent(log, next->start,
next->len);
BUG_ON(ret);
}
@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
wait_log_commit(trans, root, root->log_transid - 1);
while (1) {
unsigned long batch = root->log_batch;
if (root->log_multiple_pids) {
/* when we're on an ssd, just kick the log commit out */
if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
BUG_ON(ret);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_set_super_log_root(&root->fs_info->super_for_commit,
btrfs_set_super_log_root(root->fs_info->super_for_commit,
log_root_tree->node->start);
btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
btrfs_header_level(log_root_tree->node));
log_root_tree->log_batch = 0;

View file

@ -295,6 +295,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
btrfs_requeue_work(&device->work);
goto done;
}
/* unplug every 64 requests just for good measure */
if (batch_run % 64 == 0) {
blk_finish_plug(&plug);
blk_start_plug(&plug);
sync_pending = 0;
}
}
cond_resched();
@ -366,6 +372,14 @@ static noinline int device_list_add(const char *path,
}
INIT_LIST_HEAD(&device->dev_alloc_list);
/* init readahead state */
spin_lock_init(&device->reada_lock);
device->reada_curr_zone = NULL;
atomic_set(&device->reada_in_flight, 0);
device->reada_next = 0;
INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
mutex_lock(&fs_devices->device_list_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
mutex_unlock(&fs_devices->device_list_mutex);
@ -597,10 +611,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
if (!bh) {
ret = -EINVAL;
if (!bh)
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
@ -655,7 +667,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
continue;
}
if (fs_devices->open_devices == 0) {
ret = -EIO;
ret = -EINVAL;
goto out;
}
fs_devices->seeding = seeding;
@ -993,7 +1005,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid,
@ -1006,6 +1018,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_dev_extent);
BUG_ON(found_key.offset > start || found_key.offset +
btrfs_dev_extent_length(leaf, extent) < start);
key = found_key;
btrfs_release_path(path);
goto again;
} else if (ret == 0) {
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
@ -1013,8 +1028,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
}
BUG_ON(ret);
if (device->bytes_used > 0)
device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
if (device->bytes_used > 0) {
u64 len = btrfs_dev_extent_length(leaf, extent);
device->bytes_used -= len;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += len;
spin_unlock(&root->fs_info->free_chunk_lock);
}
ret = btrfs_del_item(trans, root, path);
out:
@ -1356,6 +1376,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (ret)
goto error_undo;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space = device->total_bytes -
device->bytes_used;
spin_unlock(&root->fs_info->free_chunk_lock);
device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root, device);
@ -1387,8 +1412,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
call_rcu(&device->rcu, free_device);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
@ -1450,7 +1475,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
struct btrfs_super_block *disk_super = root->fs_info->super_copy;
struct btrfs_device *device;
u64 super_flags;
@ -1592,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
return -EINVAL;
bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
@ -1691,15 +1716,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
root->fs_info->fs_devices->num_can_discard++;
root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += device->total_bytes;
spin_unlock(&root->fs_info->free_chunk_lock);
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
root->fs_info->fs_devices->rotating = 1;
total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
btrfs_set_super_total_bytes(&root->fs_info->super_copy,
total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
btrfs_set_super_total_bytes(root->fs_info->super_copy,
total_bytes + device->total_bytes);
total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
btrfs_set_super_num_devices(&root->fs_info->super_copy,
total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
total_bytes + 1);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
@ -1790,7 +1819,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
&device->dev_root->fs_info->super_copy;
device->dev_root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 diff = new_size - device->total_bytes;
@ -1849,7 +1878,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
chunk_offset)
{
struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *ptr;
@ -2175,7 +2204,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = device->total_bytes;
u64 diff = device->total_bytes - new_size;
@ -2192,8 +2221,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
lock_chunks(root);
device->total_bytes = new_size;
if (device->writeable)
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space -= diff;
spin_unlock(&root->fs_info->free_chunk_lock);
}
unlock_chunks(root);
again:
@ -2257,6 +2290,9 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
device->total_bytes = old_size;
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += diff;
spin_unlock(&root->fs_info->free_chunk_lock);
unlock_chunks(root);
goto done;
}
@ -2292,7 +2328,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key disk_key;
u32 array_size;
u8 *ptr;
@ -2615,6 +2651,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
index++;
}
spin_lock(&extent_root->fs_info->free_chunk_lock);
extent_root->fs_info->free_chunk_space -= (stripe_size *
map->num_stripes);
spin_unlock(&extent_root->fs_info->free_chunk_lock);
index = 0;
stripe = &chunk->stripe;
while (index < map->num_stripes) {
@ -2848,7 +2889,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
struct btrfs_multi_bio **multi_ret,
struct btrfs_bio **bbio_ret,
int mirror_num)
{
struct extent_map *em;
@ -2866,18 +2907,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
int i;
int num_stripes;
int max_errors = 0;
struct btrfs_multi_bio *multi = NULL;
struct btrfs_bio *bbio = NULL;
if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
stripes_allocated = 1;
again:
if (multi_ret) {
multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
if (bbio_ret) {
bbio = kzalloc(btrfs_bio_size(stripes_allocated),
GFP_NOFS);
if (!multi)
if (!bbio)
return -ENOMEM;
atomic_set(&multi->error, 0);
atomic_set(&bbio->error, 0);
}
read_lock(&em_tree->lock);
@ -2898,7 +2939,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
if (mirror_num > map->num_stripes)
mirror_num = 0;
/* if our multi bio struct is too small, back off and try again */
/* if our btrfs_bio struct is too small, back off and try again */
if (rw & REQ_WRITE) {
if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_DUP)) {
@ -2917,11 +2958,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
stripes_required = map->num_stripes;
}
}
if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
stripes_allocated < stripes_required) {
stripes_allocated = map->num_stripes;
free_extent_map(em);
kfree(multi);
kfree(bbio);
goto again;
}
stripe_nr = offset;
@ -2950,7 +2991,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
*length = em->len - offset;
}
if (!multi_ret)
if (!bbio_ret)
goto out;
num_stripes = 1;
@ -2975,13 +3016,17 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
stripe_index = find_live_mirror(map, 0,
map->num_stripes,
current->pid % map->num_stripes);
mirror_num = stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
if (rw & (REQ_WRITE | REQ_DISCARD))
if (rw & (REQ_WRITE | REQ_DISCARD)) {
num_stripes = map->num_stripes;
else if (mirror_num)
} else if (mirror_num) {
stripe_index = mirror_num - 1;
} else {
mirror_num = 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
int factor = map->num_stripes / map->sub_stripes;
@ -3001,6 +3046,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
stripe_index = find_live_mirror(map, stripe_index,
map->sub_stripes, stripe_index +
current->pid % map->sub_stripes);
mirror_num = stripe_index + 1;
}
} else {
/*
@ -3009,15 +3055,16 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
* stripe_index is the number of our device in the stripe array
*/
stripe_index = do_div(stripe_nr, map->num_stripes);
mirror_num = stripe_index + 1;
}
BUG_ON(stripe_index >= map->num_stripes);
if (rw & REQ_DISCARD) {
for (i = 0; i < num_stripes; i++) {
multi->stripes[i].physical =
bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
multi->stripes[i].dev = map->stripes[stripe_index].dev;
bbio->stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
u64 stripes;
@ -3038,16 +3085,16 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
}
stripes = stripe_nr_end - 1 - j;
do_div(stripes, map->num_stripes);
multi->stripes[i].length = map->stripe_len *
bbio->stripes[i].length = map->stripe_len *
(stripes - stripe_nr + 1);
if (i == 0) {
multi->stripes[i].length -=
bbio->stripes[i].length -=
stripe_offset;
stripe_offset = 0;
}
if (stripe_index == last_stripe)
multi->stripes[i].length -=
bbio->stripes[i].length -=
stripe_end_offset;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
u64 stripes;
@ -3072,11 +3119,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
}
stripes = stripe_nr_end - 1 - j;
do_div(stripes, factor);
multi->stripes[i].length = map->stripe_len *
bbio->stripes[i].length = map->stripe_len *
(stripes - stripe_nr + 1);
if (i < map->sub_stripes) {
multi->stripes[i].length -=
bbio->stripes[i].length -=
stripe_offset;
if (i == map->sub_stripes - 1)
stripe_offset = 0;
@ -3084,11 +3131,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
map->sub_stripes - 1)) {
multi->stripes[i].length -=
bbio->stripes[i].length -=
stripe_end_offset;
}
} else
multi->stripes[i].length = *length;
bbio->stripes[i].length = *length;
stripe_index++;
if (stripe_index == map->num_stripes) {
@ -3099,19 +3146,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
}
} else {
for (i = 0; i < num_stripes; i++) {
multi->stripes[i].physical =
bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset +
stripe_nr * map->stripe_len;
multi->stripes[i].dev =
bbio->stripes[i].dev =
map->stripes[stripe_index].dev;
stripe_index++;
}
}
if (multi_ret) {
*multi_ret = multi;
multi->num_stripes = num_stripes;
multi->max_errors = max_errors;
if (bbio_ret) {
*bbio_ret = bbio;
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
}
out:
free_extent_map(em);
@ -3120,9 +3168,9 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
struct btrfs_multi_bio **multi_ret, int mirror_num)
struct btrfs_bio **bbio_ret, int mirror_num)
{
return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
mirror_num);
}
@ -3191,30 +3239,32 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
static void end_bio_multi_stripe(struct bio *bio, int err)
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_multi_bio *multi = bio->bi_private;
struct btrfs_bio *bbio = bio->bi_private;
int is_orig_bio = 0;
if (err)
atomic_inc(&multi->error);
atomic_inc(&bbio->error);
if (bio == multi->orig_bio)
if (bio == bbio->orig_bio)
is_orig_bio = 1;
if (atomic_dec_and_test(&multi->stripes_pending)) {
if (atomic_dec_and_test(&bbio->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
bio = multi->orig_bio;
bio = bbio->orig_bio;
}
bio->bi_private = multi->private;
bio->bi_end_io = multi->end_io;
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
bio->bi_bdev = (struct block_device *)
(unsigned long)bbio->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the multi-bio
*/
if (atomic_read(&multi->error) > multi->max_errors) {
if (atomic_read(&bbio->error) > bbio->max_errors) {
err = -EIO;
} else if (err) {
} else {
/*
* this bio is actually up to date, we didn't
* go over the max number of errors
@ -3222,7 +3272,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
set_bit(BIO_UPTODATE, &bio->bi_flags);
err = 0;
}
kfree(multi);
kfree(bbio);
bio_endio(bio, err);
} else if (!is_orig_bio) {
@ -3302,20 +3352,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
u64 logical = (u64)bio->bi_sector << 9;
u64 length = 0;
u64 map_length;
struct btrfs_multi_bio *multi = NULL;
int ret;
int dev_nr = 0;
int total_devs = 1;
struct btrfs_bio *bbio = NULL;
length = bio->bi_size;
map_tree = &root->fs_info->mapping_tree;
map_length = length;
ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
mirror_num);
BUG_ON(ret);
total_devs = multi->num_stripes;
total_devs = bbio->num_stripes;
if (map_length < length) {
printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
"len %llu\n", (unsigned long long)logical,
@ -3323,25 +3373,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
(unsigned long long)map_length);
BUG();
}
multi->end_io = first_bio->bi_end_io;
multi->private = first_bio->bi_private;
multi->orig_bio = first_bio;
atomic_set(&multi->stripes_pending, multi->num_stripes);
bbio->orig_bio = first_bio;
bbio->private = first_bio->bi_private;
bbio->end_io = first_bio->bi_end_io;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
while (dev_nr < total_devs) {
if (total_devs > 1) {
if (dev_nr < total_devs - 1) {
bio = bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio);
} else {
bio = first_bio;
}
bio->bi_private = multi;
bio->bi_end_io = end_bio_multi_stripe;
if (dev_nr < total_devs - 1) {
bio = bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio);
} else {
bio = first_bio;
}
bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
dev = multi->stripes[dev_nr].dev;
bio->bi_private = bbio;
bio->bi_end_io = btrfs_end_bio;
bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
dev = bbio->stripes[dev_nr].dev;
if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
"(%s id %llu), size=%u\n", rw,
(u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
dev->name, dev->devid, bio->bi_size);
bio->bi_bdev = dev->bdev;
if (async_submit)
schedule_bio(root, dev, rw, bio);
@ -3354,8 +3407,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
}
dev_nr++;
}
if (total_devs == 1)
kfree(multi);
return 0;
}
@ -3616,15 +3667,20 @@ static int read_one_dev(struct btrfs_root *root,
fill_device_from_item(leaf, dev_item, device);
device->dev_root = root->fs_info->dev_root;
device->in_fs_metadata = 1;
if (device->writeable)
if (device->writeable) {
device->fs_devices->total_rw_bytes += device->total_bytes;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += device->total_bytes -
device->bytes_used;
spin_unlock(&root->fs_info->free_chunk_lock);
}
ret = 0;
return ret;
}
int btrfs_read_sys_array(struct btrfs_root *root)
{
struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;

View file

@ -92,6 +92,20 @@ struct btrfs_device {
struct btrfs_work work;
struct rcu_head rcu;
struct work_struct rcu_work;
/* readahead state */
spinlock_t reada_lock;
atomic_t reada_in_flight;
u64 reada_next;
struct reada_zone *reada_curr_zone;
struct radix_tree_root reada_zones;
struct radix_tree_root reada_extents;
/* for sending down flush barriers */
struct bio *flush_bio;
struct completion flush_wait;
int nobarriers;
};
struct btrfs_fs_devices {
@ -136,7 +150,10 @@ struct btrfs_bio_stripe {
u64 length; /* only used for discard mappings */
};
struct btrfs_multi_bio {
struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
struct btrfs_bio {
atomic_t stripes_pending;
bio_end_io_t *end_io;
struct bio *orig_bio;
@ -144,6 +161,7 @@ struct btrfs_multi_bio {
atomic_t error;
int max_errors;
int num_stripes;
int mirror_num;
struct btrfs_bio_stripe stripes[];
};
@ -171,7 +189,7 @@ struct map_lookup {
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 start, u64 num_bytes);
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
struct btrfs_multi_bio **multi_ret, int mirror_num);
struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);

View file

@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
again:
ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
name, name_len, value, size);
/*
* If we're setting an xattr to a new value but the new value is say
* exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
* back from split_leaf. This is because it thinks we'll be extending
* the existing item size, but we're asking for enough space to add the
* item itself. So if we get EOVERFLOW just set ret to EEXIST and let
* the rest of the function figure it out.
*/
if (ret == -EOVERFLOW)
ret = -EEXIST;
if (ret == -EEXIST) {
if (flags & XATTR_CREATE)
goto out;