[PATCH] jbd2: initial copy of files from jbd

This is a simple copy of the files in fs/jbd to fs/jbd2 and
/usr/incude/linux/[ext4_]jbd.h to /usr/include/[ext4_]jbd2.h

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
Dave Kleikamp 2006-10-11 01:20:57 -07:00 committed by Linus Torvalds
parent 02ea2104c5
commit 470decc613
9 changed files with 8428 additions and 0 deletions

7
fs/jbd2/Makefile Normal file
View file

@ -0,0 +1,7 @@
#
# Makefile for the linux journaling routines.
#
obj-$(CONFIG_JBD) += jbd.o
jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o

697
fs/jbd2/checkpoint.c Normal file
View file

@ -0,0 +1,697 @@
/*
* linux/fs/checkpoint.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Checkpoint routines for the generic filesystem journaling code.
* Part of the ext2fs journaling system.
*
* Checkpointing is the process of ensuring that a section of the log is
* committed fully to disk, so that that portion of the log can be
* reused.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
/*
* Unlink a buffer from a transaction checkpoint list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink_first(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
jh->b_cpnext->b_cpprev = jh->b_cpprev;
jh->b_cpprev->b_cpnext = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh) {
transaction->t_checkpoint_list = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh)
transaction->t_checkpoint_list = NULL;
}
}
/*
* Unlink a buffer from a transaction checkpoint(io) list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (transaction->t_checkpoint_io_list == jh) {
transaction->t_checkpoint_io_list = jh->b_cpnext;
if (transaction->t_checkpoint_io_list == jh)
transaction->t_checkpoint_io_list = NULL;
}
}
/*
* Move a buffer from the checkpoint list to the checkpoint io list
*
* Called with j_list_lock held
*/
static inline void __buffer_relink_io(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (!transaction->t_checkpoint_io_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_io_list;
jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_io_list = jh;
}
/*
* Try to release a checkpointed buffer from its transaction.
* Returns 1 if we released it and 2 if we also released the
* whole transaction.
*
* Requires j_list_lock
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __try_to_free_cp_buf(struct journal_head *jh)
{
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __journal_remove_checkpoint(jh) + 1;
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
BUFFER_TRACE(bh, "release");
__brelse(bh);
} else {
jbd_unlock_bh_state(bh);
}
return ret;
}
/*
* __log_wait_for_space: wait until there is space in the journal.
*
* Called under j-state_lock *only*. It will be unlocked if we have to wait
* for a checkpoint to free up some space in the log.
*/
void __log_wait_for_space(journal_t *journal)
{
int nblocks;
assert_spin_locked(&journal->j_state_lock);
nblocks = jbd_space_needed(journal);
while (__log_space_left(journal) < nblocks) {
if (journal->j_flags & JFS_ABORT)
return;
spin_unlock(&journal->j_state_lock);
mutex_lock(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
* were waiting for the checkpoint lock
*/
spin_lock(&journal->j_state_lock);
nblocks = jbd_space_needed(journal);
if (__log_space_left(journal) < nblocks) {
spin_unlock(&journal->j_state_lock);
log_do_checkpoint(journal);
spin_lock(&journal->j_state_lock);
}
mutex_unlock(&journal->j_checkpoint_mutex);
}
}
/*
* We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
* The caller must restart a list walk. Wait for someone else to run
* jbd_unlock_bh_state().
*/
static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
__releases(journal->j_list_lock)
{
get_bh(bh);
spin_unlock(&journal->j_list_lock);
jbd_lock_bh_state(bh);
jbd_unlock_bh_state(bh);
put_bh(bh);
}
/*
* Clean up transaction's list of buffers submitted for io.
* We wait for any pending IO to complete and remove any clean
* buffers. Note that we take the buffers in the opposite ordering
* from the one in which they were submitted for IO.
*
* Called with j_list_lock held.
*/
static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
{
struct journal_head *jh;
struct buffer_head *bh;
tid_t this_tid;
int released = 0;
this_tid = transaction->t_tid;
restart:
/* Did somebody clean up the transaction in the meanwhile? */
if (journal->j_checkpoint_transactions != transaction ||
transaction->t_tid != this_tid)
return;
while (!released && transaction->t_checkpoint_io_list) {
jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
spin_lock(&journal->j_list_lock);
goto restart;
}
if (buffer_locked(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
spin_lock(&journal->j_list_lock);
goto restart;
}
/*
* Now in whatever state the buffer currently is, we know that
* it has been written out and so we can drop it from the list
*/
released = __journal_remove_checkpoint(jh);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
__brelse(bh);
}
}
#define NR_BATCH 64
static void
__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
{
int i;
ll_rw_block(SWRITE, *batch_count, bhs);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = bhs[i];
clear_buffer_jwrite(bh);
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
}
*batch_count = 0;
}
/*
* Try to flush one buffer from the checkpoint list to disk.
*
* Return 1 if something happened which requires us to abort the current
* scan of the checkpoint list.
*
* Called with j_list_lock held and drops it if 1 is returned
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __process_buffer(journal_t *journal, struct journal_head *jh,
struct buffer_head **bhs, int *batch_count)
{
struct buffer_head *bh = jh2bh(jh);
int ret = 0;
if (buffer_locked(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
ret = 1;
} else if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
log_start_commit(journal, tid);
log_wait_commit(journal, tid);
ret = 1;
} else if (!buffer_dirty(bh)) {
J_ASSERT_JH(jh, !buffer_jbddirty(bh));
BUFFER_TRACE(bh, "remove from checkpoint");
__journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
__brelse(bh);
ret = 1;
} else {
/*
* Important: we are about to write the buffer, and
* possibly block, while still holding the journal lock.
* We cannot afford to let the transaction logic start
* messing around with this buffer before we write it to
* disk, as that would break recoverability.
*/
BUFFER_TRACE(bh, "queue");
get_bh(bh);
J_ASSERT_BH(bh, !buffer_jwrite(bh));
set_buffer_jwrite(bh);
bhs[*batch_count] = bh;
__buffer_relink_io(jh);
jbd_unlock_bh_state(bh);
(*batch_count)++;
if (*batch_count == NR_BATCH) {
spin_unlock(&journal->j_list_lock);
__flush_batch(journal, bhs, batch_count);
ret = 1;
}
}
return ret;
}
/*
* Perform an actual checkpoint. We take the first transaction on the
* list of transactions to be checkpointed and send all its buffers
* to disk. We submit larger chunks of data at once.
*
* The journal should be locked before calling this function.
*/
int log_do_checkpoint(journal_t *journal)
{
transaction_t *transaction;
tid_t this_tid;
int result;
jbd_debug(1, "Start checkpoint\n");
/*
* First thing: if there are any transactions in the log which
* don't need checkpointing, just eliminate them from the
* journal straight away.
*/
result = cleanup_journal_tail(journal);
jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
/*
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
transaction = journal->j_checkpoint_transactions;
this_tid = transaction->t_tid;
restart:
/*
* If someone cleaned up this transaction while we slept, we're
* done (maybe it's a new transaction, but it fell at the same
* address).
*/
if (journal->j_checkpoint_transactions == transaction &&
transaction->t_tid == this_tid) {
int batch_count = 0;
struct buffer_head *bhs[NR_BATCH];
struct journal_head *jh;
int retry = 0;
while (!retry && transaction->t_checkpoint_list) {
struct buffer_head *bh;
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
retry = 1;
break;
}
retry = __process_buffer(journal, jh, bhs,&batch_count);
if (!retry && lock_need_resched(&journal->j_list_lock)){
spin_unlock(&journal->j_list_lock);
retry = 1;
break;
}
}
if (batch_count) {
if (!retry) {
spin_unlock(&journal->j_list_lock);
retry = 1;
}
__flush_batch(journal, bhs, &batch_count);
}
if (retry) {
spin_lock(&journal->j_list_lock);
goto restart;
}
/*
* Now we have cleaned up the first transaction's checkpoint
* list. Let's clean up the second one
*/
__wait_cp_io(journal, transaction);
}
out:
spin_unlock(&journal->j_list_lock);
result = cleanup_journal_tail(journal);
if (result < 0)
return result;
return 0;
}
/*
* Check the list of checkpoint transactions for the journal to see if
* we have already got rid of any since the last update of the log tail
* in the journal superblock. If so, we can instantly roll the
* superblock forward to remove those transactions from the log.
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
* Called with the journal lock held.
*
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
* even in abort state, but we must not update the journal superblock if
* we have an abort error outstanding.
*/
int cleanup_journal_tail(journal_t *journal)
{
transaction_t * transaction;
tid_t first_tid;
unsigned long blocknr, freed;
/* OK, work out the oldest transaction remaining in the log, and
* the log block it starts at.
*
* If the log is now empty, we need to work out which is the
* next transaction ID we will write, and where it will
* start. */
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
transaction = journal->j_checkpoint_transactions;
if (transaction) {
first_tid = transaction->t_tid;
blocknr = transaction->t_log_start;
} else if ((transaction = journal->j_committing_transaction) != NULL) {
first_tid = transaction->t_tid;
blocknr = transaction->t_log_start;
} else if ((transaction = journal->j_running_transaction) != NULL) {
first_tid = transaction->t_tid;
blocknr = journal->j_head;
} else {
first_tid = journal->j_transaction_sequence;
blocknr = journal->j_head;
}
spin_unlock(&journal->j_list_lock);
J_ASSERT(blocknr != 0);
/* If the oldest pinned transaction is at the tail of the log
already then there's not much we can do right now. */
if (journal->j_tail_sequence == first_tid) {
spin_unlock(&journal->j_state_lock);
return 1;
}
/* OK, update the superblock to recover the freed space.
* Physical blocks come first: have we wrapped beyond the end of
* the log? */
freed = blocknr - journal->j_tail;
if (blocknr < journal->j_tail)
freed = freed + journal->j_last - journal->j_first;
jbd_debug(1,
"Cleaning journal tail from %d to %d (offset %lu), "
"freeing %lu\n",
journal->j_tail_sequence, first_tid, blocknr, freed);
journal->j_free += freed;
journal->j_tail_sequence = first_tid;
journal->j_tail = blocknr;
spin_unlock(&journal->j_state_lock);
if (!(journal->j_flags & JFS_ABORT))
journal_update_superblock(journal, 1);
return 0;
}
/* Checkpoint list management */
/*
* journal_clean_one_cp_list
*
* Find all the written-back checkpoint buffers in the given list and release them.
*
* Called with the journal locked.
* Called with j_list_lock held.
* Returns number of bufers reaped (for debug)
*/
static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret, freed = 0;
*released = 0;
if (!jh)
return 0;
last_jh = jh->b_cpprev;
do {
jh = next_jh;
next_jh = jh->b_cpnext;
/* Use trylock because of the ranking */
if (jbd_trylock_bh_state(jh2bh(jh))) {
ret = __try_to_free_cp_buf(jh);
if (ret) {
freed++;
if (ret == 2) {
*released = 1;
return freed;
}
}
}
/*
* This function only frees up some memory
* if possible so we dont have an obligation
* to finish processing. Bail out if preemption
* requested:
*/
if (need_resched())
return freed;
} while (jh != last_jh);
return freed;
}
/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
*
* Called with the journal locked.
* Called with j_list_lock held.
* Returns number of buffers reaped (for debug)
*/
int __journal_clean_checkpoint_list(journal_t *journal)
{
transaction_t *transaction, *last_transaction, *next_transaction;
int ret = 0;
int released;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
goto out;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
ret += journal_clean_one_cp_list(transaction->
t_checkpoint_list, &released);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
* preemption requested:
*/
if (need_resched())
goto out;
if (released)
continue;
/*
* It is essential that we are as careful as in the case of
* t_checkpoint_list with removing the buffer from the list as
* we can possibly see not yet submitted buffers on io_list
*/
ret += journal_clean_one_cp_list(transaction->
t_checkpoint_io_list, &released);
if (need_resched())
goto out;
} while (transaction != last_transaction);
out:
return ret;
}
/*
* journal_remove_checkpoint: called after a buffer has been committed
* to disk (either by being write-back flushed to disk, or being
* committed to the log).
*
* We cannot safely clean a transaction out of the log until all of the
* buffer updates committed in that transaction have safely been stored
* elsewhere on disk. To achieve this, all of the buffers in a
* transaction need to be maintained on the transaction's checkpoint
* lists until they have been rewritten, at which point this function is
* called to remove the buffer from the existing transaction's
* checkpoint lists.
*
* The function returns 1 if it frees the transaction, 0 otherwise.
*
* This function is called with the journal locked.
* This function is called with j_list_lock held.
* This function is called with jbd_lock_bh_state(jh2bh(jh))
*/
int __journal_remove_checkpoint(struct journal_head *jh)
{
transaction_t *transaction;
journal_t *journal;
int ret = 0;
JBUFFER_TRACE(jh, "entry");
if ((transaction = jh->b_cp_transaction) == NULL) {
JBUFFER_TRACE(jh, "not on transaction");
goto out;
}
journal = transaction->t_journal;
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
if (transaction->t_checkpoint_list != NULL ||
transaction->t_checkpoint_io_list != NULL)
goto out;
JBUFFER_TRACE(jh, "transaction has no more buffers");
/*
* There is one special case to worry about: if we have just pulled the
* buffer off a committing transaction's forget list, then even if the
* checkpoint list is empty, the transaction obviously cannot be
* dropped!
*
* The locking here around j_committing_transaction is a bit sleazy.
* See the comment at the end of journal_commit_transaction().
*/
if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "belongs to committing transaction");
goto out;
}
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
__journal_drop_transaction(journal, transaction);
/* Just in case anybody was waiting for more transactions to be
checkpointed... */
wake_up(&journal->j_wait_logspace);
ret = 1;
out:
JBUFFER_TRACE(jh, "exit");
return ret;
}
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
* the log.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __journal_insert_checkpoint(struct journal_head *jh,
transaction_t *transaction)
{
JBUFFER_TRACE(jh, "entry");
J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
jh->b_cp_transaction = transaction;
if (!transaction->t_checkpoint_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_list;
jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_list = jh;
}
/*
* We've finished with this transaction structure: adios...
*
* The transaction must have no links except for the checkpoint by this
* point.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
{
assert_spin_locked(&journal->j_list_lock);
if (transaction->t_cpnext) {
transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions =
transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions = NULL;
}
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(transaction->t_updates == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
kfree(transaction);
}

911
fs/jbd2/commit.c Normal file
View file

@ -0,0 +1,911 @@
/*
* linux/fs/jbd/commit.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
*
* Copyright 1998 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal commit routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/smp_lock.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
*/
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
BUFFER_TRACE(bh, "");
if (uptodate)
set_buffer_uptodate(bh);
else
clear_buffer_uptodate(bh);
unlock_buffer(bh);
}
/*
* When an ext3-ordered file is truncated, it is possible that many pages are
* not sucessfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
* the numbers in /proc/meminfo look odd.
*
* So here, we have a buffer which has just come off the forget list. Look to
* see if we can strip all buffers from the backing page.
*
* Called under lock_journal(), and possibly under journal_datalist_lock. The
* caller provided us with a ref against the buffer, and we drop that here.
*/
static void release_buffer_page(struct buffer_head *bh)
{
struct page *page;
if (buffer_dirty(bh))
goto nope;
if (atomic_read(&bh->b_count) != 1)
goto nope;
page = bh->b_page;
if (!page)
goto nope;
if (page->mapping)
goto nope;
/* OK, it's a truncated page */
if (TestSetPageLocked(page))
goto nope;
page_cache_get(page);
__brelse(bh);
try_to_free_buffers(page);
unlock_page(page);
page_cache_release(page);
return;
nope:
__brelse(bh);
}
/*
* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
* held. For ranking reasons we must trylock. If we lose, schedule away and
* return 0. j_list_lock is dropped in this case.
*/
static int inverted_lock(journal_t *journal, struct buffer_head *bh)
{
if (!jbd_trylock_bh_state(bh)) {
spin_unlock(&journal->j_list_lock);
schedule();
return 0;
}
return 1;
}
/* Done it all: now write the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
* entirely.
*
* Returns 1 if the journal needs to be aborted or 0 on success
*/
static int journal_write_commit_record(journal_t *journal,
transaction_t *commit_transaction)
{
struct journal_head *descriptor;
struct buffer_head *bh;
int i, ret;
int barrier_done = 0;
if (is_journal_aborted(journal))
return 0;
descriptor = journal_get_descriptor_buffer(journal);
if (!descriptor)
return 1;
bh = jh2bh(descriptor);
/* AKPM: buglet - add `i' to tmp! */
for (i = 0; i < bh->b_size; i += 512) {
journal_header_t *tmp = (journal_header_t*)bh->b_data;
tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
}
JBUFFER_TRACE(descriptor, "write commit block");
set_buffer_dirty(bh);
if (journal->j_flags & JFS_BARRIER) {
set_buffer_ordered(bh);
barrier_done = 1;
}
ret = sync_dirty_buffer(bh);
/* is it possible for another commit to fail at roughly
* the same time as this one? If so, we don't want to
* trust the barrier flag in the super, but instead want
* to remember if we sent a barrier request
*/
if (ret == -EOPNOTSUPP && barrier_done) {
char b[BDEVNAME_SIZE];
printk(KERN_WARNING
"JBD: barrier-based sync failed on %s - "
"disabling barriers\n",
bdevname(journal->j_dev, b));
spin_lock(&journal->j_state_lock);
journal->j_flags &= ~JFS_BARRIER;
spin_unlock(&journal->j_state_lock);
/* And try again, without the barrier */
clear_buffer_ordered(bh);
set_buffer_uptodate(bh);
set_buffer_dirty(bh);
ret = sync_dirty_buffer(bh);
}
put_bh(bh); /* One for getblk() */
journal_put_journal_head(descriptor);
return (ret == -EIO);
}
static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
{
int i;
for (i = 0; i < bufs; i++) {
wbuf[i]->b_end_io = end_buffer_write_sync;
/* We use-up our safety reference in submit_bh() */
submit_bh(WRITE, wbuf[i]);
}
}
/*
* Submit all the data buffers to disk
*/
static void journal_submit_data_buffers(journal_t *journal,
transaction_t *commit_transaction)
{
struct journal_head *jh;
struct buffer_head *bh;
int locked;
int bufs = 0;
struct buffer_head **wbuf = journal->j_wbuf;
/*
* Whenever we unlock the journal and sleep, things can get added
* onto ->t_sync_datalist, so we have to keep looping back to
* write_out_data until we *know* that the list is empty.
*
* Cleanup any flushed data buffers from the data list. Even in
* abort mode, we want to flush this out as soon as possible.
*/
write_out_data:
cond_resched();
spin_lock(&journal->j_list_lock);
while (commit_transaction->t_sync_datalist) {
jh = commit_transaction->t_sync_datalist;
bh = jh2bh(jh);
locked = 0;
/* Get reference just to make sure buffer does not disappear
* when we are forced to drop various locks */
get_bh(bh);
/* If the buffer is dirty, we need to submit IO and hence
* we need the buffer lock. We try to lock the buffer without
* blocking. If we fail, we need to drop j_list_lock and do
* blocking lock_buffer().
*/
if (buffer_dirty(bh)) {
if (test_set_buffer_locked(bh)) {
BUFFER_TRACE(bh, "needs blocking lock");
spin_unlock(&journal->j_list_lock);
/* Write out all data to prevent deadlocks */
journal_do_submit_data(wbuf, bufs);
bufs = 0;
lock_buffer(bh);
spin_lock(&journal->j_list_lock);
}
locked = 1;
}
/* We have to get bh_state lock. Again out of order, sigh. */
if (!inverted_lock(journal, bh)) {
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
}
/* Someone already cleaned up the buffer? */
if (!buffer_jbd(bh)
|| jh->b_transaction != commit_transaction
|| jh->b_jlist != BJ_SyncData) {
jbd_unlock_bh_state(bh);
if (locked)
unlock_buffer(bh);
BUFFER_TRACE(bh, "already cleaned up");
put_bh(bh);
continue;
}
if (locked && test_clear_buffer_dirty(bh)) {
BUFFER_TRACE(bh, "needs writeout, adding to array");
wbuf[bufs++] = bh;
__journal_file_buffer(jh, commit_transaction,
BJ_Locked);
jbd_unlock_bh_state(bh);
if (bufs == journal->j_wbufsize) {
spin_unlock(&journal->j_list_lock);
journal_do_submit_data(wbuf, bufs);
bufs = 0;
goto write_out_data;
}
}
else {
BUFFER_TRACE(bh, "writeout complete: unfile");
__journal_unfile_buffer(jh);
jbd_unlock_bh_state(bh);
if (locked)
unlock_buffer(bh);
journal_remove_journal_head(bh);
/* Once for our safety reference, once for
* journal_remove_journal_head() */
put_bh(bh);
put_bh(bh);
}
if (lock_need_resched(&journal->j_list_lock)) {
spin_unlock(&journal->j_list_lock);
goto write_out_data;
}
}
spin_unlock(&journal->j_list_lock);
journal_do_submit_data(wbuf, bufs);
}
/*
* journal_commit_transaction
*
* The primary function for committing a transaction to the log. This
* function is called by the journal thread to begin a complete commit.
*/
void journal_commit_transaction(journal_t *journal)
{
transaction_t *commit_transaction;
struct journal_head *jh, *new_jh, *descriptor;
struct buffer_head **wbuf = journal->j_wbuf;
int bufs;
int flags;
int err;
unsigned long blocknr;
char *tagp = NULL;
journal_header_t *header;
journal_block_tag_t *tag = NULL;
int space_left = 0;
int first_tag = 0;
int tag_flag;
int i;
/*
* First job: lock down the current transaction and wait for
* all outstanding updates to complete.
*/
#ifdef COMMIT_STATS
spin_lock(&journal->j_list_lock);
summarise_journal_usage(journal);
spin_unlock(&journal->j_list_lock);
#endif
/* Do we need to erase the effects of a prior journal_flush? */
if (journal->j_flags & JFS_FLUSHED) {
jbd_debug(3, "super block updated\n");
journal_update_superblock(journal, 1);
} else {
jbd_debug(3, "superblock not updated\n");
}
J_ASSERT(journal->j_running_transaction != NULL);
J_ASSERT(journal->j_committing_transaction == NULL);
commit_transaction = journal->j_running_transaction;
J_ASSERT(commit_transaction->t_state == T_RUNNING);
jbd_debug(1, "JBD: starting commit of transaction %d\n",
commit_transaction->t_tid);
spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_LOCKED;
spin_lock(&commit_transaction->t_handle_lock);
while (commit_transaction->t_updates) {
DEFINE_WAIT(wait);
prepare_to_wait(&journal->j_wait_updates, &wait,
TASK_UNINTERRUPTIBLE);
if (commit_transaction->t_updates) {
spin_unlock(&commit_transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
schedule();
spin_lock(&journal->j_state_lock);
spin_lock(&commit_transaction->t_handle_lock);
}
finish_wait(&journal->j_wait_updates, &wait);
}
spin_unlock(&commit_transaction->t_handle_lock);
J_ASSERT (commit_transaction->t_outstanding_credits <=
journal->j_max_transaction_buffers);
/*
* First thing we are allowed to do is to discard any remaining
* BJ_Reserved buffers. Note, it is _not_ permissible to assume
* that there are no such buffers: if a large filesystem
* operation like a truncate needs to split itself over multiple
* transactions, then it may try to do a journal_restart() while
* there are still BJ_Reserved buffers outstanding. These must
* be released cleanly from the current transaction.
*
* In this case, the filesystem must still reserve write access
* again before modifying the buffer in the new transaction, but
* we do not require it to remember exactly which old buffers it
* has reserved. This is consistent with the existing behaviour
* that multiple journal_get_write_access() calls to the same
* buffer are perfectly permissable.
*/
while (commit_transaction->t_reserved_list) {
jh = commit_transaction->t_reserved_list;
JBUFFER_TRACE(jh, "reserved, unused: refile");
/*
* A journal_get_undo_access()+journal_release_buffer() may
* leave undo-committed data.
*/
if (jh->b_committed_data) {
struct buffer_head *bh = jh2bh(jh);
jbd_lock_bh_state(bh);
jbd_slab_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
jbd_unlock_bh_state(bh);
}
journal_refile_buffer(journal, jh);
}
/*
* Now try to drop any written-back buffers from the journal's
* checkpoint lists. We do this *before* commit because it potentially
* frees some memory
*/
spin_lock(&journal->j_list_lock);
__journal_clean_checkpoint_list(journal);
spin_unlock(&journal->j_list_lock);
jbd_debug (3, "JBD: commit phase 1\n");
/*
* Switch to a new revoke table.
*/
journal_switch_revoke_table(journal);
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
journal->j_running_transaction = NULL;
commit_transaction->t_log_start = journal->j_head;
wake_up(&journal->j_wait_transaction_locked);
spin_unlock(&journal->j_state_lock);
jbd_debug (3, "JBD: commit phase 2\n");
/*
* First, drop modified flag: all accesses to the buffers
* will be tracked for a new trasaction only -bzzz
*/
spin_lock(&journal->j_list_lock);
if (commit_transaction->t_buffers) {
new_jh = jh = commit_transaction->t_buffers->b_tnext;
do {
J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
new_jh->b_modified == 0);
new_jh->b_modified = 0;
new_jh = new_jh->b_tnext;
} while (new_jh != jh);
}
spin_unlock(&journal->j_list_lock);
/*
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
err = 0;
journal_submit_data_buffers(journal, commit_transaction);
/*
* Wait for all previously submitted IO to complete.
*/
spin_lock(&journal->j_list_lock);
while (commit_transaction->t_locked_list) {
struct buffer_head *bh;
jh = commit_transaction->t_locked_list->b_tprev;
bh = jh2bh(jh);
get_bh(bh);
if (buffer_locked(bh)) {
spin_unlock(&journal->j_list_lock);
wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
spin_lock(&journal->j_list_lock);
}
if (!inverted_lock(journal, bh)) {
put_bh(bh);
spin_lock(&journal->j_list_lock);
continue;
}
if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
__journal_unfile_buffer(jh);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
put_bh(bh);
} else {
jbd_unlock_bh_state(bh);
}
put_bh(bh);
cond_resched_lock(&journal->j_list_lock);
}
spin_unlock(&journal->j_list_lock);
if (err)
__journal_abort_hard(journal);
journal_write_revoke_records(journal, commit_transaction);
jbd_debug(3, "JBD: commit phase 2\n");
/*
* If we found any dirty or locked buffers, then we should have
* looped back up to the write_out_data label. If there weren't
* any then journal_clean_data_list should have wiped the list
* clean by now, so check that it is in fact empty.
*/
J_ASSERT (commit_transaction->t_sync_datalist == NULL);
jbd_debug (3, "JBD: commit phase 3\n");
/*
* Way to go: we have now written out all of the data for a
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
*/
commit_transaction->t_state = T_COMMIT;
descriptor = NULL;
bufs = 0;
while (commit_transaction->t_buffers) {
/* Find the next buffer to be journaled... */
jh = commit_transaction->t_buffers;
/* If we're in abort mode, we just un-journal the buffer and
release it for background writing. */
if (is_journal_aborted(journal)) {
JBUFFER_TRACE(jh, "journal is aborting: refile");
journal_refile_buffer(journal, jh);
/* If that was the last one, we need to clean up
* any descriptor buffers which may have been
* already allocated, even if we are now
* aborting. */
if (!commit_transaction->t_buffers)
goto start_journal_io;
continue;
}
/* Make sure we have a descriptor block in which to
record the metadata buffer. */
if (!descriptor) {
struct buffer_head *bh;
J_ASSERT (bufs == 0);
jbd_debug(4, "JBD: get descriptor\n");
descriptor = journal_get_descriptor_buffer(journal);
if (!descriptor) {
__journal_abort_hard(journal);
continue;
}
bh = jh2bh(descriptor);
jbd_debug(4, "JBD: got buffer %llu (%p)\n",
(unsigned long long)bh->b_blocknr, bh->b_data);
header = (journal_header_t *)&bh->b_data[0];
header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
tagp = &bh->b_data[sizeof(journal_header_t)];
space_left = bh->b_size - sizeof(journal_header_t);
first_tag = 1;
set_buffer_jwrite(bh);
set_buffer_dirty(bh);
wbuf[bufs++] = bh;
/* Record it so that we can wait for IO
completion later */
BUFFER_TRACE(bh, "ph3: file as descriptor");
journal_file_buffer(descriptor, commit_transaction,
BJ_LogCtl);
}
/* Where is the buffer to be written? */
err = journal_next_log_block(journal, &blocknr);
/* If the block mapping failed, just abandon the buffer
and repeat this loop: we'll fall into the
refile-on-abort condition above. */
if (err) {
__journal_abort_hard(journal);
continue;
}
/*
* start_this_handle() uses t_outstanding_credits to determine
* the free space in the log, but this counter is changed
* by journal_next_log_block() also.
*/
commit_transaction->t_outstanding_credits--;
/* Bump b_count to prevent truncate from stumbling over
the shadowed buffer! @@@ This can go if we ever get
rid of the BJ_IO/BJ_Shadow pairing of buffers. */
atomic_inc(&jh2bh(jh)->b_count);
/* Make a temporary IO buffer with which to write it out
(this will requeue both the metadata buffer and the
temporary IO buffer). new_bh goes on BJ_IO*/
set_bit(BH_JWrite, &jh2bh(jh)->b_state);
/*
* akpm: journal_write_metadata_buffer() sets
* new_bh->b_transaction to commit_transaction.
* We need to clean this up before we release new_bh
* (which is of type BJ_IO)
*/
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = journal_write_metadata_buffer(commit_transaction,
jh, &new_jh, blocknr);
set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
wbuf[bufs++] = jh2bh(new_jh);
/* Record the new block's tag in the current descriptor
buffer */
tag_flag = 0;
if (flags & 1)
tag_flag |= JFS_FLAG_ESCAPE;
if (!first_tag)
tag_flag |= JFS_FLAG_SAME_UUID;
tag = (journal_block_tag_t *) tagp;
tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
tag->t_flags = cpu_to_be32(tag_flag);
tagp += sizeof(journal_block_tag_t);
space_left -= sizeof(journal_block_tag_t);
if (first_tag) {
memcpy (tagp, journal->j_uuid, 16);
tagp += 16;
space_left -= 16;
first_tag = 0;
}
/* If there's no more to do, or if the descriptor is full,
let the IO rip! */
if (bufs == journal->j_wbufsize ||
commit_transaction->t_buffers == NULL ||
space_left < sizeof(journal_block_tag_t) + 16) {
jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
/* Write an end-of-descriptor marker before
submitting the IOs. "tag" still points to
the last tag we set up. */
tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
start_journal_io:
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
submit_bh(WRITE, bh);
}
cond_resched();
/* Force a new descriptor to be generated next
time round the loop. */
descriptor = NULL;
bufs = 0;
}
}
/* Lo and behold: we have just managed to send a transaction to
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
transaction's t_log_list queue, and metadata buffers are on
the t_iobuf_list queue.
Wait for the buffers in reverse order. That way we are
less likely to be woken up until all IOs have completed, and
so we incur less scheduling load.
*/
jbd_debug(3, "JBD: commit phase 4\n");
/*
* akpm: these are BJ_IO, and j_list_lock is not needed.
* See __journal_try_to_free_buffer.
*/
wait_for_iobuf:
while (commit_transaction->t_iobuf_list != NULL) {
struct buffer_head *bh;
jh = commit_transaction->t_iobuf_list->b_tprev;
bh = jh2bh(jh);
if (buffer_locked(bh)) {
wait_on_buffer(bh);
goto wait_for_iobuf;
}
if (cond_resched())
goto wait_for_iobuf;
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
clear_buffer_jwrite(bh);
JBUFFER_TRACE(jh, "ph4: unfile after journal write");
journal_unfile_buffer(journal, jh);
/*
* ->t_iobuf_list should contain only dummy buffer_heads
* which were created by journal_write_metadata_buffer().
*/
BUFFER_TRACE(bh, "dumping temporary bh");
journal_put_journal_head(jh);
__brelse(bh);
J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
free_buffer_head(bh);
/* We also have to unlock and free the corresponding
shadowed buffer */
jh = commit_transaction->t_shadow_list->b_tprev;
bh = jh2bh(jh);
clear_bit(BH_JWrite, &bh->b_state);
J_ASSERT_BH(bh, buffer_jbddirty(bh));
/* The metadata is now released for reuse, but we need
to remember it against this transaction so that when
we finally commit, we can do any checkpointing
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
journal_file_buffer(jh, commit_transaction, BJ_Forget);
/* Wake up any transactions which were waiting for this
IO to complete */
wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
}
J_ASSERT (commit_transaction->t_shadow_list == NULL);
jbd_debug(3, "JBD: commit phase 5\n");
/* Here we wait for the revoke record and descriptor record buffers */
wait_for_ctlbuf:
while (commit_transaction->t_log_list != NULL) {
struct buffer_head *bh;
jh = commit_transaction->t_log_list->b_tprev;
bh = jh2bh(jh);
if (buffer_locked(bh)) {
wait_on_buffer(bh);
goto wait_for_ctlbuf;
}
if (cond_resched())
goto wait_for_ctlbuf;
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
journal_unfile_buffer(journal, jh);
journal_put_journal_head(jh);
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
jbd_debug(3, "JBD: commit phase 6\n");
if (journal_write_commit_record(journal, commit_transaction))
err = -EIO;
if (err)
__journal_abort_hard(journal);
/* End of a transaction! Finally, we can do checkpoint
processing: any buffers committed as a result of this
transaction can be removed from any checkpoint list it was on
before. */
jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_sync_datalist == NULL);
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
J_ASSERT(commit_transaction->t_shadow_list == NULL);
J_ASSERT(commit_transaction->t_log_list == NULL);
restart_loop:
/*
* As there are other places (journal_unmap_buffer()) adding buffers
* to this list we have to be careful and hold the j_list_lock.
*/
spin_lock(&journal->j_list_lock);
while (commit_transaction->t_forget) {
transaction_t *cp_transaction;
struct buffer_head *bh;
jh = commit_transaction->t_forget;
spin_unlock(&journal->j_list_lock);
bh = jh2bh(jh);
jbd_lock_bh_state(bh);
J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
jh->b_transaction == journal->j_running_transaction);
/*
* If there is undo-protected committed data against
* this buffer, then we can remove it now. If it is a
* buffer needing such protection, the old frozen_data
* field now points to a committed version of the
* buffer, so rotate that field to the new committed
* data.
*
* Otherwise, we can just throw away the frozen data now.
*/
if (jh->b_committed_data) {
jbd_slab_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
if (jh->b_frozen_data) {
jh->b_committed_data = jh->b_frozen_data;
jh->b_frozen_data = NULL;
}
} else if (jh->b_frozen_data) {
jbd_slab_free(jh->b_frozen_data, bh->b_size);
jh->b_frozen_data = NULL;
}
spin_lock(&journal->j_list_lock);
cp_transaction = jh->b_cp_transaction;
if (cp_transaction) {
JBUFFER_TRACE(jh, "remove from old cp transaction");
__journal_remove_checkpoint(jh);
}
/* Only re-checkpoint the buffer_head if it is marked
* dirty. If the buffer was added to the BJ_Forget list
* by journal_forget, it may no longer be dirty and
* there's no point in keeping a checkpoint record for
* it. */
/* A buffer which has been freed while still being
* journaled by a previous transaction may end up still
* being dirty here, but we want to avoid writing back
* that buffer in the future now that the last use has
* been committed. That's not only a performance gain,
* it also stops aliasing problems if the buffer is left
* behind for writeback and gets reallocated for another
* use in a different page. */
if (buffer_freed(bh)) {
clear_buffer_freed(bh);
clear_buffer_jbddirty(bh);
}
if (buffer_jbddirty(bh)) {
JBUFFER_TRACE(jh, "add to new checkpointing trans");
__journal_insert_checkpoint(jh, commit_transaction);
JBUFFER_TRACE(jh, "refile for checkpoint writeback");
__journal_refile_buffer(jh);
jbd_unlock_bh_state(bh);
} else {
J_ASSERT_BH(bh, !buffer_dirty(bh));
/* The buffer on BJ_Forget list and not jbddirty means
* it has been freed by this transaction and hence it
* could not have been reallocated until this
* transaction has committed. *BUT* it could be
* reallocated once we have written all the data to
* disk and before we process the buffer on BJ_Forget
* list. */
JBUFFER_TRACE(jh, "refile or unfile freed buffer");
__journal_refile_buffer(jh);
if (!jh->b_transaction) {
jbd_unlock_bh_state(bh);
/* needs a brelse */
journal_remove_journal_head(bh);
release_buffer_page(bh);
} else
jbd_unlock_bh_state(bh);
}
cond_resched_lock(&journal->j_list_lock);
}
spin_unlock(&journal->j_list_lock);
/*
* This is a bit sleazy. We borrow j_list_lock to protect
* journal->j_committing_transaction in __journal_remove_checkpoint.
* Really, __journal_remove_checkpoint should be using j_state_lock but
* it's a bit hassle to hold that across __journal_remove_checkpoint
*/
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
/*
* Now recheck if some buffers did not get attached to the transaction
* while the lock was dropped...
*/
if (commit_transaction->t_forget) {
spin_unlock(&journal->j_list_lock);
spin_unlock(&journal->j_state_lock);
goto restart_loop;
}
/* Done with this transaction! */
jbd_debug(3, "JBD: commit phase 8\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT);
commit_transaction->t_state = T_FINISHED;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
spin_unlock(&journal->j_state_lock);
if (commit_transaction->t_checkpoint_list == NULL) {
__journal_drop_transaction(journal, commit_transaction);
} else {
if (journal->j_checkpoint_transactions == NULL) {
journal->j_checkpoint_transactions = commit_transaction;
commit_transaction->t_cpnext = commit_transaction;
commit_transaction->t_cpprev = commit_transaction;
} else {
commit_transaction->t_cpnext =
journal->j_checkpoint_transactions;
commit_transaction->t_cpprev =
commit_transaction->t_cpnext->t_cpprev;
commit_transaction->t_cpnext->t_cpprev =
commit_transaction;
commit_transaction->t_cpprev->t_cpnext =
commit_transaction;
}
}
spin_unlock(&journal->j_list_lock);
jbd_debug(1, "JBD: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
wake_up(&journal->j_wait_done_commit);
}

2072
fs/jbd2/journal.c Normal file

File diff suppressed because it is too large Load diff

592
fs/jbd2/recovery.c Normal file
View file

@ -0,0 +1,592 @@
/*
* linux/fs/recovery.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999-2000 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal recovery routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#endif
/*
* Maintain information about the progress of the recovery job, so that
* the different passes can carry information between them.
*/
struct recovery_info
{
tid_t start_transaction;
tid_t end_transaction;
int nr_replays;
int nr_revokes;
int nr_revoke_hits;
};
enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
tid_t, struct recovery_info *);
#ifdef __KERNEL__
/* Release readahead buffers after use */
static void journal_brelse_array(struct buffer_head *b[], int n)
{
while (--n >= 0)
brelse (b[n]);
}
/*
* When reading from the journal, we are going through the block device
* layer directly and so there is no readahead being done for us. We
* need to implement any readahead ourselves if we want it to happen at
* all. Recovery is basically one long sequential read, so make sure we
* do the IO in reasonably large chunks.
*
* This is not so critical that we need to be enormously clever about
* the readahead size, though. 128K is a purely arbitrary, good-enough
* fixed value.
*/
#define MAXBUF 8
static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
unsigned long blocknr;
struct buffer_head *bh;
struct buffer_head * bufs[MAXBUF];
/* Do up to 128K of readahead */
max = start + (128 * 1024 / journal->j_blocksize);
if (max > journal->j_maxlen)
max = journal->j_maxlen;
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
* a time to the block device IO layer. */
nbufs = 0;
for (next = start; next < max; next++) {
err = journal_bmap(journal, next, &blocknr);
if (err) {
printk (KERN_ERR "JBD: bad block at offset %u\n",
next);
goto failed;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
err = -ENOMEM;
goto failed;
}
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
ll_rw_block(READ, nbufs, bufs);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
} else
brelse(bh);
}
if (nbufs)
ll_rw_block(READ, nbufs, bufs);
err = 0;
failed:
if (nbufs)
journal_brelse_array(bufs, nbufs);
return err;
}
#endif /* __KERNEL__ */
/*
* Read a block from the journal
*/
static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
unsigned long blocknr;
struct buffer_head *bh;
*bhp = NULL;
if (offset >= journal->j_maxlen) {
printk(KERN_ERR "JBD: corrupted journal superblock\n");
return -EIO;
}
err = journal_bmap(journal, offset, &blocknr);
if (err) {
printk (KERN_ERR "JBD: bad block at offset %u\n",
offset);
return err;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
return -ENOMEM;
if (!buffer_uptodate(bh)) {
/* If this is a brand new buffer, start readahead.
Otherwise, we assume we are already reading it. */
if (!buffer_req(bh))
do_readahead(journal, offset);
wait_on_buffer(bh);
}
if (!buffer_uptodate(bh)) {
printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
offset);
brelse(bh);
return -EIO;
}
*bhp = bh;
return 0;
}
/*
* Count the number of in-use tags in a journal descriptor block.
*/
static int count_tags(struct buffer_head *bh, int size)
{
char * tagp;
journal_block_tag_t * tag;
int nr = 0;
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
tag = (journal_block_tag_t *) tagp;
nr++;
tagp += sizeof(journal_block_tag_t);
if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
tagp += 16;
if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
break;
}
return nr;
}
/* Make sure we wrap around the log correctly! */
#define wrap(journal, var) \
do { \
if (var >= (journal)->j_last) \
var -= ((journal)->j_last - (journal)->j_first); \
} while (0)
/**
* journal_recover - recovers a on-disk journal
* @journal: the journal to recover
*
* The primary function for recovering the log contents when mounting a
* journaled device.
*
* Recovery is done in three passes. In the first pass, we look for the
* end of the log. In the second, we assemble the list of revoke
* blocks. In the third and final pass, we replay any un-revoked blocks
* in the log.
*/
int journal_recover(journal_t *journal)
{
int err;
journal_superblock_t * sb;
struct recovery_info info;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
/*
* The journal superblock's s_start field (the current log head)
* is always zero if, and only if, the journal was cleanly
* unmounted.
*/
if (!sb->s_start) {
jbd_debug(1, "No recovery required, last transaction %d\n",
be32_to_cpu(sb->s_sequence));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
return 0;
}
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
jbd_debug(0, "JBD: recovery, exit status %d, "
"recovered transactions %u to %u\n",
err, info.start_transaction, info.end_transaction);
jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
/* Restart the log at the next transaction ID, thus invalidating
* any existing commit records in the log. */
journal->j_transaction_sequence = ++info.end_transaction;
journal_clear_revoke(journal);
sync_blockdev(journal->j_fs_dev);
return err;
}
/**
* journal_skip_recovery - Start journal and wipe exiting records
* @journal: journal to startup
*
* Locate any valid recovery information from the journal and set up the
* journal structures in memory to ignore it (presumably because the
* caller has evidence that it is out of date).
* This function does'nt appear to be exorted..
*
* We perform one pass over the journal to allow us to tell the user how
* much recovery information is being erased, and to let us initialise
* the journal transaction sequence numbers to the next unused ID.
*/
int journal_skip_recovery(journal_t *journal)
{
int err;
journal_superblock_t * sb;
struct recovery_info info;
memset (&info, 0, sizeof(info));
sb = journal->j_superblock;
err = do_one_pass(journal, &info, PASS_SCAN);
if (err) {
printk(KERN_ERR "JBD: error %d scanning journal\n", err);
++journal->j_transaction_sequence;
} else {
#ifdef CONFIG_JBD_DEBUG
int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
#endif
jbd_debug(0,
"JBD: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s");
journal->j_transaction_sequence = ++info.end_transaction;
}
journal->j_tail = 0;
return err;
}
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
unsigned long next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
struct buffer_head * bh;
unsigned int sequence;
int blocktype;
/* Precompute the maximum metadata descriptors in a descriptor block */
int MAX_BLOCKS_PER_DESC;
MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
/ sizeof(journal_block_tag_t));
/*
* First thing is to establish what we expect to find in the log
* (in terms of transaction IDs), and where (in terms of log
* block offsets): query the superblock.
*/
sb = journal->j_superblock;
next_commit_ID = be32_to_cpu(sb->s_sequence);
next_log_block = be32_to_cpu(sb->s_start);
first_commit_ID = next_commit_ID;
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
jbd_debug(1, "Starting recovery pass %d\n", pass);
/*
* Now we walk through the log, transaction by transaction,
* making sure that each transaction has a commit block in the
* expected place. Each complete transaction gets replayed back
* into the main filesystem.
*/
while (1) {
int flags;
char * tagp;
journal_block_tag_t * tag;
struct buffer_head * obh;
struct buffer_head * nbh;
cond_resched(); /* We're under lock_kernel() */
/* If we already know where to stop the log traversal,
* check right now that we haven't gone past the end of
* the log. */
if (pass != PASS_SCAN)
if (tid_geq(next_commit_ID, info->end_transaction))
break;
jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
next_log_block++;
wrap(journal, next_log_block);
/* What kind of buffer is it?
*
* If it is a descriptor block, check that it has the
* expected sequence number. Otherwise, we're all done
* here. */
tmp = (journal_header_t *)bh->b_data;
if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
brelse(bh);
break;
}
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
jbd_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
if (sequence != next_commit_ID) {
brelse(bh);
break;
}
/* OK, we have a valid descriptor block which matches
* all of the sequence number checks. What are we going
* to do with it? That depends on the pass... */
switch(blocktype) {
case JFS_DESCRIPTOR_BLOCK:
/* If it is a valid descriptor block, replay it
* in pass REPLAY; otherwise, just skip over the
* blocks it describes. */
if (pass != PASS_REPLAY) {
next_log_block +=
count_tags(bh, journal->j_blocksize);
wrap(journal, next_log_block);
brelse(bh);
continue;
}
/* A descriptor block: we can now write all of
* the data blocks. Yay, useful work is finally
* getting done here! */
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
<= journal->j_blocksize) {
unsigned long io_block;
tag = (journal_block_tag_t *) tagp;
flags = be32_to_cpu(tag->t_flags);
io_block = next_log_block++;
wrap(journal, next_log_block);
err = jread(&obh, journal, io_block);
if (err) {
/* Recover what we can, but
* report failure at the end. */
success = err;
printk (KERN_ERR
"JBD: IO error %d recovering "
"block %ld in log\n",
err, io_block);
} else {
unsigned long blocknr;
J_ASSERT(obh != NULL);
blocknr = be32_to_cpu(tag->t_blocknr);
/* If the block has been
* revoked, then we're all done
* here. */
if (journal_test_revoke
(journal, blocknr,
next_commit_ID)) {
brelse(obh);
++info->nr_revoke_hits;
goto skip_write;
}
/* Find a buffer for the new
* data being restored */
nbh = __getblk(journal->j_fs_dev,
blocknr,
journal->j_blocksize);
if (nbh == NULL) {
printk(KERN_ERR
"JBD: Out of memory "
"during recovery.\n");
err = -ENOMEM;
brelse(bh);
brelse(obh);
goto failed;
}
lock_buffer(nbh);
memcpy(nbh->b_data, obh->b_data,
journal->j_blocksize);
if (flags & JFS_FLAG_ESCAPE) {
*((__be32 *)bh->b_data) =
cpu_to_be32(JFS_MAGIC_NUMBER);
}
BUFFER_TRACE(nbh, "marking dirty");
set_buffer_uptodate(nbh);
mark_buffer_dirty(nbh);
BUFFER_TRACE(nbh, "marking uptodate");
++info->nr_replays;
/* ll_rw_block(WRITE, 1, &nbh); */
unlock_buffer(nbh);
brelse(obh);
brelse(nbh);
}
skip_write:
tagp += sizeof(journal_block_tag_t);
if (!(flags & JFS_FLAG_SAME_UUID))
tagp += 16;
if (flags & JFS_FLAG_LAST_TAG)
break;
}
brelse(bh);
continue;
case JFS_COMMIT_BLOCK:
/* Found an expected commit block: not much to
* do other than move on to the next sequence
* number. */
brelse(bh);
next_commit_ID++;
continue;
case JFS_REVOKE_BLOCK:
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
brelse(bh);
continue;
}
err = scan_revoke_records(journal, bh,
next_commit_ID, info);
brelse(bh);
if (err)
goto failed;
continue;
default:
jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
brelse(bh);
goto done;
}
}
done:
/*
* We broke out of the log scan loop: either we came to the
* known end of the log or we found an unexpected block in the
* log. If the latter happened, then we know that the "current"
* transaction marks the end of the valid log.
*/
if (pass == PASS_SCAN)
info->end_transaction = next_commit_ID;
else {
/* It's really bad news if different passes end up at
* different places (but possible due to IO errors). */
if (info->end_transaction != next_commit_ID) {
printk (KERN_ERR "JBD: recovery pass %d ended at "
"transaction %u, expected %u\n",
pass, next_commit_ID, info->end_transaction);
if (!success)
success = -EIO;
}
}
return success;
failed:
return err;
}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
tid_t sequence, struct recovery_info *info)
{
journal_revoke_header_t *header;
int offset, max;
header = (journal_revoke_header_t *) bh->b_data;
offset = sizeof(journal_revoke_header_t);
max = be32_to_cpu(header->r_count);
while (offset < max) {
unsigned long blocknr;
int err;
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
offset += 4;
err = journal_set_revoke(journal, blocknr, sequence);
if (err)
return err;
++info->nr_revokes;
}
return 0;
}

703
fs/jbd2/revoke.c Normal file
View file

@ -0,0 +1,703 @@
/*
* linux/fs/revoke.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 2000
*
* Copyright 2000 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal revoke routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*
* Revoke is the mechanism used to prevent old log records for deleted
* metadata from being replayed on top of newer data using the same
* blocks. The revoke mechanism is used in two separate places:
*
* + Commit: during commit we write the entire list of the current
* transaction's revoked blocks to the journal
*
* + Recovery: during recovery we record the transaction ID of all
* revoked blocks. If there are multiple revoke records in the log
* for a single block, only the last one counts, and if there is a log
* entry for a block beyond the last revoke, then that log entry still
* gets replayed.
*
* We can get interactions between revokes and new log data within a
* single transaction:
*
* Block is revoked and then journaled:
* The desired end result is the journaling of the new block, so we
* cancel the revoke before the transaction commits.
*
* Block is journaled and then revoked:
* The revoke must take precedence over the write of the block, so we
* need either to cancel the journal entry or to write the revoke
* later in the log than the log block. In this case, we choose the
* latter: journaling a block cancels any revoke record for that block
* in the current transaction, so any revoke for that block in the
* transaction must have happened after the block was journaled and so
* the revoke must take precedence.
*
* Block is revoked and then written as data:
* The data write is allowed to succeed, but the revoke is _not_
* cancelled. We still need to prevent old log records from
* overwriting the new data. We don't even need to clear the revoke
* bit here.
*
* Revoke information on buffers is a tri-state value:
*
* RevokeValid clear: no cached revoke status, need to look it up
* RevokeValid set, Revoked clear:
* buffer has not been revoked, and cancel_revoke
* need do nothing.
* RevokeValid set, Revoked set:
* buffer has been revoked.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#endif
static kmem_cache_t *revoke_record_cache;
static kmem_cache_t *revoke_table_cache;
/* Each revoke record represents one single revoked block. During
journal replay, this involves recording the transaction ID of the
last transaction to revoke this block. */
struct jbd_revoke_record_s
{
struct list_head hash;
tid_t sequence; /* Used for recovery only */
unsigned long blocknr;
};
/* The revoke table is just a simple hash table of revoke records. */
struct jbd_revoke_table_s
{
/* It is conceivable that we might want a larger hash table
* for recovery. Must be a power of two. */
int hash_size;
int hash_shift;
struct list_head *hash_table;
};
#ifdef __KERNEL__
static void write_one_revoke_record(journal_t *, transaction_t *,
struct journal_head **, int *,
struct jbd_revoke_record_s *);
static void flush_descriptor(journal_t *, struct journal_head *, int);
#endif
/* Utility functions to maintain the revoke table */
/* Borrowed from buffer.c: this is a tried and tested block hash function */
static inline int hash(journal_t *journal, unsigned long block)
{
struct jbd_revoke_table_s *table = journal->j_revoke;
int hash_shift = table->hash_shift;
return ((block << (hash_shift - 6)) ^
(block >> 13) ^
(block << (hash_shift - 12))) & (table->hash_size - 1);
}
static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
tid_t seq)
{
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
repeat:
record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
if (!record)
goto oom;
record->sequence = seq;
record->blocknr = blocknr;
hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
spin_lock(&journal->j_revoke_lock);
list_add(&record->hash, hash_list);
spin_unlock(&journal->j_revoke_lock);
return 0;
oom:
if (!journal_oom_retry)
return -ENOMEM;
jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
yield();
goto repeat;
}
/* Find a revoke record in the journal's hash table. */
static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
unsigned long blocknr)
{
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
spin_lock(&journal->j_revoke_lock);
record = (struct jbd_revoke_record_s *) hash_list->next;
while (&(record->hash) != hash_list) {
if (record->blocknr == blocknr) {
spin_unlock(&journal->j_revoke_lock);
return record;
}
record = (struct jbd_revoke_record_s *) record->hash.next;
}
spin_unlock(&journal->j_revoke_lock);
return NULL;
}
int __init journal_init_revoke_caches(void)
{
revoke_record_cache = kmem_cache_create("revoke_record",
sizeof(struct jbd_revoke_record_s),
0, SLAB_HWCACHE_ALIGN, NULL, NULL);
if (revoke_record_cache == 0)
return -ENOMEM;
revoke_table_cache = kmem_cache_create("revoke_table",
sizeof(struct jbd_revoke_table_s),
0, 0, NULL, NULL);
if (revoke_table_cache == 0) {
kmem_cache_destroy(revoke_record_cache);
revoke_record_cache = NULL;
return -ENOMEM;
}
return 0;
}
void journal_destroy_revoke_caches(void)
{
kmem_cache_destroy(revoke_record_cache);
revoke_record_cache = NULL;
kmem_cache_destroy(revoke_table_cache);
revoke_table_cache = NULL;
}
/* Initialise the revoke table for a given journal to a given size. */
int journal_init_revoke(journal_t *journal, int hash_size)
{
int shift, tmp;
J_ASSERT (journal->j_revoke_table[0] == NULL);
shift = 0;
tmp = hash_size;
while((tmp >>= 1UL) != 0UL)
shift++;
journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
if (!journal->j_revoke_table[0])
return -ENOMEM;
journal->j_revoke = journal->j_revoke_table[0];
/* Check that the hash_size is a power of two */
J_ASSERT ((hash_size & (hash_size-1)) == 0);
journal->j_revoke->hash_size = hash_size;
journal->j_revoke->hash_shift = shift;
journal->j_revoke->hash_table =
kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
if (!journal->j_revoke->hash_table) {
kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
journal->j_revoke = NULL;
return -ENOMEM;
}
for (tmp = 0; tmp < hash_size; tmp++)
INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
if (!journal->j_revoke_table[1]) {
kfree(journal->j_revoke_table[0]->hash_table);
kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
return -ENOMEM;
}
journal->j_revoke = journal->j_revoke_table[1];
/* Check that the hash_size is a power of two */
J_ASSERT ((hash_size & (hash_size-1)) == 0);
journal->j_revoke->hash_size = hash_size;
journal->j_revoke->hash_shift = shift;
journal->j_revoke->hash_table =
kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
if (!journal->j_revoke->hash_table) {
kfree(journal->j_revoke_table[0]->hash_table);
kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]);
journal->j_revoke = NULL;
return -ENOMEM;
}
for (tmp = 0; tmp < hash_size; tmp++)
INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
spin_lock_init(&journal->j_revoke_lock);
return 0;
}
/* Destoy a journal's revoke table. The table must already be empty! */
void journal_destroy_revoke(journal_t *journal)
{
struct jbd_revoke_table_s *table;
struct list_head *hash_list;
int i;
table = journal->j_revoke_table[0];
if (!table)
return;
for (i=0; i<table->hash_size; i++) {
hash_list = &table->hash_table[i];
J_ASSERT (list_empty(hash_list));
}
kfree(table->hash_table);
kmem_cache_free(revoke_table_cache, table);
journal->j_revoke = NULL;
table = journal->j_revoke_table[1];
if (!table)
return;
for (i=0; i<table->hash_size; i++) {
hash_list = &table->hash_table[i];
J_ASSERT (list_empty(hash_list));
}
kfree(table->hash_table);
kmem_cache_free(revoke_table_cache, table);
journal->j_revoke = NULL;
}
#ifdef __KERNEL__
/*
* journal_revoke: revoke a given buffer_head from the journal. This
* prevents the block from being replayed during recovery if we take a
* crash after this current transaction commits. Any subsequent
* metadata writes of the buffer in this transaction cancel the
* revoke.
*
* Note that this call may block --- it is up to the caller to make
* sure that there are no further calls to journal_write_metadata
* before the revoke is complete. In ext3, this implies calling the
* revoke before clearing the block bitmap when we are deleting
* metadata.
*
* Revoke performs a journal_forget on any buffer_head passed in as a
* parameter, but does _not_ forget the buffer_head if the bh was only
* found implicitly.
*
* bh_in may not be a journalled buffer - it may have come off
* the hash tables without an attached journal_head.
*
* If bh_in is non-zero, journal_revoke() will decrement its b_count
* by one.
*/
int journal_revoke(handle_t *handle, unsigned long blocknr,
struct buffer_head *bh_in)
{
struct buffer_head *bh = NULL;
journal_t *journal;
struct block_device *bdev;
int err;
might_sleep();
if (bh_in)
BUFFER_TRACE(bh_in, "enter");
journal = handle->h_transaction->t_journal;
if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
J_ASSERT (!"Cannot set revoke feature!");
return -EINVAL;
}
bdev = journal->j_fs_dev;
bh = bh_in;
if (!bh) {
bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
#ifdef JBD_EXPENSIVE_CHECKING
else {
struct buffer_head *bh2;
/* If there is a different buffer_head lying around in
* memory anywhere... */
bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
/* ...then it better be revoked too,
* since it's illegal to create a revoke
* record against a buffer_head which is
* not marked revoked --- that would
* risk missing a subsequent revoke
* cancel. */
J_ASSERT_BH(bh2, buffer_revoked(bh2));
put_bh(bh2);
}
}
#endif
/* We really ought not ever to revoke twice in a row without
first having the revoke cancelled: it's illegal to free a
block twice without allocating it in between! */
if (bh) {
if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
"inconsistent data on disk")) {
if (!bh_in)
brelse(bh);
return -EIO;
}
set_buffer_revoked(bh);
set_buffer_revokevalid(bh);
if (bh_in) {
BUFFER_TRACE(bh_in, "call journal_forget");
journal_forget(handle, bh_in);
} else {
BUFFER_TRACE(bh, "call brelse");
__brelse(bh);
}
}
jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
return err;
}
/*
* Cancel an outstanding revoke. For use only internally by the
* journaling code (called from journal_get_write_access).
*
* We trust buffer_revoked() on the buffer if the buffer is already
* being journaled: if there is no revoke pending on the buffer, then we
* don't do anything here.
*
* This would break if it were possible for a buffer to be revoked and
* discarded, and then reallocated within the same transaction. In such
* a case we would have lost the revoked bit, but when we arrived here
* the second time we would still have a pending revoke to cancel. So,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
*
* The caller must have the journal locked.
*/
int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
struct jbd_revoke_record_s *record;
journal_t *journal = handle->h_transaction->t_journal;
int need_cancel;
int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
/* Is the existing Revoke bit valid? If so, we trust it, and
* only perform the full cancel if the revoke bit is set. If
* not, we can't trust the revoke bit, and we need to do the
* full search for a revoke record. */
if (test_set_buffer_revokevalid(bh)) {
need_cancel = test_clear_buffer_revoked(bh);
} else {
need_cancel = 1;
clear_buffer_revoked(bh);
}
if (need_cancel) {
record = find_revoke_record(journal, bh->b_blocknr);
if (record) {
jbd_debug(4, "cancelled existing revoke on "
"blocknr %llu\n", (unsigned long long)bh->b_blocknr);
spin_lock(&journal->j_revoke_lock);
list_del(&record->hash);
spin_unlock(&journal->j_revoke_lock);
kmem_cache_free(revoke_record_cache, record);
did_revoke = 1;
}
}
#ifdef JBD_EXPENSIVE_CHECKING
/* There better not be one left behind by now! */
record = find_revoke_record(journal, bh->b_blocknr);
J_ASSERT_JH(jh, record == NULL);
#endif
/* Finally, have we just cleared revoke on an unhashed
* buffer_head? If so, we'd better make sure we clear the
* revoked status on any hashed alias too, otherwise the revoke
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
__brelse(bh2);
}
}
return did_revoke;
}
/* journal_switch_revoke table select j_revoke for next transaction
* we do not want to suspend any processing until all revokes are
* written -bzzz
*/
void journal_switch_revoke_table(journal_t *journal)
{
int i;
if (journal->j_revoke == journal->j_revoke_table[0])
journal->j_revoke = journal->j_revoke_table[1];
else
journal->j_revoke = journal->j_revoke_table[0];
for (i = 0; i < journal->j_revoke->hash_size; i++)
INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
}
/*
* Write revoke records to the journal for all entries in the current
* revoke hash, deleting the entries as we go.
*
* Called with the journal lock held.
*/
void journal_write_revoke_records(journal_t *journal,
transaction_t *transaction)
{
struct journal_head *descriptor;
struct jbd_revoke_record_s *record;
struct jbd_revoke_table_s *revoke;
struct list_head *hash_list;
int i, offset, count;
descriptor = NULL;
offset = 0;
count = 0;
/* select revoke table for committing transaction */
revoke = journal->j_revoke == journal->j_revoke_table[0] ?
journal->j_revoke_table[1] : journal->j_revoke_table[0];
for (i = 0; i < revoke->hash_size; i++) {
hash_list = &revoke->hash_table[i];
while (!list_empty(hash_list)) {
record = (struct jbd_revoke_record_s *)
hash_list->next;
write_one_revoke_record(journal, transaction,
&descriptor, &offset,
record);
count++;
list_del(&record->hash);
kmem_cache_free(revoke_record_cache, record);
}
}
if (descriptor)
flush_descriptor(journal, descriptor, offset);
jbd_debug(1, "Wrote %d revoke records\n", count);
}
/*
* Write out one revoke record. We need to create a new descriptor
* block if the old one is full or if we have not already created one.
*/
static void write_one_revoke_record(journal_t *journal,
transaction_t *transaction,
struct journal_head **descriptorp,
int *offsetp,
struct jbd_revoke_record_s *record)
{
struct journal_head *descriptor;
int offset;
journal_header_t *header;
/* If we are already aborting, this all becomes a noop. We
still need to go round the loop in
journal_write_revoke_records in order to free all of the
revoke records: only the IO to the journal is omitted. */
if (is_journal_aborted(journal))
return;
descriptor = *descriptorp;
offset = *offsetp;
/* Make sure we have a descriptor with space left for the record */
if (descriptor) {
if (offset == journal->j_blocksize) {
flush_descriptor(journal, descriptor, offset);
descriptor = NULL;
}
}
if (!descriptor) {
descriptor = journal_get_descriptor_buffer(journal);
if (!descriptor)
return;
header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
journal_file_buffer(descriptor, transaction, BJ_LogCtl);
offset = sizeof(journal_revoke_header_t);
*descriptorp = descriptor;
}
* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
cpu_to_be32(record->blocknr);
offset += 4;
*offsetp = offset;
}
/*
* Flush a revoke descriptor out to the journal. If we are aborting,
* this is a noop; otherwise we are generating a buffer which needs to
* be waited for during commit, so it has to go onto the appropriate
* journal buffer list.
*/
static void flush_descriptor(journal_t *journal,
struct journal_head *descriptor,
int offset)
{
journal_revoke_header_t *header;
struct buffer_head *bh = jh2bh(descriptor);
if (is_journal_aborted(journal)) {
put_bh(bh);
return;
}
header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
header->r_count = cpu_to_be32(offset);
set_buffer_jwrite(bh);
BUFFER_TRACE(bh, "write");
set_buffer_dirty(bh);
ll_rw_block(SWRITE, 1, &bh);
}
#endif
/*
* Revoke support for recovery.
*
* Recovery needs to be able to:
*
* record all revoke records, including the tid of the latest instance
* of each revoke in the journal
*
* check whether a given block in a given transaction should be replayed
* (ie. has not been revoked by a revoke record in that or a subsequent
* transaction)
*
* empty the revoke table after recovery.
*/
/*
* First, setting revoke records. We create a new revoke record for
* every block ever revoked in the log as we scan it for recovery, and
* we update the existing records if we find multiple revokes for a
* single block.
*/
int journal_set_revoke(journal_t *journal,
unsigned long blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
record = find_revoke_record(journal, blocknr);
if (record) {
/* If we have multiple occurrences, only record the
* latest sequence number in the hashed record */
if (tid_gt(sequence, record->sequence))
record->sequence = sequence;
return 0;
}
return insert_revoke_hash(journal, blocknr, sequence);
}
/*
* Test revoke records. For a given block referenced in the log, has
* that block been revoked? A revoke record with a given transaction
* sequence number revokes all blocks in that transaction and earlier
* ones, but later transactions still need replayed.
*/
int journal_test_revoke(journal_t *journal,
unsigned long blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
record = find_revoke_record(journal, blocknr);
if (!record)
return 0;
if (tid_gt(sequence, record->sequence))
return 0;
return 1;
}
/*
* Finally, once recovery is over, we need to clear the revoke table so
* that it can be reused by the running filesystem.
*/
void journal_clear_revoke(journal_t *journal)
{
int i;
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
struct jbd_revoke_table_s *revoke;
revoke = journal->j_revoke;
for (i = 0; i < revoke->hash_size; i++) {
hash_list = &revoke->hash_table[i];
while (!list_empty(hash_list)) {
record = (struct jbd_revoke_record_s*) hash_list->next;
list_del(&record->hash);
kmem_cache_free(revoke_record_cache, record);
}
}
}

2080
fs/jbd2/transaction.c Normal file

File diff suppressed because it is too large Load diff

268
include/linux/ext4_jbd2.h Normal file
View file

@ -0,0 +1,268 @@
/*
* linux/include/linux/ext4_jbd.h
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1998--1999 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Ext4-specific journaling extensions.
*/
#ifndef _LINUX_EXT4_JBD_H
#define _LINUX_EXT4_JBD_H
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/ext4_fs.h>
#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal)
/* Define the number of blocks we need to account to a transaction to
* modify one block of data.
*
* We may have to touch one inode, one bitmap buffer, up to three
* indirection blocks, the group and superblock summaries, and the data
* block to complete the transaction. */
#define EXT4_SINGLEDATA_TRANS_BLOCKS 8U
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
* and the superblock, which are already accounted for. */
#define EXT4_XATTR_TRANS_BLOCKS 6U
/* Define the minimum size for a transaction which modifies data. This
* needs to take into account the fact that we may end up modifying two
* quota files too (one for the group, one for the user quota). The
* superblock only gets updated once, of course, so don't bother
* counting that again for the quota updates. */
#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS + \
EXT4_XATTR_TRANS_BLOCKS - 2 + \
2*EXT4_QUOTA_TRANS_BLOCKS(sb))
/* Delete operations potentially hit one directory's namespace plus an
* entire inode, plus arbitrary amounts of bitmap/indirection data. Be
* generous. We can grow the delete transaction later if necessary. */
#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
/* Define an arbitrary limit for the amount of data we will anticipate
* writing to any given transaction. For unbounded transactions such as
* write(2) and truncate(2) we can write more than this, but we always
* start off at the maximum transaction size and grow the transaction
* optimistically as we go. */
#define EXT4_MAX_TRANS_DATA 64U
/* We break up a large truncate or write transaction once the handle's
* buffer credits gets this low, we need either to extend the
* transaction or to start a new one. Reserve enough space here for
* inode, bitmap, superblock, group and indirection updates for at least
* one block, plus two quota updates. Quota allocations are not
* needed. */
#define EXT4_RESERVE_TRANS_BLOCKS 12U
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only inode+data */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
struct ext4_iloc *iloc);
/*
* On success, We end up with an outstanding reference count against
* iloc->bh. This _must_ be cleaned up later.
*/
int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc);
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
/*
* Wrapper functions with which ext4 calls into JBD. The intent here is
* to allow these to be turned into appropriate stubs so ext4 can control
* ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't
* been done yet.
*/
void ext4_journal_abort_handle(const char *caller, const char *err_fn,
struct buffer_head *bh, handle_t *handle, int err);
static inline int
__ext4_journal_get_undo_access(const char *where, handle_t *handle,
struct buffer_head *bh)
{
int err = journal_get_undo_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline int
__ext4_journal_get_write_access(const char *where, handle_t *handle,
struct buffer_head *bh)
{
int err = journal_get_write_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline void
ext4_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
{
journal_release_buffer(handle, bh);
}
static inline int
__ext4_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh)
{
int err = journal_forget(handle, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline int
__ext4_journal_revoke(const char *where, handle_t *handle,
unsigned long blocknr, struct buffer_head *bh)
{
int err = journal_revoke(handle, blocknr, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline int
__ext4_journal_get_create_access(const char *where,
handle_t *handle, struct buffer_head *bh)
{
int err = journal_get_create_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline int
__ext4_journal_dirty_metadata(const char *where,
handle_t *handle, struct buffer_head *bh)
{
int err = journal_dirty_metadata(handle, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
#define ext4_journal_get_undo_access(handle, bh) \
__ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
#define ext4_journal_revoke(handle, blocknr, bh) \
__ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
#define ext4_journal_get_create_access(handle, bh) \
__ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
#define ext4_journal_dirty_metadata(handle, bh) \
__ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
#define ext4_journal_forget(handle, bh) \
__ext4_journal_forget(__FUNCTION__, (handle), (bh))
int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
{
return ext4_journal_start_sb(inode->i_sb, nblocks);
}
#define ext4_journal_stop(handle) \
__ext4_journal_stop(__FUNCTION__, (handle))
static inline handle_t *ext4_journal_current_handle(void)
{
return journal_current_handle();
}
static inline int ext4_journal_extend(handle_t *handle, int nblocks)
{
return journal_extend(handle, nblocks);
}
static inline int ext4_journal_restart(handle_t *handle, int nblocks)
{
return journal_restart(handle, nblocks);
}
static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
return journal_blocks_per_page(inode);
}
static inline int ext4_journal_force_commit(journal_t *journal)
{
return journal_force_commit(journal);
}
/* super.c */
int ext4_force_commit(struct super_block *sb);
static inline int ext4_should_journal_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
return 1;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
return 1;
if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
return 1;
return 0;
}
static inline int ext4_should_order_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
return 0;
if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
return 1;
return 0;
}
static inline int ext4_should_writeback_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
return 0;
if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
return 1;
return 0;
}
#endif /* _LINUX_EXT4_JBD_H */

1098
include/linux/jbd2.h Normal file

File diff suppressed because it is too large Load diff