linux/fs/f2fs/gc.c
Chao Yu 6aa58d8ad2 f2fs: readahead encrypted block during GC
During GC, for each encrypted block, we will read block synchronously
into meta page, and then submit it into current cold data log area.

So this block read model with 4k granularity can make poor performance,
like migrating non-encrypted block, let's readahead encrypted block
as well to improve migration performance.

To implement this, we choose meta page that its index is old block
address of the encrypted block, and readahead ciphertext into this
page, later, if readaheaded page is still updated, we will load its
data into target meta page, and submit the write IO.

Note that for OPU, truncation, deletion, we need to invalid meta
page after we invalid old block address, to make sure we won't load
invalid data from target meta page during encrypted block migration.

for ((i = 0; i < 1000; i++))
do {
        xfs_io -f /mnt/f2fs/dir/$i -c "pwrite 0 128k" -c "fsync";
} done

for ((i = 0; i < 1000; i+=2))
do {
        rm /mnt/f2fs/dir/$i;
} done

ret = ioctl(fd, F2FS_IOC_GARBAGE_COLLECT, 0);

Before:
              gc-6549  [001] d..1 214682.212797: block_rq_insert: 8,32 RA 32768 () 786400 + 64 [gc]
              gc-6549  [001] d..1 214682.212802: block_unplug: [gc] 1
              gc-6549  [001] .... 214682.213892: block_bio_queue: 8,32 R 67494144 + 8 [gc]
              gc-6549  [001] .... 214682.213899: block_getrq: 8,32 R 67494144 + 8 [gc]
              gc-6549  [001] .... 214682.213902: block_plug: [gc]
              gc-6549  [001] d..1 214682.213905: block_rq_insert: 8,32 R 4096 () 67494144 + 8 [gc]
              gc-6549  [001] d..1 214682.213908: block_unplug: [gc] 1
              gc-6549  [001] .... 214682.226405: block_bio_queue: 8,32 R 67494152 + 8 [gc]
              gc-6549  [001] .... 214682.226412: block_getrq: 8,32 R 67494152 + 8 [gc]
              gc-6549  [001] .... 214682.226414: block_plug: [gc]
              gc-6549  [001] d..1 214682.226417: block_rq_insert: 8,32 R 4096 () 67494152 + 8 [gc]
              gc-6549  [001] d..1 214682.226420: block_unplug: [gc] 1
              gc-6549  [001] .... 214682.226904: block_bio_queue: 8,32 R 67494160 + 8 [gc]
              gc-6549  [001] .... 214682.226910: block_getrq: 8,32 R 67494160 + 8 [gc]
              gc-6549  [001] .... 214682.226911: block_plug: [gc]
              gc-6549  [001] d..1 214682.226914: block_rq_insert: 8,32 R 4096 () 67494160 + 8 [gc]
              gc-6549  [001] d..1 214682.226916: block_unplug: [gc] 1

After:
              gc-5678  [003] .... 214327.025906: block_bio_queue: 8,32 R 67493824 + 8 [gc]
              gc-5678  [003] .... 214327.025908: block_bio_backmerge: 8,32 R 67493824 + 8 [gc]
              gc-5678  [003] .... 214327.025915: block_bio_queue: 8,32 R 67493832 + 8 [gc]
              gc-5678  [003] .... 214327.025917: block_bio_backmerge: 8,32 R 67493832 + 8 [gc]
              gc-5678  [003] .... 214327.025923: block_bio_queue: 8,32 R 67493840 + 8 [gc]
              gc-5678  [003] .... 214327.025925: block_bio_backmerge: 8,32 R 67493840 + 8 [gc]
              gc-5678  [003] .... 214327.025932: block_bio_queue: 8,32 R 67493848 + 8 [gc]
              gc-5678  [003] .... 214327.025934: block_bio_backmerge: 8,32 R 67493848 + 8 [gc]
              gc-5678  [003] .... 214327.025941: block_bio_queue: 8,32 R 67493856 + 8 [gc]
              gc-5678  [003] .... 214327.025943: block_bio_backmerge: 8,32 R 67493856 + 8 [gc]
              gc-5678  [003] .... 214327.025953: block_bio_queue: 8,32 R 67493864 + 8 [gc]
              gc-5678  [003] .... 214327.025955: block_bio_backmerge: 8,32 R 67493864 + 8 [gc]
              gc-5678  [003] .... 214327.025962: block_bio_queue: 8,32 R 67493872 + 8 [gc]
              gc-5678  [003] .... 214327.025964: block_bio_backmerge: 8,32 R 67493872 + 8 [gc]
              gc-5678  [003] .... 214327.025970: block_bio_queue: 8,32 R 67493880 + 8 [gc]
              gc-5678  [003] .... 214327.025972: block_bio_backmerge: 8,32 R 67493880 + 8 [gc]
              gc-5678  [003] .... 214327.026000: block_bio_queue: 8,32 WS 34123776 + 2048 [gc]
              gc-5678  [003] .... 214327.026019: block_getrq: 8,32 WS 34123776 + 2048 [gc]
              gc-5678  [003] d..1 214327.026021: block_rq_insert: 8,32 R 131072 () 67493632 + 256 [gc]
              gc-5678  [003] d..1 214327.026023: block_unplug: [gc] 1
              gc-5678  [003] d..1 214327.026026: block_rq_issue: 8,32 R 131072 () 67493632 + 256 [gc]
              gc-5678  [003] .... 214327.026046: block_plug: [gc]

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-20 23:13:42 -07:00

1263 lines
31 KiB
C

/*
* fs/f2fs/gc.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/f2fs_fs.h>
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/freezer.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
#include "gc.h"
#include <trace/events/f2fs.h>
static int gc_thread_func(void *data)
{
struct f2fs_sb_info *sbi = data;
struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
unsigned int wait_ms;
wait_ms = gc_th->min_sleep_time;
set_freezable();
do {
wait_event_interruptible_timeout(*wq,
kthread_should_stop() || freezing(current) ||
gc_th->gc_wake,
msecs_to_jiffies(wait_ms));
/* give it a try one time */
if (gc_th->gc_wake)
gc_th->gc_wake = 0;
if (try_to_freeze())
continue;
if (kthread_should_stop())
break;
if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
increase_sleep_time(gc_th, &wait_ms);
continue;
}
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
f2fs_show_injection_info(FAULT_CHECKPOINT);
f2fs_stop_checkpoint(sbi, false);
}
if (!sb_start_write_trylock(sbi->sb))
continue;
/*
* [GC triggering condition]
* 0. GC is not conducted currently.
* 1. There are enough dirty segments.
* 2. IO subsystem is idle by checking the # of writeback pages.
* 3. IO subsystem is idle by checking the # of requests in
* bdev's request list.
*
* Note) We have to avoid triggering GCs frequently.
* Because it is possible that some segments can be
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
*/
if (sbi->gc_mode == GC_URGENT) {
wait_ms = gc_th->urgent_sleep_time;
mutex_lock(&sbi->gc_mutex);
goto do_gc;
}
if (!mutex_trylock(&sbi->gc_mutex))
goto next;
if (!is_idle(sbi)) {
increase_sleep_time(gc_th, &wait_ms);
mutex_unlock(&sbi->gc_mutex);
goto next;
}
if (has_enough_invalid_blocks(sbi))
decrease_sleep_time(gc_th, &wait_ms);
else
increase_sleep_time(gc_th, &wait_ms);
do_gc:
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
wait_ms = gc_th->no_gc_sleep_time;
trace_f2fs_background_gc(sbi->sb, wait_ms,
prefree_segments(sbi), free_segments(sbi));
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi);
next:
sb_end_write(sbi->sb);
} while (!kthread_should_stop());
return 0;
}
int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
dev_t dev = sbi->sb->s_bdev->bd_dev;
int err = 0;
gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
if (!gc_th) {
err = -ENOMEM;
goto out;
}
gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
gc_th->gc_wake= 0;
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
err = PTR_ERR(gc_th->f2fs_gc_task);
kfree(gc_th);
sbi->gc_thread = NULL;
}
out:
return err;
}
void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
if (!gc_th)
return;
kthread_stop(gc_th->f2fs_gc_task);
kfree(gc_th);
sbi->gc_thread = NULL;
}
static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
{
int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
switch (sbi->gc_mode) {
case GC_IDLE_CB:
gc_mode = GC_CB;
break;
case GC_IDLE_GREEDY:
case GC_URGENT:
gc_mode = GC_GREEDY;
break;
}
return gc_mode;
}
static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
int type, struct victim_sel_policy *p)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
if (p->alloc_mode == SSR) {
p->gc_mode = GC_GREEDY;
p->dirty_segmap = dirty_i->dirty_segmap[type];
p->max_search = dirty_i->nr_dirty[type];
p->ofs_unit = 1;
} else {
p->gc_mode = select_gc_type(sbi, gc_type);
p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
p->max_search = dirty_i->nr_dirty[DIRTY];
p->ofs_unit = sbi->segs_per_sec;
}
/* we need to check every dirty segments in the FG_GC case */
if (gc_type != FG_GC &&
(sbi->gc_mode != GC_URGENT) &&
p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
/* let's select beginning hot/small space first in no_heap mode*/
if (test_opt(sbi, NOHEAP) &&
(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
p->offset = 0;
else
p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
}
static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
struct victim_sel_policy *p)
{
/* SSR allocates in a segment unit */
if (p->alloc_mode == SSR)
return sbi->blocks_per_seg;
if (p->gc_mode == GC_GREEDY)
return 2 * sbi->blocks_per_seg * p->ofs_unit;
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else /* No other gc_mode */
return 0;
}
static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int secno;
/*
* If the gc_type is FG_GC, we can select victim segments
* selected by background GC before.
* Those segments guarantee they have small valid blocks.
*/
for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
if (sec_usage_check(sbi, secno))
continue;
clear_bit(secno, dirty_i->victim_secmap);
return GET_SEG_FROM_SEC(sbi, secno);
}
return NULL_SEGNO;
}
static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
unsigned long long mtime = 0;
unsigned int vblocks;
unsigned char age = 0;
unsigned char u;
unsigned int i;
for (i = 0; i < sbi->segs_per_sec; i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
vblocks = get_valid_blocks(sbi, segno, true);
mtime = div_u64(mtime, sbi->segs_per_sec);
vblocks = div_u64(vblocks, sbi->segs_per_sec);
u = (vblocks * 100) >> sbi->log_blocks_per_seg;
/* Handle if the system time has changed by the user */
if (mtime < sit_i->min_mtime)
sit_i->min_mtime = mtime;
if (mtime > sit_i->max_mtime)
sit_i->max_mtime = mtime;
if (sit_i->max_mtime != sit_i->min_mtime)
age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
sit_i->max_mtime - sit_i->min_mtime);
return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
}
static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
unsigned int segno, struct victim_sel_policy *p)
{
if (p->alloc_mode == SSR)
return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
/* alloc_mode == LFS */
if (p->gc_mode == GC_GREEDY)
return get_valid_blocks(sbi, segno, true);
else
return get_cb_cost(sbi, segno);
}
static unsigned int count_bits(const unsigned long *addr,
unsigned int offset, unsigned int len)
{
unsigned int end = offset + len, sum = 0;
while (offset < end) {
if (test_bit(offset++, addr))
++sum;
}
return sum;
}
/*
* This function is called from two paths.
* One is garbage collection and the other is SSR segment selection.
* When it is called during GC, it just gets a victim segment
* and it does not remove it from dirty seglist.
* When it is called from SSR segment selection, it finds a segment
* which has minimum valid blocks and removes it from dirty seglist.
*/
static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int *result, int gc_type, int type, char alloc_mode)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct sit_info *sm = SIT_I(sbi);
struct victim_sel_policy p;
unsigned int secno, last_victim;
unsigned int last_segment = MAIN_SEGS(sbi);
unsigned int nsearched = 0;
mutex_lock(&dirty_i->seglist_lock);
p.alloc_mode = alloc_mode;
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
p.min_cost = get_max_cost(sbi, &p);
if (*result != NULL_SEGNO) {
if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
get_valid_blocks(sbi, *result, false) &&
!sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
p.min_segno = *result;
goto out;
}
if (p.max_search == 0)
goto out;
last_victim = sm->last_victim[p.gc_mode];
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
goto got_it;
}
while (1) {
unsigned long cost;
unsigned int segno;
segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
if (segno >= last_segment) {
if (sm->last_victim[p.gc_mode]) {
last_segment =
sm->last_victim[p.gc_mode];
sm->last_victim[p.gc_mode] = 0;
p.offset = 0;
continue;
}
break;
}
p.offset = segno + p.ofs_unit;
if (p.ofs_unit > 1) {
p.offset -= segno % p.ofs_unit;
nsearched += count_bits(p.dirty_segmap,
p.offset - p.ofs_unit,
p.ofs_unit);
} else {
nsearched++;
}
secno = GET_SEC_FROM_SEG(sbi, segno);
if (sec_usage_check(sbi, secno))
goto next;
if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
goto next;
cost = get_gc_cost(sbi, segno, &p);
if (p.min_cost > cost) {
p.min_segno = segno;
p.min_cost = cost;
}
next:
if (nsearched >= p.max_search) {
if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
sm->last_victim[p.gc_mode] = last_victim + 1;
else
sm->last_victim[p.gc_mode] = segno + 1;
sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
break;
}
}
if (p.min_segno != NULL_SEGNO) {
got_it:
if (p.alloc_mode == LFS) {
secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
if (gc_type == FG_GC)
sbi->cur_victim_sec = secno;
else
set_bit(secno, dirty_i->victim_secmap);
}
*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
sbi->cur_victim_sec,
prefree_segments(sbi), free_segments(sbi));
}
out:
mutex_unlock(&dirty_i->seglist_lock);
return (p.min_segno == NULL_SEGNO) ? 0 : 1;
}
static const struct victim_selection default_v_ops = {
.get_victim = get_victim_by_default,
};
static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
{
struct inode_entry *ie;
ie = radix_tree_lookup(&gc_list->iroot, ino);
if (ie)
return ie->inode;
return NULL;
}
static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
{
struct inode_entry *new_ie;
if (inode == find_gc_inode(gc_list, inode->i_ino)) {
iput(inode);
return;
}
new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS);
new_ie->inode = inode;
f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
list_add_tail(&new_ie->list, &gc_list->ilist);
}
static void put_gc_inode(struct gc_inode_list *gc_list)
{
struct inode_entry *ie, *next_ie;
list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
iput(ie->inode);
list_del(&ie->list);
kmem_cache_free(f2fs_inode_entry_slab, ie);
}
}
static int check_valid_map(struct f2fs_sb_info *sbi,
unsigned int segno, int offset)
{
struct sit_info *sit_i = SIT_I(sbi);
struct seg_entry *sentry;
int ret;
down_read(&sit_i->sentry_lock);
sentry = get_seg_entry(sbi, segno);
ret = f2fs_test_bit(offset, sentry->cur_valid_map);
up_read(&sit_i->sentry_lock);
return ret;
}
/*
* This function compares node address got in summary with that in NAT.
* On validity, copy that node with cold status, otherwise (invalid node)
* ignore that.
*/
static void gc_node_segment(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum, unsigned int segno, int gc_type)
{
struct f2fs_summary *entry;
block_t start_addr;
int off;
int phase = 0;
bool fggc = (gc_type == FG_GC);
start_addr = START_BLOCK(sbi, segno);
next_step:
entry = sum;
if (fggc && phase == 2)
atomic_inc(&sbi->wb_sync_req[NODE]);
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
nid_t nid = le32_to_cpu(entry->nid);
struct page *node_page;
struct node_info ni;
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
return;
if (check_valid_map(sbi, segno, off) == 0)
continue;
if (phase == 0) {
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
META_NAT, true);
continue;
}
if (phase == 1) {
f2fs_ra_node_page(sbi, nid);
continue;
}
/* phase == 2 */
node_page = f2fs_get_node_page(sbi, nid);
if (IS_ERR(node_page))
continue;
/* block may become invalid during f2fs_get_node_page */
if (check_valid_map(sbi, segno, off) == 0) {
f2fs_put_page(node_page, 1);
continue;
}
if (f2fs_get_node_info(sbi, nid, &ni)) {
f2fs_put_page(node_page, 1);
continue;
}
if (ni.blk_addr != start_addr + off) {
f2fs_put_page(node_page, 1);
continue;
}
f2fs_move_node_page(node_page, gc_type);
stat_inc_node_blk_count(sbi, 1, gc_type);
}
if (++phase < 3)
goto next_step;
if (fggc)
atomic_dec(&sbi->wb_sync_req[NODE]);
}
/*
* Calculate start block index indicating the given node offset.
* Be careful, caller should give this node offset only indicating direct node
* blocks. If any node offsets, which point the other types of node blocks such
* as indirect or double indirect node blocks, are given, it must be a caller's
* bug.
*/
block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
{
unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
unsigned int bidx;
if (node_ofs == 0)
return 0;
if (node_ofs <= 2) {
bidx = node_ofs - 1;
} else if (node_ofs <= indirect_blks) {
int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
bidx = node_ofs - 2 - dec;
} else {
int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
bidx = node_ofs - 5 - dec;
}
return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
}
static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct node_info *dni, block_t blkaddr, unsigned int *nofs)
{
struct page *node_page;
nid_t nid;
unsigned int ofs_in_node;
block_t source_blkaddr;
nid = le32_to_cpu(sum->nid);
ofs_in_node = le16_to_cpu(sum->ofs_in_node);
node_page = f2fs_get_node_page(sbi, nid);
if (IS_ERR(node_page))
return false;
if (f2fs_get_node_info(sbi, nid, dni)) {
f2fs_put_page(node_page, 1);
return false;
}
if (sum->version != dni->version) {
f2fs_msg(sbi->sb, KERN_WARNING,
"%s: valid data with mismatched node version.",
__func__);
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
*nofs = ofs_of_node(node_page);
source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
if (source_blkaddr != blkaddr)
return false;
return true;
}
static int ra_data_block(struct inode *inode, pgoff_t index)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
struct extent_info ei = {0, 0, 0};
struct f2fs_io_info fio = {
.sbi = sbi,
.ino = inode->i_ino,
.type = DATA,
.temp = COLD,
.op = REQ_OP_READ,
.op_flags = 0,
.encrypted_page = NULL,
.in_list = false,
.retry = false,
};
int err;
page = f2fs_grab_cache_page(mapping, index, true);
if (!page)
return -ENOMEM;
if (f2fs_lookup_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
goto got_it;
}
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err)
goto put_page;
f2fs_put_dnode(&dn);
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC))) {
err = -EFAULT;
goto put_page;
}
got_it:
/* read page */
fio.page = page;
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
dn.data_blkaddr,
FGP_LOCK | FGP_CREAT, GFP_NOFS);
if (!fio.encrypted_page) {
err = -ENOMEM;
goto put_page;
}
err = f2fs_submit_page_bio(&fio);
if (err)
goto put_encrypted_page;
f2fs_put_page(fio.encrypted_page, 0);
f2fs_put_page(page, 1);
return 0;
put_encrypted_page:
f2fs_put_page(fio.encrypted_page, 1);
put_page:
f2fs_put_page(page, 1);
return err;
}
/*
* Move data block via META_MAPPING while keeping locked data page.
* This can be used to move blocks, aka LBAs, directly on disk.
*/
static void move_data_block(struct inode *inode, block_t bidx,
int gc_type, unsigned int segno, int off)
{
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
.ino = inode->i_ino,
.type = DATA,
.temp = COLD,
.op = REQ_OP_READ,
.op_flags = 0,
.encrypted_page = NULL,
.in_list = false,
.retry = false,
};
struct dnode_of_data dn;
struct f2fs_summary sum;
struct node_info ni;
struct page *page, *mpage;
block_t newaddr;
int err;
bool lfs_mode = test_opt(fio.sbi, LFS);
/* do not read out */
page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
if (!page)
return;
if (!check_valid_map(F2FS_I_SB(inode), segno, off))
goto out;
if (f2fs_is_atomic_file(inode)) {
F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
goto out;
}
if (f2fs_is_pinned_file(inode)) {
f2fs_pin_file_control(inode, true);
goto out;
}
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
if (err)
goto out;
if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
ClearPageUptodate(page);
goto put_out;
}
/*
* don't cache encrypted data into meta inode until previous dirty
* data were writebacked to avoid racing between GC and flush.
*/
f2fs_wait_on_page_writeback(page, DATA, true);
err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
if (err)
goto put_out;
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* read page */
fio.page = page;
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
if (lfs_mode)
down_write(&fio.sbi->io_order_lock);
f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
&sum, CURSEG_COLD_DATA, NULL, false);
fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
if (!fio.encrypted_page) {
err = -ENOMEM;
goto recover_block;
}
mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
fio.old_blkaddr, FGP_LOCK, GFP_NOFS);
if (mpage) {
bool updated = false;
if (PageUptodate(mpage)) {
memcpy(page_address(fio.encrypted_page),
page_address(mpage), PAGE_SIZE);
updated = true;
}
f2fs_put_page(mpage, 1);
invalidate_mapping_pages(META_MAPPING(fio.sbi),
fio.old_blkaddr, fio.old_blkaddr);
if (updated)
goto write_page;
}
err = f2fs_submit_page_bio(&fio);
if (err)
goto put_page_out;
/* write page */
lock_page(fio.encrypted_page);
if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
err = -EIO;
goto put_page_out;
}
if (unlikely(!PageUptodate(fio.encrypted_page))) {
err = -EIO;
goto put_page_out;
}
write_page:
set_page_dirty(fio.encrypted_page);
f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
if (clear_page_dirty_for_io(fio.encrypted_page))
dec_page_count(fio.sbi, F2FS_DIRTY_META);
set_page_writeback(fio.encrypted_page);
ClearPageError(page);
/* allocate block address */
f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
fio.op = REQ_OP_WRITE;
fio.op_flags = REQ_SYNC;
fio.new_blkaddr = newaddr;
f2fs_submit_page_write(&fio);
if (fio.retry) {
if (PageWriteback(fio.encrypted_page))
end_page_writeback(fio.encrypted_page);
goto put_page_out;
}
f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(inode, FI_APPEND_WRITE);
if (page->index == 0)
set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
put_page_out:
f2fs_put_page(fio.encrypted_page, 1);
recover_block:
if (lfs_mode)
up_write(&fio.sbi->io_order_lock);
if (err)
f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
true, true);
put_out:
f2fs_put_dnode(&dn);
out:
f2fs_put_page(page, 1);
}
static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
unsigned int segno, int off)
{
struct page *page;
page = f2fs_get_lock_data_page(inode, bidx, true);
if (IS_ERR(page))
return;
if (!check_valid_map(F2FS_I_SB(inode), segno, off))
goto out;
if (f2fs_is_atomic_file(inode)) {
F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
goto out;
}
if (f2fs_is_pinned_file(inode)) {
if (gc_type == FG_GC)
f2fs_pin_file_control(inode, true);
goto out;
}
if (gc_type == BG_GC) {
if (PageWriteback(page))
goto out;
set_page_dirty(page);
set_cold_data(page);
} else {
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
.ino = inode->i_ino,
.type = DATA,
.temp = COLD,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC,
.old_blkaddr = NULL_ADDR,
.page = page,
.encrypted_page = NULL,
.need_lock = LOCK_REQ,
.io_type = FS_GC_DATA_IO,
};
bool is_dirty = PageDirty(page);
int err;
retry:
set_page_dirty(page);
f2fs_wait_on_page_writeback(page, DATA, true);
if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
f2fs_remove_dirty_inode(inode);
}
set_cold_data(page);
err = f2fs_do_write_data_page(&fio);
if (err) {
clear_cold_data(page);
if (err == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
if (is_dirty)
set_page_dirty(page);
}
}
out:
f2fs_put_page(page, 1);
}
/*
* This function tries to get parent node of victim data block, and identifies
* data block validity. If the block is valid, copy that with cold status and
* modify parent node.
* If the parent node is not valid or the data block address is different,
* the victim data block is ignored.
*/
static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
{
struct super_block *sb = sbi->sb;
struct f2fs_summary *entry;
block_t start_addr;
int off;
int phase = 0;
start_addr = START_BLOCK(sbi, segno);
next_step:
entry = sum;
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
struct page *data_page;
struct inode *inode;
struct node_info dni; /* dnode info for the data */
unsigned int ofs_in_node, nofs;
block_t start_bidx;
nid_t nid = le32_to_cpu(entry->nid);
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
return;
if (check_valid_map(sbi, segno, off) == 0)
continue;
if (phase == 0) {
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
META_NAT, true);
continue;
}
if (phase == 1) {
f2fs_ra_node_page(sbi, nid);
continue;
}
/* Get an inode by ino with checking validity */
if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
continue;
if (phase == 2) {
f2fs_ra_node_page(sbi, dni.ino);
continue;
}
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
if (phase == 3) {
inode = f2fs_iget(sb, dni.ino);
if (IS_ERR(inode) || is_bad_inode(inode))
continue;
if (!down_write_trylock(
&F2FS_I(inode)->i_gc_rwsem[WRITE])) {
iput(inode);
sbi->skipped_gc_rwsem++;
continue;
}
start_bidx = f2fs_start_bidx_of_node(nofs, inode) +
ofs_in_node;
if (f2fs_post_read_required(inode)) {
int err = ra_data_block(inode, start_bidx);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err) {
iput(inode);
continue;
}
add_gc_inode(gc_list, inode);
continue;
}
data_page = f2fs_get_read_data_page(inode,
start_bidx, REQ_RAHEAD, true);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (IS_ERR(data_page)) {
iput(inode);
continue;
}
f2fs_put_page(data_page, 0);
add_gc_inode(gc_list, inode);
continue;
}
/* phase 4 */
inode = find_gc_inode(gc_list, dni.ino);
if (inode) {
struct f2fs_inode_info *fi = F2FS_I(inode);
bool locked = false;
if (S_ISREG(inode->i_mode)) {
if (!down_write_trylock(&fi->i_gc_rwsem[READ]))
continue;
if (!down_write_trylock(
&fi->i_gc_rwsem[WRITE])) {
sbi->skipped_gc_rwsem++;
up_write(&fi->i_gc_rwsem[READ]);
continue;
}
locked = true;
/* wait for all inflight aio data */
inode_dio_wait(inode);
}
start_bidx = f2fs_start_bidx_of_node(nofs, inode)
+ ofs_in_node;
if (f2fs_post_read_required(inode))
move_data_block(inode, start_bidx, gc_type,
segno, off);
else
move_data_page(inode, start_bidx, gc_type,
segno, off);
if (locked) {
up_write(&fi->i_gc_rwsem[WRITE]);
up_write(&fi->i_gc_rwsem[READ]);
}
stat_inc_data_blk_count(sbi, 1, gc_type);
}
}
if (++phase < 5)
goto next_step;
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
int gc_type)
{
struct sit_info *sit_i = SIT_I(sbi);
int ret;
down_write(&sit_i->sentry_lock);
ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
NO_CHECK_TYPE, LFS);
up_write(&sit_i->sentry_lock);
return ret;
}
static int do_garbage_collect(struct f2fs_sb_info *sbi,
unsigned int start_segno,
struct gc_inode_list *gc_list, int gc_type)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
struct blk_plug plug;
unsigned int segno = start_segno;
unsigned int end_segno = start_segno + sbi->segs_per_sec;
int seg_freed = 0;
unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
SUM_TYPE_DATA : SUM_TYPE_NODE;
/* readahead multi ssa blocks those have contiguous address */
if (sbi->segs_per_sec > 1)
f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
sbi->segs_per_sec, META_SSA, true);
/* reference all summary page */
while (segno < end_segno) {
sum_page = f2fs_get_sum_page(sbi, segno++);
unlock_page(sum_page);
}
blk_start_plug(&plug);
for (segno = start_segno; segno < end_segno; segno++) {
/* find segment summary of victim */
sum_page = find_get_page(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
f2fs_put_page(sum_page, 0);
if (get_valid_blocks(sbi, segno, false) == 0 ||
!PageUptodate(sum_page) ||
unlikely(f2fs_cp_error(sbi)))
goto next;
sum = page_address(sum_page);
if (type != GET_SUM_TYPE((&sum->footer))) {
f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent segment (%u) "
"type [%d, %d] in SSA and SIT",
segno, type, GET_SUM_TYPE((&sum->footer)));
set_sbi_flag(sbi, SBI_NEED_FSCK);
goto next;
}
/*
* this is to avoid deadlock:
* - lock_page(sum_page) - f2fs_replace_block
* - check_valid_map() - down_write(sentry_lock)
* - down_read(sentry_lock) - change_curseg()
* - lock_page(sum_page)
*/
if (type == SUM_TYPE_NODE)
gc_node_segment(sbi, sum->entries, segno, gc_type);
else
gc_data_segment(sbi, sum->entries, gc_list, segno,
gc_type);
stat_inc_seg_count(sbi, type, gc_type);
if (gc_type == FG_GC &&
get_valid_blocks(sbi, segno, false) == 0)
seg_freed++;
next:
f2fs_put_page(sum_page, 0);
}
if (gc_type == FG_GC)
f2fs_submit_merged_write(sbi,
(type == SUM_TYPE_NODE) ? NODE : DATA);
blk_finish_plug(&plug);
stat_inc_call_count(sbi->stat_info);
return seg_freed;
}
int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
bool background, unsigned int segno)
{
int gc_type = sync ? FG_GC : BG_GC;
int sec_freed = 0, seg_freed = 0, total_freed = 0;
int ret = 0;
struct cp_control cpc;
unsigned int init_segno = segno;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC];
unsigned long long first_skipped;
unsigned int skipped_round = 0, round = 0;
trace_f2fs_gc_begin(sbi->sb, sync, background,
get_pages(sbi, F2FS_DIRTY_NODES),
get_pages(sbi, F2FS_DIRTY_DENTS),
get_pages(sbi, F2FS_DIRTY_IMETA),
free_sections(sbi),
free_segments(sbi),
reserved_segments(sbi),
prefree_segments(sbi));
cpc.reason = __get_cp_reason(sbi);
sbi->skipped_gc_rwsem = 0;
first_skipped = last_skipped;
gc_more:
if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
ret = -EINVAL;
goto stop;
}
if (unlikely(f2fs_cp_error(sbi))) {
ret = -EIO;
goto stop;
}
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) {
/*
* For example, if there are many prefree_segments below given
* threshold, we can make them free by checkpoint. Then, we
* secure free segments which doesn't need fggc any more.
*/
if (prefree_segments(sbi)) {
ret = f2fs_write_checkpoint(sbi, &cpc);
if (ret)
goto stop;
}
if (has_not_enough_free_secs(sbi, 0, 0))
gc_type = FG_GC;
}
/* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
if (gc_type == BG_GC && !background) {
ret = -EINVAL;
goto stop;
}
if (!__get_victim(sbi, &segno, gc_type)) {
ret = -ENODATA;
goto stop;
}
seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
sec_freed++;
total_freed += seg_freed;
if (gc_type == FG_GC) {
if (sbi->skipped_atomic_files[FG_GC] > last_skipped ||
sbi->skipped_gc_rwsem)
skipped_round++;
last_skipped = sbi->skipped_atomic_files[FG_GC];
round++;
}
if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
if (sync)
goto stop;
if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
if (skipped_round <= MAX_SKIP_GC_COUNT ||
skipped_round * 2 < round) {
segno = NULL_SEGNO;
goto gc_more;
}
if (first_skipped < last_skipped &&
(last_skipped - first_skipped) >
sbi->skipped_gc_rwsem) {
f2fs_drop_inmem_pages_all(sbi, true);
segno = NULL_SEGNO;
goto gc_more;
}
if (gc_type == FG_GC)
ret = f2fs_write_checkpoint(sbi, &cpc);
}
stop:
SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
get_pages(sbi, F2FS_DIRTY_NODES),
get_pages(sbi, F2FS_DIRTY_DENTS),
get_pages(sbi, F2FS_DIRTY_IMETA),
free_sections(sbi),
free_segments(sbi),
reserved_segments(sbi),
prefree_segments(sbi));
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&gc_list);
if (sync)
ret = sec_freed ? 0 : -EAGAIN;
return ret;
}
void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
{
DIRTY_I(sbi)->v_ops = &default_v_ops;
sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
/* give warm/cold data area from slower device */
if (sbi->s_ndevs && sbi->segs_per_sec == 1)
SIT_I(sbi)->last_victim[ALLOC_NEXT] =
GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
}