linux/fs/ocfs2/suballoc.h
Heming Zhao 4eb7b93e03 ocfs2: improve write IO performance when fragmentation is high
The group_search function ocfs2_cluster_group_search() should
bypass groups with insufficient space to avoid unnecessary
searches.

This patch is particularly useful when ocfs2 is handling huge
number small files, and volume fragmentation is very high.
In this case, ocfs2 is busy with looking up available la window
from //global_bitmap.

This patch introduces a new member in the Group Description (gd)
struct called 'bg_contig_free_bits', representing the max
contigous free bits in this gd. When ocfs2 allocates a new
la window from //global_bitmap, 'bg_contig_free_bits' helps
expedite the search process.

Let's image below path.

1. la state (->local_alloc_state) is set THROTTLED or DISABLED.

2. when user delete a large file and trigger
   ocfs2_local_alloc_seen_free_bits set osb->local_alloc_state
   unconditionally.

3. a write IOs thread run and trigger the worst performance path

```
ocfs2_reserve_clusters_with_limit
 ocfs2_reserve_local_alloc_bits
  ocfs2_local_alloc_slide_window //[1]
   + ocfs2_local_alloc_reserve_for_window //[2]
   + ocfs2_local_alloc_new_window //[3]
      ocfs2_recalc_la_window
```

[1]:
will be called when la window bits used up.

[2]:
under la state is ENABLED, and this func only check global_bitmap
free bits, it will succeed in general.

[3]:
will use the default la window size to search clusters then fail.
ocfs2_recalc_la_window attempts other la window sizes.
the timing complexity is O(n^4), resulting in a significant time
cost for scanning global bitmap. This leads to a dramatic slowdown
in write I/Os (e.g., user space 'dd').

i.e.
an ocfs2 partition size: 1.45TB, cluster size: 4KB,
la window default size: 106MB.
The partition is fragmentation by creating & deleting huge mount of
small files.

before this patch, the timing of [3] should be
(the number got from real world):
- la window size change order (size: MB):
  106, 53, 26.5, 13, 6.5, 3.25, 1.6, 0.8
  only 0.8MB succeed, 0.8MB also triggers la window to disable.
  ocfs2_local_alloc_new_window retries 8 times, first 7 times totally
  runs in worst case.
- group chain number: 242
  ocfs2_claim_suballoc_bits calls for-loop 242 times
- each chain has 49 block group
  ocfs2_search_chain calls while-loop 49 times
- each bg has 32256 blocks
  ocfs2_block_group_find_clear_bits calls while-loop for 32256 bits.
  for ocfs2_find_next_zero_bit uses ffz() to find zero bit, let's use
  (32256/64) (this is not worst value) for timing calucation.

the loop times: 7*242*49*(32256/64) = 41835024 (~42 million times)

In the worst case, user space writes 1MB data will trigger 42M scanning
times.

under this patch, the timing is '7*242*49 = 83006', reduced by three
orders of magnitude.

Link: https://lkml.kernel.org/r/20240328125203.20892-2-heming.zhao@suse.com
Signed-off-by: Heming Zhao <heming.zhao@suse.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-04-25 21:07:03 -07:00

226 lines
6.9 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* suballoc.h
*
* Defines sub allocator api
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
*/
#ifndef _CHAINALLOC_H_
#define _CHAINALLOC_H_
struct ocfs2_suballoc_result;
typedef int (group_search_t)(struct inode *,
struct buffer_head *,
u32, /* bits_wanted */
u32, /* min_bits */
u64, /* max_block */
struct ocfs2_suballoc_result *);
/* found bits */
struct ocfs2_alloc_context {
struct inode *ac_inode; /* which bitmap are we allocating from? */
struct buffer_head *ac_bh; /* file entry bh */
u32 ac_alloc_slot; /* which slot are we allocating from? */
u32 ac_bits_wanted;
u32 ac_bits_given;
#define OCFS2_AC_USE_LOCAL 1
#define OCFS2_AC_USE_MAIN 2
#define OCFS2_AC_USE_INODE 3
#define OCFS2_AC_USE_META 4
u32 ac_which;
/* these are used by the chain search */
u16 ac_chain;
int ac_disable_chain_relink;
group_search_t *ac_group_search;
u64 ac_last_group;
u64 ac_max_block; /* Highest block number to allocate. 0 is
the same as ~0 - unlimited */
int ac_find_loc_only; /* hack for reflink operation ordering */
struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
struct ocfs2_alloc_reservation *ac_resv;
};
void ocfs2_init_steal_slots(struct ocfs2_super *osb);
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
{
return ac->ac_bits_wanted - ac->ac_bits_given;
}
/*
* Please note that the caller must make sure that root_el is the root
* of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
* the result may be wrong.
*/
int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
struct ocfs2_extent_list *root_el,
struct ocfs2_alloc_context **ac);
int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
int blocks,
struct ocfs2_alloc_context **ac);
int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
struct ocfs2_alloc_context **ac);
int ocfs2_reserve_clusters(struct ocfs2_super *osb,
u32 bits_wanted,
struct ocfs2_alloc_context **ac);
int ocfs2_alloc_dinode_update_counts(struct inode *inode,
handle_t *handle,
struct buffer_head *di_bh,
u32 num_bits,
u16 chain);
void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
struct buffer_head *di_bh,
u32 num_bits,
u16 chain);
u16 ocfs2_find_max_contig_free_bits(void *bitmap,
u16 total_bits, u16 start);
int ocfs2_block_group_set_bits(handle_t *handle,
struct inode *alloc_inode,
struct ocfs2_group_desc *bg,
struct buffer_head *group_bh,
unsigned int bit_off,
unsigned int num_bits,
unsigned int max_contig_bits,
int fastpath);
int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
u64 *suballoc_loc,
u16 *suballoc_bit_start,
u32 *num_bits,
u64 *blkno_start);
int ocfs2_claim_new_inode(handle_t *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *ac,
u64 *suballoc_loc,
u16 *suballoc_bit,
u64 *fe_blkno);
int ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 *cluster_start,
u32 *num_clusters);
/*
* Use this variant of ocfs2_claim_clusters to specify a maximum
* number of clusters smaller than the allocation reserved.
*/
int __ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 max_clusters,
u32 *cluster_start,
u32 *num_clusters);
int ocfs2_free_suballoc_bits(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *alloc_bh,
unsigned int start_bit,
u64 bg_blkno,
unsigned int count);
int ocfs2_free_dinode(handle_t *handle,
struct inode *inode_alloc_inode,
struct buffer_head *inode_alloc_bh,
struct ocfs2_dinode *di);
int ocfs2_free_clusters(handle_t *handle,
struct inode *bitmap_inode,
struct buffer_head *bitmap_bh,
u64 start_blk,
unsigned int num_clusters);
int ocfs2_release_clusters(handle_t *handle,
struct inode *bitmap_inode,
struct buffer_head *bitmap_bh,
u64 start_blk,
unsigned int num_clusters);
static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
{
u64 group = block - (u64) bit;
return group;
}
static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
u64 bg_blkno)
{
/* This should work for all block group descriptors as only
* the 1st group descriptor of the cluster bitmap is
* different. */
if (bg_blkno == osb->first_cluster_group_blkno)
return 0;
/* the rest of the block groups are located at the beginning
* of their 1st cluster, so a direct translation just
* works. */
return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
}
static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
}
/* This is for local alloc ONLY. Others should use the task-specific
* apis above. */
int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac);
void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
/* given a cluster offset, calculate which block group it belongs to
* and return that block offset. */
u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
/*
* By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
* finds a problem. A caller that wants to check a group descriptor
* without going readonly should read the block with ocfs2_read_block[s]()
* and then checking it with this function. This is only resize, really.
* Everyone else should be using ocfs2_read_group_descriptor().
*/
int ocfs2_check_group_descriptor(struct super_block *sb,
struct ocfs2_dinode *di,
struct buffer_head *bh);
/*
* Read a group descriptor block into *bh. If *bh is NULL, a bh will be
* allocated. This is a cached read. The descriptor will be validated with
* ocfs2_validate_group_descriptor().
*/
int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
u64 gd_blkno, struct buffer_head **bh);
int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
u32 clusters_to_add, u32 extents_to_split,
struct ocfs2_alloc_context **data_ac,
struct ocfs2_alloc_context **meta_ac);
int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
/*
* The following two interfaces are for ocfs2_create_inode_in_orphan().
*/
int ocfs2_find_new_inode_loc(struct inode *dir,
struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *ac,
u64 *fe_blkno);
int ocfs2_claim_new_inode_at_loc(handle_t *handle,
struct inode *dir,
struct ocfs2_alloc_context *ac,
u64 *suballoc_loc,
u16 *suballoc_bit,
u64 di_blkno);
#endif /* _CHAINALLOC_H_ */