summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/f2fs.rst11
-rw-r--r--fs/f2fs/checkpoint.c177
-rw-r--r--fs/f2fs/debug.c12
-rw-r--r--fs/f2fs/f2fs.h27
-rw-r--r--fs/f2fs/super.c58
5 files changed, 277 insertions, 8 deletions
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 5eff4009e77e..f75ec244762f 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -247,6 +247,17 @@ checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enabl
hide up to all remaining free space. The actual space that
would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
This space is reclaimed once checkpoint=enable.
+checkpoint_merge When checkpoint is enabled, this can be used to create a kernel
+ daemon and make it to merge concurrent checkpoint requests as
+ much as possible to eliminate redundant checkpoint issues. Plus,
+ we can eliminate the sluggish issue caused by slow checkpoint
+ operation when the checkpoint is done in a process context in
+ a cgroup having low i/o budget and cpu shares. To make this
+ do better, we set the default i/o priority of the kernel daemon
+ to "3", to give one higher priority than other kernel threads.
+ This is the same way to give a I/O priority to the jbd2
+ journaling thread of ext4 filesystem.
+nocheckpoint_merge Disable checkpoint merge feature.
compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo",
"lz4", "zstd" and "lzo-rle" algorithm.
compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 8c79ba0566b1..6f6ecba8f920 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -13,12 +13,15 @@
#include <linux/f2fs_fs.h>
#include <linux/pagevec.h>
#include <linux/swap.h>
+#include <linux/kthread.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
#include <trace/events/f2fs.h>
+#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
static struct kmem_cache *ino_entry_slab;
struct kmem_cache *f2fs_inode_entry_slab;
@@ -1704,3 +1707,177 @@ void f2fs_destroy_checkpoint_caches(void)
kmem_cache_destroy(ino_entry_slab);
kmem_cache_destroy(f2fs_inode_entry_slab);
}
+
+static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
+{
+ struct cp_control cpc = { .reason = CP_SYNC, };
+ int err;
+
+ down_write(&sbi->gc_lock);
+ err = f2fs_write_checkpoint(sbi, &cpc);
+ up_write(&sbi->gc_lock);
+
+ return err;
+}
+
+static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+ struct ckpt_req *req, *next;
+ struct llist_node *dispatch_list;
+ u64 sum_diff = 0, diff, count = 0;
+ int ret;
+
+ dispatch_list = llist_del_all(&cprc->issue_list);
+ if (!dispatch_list)
+ return;
+ dispatch_list = llist_reverse_order(dispatch_list);
+
+ ret = __write_checkpoint_sync(sbi);
+ atomic_inc(&cprc->issued_ckpt);
+
+ llist_for_each_entry_safe(req, next, dispatch_list, llnode) {
+ diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time);
+ req->ret = ret;
+ complete(&req->wait);
+
+ sum_diff += diff;
+ count++;
+ }
+ atomic_sub(count, &cprc->queued_ckpt);
+ atomic_add(count, &cprc->total_ckpt);
+
+ spin_lock(&cprc->stat_lock);
+ cprc->cur_time = (unsigned int)div64_u64(sum_diff, count);
+ if (cprc->peak_time < cprc->cur_time)
+ cprc->peak_time = cprc->cur_time;
+ spin_unlock(&cprc->stat_lock);
+}
+
+static int issue_checkpoint_thread(void *data)
+{
+ struct f2fs_sb_info *sbi = data;
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+ wait_queue_head_t *q = &cprc->ckpt_wait_queue;
+repeat:
+ if (kthread_should_stop())
+ return 0;
+
+ sb_start_intwrite(sbi->sb);
+
+ if (!llist_empty(&cprc->issue_list))
+ __checkpoint_and_complete_reqs(sbi);
+
+ sb_end_intwrite(sbi->sb);
+
+ wait_event_interruptible(*q,
+ kthread_should_stop() || !llist_empty(&cprc->issue_list));
+ goto repeat;
+}
+
+static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi,
+ struct ckpt_req *wait_req)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+ if (!llist_empty(&cprc->issue_list)) {
+ __checkpoint_and_complete_reqs(sbi);
+ } else {
+ /* already dispatched by issue_checkpoint_thread */
+ if (wait_req)
+ wait_for_completion(&wait_req->wait);
+ }
+}
+
+static void init_ckpt_req(struct ckpt_req *req)
+{
+ memset(req, 0, sizeof(struct ckpt_req));
+
+ init_completion(&req->wait);
+ req->queue_time = ktime_get();
+}
+
+int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+ struct ckpt_req req;
+ struct cp_control cpc;
+
+ cpc.reason = __get_cp_reason(sbi);
+ if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
+ int ret;
+
+ down_write(&sbi->gc_lock);
+ ret = f2fs_write_checkpoint(sbi, &cpc);
+ up_write(&sbi->gc_lock);
+
+ return ret;
+ }
+
+ if (!cprc->f2fs_issue_ckpt)
+ return __write_checkpoint_sync(sbi);
+
+ init_ckpt_req(&req);
+
+ llist_add(&req.llnode, &cprc->issue_list);
+ atomic_inc(&cprc->queued_ckpt);
+
+ /* update issue_list before we wake up issue_checkpoint thread */
+ smp_mb();
+
+ if (waitqueue_active(&cprc->ckpt_wait_queue))
+ wake_up(&cprc->ckpt_wait_queue);
+
+ if (cprc->f2fs_issue_ckpt)
+ wait_for_completion(&req.wait);
+ else
+ flush_remained_ckpt_reqs(sbi, &req);
+
+ return req.ret;
+}
+
+int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
+{
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+ if (cprc->f2fs_issue_ckpt)
+ return 0;
+
+ cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
+ "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
+ if (IS_ERR(cprc->f2fs_issue_ckpt)) {
+ cprc->f2fs_issue_ckpt = NULL;
+ return -ENOMEM;
+ }
+
+ set_task_ioprio(cprc->f2fs_issue_ckpt, DEFAULT_CHECKPOINT_IOPRIO);
+
+ return 0;
+}
+
+void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+ if (cprc->f2fs_issue_ckpt) {
+ struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt;
+
+ cprc->f2fs_issue_ckpt = NULL;
+ kthread_stop(ckpt_task);
+
+ flush_remained_ckpt_reqs(sbi, NULL);
+ }
+}
+
+void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+ atomic_set(&cprc->issued_ckpt, 0);
+ atomic_set(&cprc->total_ckpt, 0);
+ atomic_set(&cprc->queued_ckpt, 0);
+ init_waitqueue_head(&cprc->ckpt_wait_queue);
+ init_llist_head(&cprc->issue_list);
+ spin_lock_init(&cprc->stat_lock);
+}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 197c914119da..91855d5721cd 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi)
atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
}
+ si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt);
+ si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt);
+ si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt);
+ spin_lock(&sbi->cprc_info.stat_lock);
+ si->cur_ckpt_time = sbi->cprc_info.cur_time;
+ si->peak_ckpt_time = sbi->cprc_info.peak_time;
+ spin_unlock(&sbi->cprc_info.stat_lock);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@@ -417,6 +424,11 @@ static int stat_show(struct seq_file *s, void *v)
si->meta_count[META_NAT]);
seq_printf(s, " - ssa blocks : %u\n",
si->meta_count[META_SSA]);
+ seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
+ "Cur time: %4d(ms), Peak time: %4d(ms))\n",
+ si->nr_queued_ckpt, si->nr_issued_ckpt,
+ si->nr_total_ckpt, si->cur_ckpt_time,
+ si->peak_ckpt_time);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a2e520a13630..f7536aca8a31 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
#define F2FS_MOUNT_NORECOVERY 0x04000000
#define F2FS_MOUNT_ATGC 0x08000000
+#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -267,6 +268,25 @@ struct fsync_node_entry {
unsigned int seq_id; /* sequence id */
};
+struct ckpt_req {
+ struct completion wait; /* completion for checkpoint done */
+ struct llist_node llnode; /* llist_node to be linked in wait queue */
+ int ret; /* return code of checkpoint */
+ ktime_t queue_time; /* request queued time */
+};
+
+struct ckpt_req_control {
+ struct task_struct *f2fs_issue_ckpt; /* checkpoint task */
+ wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */
+ atomic_t issued_ckpt; /* # of actually issued ckpts */
+ atomic_t total_ckpt; /* # of total ckpts */
+ atomic_t queued_ckpt; /* # of queued ckpts */
+ struct llist_head issue_list; /* list for command issue */
+ spinlock_t stat_lock; /* lock for below checkpoint time stats */
+ unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */
+ unsigned int peak_time; /* peak wait time in msec until now */
+};
+
/* for the bitmap indicate blocks to be discarded */
struct discard_entry {
struct list_head list; /* list head */
@@ -1433,6 +1453,7 @@ struct f2fs_sb_info {
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
+ struct ckpt_req_control cprc_info; /* for checkpoint request control */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -3450,6 +3471,10 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_checkpoint_caches(void);
void f2fs_destroy_checkpoint_caches(void);
+int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi);
+int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi);
+void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi);
+void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
/*
* data.c
@@ -3562,6 +3587,8 @@ struct f2fs_stat_info {
int nr_discarding, nr_discarded;
int nr_discard_cmd;
unsigned int undiscard_blks;
+ int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt;
+ unsigned int cur_ckpt_time, peak_ckpt_time;
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
int compr_inode;
unsigned long long compr_blocks;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 429bc00af440..1000d21120ca 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -144,6 +144,8 @@ enum {
Opt_checkpoint_disable_cap,
Opt_checkpoint_disable_cap_perc,
Opt_checkpoint_enable,
+ Opt_checkpoint_merge,
+ Opt_nocheckpoint_merge,
Opt_compress_algorithm,
Opt_compress_log_size,
Opt_compress_extension,
@@ -214,6 +216,8 @@ static match_table_t f2fs_tokens = {
{Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
{Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"},
{Opt_checkpoint_enable, "checkpoint=enable"},
+ {Opt_checkpoint_merge, "checkpoint_merge"},
+ {Opt_nocheckpoint_merge, "nocheckpoint_merge"},
{Opt_compress_algorithm, "compress_algorithm=%s"},
{Opt_compress_log_size, "compress_log_size=%u"},
{Opt_compress_extension, "compress_extension=%s"},
@@ -941,6 +945,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_checkpoint_enable:
clear_opt(sbi, DISABLE_CHECKPOINT);
break;
+ case Opt_checkpoint_merge:
+ set_opt(sbi, MERGE_CHECKPOINT);
+ break;
+ case Opt_nocheckpoint_merge:
+ clear_opt(sbi, MERGE_CHECKPOINT);
+ break;
#ifdef CONFIG_F2FS_FS_COMPRESSION
case Opt_compress_algorithm:
if (!f2fs_sb_has_compression(sbi)) {
@@ -1341,6 +1351,12 @@ static void f2fs_put_super(struct super_block *sb)
mutex_lock(&sbi->umount_mutex);
/*
+ * flush all issued checkpoints and stop checkpoint issue thread.
+ * after then, all checkpoints should be done by each process context.
+ */
+ f2fs_stop_ckpt_thread(sbi);
+
+ /*
* We don't need to do checkpoint when superblock is clean.
* But, the previous checkpoint was not done by umount, it needs to do
* clean checkpoint again.
@@ -1438,15 +1454,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return -EAGAIN;
- if (sync) {
- struct cp_control cpc;
-
- cpc.reason = __get_cp_reason(sbi);
+ if (sync)
+ err = f2fs_issue_checkpoint(sbi);
- down_write(&sbi->gc_lock);
- err = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
- }
return err;
}
@@ -1770,6 +1780,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sbi, DISABLE_CHECKPOINT))
seq_printf(seq, ",checkpoint=disable:%u",
F2FS_OPTION(sbi).unusable_cap);
+ if (test_opt(sbi, MERGE_CHECKPOINT))
+ seq_puts(seq, ",checkpoint_merge");
+ else
+ seq_puts(seq, ",nocheckpoint_merge");
if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX)
seq_printf(seq, ",fsync_mode=%s", "posix");
else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT)
@@ -2053,6 +2067,19 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
}
+ if (!test_opt(sbi, DISABLE_CHECKPOINT) &&
+ test_opt(sbi, MERGE_CHECKPOINT)) {
+ err = f2fs_start_ckpt_thread(sbi);
+ if (err) {
+ f2fs_err(sbi,
+ "Failed to start F2FS issue_checkpoint_thread (%d)",
+ err);
+ goto restore_gc;
+ }
+ } else {
+ f2fs_stop_ckpt_thread(sbi);
+ }
+
/*
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
@@ -3804,6 +3831,19 @@ try_onemore:
f2fs_init_fsync_node_info(sbi);
+ /* setup checkpoint request control and start checkpoint issue thread */
+ f2fs_init_ckpt_req_control(sbi);
+ if (!test_opt(sbi, DISABLE_CHECKPOINT) &&
+ test_opt(sbi, MERGE_CHECKPOINT)) {
+ err = f2fs_start_ckpt_thread(sbi);
+ if (err) {
+ f2fs_err(sbi,
+ "Failed to start F2FS issue_checkpoint_thread (%d)",
+ err);
+ goto stop_ckpt_thread;
+ }
+ }
+
/* setup f2fs internal modules */
err = f2fs_build_segment_manager(sbi);
if (err) {
@@ -4013,6 +4053,8 @@ free_nm:
free_sm:
f2fs_destroy_segment_manager(sbi);
f2fs_destroy_post_read_wq(sbi);
+stop_ckpt_thread:
+ f2fs_stop_ckpt_thread(sbi);
free_devices:
destroy_device_list(sbi);
kvfree(sbi->ckpt);