From 8e1a4857cd92e32e642b3e7184c7f6bf85c96e2e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 14:53:06 -0500 Subject: Update Documentation/filesystems/ext4.txt Fix paragraph with recommendations on how to tune ext4 for benchmarks. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 42 +++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 174eaff7ded9..f75ab101c00a 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -58,13 +58,22 @@ Note: More extensive information for getting started with ext4 can be # mount -t ext4 /dev/hda1 /wherever - - When comparing performance with other filesystems, remember that - ext3/4 by default offers higher data integrity guarantees than most. - So when comparing with a metadata-only journalling filesystem, such - as ext3, use `mount -o data=writeback'. And you might as well use - `mount -o nobh' too along with it. Making the journal larger than - the mke2fs default often helps performance with metadata-intensive - workloads. + - When comparing performance with other filesystems, it's always + important to try multiple workloads; very often a subtle change in a + workload parameter can completely change the ranking of which + filesystems do well compared to others. When comparing versus ext3, + note that ext4 enables write barriers by default, while ext3 does + not enable write barriers by default. So it is useful to use + explicitly specify whether barriers are enabled or not when via the + '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems + for a fair comparison. When tuning ext3 for best benchmark numbers, + it is often worthwhile to try changing the data journaling mode; '-o + data=writeback,nobh' can be faster for some workloads. (Note + however that running mounted with data=writeback can potentially + leave stale data exposed in recently written files in case of an + unclean shutdown, which could be a security exposure in some + situations.) Configuring the filesystem with a large journal can + also be helpful for metadata-intensive workloads. 2. Features =========== @@ -74,7 +83,7 @@ Note: More extensive information for getting started with ext4 can be * ability to use filesystems > 16TB (e2fsprogs support not available yet) * extent format reduces metadata overhead (RAM, IO for access, transactions) * extent format more robust in face of on-disk corruption due to magics, -* internal redunancy in tree +* internal redundancy in tree * improved file allocation (multi-block alloc) * fix 32000 subdirectory limit * nsec timestamps for mtime, atime, ctime, create time @@ -116,6 +125,12 @@ grouping of bitmaps and inode tables. Some test results available here: When mounting an ext4 filesystem, the following option are accepted: (*) == default +ro Mount filesystem read only. Note that ext4 will + replay the journal (and thus write to the + partition) even when mounted "read only". The + mount options "ro,noload" can be used to prevent + writes to the filesystem. + extents (*) ext4 will use extents to address file data. The file system will no longer be mountable by ext3. @@ -144,7 +159,11 @@ journal_dev=devnum When the external journal device's major/minor numbers identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. +noload Don't load the journal on mounting. Note that + if the filesystem was not unmounted cleanly, + skipping the journal replay will lead to the + filesystem containing inconsistencies that can + lead to any number of problems. data=journal All data are committed into the journal prior to being written into the main file system. @@ -219,9 +238,12 @@ minixdf Make 'df' act like Minix. debug Extra debugging information is sent to syslog. -errors=remount-ro(*) Remount the filesystem read-only on an error. +errors=remount-ro Remount the filesystem read-only on an error. errors=continue Keep going on a filesystem error. errors=panic Panic and halt the machine if an error occurs. + (These mount options override the errors behavior + specified in the superblock, which can be configured + using tune2fs) data_err=ignore(*) Just print an error message if an error occurs in a file data buffer in ordered mode. -- cgit v1.2.3 From 30773840c19cea60dcef39545960d541b1ac1cf8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Jan 2009 20:27:38 -0500 Subject: ext4: add fsync batch tuning knobs Add new mount options, min_batch_time and max_batch_time, which controls how long the jbd2 layer should wait for additional filesystem operations to get batched with a synchronous write transaction. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 29 +++++++++++++++++++++++ fs/ext4/ext4.h | 7 ++++++ fs/ext4/ext4_sb.h | 2 ++ fs/ext4/super.c | 47 ++++++++++++++++++++++++++++++++------ fs/jbd2/journal.c | 2 ++ fs/jbd2/transaction.c | 4 +++- include/linux/jbd2.h | 8 +++++++ 7 files changed, 91 insertions(+), 8 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index f75ab101c00a..e3fcbea3ec8c 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -283,6 +283,35 @@ delalloc (*) Deferring block allocation until write-out time. nodelalloc Disable delayed allocation. Blocks are allocation when data is copied from user to page cache. +max_batch_time=usec Maximum amount of time ext4 should wait for + additional filesystem operations to be batch + together with a synchronous write operation. + Since a synchronous write operation is going to + force a commit and then a wait for the I/O + complete, it doesn't cost much, and can be a + huge throughput win, we wait for a small amount + of time to see if any other transactions can + piggyback on the synchronous write. The + algorithm used is designed to automatically tune + for the speed of the disk, by measuring the + amount of time (on average) that it takes to + finish committing a transaction. Call this time + the "commit time". If the time that the + transactoin has been running is less than the + commit time, ext4 will try sleeping for the + commit time to see if other operations will join + the transaction. The commit time is capped by + the max_batch_time, which defaults to 15000us + (15ms). This optimization can be turned off + entirely by setting max_batch_time to 0. + +min_batch_time=usec This parameter sets the commit time (as + described above) to be at least min_batch_time. + It defaults to zero microseconds. Increasing + this parameter may improve the throughput of + multi-threaded, synchronous workloads on very + fast disks, at the cost of increasing latency. + Data Mode ========= There are 3 different data modes: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ac8551e0b70a..9ba9fd6d14da 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -328,6 +328,7 @@ struct ext4_mount_options { uid_t s_resuid; gid_t s_resgid; unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; char *s_qf_names[MAXQUOTAS]; @@ -805,6 +806,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEFM_JMODE_ORDERED 0x0040 #define EXT4_DEFM_JMODE_WBACK 0x0060 +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + /* * Structure of a directory entry */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 3db800f399a6..039b6ea1a042 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -74,6 +74,8 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_JBD2_DEBUG struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dc27d4c613c0..da377f9521bb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) #endif if (!test_opt(sb, RESERVATION)) seq_puts(seq, ",noreservation"); - if (sbi->s_commit_interval) { + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { seq_printf(seq, ",commit=%u", (unsigned) (sbi->s_commit_interval / HZ)); } + if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { + seq_printf(seq, ",min_batch_time=%u", + (unsigned) sbi->s_min_batch_time); + } + if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { + seq_printf(seq, ",max_batch_time=%u", + (unsigned) sbi->s_min_batch_time); + } + /* * We're changing the default of barrier mount option, so * let's always display its mount state so it's clear what its @@ -874,7 +883,8 @@ enum { Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, - Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, + Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, @@ -913,6 +923,8 @@ static const match_table_t tokens = { {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, + {Opt_min_batch_time, "min_batch_time=%u"}, + {Opt_max_batch_time, "max_batch_time=%u"}, {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, @@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb, option = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * option; break; + case Opt_max_batch_time: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = option; + break; + case Opt_min_batch_time: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_min_batch_time = option; + break; case Opt_data_journal: data_opt = EXT4_MOUNT_JOURNAL_DATA; goto datacheck; @@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; set_opt(sbi->s_mount_opt, RESERVATION); set_opt(sbi->s_mount_opt, BARRIER); @@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sbi->s_commit_interval) - journal->j_commit_interval = sbi->s_commit_interval; - /* We could also set up an ext4-specific default for the commit - * interval here, but for now we'll just fall back to the jbd - * default. */ + journal->j_commit_interval = sbi->s_commit_interval; + journal->j_min_batch_time = sbi->s_min_batch_time; + journal->j_max_batch_time = sbi->s_max_batch_time; spin_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) @@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; + old_opts.s_min_batch_time = sbi->s_min_batch_time; + old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) @@ -3178,6 +3209,8 @@ restore_opts: sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; + sbi->s_min_batch_time = old_opts.s_min_batch_time; + sbi->s_max_batch_time = old_opts.s_max_batch_time; #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 74d87290381c..fd1d7557a098 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -964,6 +964,8 @@ static journal_t * journal_init_common (void) spin_lock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); + journal->j_min_batch_time = 0; + journal->j_max_batch_time = 15000; /* 15ms */ /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 13dcbc990f41..48c21bac5a56 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle) trans_time = ktime_to_ns(ktime_sub(ktime_get(), transaction->t_start_time)); + commit_time = max_t(u64, commit_time, + 1000*journal->j_min_batch_time); commit_time = min_t(u64, commit_time, - 1000*jiffies_to_usecs(1)); + 1000*journal->j_max_batch_time); if (trans_time < commit_time) { ktime_t expires = ktime_add_ns(ktime_get(), diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index ab8cef130c28..a3cd647ea1bc 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -956,6 +956,14 @@ struct journal_s */ u64 j_average_commit_time; + /* + * minimum and maximum times that we should wait for + * additional filesystem operations to get batched into a + * synchronous handle in microseconds + */ + u32 j_min_batch_time; + u32 j_max_batch_time; + /* This function is called when a transaction is closed */ void (*j_commit_callback)(journal_t *, transaction_t *); -- cgit v1.2.3 From c31910672376dfb8d020e32afa7249763bcd924a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 11:14:25 -0500 Subject: ext4: Remove code to create the journal inode This code has been obsolete in quite some time, since the supported method for adding a journal inode is to use tune2fs (or to creating new filesystem with a journal via mke2fs or mkfs.ext4). Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 4 --- fs/ext4/super.c | 68 +++-------------------------------- fs/jbd2/journal.c | 72 -------------------------------------- include/linux/jbd2.h | 1 - 4 files changed, 4 insertions(+), 141 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index e3fcbea3ec8c..9ec29d86ff8b 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -149,10 +149,6 @@ journal_async_commit Commit block can be written to disk without waiting journal=update Update the ext4 file system's journal to the current format. -journal=inum When a journal already exists, this option is ignored. - Otherwise, it specifies the number of the inode which - will represent the ext4 file system's journal file. - journal_dev=devnum When the external journal device's major/minor numbers have changed, this option allows the user to specify the new journal location. The journal device is diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5ab520724da..8036392b2121 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -51,8 +51,6 @@ struct proc_dir_entry *ext4_proc_root; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); -static int ext4_create_journal(struct super_block *, struct ext4_super_block *, - unsigned int); static void ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, @@ -1006,7 +1004,7 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, - Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, @@ -1048,7 +1046,6 @@ static const match_table_t tokens = { {Opt_min_batch_time, "min_batch_time=%u"}, {Opt_max_batch_time, "max_batch_time=%u"}, {Opt_journal_update, "journal=update"}, - {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, @@ -1102,7 +1099,7 @@ static ext4_fsblk_t get_sb_block(void **data) } static int parse_options(char *options, struct super_block *sb, - unsigned int *inum, unsigned long *journal_devnum, + unsigned long *journal_devnum, ext4_fsblk_t *n_blocks_count, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1226,16 +1223,6 @@ static int parse_options(char *options, struct super_block *sb, } set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); break; - case Opt_journal_inum: - if (is_remount) { - printk(KERN_ERR "EXT4-fs: cannot specify " - "journal on remount\n"); - return 0; - } - if (match_int(&args[0], &option)) - return 0; - *inum = option; - break; case Opt_journal_dev: if (is_remount) { printk(KERN_ERR "EXT4-fs: cannot specify " @@ -2035,7 +2022,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; unsigned long offset = 0; - unsigned int journal_inum = 0; unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; @@ -2155,8 +2141,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, DELALLOC); - if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, - NULL, 0)) + if (!parse_options((char *) data, sb, &journal_devnum, NULL, 0)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -2460,9 +2445,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } } - } else if (journal_inum) { - if (ext4_create_journal(sb, es, journal_inum)) - goto failed_mount3; } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { printk(KERN_ERR "EXT4-fs: required journal recovery " @@ -2926,48 +2908,6 @@ static int ext4_load_journal(struct super_block *sb, return 0; } -static int ext4_create_journal(struct super_block *sb, - struct ext4_super_block *es, - unsigned int journal_inum) -{ - journal_t *journal; - int err; - - if (sb->s_flags & MS_RDONLY) { - printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to " - "create journal.\n"); - return -EROFS; - } - - journal = ext4_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; - - printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n", - journal_inum); - - err = jbd2_journal_create(journal); - if (err) { - printk(KERN_ERR "EXT4-fs: error creating journal.\n"); - jbd2_journal_destroy(journal); - return -EIO; - } - - EXT4_SB(sb)->s_journal = journal; - - ext4_update_dynamic_rev(sb); - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL); - - es->s_journal_inum = cpu_to_le32(journal_inum); - sb->s_dirt = 1; - - /* Make sure we flush the recovery flag to disk. */ - ext4_commit_super(sb, es, 1); - - return 0; -} - static void ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync) { @@ -3209,7 +3149,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) { err = -EINVAL; goto restore_opts; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 34ef98057202..b10d7283ba5b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -66,7 +66,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format); EXPORT_SYMBOL(jbd2_journal_check_used_features); EXPORT_SYMBOL(jbd2_journal_check_available_features); EXPORT_SYMBOL(jbd2_journal_set_features); -EXPORT_SYMBOL(jbd2_journal_create); EXPORT_SYMBOL(jbd2_journal_load); EXPORT_SYMBOL(jbd2_journal_destroy); EXPORT_SYMBOL(jbd2_journal_abort); @@ -1162,77 +1161,6 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } -/** - * int jbd2_journal_create() - Initialise the new journal file - * @journal: Journal to create. This structure must have been initialised - * - * Given a journal_t structure which tells us which disk blocks we can - * use, create a new journal superblock and initialise all of the - * journal fields from scratch. - **/ -int jbd2_journal_create(journal_t *journal) -{ - unsigned long long blocknr; - struct buffer_head *bh; - journal_superblock_t *sb; - int i, err; - - if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) { - printk (KERN_ERR "Journal length (%d blocks) too short.\n", - journal->j_maxlen); - journal_fail_superblock(journal); - return -EINVAL; - } - - if (journal->j_inode == NULL) { - /* - * We don't know what block to start at! - */ - printk(KERN_EMERG - "%s: creation of journal on external device!\n", - __func__); - BUG(); - } - - /* Zero out the entire journal on disk. We cannot afford to - have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */ - jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); - for (i = 0; i < journal->j_maxlen; i++) { - err = jbd2_journal_bmap(journal, i, &blocknr); - if (err) - return err; - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - lock_buffer(bh); - memset (bh->b_data, 0, journal->j_blocksize); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - __brelse(bh); - } - - sync_blockdev(journal->j_dev); - jbd_debug(1, "JBD: journal cleared.\n"); - - /* OK, fill in the initial static fields in the new superblock */ - sb = journal->j_superblock; - - sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); - - sb->s_blocksize = cpu_to_be32(journal->j_blocksize); - sb->s_maxlen = cpu_to_be32(journal->j_maxlen); - sb->s_first = cpu_to_be32(1); - - journal->j_transaction_sequence = 1; - - journal->j_flags &= ~JBD2_ABORT; - journal->j_format_version = 2; - - return journal_reset(journal); -} - /** * void jbd2_journal_update_superblock() - Update journal sb on disk. * @journal: The journal to update. diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 9d82084a1605..adef1c9940d3 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1104,7 +1104,6 @@ extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); -extern int jbd2_journal_create (journal_t *); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); -- cgit v1.2.3 From b3881f74b31b7d47d0f1c4d89ac3e7f0b9c05e3e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 5 Jan 2009 22:46:26 -0500 Subject: ext4: Add mount option to set kjournald's I/O priority Signed-off-by: "Theodore Ts'o" Cc: Jens Axboe --- Documentation/filesystems/ext4.txt | 7 +++++++ fs/ext4/super.c | 29 +++++++++++++++++++++++++---- fs/ioprio.c | 3 ++- include/linux/ioprio.h | 2 ++ 4 files changed, 36 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 9ec29d86ff8b..8938949b201e 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -308,6 +308,13 @@ min_batch_time=usec This parameter sets the commit time (as multi-threaded, synchronous workloads on very fast disks, at the cost of increasing latency. +journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the + highest priorty) which should be used for I/O + operations submitted by kjournald2 during a + commit operation. This defaults to 3, which is + a slightly higher priority than the default I/O + priority. + Data Mode ========= There are 3 different data modes: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8036392b2121..8ff8709828fd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1013,7 +1013,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_inode_readahead_blks + Opt_inode_readahead_blks, Opt_journal_ioprio }; static const match_table_t tokens = { @@ -1074,6 +1074,7 @@ static const match_table_t tokens = { {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, + {Opt_journal_ioprio, "journal_ioprio=%u"}, {Opt_err, NULL}, }; @@ -1098,8 +1099,11 @@ static ext4_fsblk_t get_sb_block(void **data) return sb_block; } +#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static int parse_options(char *options, struct super_block *sb, unsigned long *journal_devnum, + unsigned int *journal_ioprio, ext4_fsblk_t *n_blocks_count, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1492,6 +1496,14 @@ set_qf_format: return 0; sbi->s_inode_readahead_blks = option; break; + case Opt_journal_ioprio: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 7) + break; + *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, + option); + break; default: printk(KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -2035,6 +2047,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int features; __u64 blocks_count; int err; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -2141,7 +2154,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, DELALLOC); - if (!parse_options((char *) data, sb, &journal_devnum, NULL, 0)) + if (!parse_options((char *) data, sb, &journal_devnum, + &journal_ioprio, NULL, 0)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -2506,6 +2520,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) default: break; } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); no_journal: @@ -3127,6 +3142,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err; #ifdef CONFIG_QUOTA int i; @@ -3145,11 +3161,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) for (i = 0; i < MAXQUOTAS; i++) old_opts.s_qf_names[i] = sbi->s_qf_names[i]; #endif + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, &journal_ioprio, + &n_blocks_count, 1)) { err = -EINVAL; goto restore_opts; } @@ -3162,8 +3181,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) es = sbi->s_es; - if (sbi->s_journal) + if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + } if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > ext4_blocks_count(es)) { diff --git a/fs/ioprio.c b/fs/ioprio.c index 3569e0ad86a2..1a39ac370942 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -27,7 +27,7 @@ #include #include -static int set_task_ioprio(struct task_struct *task, int ioprio) +int set_task_ioprio(struct task_struct *task, int ioprio) { int err; struct io_context *ioc; @@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio) task_unlock(task); return err; } +EXPORT_SYMBOL_GPL(set_task_ioprio); asmlinkage long sys_ioprio_set(int which, int who, int ioprio) { diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index f98a656b17e5..76dad4808847 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -86,4 +86,6 @@ static inline int task_nice_ioclass(struct task_struct *task) */ extern int ioprio_best(unsigned short aprio, unsigned short bprio); +extern int set_task_ioprio(struct task_struct *task, int ioprio); + #endif -- cgit v1.2.3 From 83982b6f47201c4c7767210d24d7d8c99567a0b3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 14:53:16 -0500 Subject: ext4: Remove "extents" mount option This mount option is largely superfluous, and in fact the way it was implemented was buggy; if a filesystem which did not have the extents feature flag was mounted -o extents, the filesystem would attempt to create and use extents-based file even though the extents feature flag was not eabled. The simplest thing to do is to nuke the mount option entirely. It's not all that useful to force the non-creation of new extent-based files if the filesystem can support it. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 5 ---- fs/ext4/ext4.h | 1 - fs/ext4/ext4_jbd2.h | 4 ++-- fs/ext4/extents.c | 4 ++-- fs/ext4/ialloc.c | 2 +- fs/ext4/migrate.c | 14 +++++------ fs/ext4/super.c | 48 ++------------------------------------ 7 files changed, 14 insertions(+), 64 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 8938949b201e..cec829bc7291 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -131,11 +131,6 @@ ro Mount filesystem read only. Note that ext4 will mount options "ro,noload" can be used to prevent writes to the filesystem. -extents (*) ext4 will use extents to address file data. The - file system will no longer be mountable by ext3. - -noextents ext4 will not use extents for newly created files - journal_checksum Enable checksumming of the journal transactions. This will allow the recovery code in e2fsck and the kernel to detect corruption in the kernel. It is a diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 695b45cc34e7..db1718833f58 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -536,7 +536,6 @@ do { \ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 663197adae56..be2f426f6805 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -32,8 +32,8 @@ * 5 levels of tree + root which are stored in the inode. */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ - (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - || test_opt(sb, EXTENTS) ? 27U : 8U) + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 27U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c64080e49493..240cf0daad4b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2247,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb) * possible initialization would be here */ - if (test_opt(sb, EXTENTS)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { printk(KERN_INFO "EXT4-fs: file extents enabled"); #ifdef AGGRESSIVE_TEST printk(", aggressive tests"); @@ -2272,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb) */ void ext4_ext_release(struct super_block *sb) { - if (!test_opt(sb, EXTENTS)) + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) return; #ifdef EXTENTS_STATS diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 369c34c64292..4fb86a0061d0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -917,7 +917,7 @@ got: if (err) goto fail_free_drop; - if (test_opt(sb, EXTENTS)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index e7cd488da4bb..734abca25e35 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -459,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode) struct list_blocks_struct lb; unsigned long max_entries; - if (!test_opt(inode->i_sb, EXTENTS)) - /* - * if mounted with noextents we don't allow the migrate - */ - return -EINVAL; - - if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + /* + * If the filesystem does not support extents, or the inode + * already is extent-based, error out. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b69d09203865..acb69c00fd42 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -829,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",journal_async_commit"); if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - if (!test_opt(sb, EXTENTS)) - seq_puts(seq, ",noextents"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); if (!test_opt(sb, DELALLOC)) @@ -1011,7 +1009,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, + Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks, Opt_journal_ioprio }; @@ -1066,8 +1064,6 @@ static const match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, - {Opt_extents, "extents"}, - {Opt_noextents, "noextents"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, @@ -1115,7 +1111,6 @@ static int parse_options(char *options, struct super_block *sb, int qtype, qfmt; char *qname; #endif - ext4_fsblk_t last_block; if (!options) return 1; @@ -1445,33 +1440,6 @@ set_qf_format: case Opt_bh: clear_opt(sbi->s_mount_opt, NOBH); break; - case Opt_extents: - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_EXTENTS)) { - ext4_warning(sb, __func__, - "extents feature not enabled " - "on this filesystem, use tune2fs"); - return 0; - } - set_opt(sbi->s_mount_opt, EXTENTS); - break; - case Opt_noextents: - /* - * When e2fsprogs support resizing an already existing - * ext3 file system to greater than 2**32 we need to - * add support to block allocator to handle growing - * already existing block mapped inode so that blocks - * allocated for them fall within 2**32 - */ - last_block = ext4_blocks_count(sbi->s_es) - 1; - if (last_block > 0xffffffffULL) { - printk(KERN_ERR "EXT4-fs: Filesystem too " - "large to mount with " - "-o noextents options\n"); - return 0; - } - clear_opt(sbi->s_mount_opt, EXTENTS); - break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); sb->s_flags |= MS_I_VERSION; @@ -2135,18 +2103,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, RESERVATION); set_opt(sbi->s_mount_opt, BARRIER); - /* - * turn on extents feature by default in ext4 filesystem - * only if feature flag already set by mkfs or tune2fs. - * Use -o noextents to turn it off - */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) - set_opt(sbi->s_mount_opt, EXTENTS); - else - ext4_warning(sb, __func__, - "extents feature not enabled on this filesystem, " - "use tune2fs."); - /* * enable delayed allocation by default * Use -o nodelalloc to turn it off @@ -3825,7 +3781,7 @@ static void __exit exit_ext4_fs(void) } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); +MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); module_init(init_ext4_fs) module_exit(exit_ext4_fs) -- cgit v1.2.3