From 67638e4043083cdc6f10386a75fef87ba46eecb3 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sat, 1 Aug 2009 21:30:31 +0900 Subject: fat/nls: Fix handling of utf8 invalid char With utf8 option, vfat allowed the duplicated filenames. Normal nls returns -EINVAL for invalid char. But utf8s_to_utf16s() skipped the invalid char historically. So, this changes the utf8s_to_utf16s() directly to return -EINVAL for invalid char, because vfat is only user of it. mkdir /mnt/fatfs FILENAME=`echo -ne "invalidutf8char_\\0341_endofchar"` echo "Using filename: $FILENAME" dd if=/dev/zero of=fatfs bs=512 count=128 mkdosfs -F 32 fatfs mount -o loop,utf8 fatfs /mnt/fatfs touch "/mnt/fatfs/$FILENAME" umount /mnt/fatfs mount -o loop,utf8 fatfs /mnt/fatfs touch "/mnt/fatfs/$FILENAME" ls -l /mnt/fatfs umount /mnt/fatfs ---- And the output is: Using filename: invalidutf8char_\0341_endofchar 128+0 records in 128+0 records out 65536 bytes (66 kB) copied, 0.000388118 s, 169 MB/s mkdosfs 2.11 (12 Mar 2005) total 0 -rwxr-xr-x 1 root root 0 Jun 28 19:46 invalidutf8char__endofchar -rwxr-xr-x 1 root root 0 Jun 28 19:46 invalidutf8char__endofchar Tested-by: Marton Balint Signed-off-by: OGAWA Hirofumi --- fs/fat/namei_vfat.c | 15 ++++----------- fs/nls/nls_base.c | 8 ++++---- 2 files changed, 8 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index cb6e83557112..f565f24019b5 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -499,17 +499,10 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, int charlen; if (utf8) { - int name_len = strlen(name); - - *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname); - - /* - * We stripped '.'s before and set len appropriately, - * but utf8s_to_utf16s doesn't care about len - */ - *outlen -= (name_len - len); - - if (*outlen > 255) + *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); + if (*outlen < 0) + return *outlen; + else if (*outlen > 255) return -ENAMETOOLONG; op = &outname[*outlen * sizeof(wchar_t)]; diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 477d37d83b31..b25c218671b3 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -124,10 +124,10 @@ int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) while (*s && len > 0) { if (*s & 0x80) { size = utf8_to_utf32(s, len, &u); - if (size < 0) { - /* Ignore character and move on */ - size = 1; - } else if (u >= PLANE_SIZE) { + if (size < 0) + return -EINVAL; + + if (u >= PLANE_SIZE) { u -= PLANE_SIZE; *op++ = (wchar_t) (SURROGATE_PAIR | ((u >> 10) & SURROGATE_BITS)); -- cgit v1.2.3 From 955234755ce4a2c33cfc558912aa8f2148cc1fc6 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sat, 1 Aug 2009 21:30:31 +0900 Subject: vfat: change the default from shortname=lower to shortname=mixed Because, with "shortname=lower", copying one FAT filesystem tree to another FAT filesystem tree using Linux results in semantically different filesystems. (E.g.: Filenames which were once "all uppercase" are now "all lowercase"). So, this changes the default of "shortname=lower" to "shortname=mixed". Signed-off-by: Paul Wise [change fat_show_options()] Signed-off-by: OGAWA Hirofumi --- Documentation/filesystems/vfat.txt | 2 +- fs/fat/inode.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index b58b84b50fa2..eed520fd0c8e 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -102,7 +102,7 @@ shortname=lower|win95|winnt|mixed winnt: emulate the Windows NT rule for display/create. mixed: emulate the Windows NT rule for display, emulate the Windows 95 rule for create. - Default setting is `lower'. + Default setting is `mixed'. tz=UTC -- Interpret timestamps as UTC rather than local time. This option disables the conversion of timestamps diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 8970d8c49bb0..63a5c1a4ee60 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -820,7 +820,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",shortname=mixed"); break; case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: - /* seq_puts(m, ",shortname=lower"); */ + seq_puts(m, ",shortname=lower"); break; default: seq_puts(m, ",shortname=unknown"); @@ -971,7 +971,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, opts->codepage = fat_default_codepage; opts->iocharset = fat_default_iocharset; if (is_vfat) { - opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95; + opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; opts->rodir = 0; } else { opts->shortname = 0; -- cgit v1.2.3 From ed248b290da7297c9b9a3ff180f5eee4db016224 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 20 Sep 2009 01:31:58 +0900 Subject: fat: Check s_dirt in fat_sync_fs() If we didn't check sb->s_dirt, it will update the FSINFO unconditionally. It will reduce the filetime of flash base device. So, this checks sb->s_dirt. sb->s_dirt is racy, however FSINFO is just hint. So even if there is race, and we hit it, it would not become big problem. And this also is as workaround of suspend problem. Signed-off-by: OGAWA Hirofumi --- fs/fat/fat.h | 2 +- fs/fat/inode.c | 14 +++++++++----- fs/fat/misc.c | 8 +++++--- 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fat/fat.h b/fs/fat/fat.h index adb0e72a176d..7db0979c6b72 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -323,7 +323,7 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, /* fat/misc.c */ extern void fat_fs_error(struct super_block *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))) __cold; -extern void fat_clusters_flush(struct super_block *sb); +extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, __le16 __time, __le16 __date, u8 time_cs); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 63a5c1a4ee60..a8a3afec8758 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -451,12 +451,16 @@ static void fat_write_super(struct super_block *sb) static int fat_sync_fs(struct super_block *sb, int wait) { - lock_super(sb); - fat_clusters_flush(sb); - sb->s_dirt = 0; - unlock_super(sb); + int err = 0; - return 0; + if (sb->s_dirt) { + lock_super(sb); + sb->s_dirt = 0; + err = fat_clusters_flush(sb); + unlock_super(sb); + } + + return err; } static void fat_put_super(struct super_block *sb) diff --git a/fs/fat/misc.c b/fs/fat/misc.c index a6c20473dfd7..63785a150290 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -43,19 +43,19 @@ EXPORT_SYMBOL_GPL(fat_fs_error); /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ -void fat_clusters_flush(struct super_block *sb) +int fat_clusters_flush(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct fat_boot_fsinfo *fsinfo; if (sbi->fat_bits != 32) - return; + return 0; bh = sb_bread(sb, sbi->fsinfo_sector); if (bh == NULL) { printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n"); - return; + return -EIO; } fsinfo = (struct fat_boot_fsinfo *)bh->b_data; @@ -74,6 +74,8 @@ void fat_clusters_flush(struct super_block *sb) mark_buffer_dirty(bh); } brelse(bh); + + return 0; } /* -- cgit v1.2.3 From 1693918e0b6988cf5eb93b7da34f30e94360a379 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 26 Sep 2009 17:43:59 -0400 Subject: ext4: Use ext4_msg() for ext4_da_writepage() errors This allows the user to see what filesystem was involved with a particular ext4_da_writepage() error. Also, use KERN_CRIT which is more appropriate than KERN_EMERG. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 064746fad581..5fb72a98ccbe 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2092,18 +2092,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - printk(KERN_EMERG "Total free blocks count %lld\n", - ext4_count_free_blocks(inode->i_sb)); - printk(KERN_EMERG "Free/Dirty block details\n"); - printk(KERN_EMERG "free_blocks=%lld\n", - (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); - printk(KERN_EMERG "dirty_blocks=%lld\n", - (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); - printk(KERN_EMERG "Block reservation details\n"); - printk(KERN_EMERG "i_reserved_data_blocks=%u\n", - EXT4_I(inode)->i_reserved_data_blocks); - printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", - EXT4_I(inode)->i_reserved_meta_blocks); + printk(KERN_CRIT "Total free blocks count %lld\n", + ext4_count_free_blocks(inode->i_sb)); + printk(KERN_CRIT "Free/Dirty block details\n"); + printk(KERN_CRIT "free_blocks=%lld\n", + (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); + printk(KERN_CRIT "dirty_blocks=%lld\n", + (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + printk(KERN_CRIT "Block reservation details\n"); + printk(KERN_CRIT "i_reserved_data_blocks=%u\n", + EXT4_I(inode)->i_reserved_data_blocks); + printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", + EXT4_I(inode)->i_reserved_meta_blocks); return; } @@ -2189,14 +2189,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * writepage and writepages will again try to write * the same. */ - printk(KERN_EMERG "%s block allocation failed for inode %lu " - "at logical offset %llu with max blocks " - "%zd with error %d\n", - __func__, mpd->inode->i_ino, - (unsigned long long)next, - mpd->b_size >> mpd->inode->i_blkbits, err); - printk(KERN_EMERG "This should not happen.!! " - "Data will be lost\n"); + ext4_msg(mpd->inode->i_sb, KERN_CRIT, + "delayed block allocation failed for inode %lu at " + "logical offset %llu with max blocks %zd with " + "error %d\n", mpd->inode->i_ino, + (unsigned long long) next, + mpd->b_size >> mpd->inode->i_blkbits, err); + printk(KERN_CRIT "This should not happen!! " + "Data will be lost\n"); if (err == -ENOSPC) { ext4_print_free_blocks(mpd->inode); } @@ -2822,10 +2822,9 @@ retry: handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - printk(KERN_CRIT "%s: jbd2_start: " + ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d\n", __func__, wbc->nr_to_write, inode->i_ino, ret); - dump_stack(); goto out_writepages; } @@ -2897,9 +2896,10 @@ retry: goto retry; } if (pages_skipped != wbc->pages_skipped) - printk(KERN_EMERG "This should not happen leaving %s " - "with nr_to_write = %ld ret = %d\n", - __func__, wbc->nr_to_write, ret); + ext4_msg(inode->i_sb, KERN_CRIT, + "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", + __func__, wbc->nr_to_write, ret); /* Update index */ index += pages_written; -- cgit v1.2.3 From 71780577306fd1e76c7a92e3b308db624d03adb9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 28 Sep 2009 00:06:20 -0400 Subject: ext4: Fix hueristic which avoids group preallocation for closed files The hueristic was designed to avoid using locality group preallocation when writing the last segment of a closed file. Fix it by move setting size to the maximum of size and isize until after we check whether size == isize. Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e9c61896d605..c73d43995b13 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4189,7 +4189,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; - size = max(size, isize); if ((size == isize) && !ext4_fs_is_busy(sbi) && @@ -4199,6 +4198,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) } /* don't use group allocation for large files */ + size = max(size, isize); if (size >= sbi->s_mb_stream_request) { ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; -- cgit v1.2.3 From 55138e0bc29c0751e2152df9ad35deea542f29b3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 29 Sep 2009 13:31:31 -0400 Subject: ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks Work around problems in the writeback code to force out writebacks in larger chunks than just 4mb, which is just too small. This also works around limitations in the ext4 block allocator, which can't allocate more than 2048 blocks at a time. So we need to defeat the round-robin characteristics of the writeback code and try to write out as many blocks in one inode before allowing the writeback code to move on to another inode. We add a a new per-filesystem tunable, max_writeback_mb_bump, which caps this to a default of 128mb per inode. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 105 +++++++++++++++++++++++++++++++++++++++----- fs/ext4/super.c | 3 ++ include/trace/events/ext4.h | 14 ++++-- 4 files changed, 107 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e227eea23f05..a58438e18d0b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -942,6 +942,7 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; + unsigned int s_max_writeback_mb_bump; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5fb72a98ccbe..20e2d704dc2e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1144,6 +1144,64 @@ static int check_block_validity(struct inode *inode, const char *msg, return 0; } +/* + * Return the number of dirty pages in the given inode starting at + * page frame idx. + */ +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, + unsigned int max_pages) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + struct pagevec pvec; + pgoff_t num = 0; + int i, nr_pages, done = 0; + + if (max_pages == 0) + return 0; + pagevec_init(&pvec, 0); + while (!done) { + index = idx; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + (pgoff_t)PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + lock_page(page); + if (unlikely(page->mapping != mapping) || + !PageDirty(page) || + PageWriteback(page) || + page->index != idx) { + done = 1; + unlock_page(page); + break; + } + head = page_buffers(page); + bh = head; + do { + if (!buffer_delay(bh) && + !buffer_unwritten(bh)) { + done = 1; + break; + } + } while ((bh = bh->b_this_page) != head); + unlock_page(page); + if (done) + break; + idx++; + num++; + if (num >= max_pages) + break; + } + pagevec_release(&pvec); + } + return num; +} + /* * The ext4_get_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -2743,8 +2801,10 @@ static int ext4_da_writepages(struct address_space *mapping, int no_nrwrite_index_update; int pages_written = 0; long pages_skipped; + unsigned int max_pages; int range_cyclic, cycled = 1, io_done = 0; - int needed_blocks, ret = 0, nr_to_writebump = 0; + int needed_blocks, ret = 0; + long desired_nr_to_write, nr_to_writebump = 0; loff_t range_start = wbc->range_start; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); @@ -2771,16 +2831,6 @@ static int ext4_da_writepages(struct address_space *mapping, if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) return -EROFS; - /* - * Make sure nr_to_write is >= sbi->s_mb_stream_request - * This make sure small files blocks are allocated in - * single attempt. This ensure that small files - * get less fragmented. - */ - if (wbc->nr_to_write < sbi->s_mb_stream_request) { - nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; - wbc->nr_to_write = sbi->s_mb_stream_request; - } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; @@ -2795,6 +2845,36 @@ static int ext4_da_writepages(struct address_space *mapping, } else index = wbc->range_start >> PAGE_CACHE_SHIFT; + /* + * This works around two forms of stupidity. The first is in + * the writeback code, which caps the maximum number of pages + * written to be 1024 pages. This is wrong on multiple + * levels; different architectues have a different page size, + * which changes the maximum amount of data which gets + * written. Secondly, 4 megabytes is way too small. XFS + * forces this value to be 16 megabytes by multiplying + * nr_to_write parameter by four, and then relies on its + * allocator to allocate larger extents to make them + * contiguous. Unfortunately this brings us to the second + * stupidity, which is that ext4's mballoc code only allocates + * at most 2048 blocks. So we force contiguous writes up to + * the number of dirty blocks in the inode, or + * sbi->max_writeback_mb_bump whichever is smaller. + */ + max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); + if (!range_cyclic && range_whole) + desired_nr_to_write = wbc->nr_to_write * 8; + else + desired_nr_to_write = ext4_num_dirty_pages(inode, index, + max_pages); + if (desired_nr_to_write > max_pages) + desired_nr_to_write = max_pages; + + if (wbc->nr_to_write < desired_nr_to_write) { + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; + wbc->nr_to_write = desired_nr_to_write; + } + mpd.wbc = wbc; mpd.inode = mapping->host; @@ -2914,7 +2994,8 @@ retry: out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; - wbc->nr_to_write -= nr_to_writebump; + if (wbc->nr_to_write > nr_to_writebump) + wbc->nr_to_write -= nr_to_writebump; wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index df539ba27779..16817737ba52 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2197,6 +2197,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2210,6 +2211,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), NULL, }; @@ -2679,6 +2681,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_max_writeback_mb_bump = 128; /* * set up enough so that it can read an inode diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index c1bd8f1e8b94..7c6bbb7198a3 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -236,6 +236,7 @@ TRACE_EVENT(ext4_da_writepages, __field( char, for_kupdate ) __field( char, for_reclaim ) __field( char, range_cyclic ) + __field( pgoff_t, writeback_index ) ), TP_fast_assign( @@ -249,15 +250,17 @@ TRACE_EVENT(ext4_da_writepages, __entry->for_kupdate = wbc->for_kupdate; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; + __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->nonblocking, __entry->for_kupdate, __entry->for_reclaim, - __entry->range_cyclic) + __entry->range_cyclic, + (unsigned long) __entry->writeback_index) ); TRACE_EVENT(ext4_da_write_pages, @@ -309,6 +312,7 @@ TRACE_EVENT(ext4_da_writepages_result, __field( char, encountered_congestion ) __field( char, more_io ) __field( char, no_nrwrite_index_update ) + __field( pgoff_t, writeback_index ) ), TP_fast_assign( @@ -320,14 +324,16 @@ TRACE_EVENT(ext4_da_writepages_result, __entry->encountered_congestion = wbc->encountered_congestion; __entry->more_io = wbc->more_io; __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; + __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->encountered_congestion, __entry->more_io, - __entry->no_nrwrite_index_update) + __entry->no_nrwrite_index_update, + (unsigned long) __entry->writeback_index) ); TRACE_EVENT(ext4_da_write_begin, -- cgit v1.2.3 From 9f0ccfd8e07d61b413e6536ffa02fbf60d2e20d8 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Mon, 28 Sep 2009 15:49:52 -0400 Subject: ext4: release reserved quota when block reservation for delalloc retry ext4_da_reserve_space() can reserve quota blocks multiple times if ext4_claim_free_blocks() fail and we retry the allocation. We should release the quota reservation before restarting. Bug found by Jan Kara. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 20e2d704dc2e..219067ce09d9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1854,11 +1854,11 @@ repeat: if (ext4_claim_free_blocks(sbi, total)) { spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + vfs_dq_release_reservation_block(inode, total); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; } - vfs_dq_release_reservation_block(inode, total); return -ENOSPC; } EXT4_I(inode)->i_reserved_data_blocks += nrblocks; -- cgit v1.2.3 From 0031462b5b392f90d17f1d75abb795883c44e969 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Mon, 28 Sep 2009 15:49:08 -0400 Subject: ext4: Split uninitialized extents for direct I/O When writing into an unitialized extent via direct I/O, and the direct I/O doesn't exactly cover the unitialized extent, split the extent into uninitialized and initialized extents before submitting the I/O. This avoids needing to deal with an ENOSPC error in the end_io callback that gets used for direct I/O. When the IO is complete, the written extent will be marked as initialized. Singed-Off-By: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 22 ++- fs/ext4/ext4_extents.h | 7 +- fs/ext4/extents.c | 423 ++++++++++++++++++++++++++++++++++++++++++++----- fs/ext4/inode.c | 3 + fs/ext4/migrate.c | 2 +- fs/ext4/move_extent.c | 4 +- 6 files changed, 419 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a58438e18d0b..2b4293aac162 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -128,6 +128,15 @@ struct mpage_da_data { int retval; }; +typedef struct ext4_io_end { + struct inode *inode; /* file being written to */ + unsigned int flag; /* sync IO or AIO */ + int error; /* I/O error code */ + ext4_lblk_t offset; /* offset in the file */ + size_t size; /* size of the extent */ + struct work_struct work; /* data work queue */ +} ext4_io_end_t; + /* * Special inodes numbers */ @@ -347,7 +356,16 @@ struct ext4_new_group_data { /* Call ext4_da_update_reserve_space() after successfully allocating the blocks */ #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 - + /* caller is from the direct IO path, request to creation of an + unitialized extents if not allocated, split the uninitialized + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_DIO 0x0010 +#define EXT4_GET_BLOCKS_CONVERT 0x0020 +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Convert extent to initialized after direct IO complete */ +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_DIO_CREATE_EXT) /* * ioctl commands @@ -1700,6 +1718,8 @@ extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + loff_t len); extern int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, int flags); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 61652f1d15e6..2ca686454e87 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) +{ + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); +} + extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); @@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *); extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, ext_prepare_callback, void *); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7a3832577923..a38e651c004e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -723,7 +723,7 @@ err: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ -static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, +int ext4_ext_insert_index(handle_t *handle, struct inode *inode, struct ext4_ext_path *curp, int logical, ext4_fsblk_t ptr) { @@ -1586,7 +1586,7 @@ out: */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext) + struct ext4_extent *newext, int flag) { struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; @@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, BUG_ON(path[depth].p_hdr == NULL); /* try to insert block into found extent and return */ - if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { + if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), @@ -1722,7 +1723,8 @@ has_space: merge: /* try to merge extents to the right */ - ext4_ext_try_to_merge(inode, path, nearex); + if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + ext4_ext_try_to_merge(inode, path, nearex); /* try to merge extents to the left */ @@ -2490,7 +2492,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) } #define EXT4_EXT_ZERO_LEN 7 - /* * This function is called by ext4_ext_get_blocks() if someone tries to write * to an uninitialized extent. It may result in splitting the uninitialized @@ -2583,7 +2584,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex3->ee_block = cpu_to_le32(iblock); ext4_ext_store_pblock(ex3, newblock); ex3->ee_len = cpu_to_le16(allocated); - err = ext4_ext_insert_extent(handle, inode, path, ex3); + err = ext4_ext_insert_extent(handle, inode, path, + ex3, 0); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2639,7 +2641,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex3, newblock + max_blocks); ex3->ee_len = cpu_to_le16(allocated - max_blocks); ext4_ext_mark_uninitialized(ex3); - err = ext4_ext_insert_extent(handle, inode, path, ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2757,7 +2759,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = ext4_ext_dirty(handle, inode, path + depth); goto out; insert: - err = ext4_ext_insert_extent(handle, inode, path, &newex); + err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2784,6 +2786,320 @@ fix_extent_len: return err; } +/* + * This function is called by ext4_ext_get_blocks() from + * ext4_get_blocks_dio_write() when DIO to write + * to an uninitialized extent. + * + * Writing to an uninitized extent may result in splitting the uninitialized + * extent into multiple /intialized unintialized extents (up to three) + * There are three possibilities: + * a> There is no split required: Entire extent should be uninitialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * One of more index blocks maybe needed if the extent tree grow after + * the unintialized extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the uninitialized extent before DIO submit + * the IO. The uninitilized extent called at this time will be split + * into three uninitialized extent(at most). After IO complete, the part + * being filled will be convert to initialized by the end_io callback function + * via ext4_convert_unwritten_extents(). + */ +static int ext4_split_unwritten_extents(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t iblock, + unsigned int max_blocks, + int flags) +{ + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex1 = NULL; + struct ext4_extent *ex2 = NULL; + struct ext4_extent *ex3 = NULL; + struct ext4_extent_header *eh; + ext4_lblk_t ee_block; + unsigned int allocated, ee_len, depth; + ext4_fsblk_t newblock; + int err = 0; + int ret = 0; + + ext_debug("ext4_split_unwritten_extents: inode %lu," + "iblock %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)iblock, max_blocks); + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (iblock - ee_block); + newblock = iblock - ee_block + ext_pblock(ex); + ex2 = ex; + orig_ex.ee_block = ex->ee_block; + orig_ex.ee_len = cpu_to_le16(ee_len); + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + + /* + * if the entire unintialized extent length less than + * the size of extent to write, there is no need to split + * uninitialized extent + */ + if (allocated <= max_blocks) + return ret; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* ex1: ee_block to iblock - 1 : uninitialized */ + if (iblock > ee_block) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * for sanity, update the length of the ex2 extent before + * we insert ex3, if ex1 is NULL. This is to avoid temporary + * overlap of blocks. + */ + if (!ex1 && allocated > max_blocks) + ex2->ee_len = cpu_to_le16(max_blocks); + /* ex3: to ee_block + ee_len : uninitialised */ + if (allocated > max_blocks) { + unsigned int newdepth; + ex3 = &newex; + ex3->ee_block = cpu_to_le32(iblock + max_blocks); + ext4_ext_store_pblock(ex3, newblock + max_blocks); + ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + /* blocks available from iblock */ + return allocated; + + } else if (err) + goto fix_extent_len; + /* + * The depth, and hence eh & ex might change + * as part of the insert above. + */ + newdepth = ext_depth(inode); + /* + * update the extent length after successful insert of the + * split extent + */ + orig_ex.ee_len = cpu_to_le16(ee_len - + ext4_ext_get_actual_len(ex3)); + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + allocated = max_blocks; + } + /* + * If there was a change of depth as part of the + * insertion of ex3 above, we need to update the length + * of the ex1 extent again here + */ + if (ex1 && ex1 != ex) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, + * uninitialised still. + */ + ex2->ee_block = cpu_to_le32(iblock); + ext4_ext_store_pblock(ex2, newblock); + ex2->ee_len = cpu_to_le16(allocated); + ext4_ext_mark_uninitialized(ex2); + if (ex2 != ex) + goto insert; + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); + ext_debug("out here\n"); + goto out; +insert: + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + return allocated; + } else if (err) + goto fix_extent_len; +out: + ext4_ext_show_leaf(inode, path); + return err ? err : allocated; + +fix_extent_len: + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; +} +static int ext4_convert_unwritten_extents_dio(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + struct ext4_extent_header *eh; + int depth; + int err = 0; + int ret = 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as initialized */ + ext4_ext_mark_initialized(ex); + + /* + * We have to see if it can be merged with the extent + * on the left. + */ + if (ex > EXT_FIRST_EXTENT(eh)) { + /* + * To merge left, pass "ex - 1" to try_to_merge(), + * since it merges towards right _only_. + */ + ret = ext4_ext_try_to_merge(inode, path, ex - 1); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + ex--; + } + } + /* + * Try to Merge towards right. + */ + ret = ext4_ext_try_to_merge(inode, path, ex); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + } + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + +static int +ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned int max_blocks, + struct ext4_ext_path *path, int flags, + unsigned int allocated, struct buffer_head *bh_result, + ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" + "block %llu, max_blocks %u, flags %d, allocated %u", + inode->i_ino, (unsigned long long)iblock, max_blocks, + flags, allocated); + ext4_ext_show_leaf(inode, path); + + /* DIO get_block() before submit the IO, split the extent */ + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + ret = ext4_split_unwritten_extents(handle, + inode, path, iblock, + max_blocks, flags); + goto out; + } + /* DIO end_io complete, convert the filled extent to written */ + if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { + ret = ext4_convert_unwritten_extents_dio(handle, inode, + path); + goto out2; + } + /* buffered IO case */ + /* + * repeat fallocate creation request + * we already have an unwritten extent + */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) + goto map_out; + + /* buffered READ or buffered write_begin() lookup */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * We have blocks reserved already. We + * return allocated blocks so that delalloc + * won't do block reservation for us. But + * the buffer head will be unmapped so that + * a read from the block returns 0s. + */ + set_buffer_unwritten(bh_result); + goto out1; + } + + /* buffered write, writepage time, convert*/ + ret = ext4_ext_convert_to_initialized(handle, inode, + path, iblock, + max_blocks); +out: + if (ret <= 0) { + err = ret; + goto out2; + } else + allocated = ret; + set_buffer_new(bh_result); +map_out: + set_buffer_mapped(bh_result); +out1: + if (allocated > max_blocks) + allocated = max_blocks; + ext4_ext_show_leaf(inode, path); + bh_result->b_bdev = inode->i_sb->s_bdev; + bh_result->b_blocknr = newblock; +out2: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return err ? err : allocated; +} /* * Block allocation/map/preallocation routine for extents based files * @@ -2889,33 +3205,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, EXT4_EXT_CACHE_EXTENT); goto out; } - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) - goto out; - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - if (allocated > max_blocks) - allocated = max_blocks; - /* - * We have blocks reserved already. We - * return allocated blocks so that delalloc - * won't do block reservation for us. But - * the buffer head will be unmapped so that - * a read from the block returns 0s. - */ - set_buffer_unwritten(bh_result); - bh_result->b_bdev = inode->i_sb->s_bdev; - bh_result->b_blocknr = newblock; - goto out2; - } - - ret = ext4_ext_convert_to_initialized(handle, inode, - path, iblock, - max_blocks); - if (ret <= 0) { - err = ret; - goto out2; - } else - allocated = ret; - goto outnew; + ret = ext4_ext_handle_uninitialized_extents(handle, + inode, iblock, max_blocks, path, + flags, allocated, bh_result, newblock); + return ret; } } @@ -2988,7 +3281,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newex.ee_len = cpu_to_le16(ar.len); if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ ext4_ext_mark_uninitialized(&newex); - err = ext4_ext_insert_extent(handle, inode, path, &newex); + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { /* free data blocks we just allocated */ /* not a good idea to call discard here directly, @@ -3002,7 +3295,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); -outnew: set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ @@ -3200,6 +3492,63 @@ retry: return ret > 0 ? ret2 : ret; } +/* + * This function convert a range of blocks to written extents + * The caller of this function will pass the start offset and the size. + * all unwritten extents within this range will be converted to + * written extents. + * + * This function is called from the direct IO end io call back + * function, to convert the fallocated extents after IO is completed. + */ +int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + ext4_lblk_t block; + unsigned int max_blocks; + int ret = 0; + int ret2 = 0; + struct buffer_head map_bh; + unsigned int credits, blkbits = inode->i_blkbits; + + block = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) + - block; + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + while (ret >= 0 && ret < max_blocks) { + block = block + ret; + max_blocks = max_blocks - ret; + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + map_bh.b_state = 0; + ret = ext4_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_GET_BLOCKS_DIO_CONVERT_EXT); + if (ret <= 0) { + WARN_ON(ret <= 0); + printk(KERN_ERR "%s: ext4_ext_get_blocks " + "returned error inode#%lu, block=%u, " + "max_blocks=%u", __func__, + inode->i_ino, block, max_blocks); + } + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2 ) + break; + } + return ret > 0 ? ret2 : ret; +} /* * Callback function called for each extent to gather FIEMAP information. */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 219067ce09d9..da4f2ecb5447 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1233,6 +1233,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, clear_buffer_mapped(bh); clear_buffer_unwritten(bh); + ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," + "logical block %lu\n", inode->i_ino, flags, max_blocks, + (unsigned long)block); /* * Try to see if we can get the block without requesting a new * file system block. diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index bf519f239ae6..a93d5b80f3e2 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode, goto err_out; } } - retval = ext4_ext_insert_extent(handle, inode, path, &newext); + retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); err_out: if (path) { ext4_ext_drop_refs(path); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c07a2915e40b..5332fd4c4028 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, goto out; if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, new_ext)) + orig_path, new_ext, 0)) goto out; } @@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, goto out; if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, end_ext)) + orig_path, end_ext, 0)) goto out; } out: -- cgit v1.2.3 From 4c0425ff68b1b87b802ffeda7b6a46ff7da7241c Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Mon, 28 Sep 2009 15:48:41 -0400 Subject: ext4: Use end_io callback to avoid direct I/O fallback to buffered I/O Currently the DIO VFS code passes create = 0 when writing to the middle of file. It does this to avoid block allocation for holes, so as not to expose stale data out when there is a parallel buffered read (which does not hold the i_mutex lock). Direct I/O writes into holes falls back to buffered IO for this reason. Since preallocated extents are treated as holes when doing a get_block() look up (buffer is not mapped), direct IO over fallocate also falls back to buffered IO. Thus ext4 actually silently falls back to buffered IO in above two cases, which is undesirable. To fix this, this patch creates unitialized extents when a direct I/O write into holes in sparse files, and registering an end_io callback which converts the uninitialized extent to an initialized extent after the I/O is completed. Singed-Off-By: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 + fs/ext4/inode.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/super.c | 11 ++++ 3 files changed, 210 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2b4293aac162..ccb4dbf359c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -999,6 +999,9 @@ struct ext4_sb_info { unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; + + /* workqueue for dio unwritten */ + struct workqueue_struct *dio_unwritten_wq; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index da4f2ecb5447..5633af6a7045 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -3356,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } /* + * O_DIRECT for ext3 (or indirect map) based files + * * If the O_DIRECT write will extend the file then add this inode to the * orphan list. So recovery will truncate it back to the original size * if the machine crashes during the write. @@ -3364,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) * crashes then stale disk data _may_ be exposed inside the file. But current * VFS code falls back into buffered path in that case so we are safe. */ -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { @@ -3438,6 +3441,198 @@ out: return ret; } +/* Maximum number of blocks we map for direct IO at once. */ + +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = NULL; + int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int dio_credits; + + /* + * DIO VFS code passes create = 0 flag for write to + * the middle of file. It does this to avoid block + * allocation for holes, to prevent expose stale data + * out when there is parallel buffered read (which does + * not hold the i_mutex lock) while direct IO write has + * not completed. DIO request on holes finally falls back + * to buffered IO for this reason. + * + * For ext4 extent based file, since we support fallocate, + * new allocated extent as uninitialized, for holes, we + * could fallocate blocks for holes, thus parallel + * buffered IO read will zero out the page when read on + * a hole while parallel DIO write to the hole has not completed. + * + * when we come here, we know it's a direct IO write to + * to the middle of file ( DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, + create); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + ext4_journal_stop(handle); +out: + return ret; +} + +#define DIO_AIO 0x1 + +static void ext4_free_io_end(ext4_io_end_t *io) +{ + kfree(io); +} + +/* + * IO write completion for unwritten extents. + * + * check a range of space and convert unwritten extents to written. + */ +static void ext4_end_dio_unwritten(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + loff_t offset = io->offset; + size_t size = io->size; + int ret = 0; + int aio = io->flag & DIO_AIO; + + if (aio) + mutex_lock(&inode->i_mutex); + if (offset + size <= i_size_read(inode)) + ret = ext4_convert_unwritten_extents(inode, offset, size); + + if (ret < 0) + printk(KERN_EMERG "%s: failed to convert unwritten" + "extents to written extents, error is %d\n", + __func__, ret); + + ext4_free_io_end(io); + if (aio) + mutex_unlock(&inode->i_mutex); +} + +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) +{ + ext4_io_end_t *io = NULL; + + io = kmalloc(sizeof(*io), GFP_NOFS); + + if (io) { + io->inode = inode; + io->flag = flag; + io->offset = 0; + io->size = 0; + io->error = 0; + INIT_WORK(&io->work, ext4_end_dio_unwritten); + } + + return io; +} + +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private) +{ + ext4_io_end_t *io_end = iocb->private; + struct workqueue_struct *wq; + + /* if not hole or unwritten extents, just simple return */ + if (!io_end || !size || !iocb->private) + return; + io_end->offset = offset; + io_end->size = size; + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; + + /* We need to convert unwritten extents to written */ + queue_work(wq, &io_end->work); + + if (is_sync_kiocb(iocb)) + flush_workqueue(wq); + + iocb->private = NULL; +} +/* + * For ext4 extent files, ext4 will do direct-io write to holes, + * preallocated extents, and those write extend the file, no need to + * fall back to buffered IO. + * + * For holes, we fallocate those blocks, mark them as unintialized + * If those blocks were preallocated, we mark sure they are splited, but + * still keep the range to write as unintialized. + * + * When end_io call back function called at the last IO complete time, + * those extents will be converted to written extents. + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + */ +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + size_t count = iov_length(iov, nr_segs); + + loff_t final_size = offset + count; + if (rw == WRITE && final_size <= inode->i_size) { + /* + * For DIO we fallocate blocks for holes, we fallocate blocks + * The fallocated extent for hole is marked as uninitialized + * to prevent paralel buffered read to expose the stale data + * before DIO complete the data IO. + * as for previously fallocated extents, ext4 get_block + * will just simply mark the buffer mapped but still + * keep the extents uninitialized. + * + * At the end of IO, the ext4 end_io callback function + * will convert those unwritten extents to written, + * + */ + iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); + if (!iocb->private) + return -ENOMEM; + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block_dio_write, + ext4_end_io_dio); + return ret; + } + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +} + +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +} + /* * Pages can be marked dirty completely asynchronously from ext4's journalling * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16817737ba52..1a03ea98fdd1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -580,6 +580,9 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + flush_workqueue(sbi->dio_unwritten_wq); + destroy_workqueue(sbi->dio_unwritten_wq); + lock_super(sb); lock_kernel(); if (sb->s_dirt) @@ -2801,6 +2804,12 @@ no_journal: clear_opt(sbi->s_mount_opt, NOBH); } } + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); + if (!EXT4_SB(sb)->dio_unwritten_wq) { + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + goto failed_mount_wq; + } + /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. @@ -2913,6 +2922,8 @@ cantfind_ext4: failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); +failed_mount_wq: ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); -- cgit v1.2.3 From 8d5d02e6b176565c77ff03604908b1453a22044d Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Mon, 28 Sep 2009 15:48:29 -0400 Subject: ext4: async direct IO for holes and fallocate support For async direct IO that covers holes or fallocate, the end_io callback function now queued the convertion work on workqueue but don't flush the work rightaway as it might take too long to afford. But when fsync is called after all the data is completed, user expects the metadata also being updated before fsync returns. Thus we need to flush the conversion work when fsync() is called. This patch keep track of a listed of completed async direct io that has a work queued on workqueue. When fsync() is called, it will go through the list and do the conversion. Signed-off-by: Mingming Cao --- fs/ext4/ext4.h | 12 ++- fs/ext4/extents.c | 19 ++++- fs/ext4/fsync.c | 5 ++ fs/ext4/inode.c | 231 +++++++++++++++++++++++++++++++++++++++++++++--------- fs/ext4/super.c | 8 +- 5 files changed, 234 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ccb4dbf359c4..b491576e11c3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -127,10 +127,11 @@ struct mpage_da_data { int pages_written; int retval; }; - +#define DIO_AIO_UNWRITTEN 0x1 typedef struct ext4_io_end { + struct list_head list; /* per-file finished AIO list */ struct inode *inode; /* file being written to */ - unsigned int flag; /* sync IO or AIO */ + unsigned int flag; /* unwritten or not */ int error; /* I/O error code */ ext4_lblk_t offset; /* offset in the file */ size_t size; /* size of the extent */ @@ -690,6 +691,11 @@ struct ext4_inode_info { __u16 i_extra_isize; spinlock_t i_block_reservation_lock; + + /* completed async DIOs that might need unwritten extents handling */ + struct list_head i_aio_dio_complete_list; + /* current io_end structure for async DIO write*/ + ext4_io_end_t *cur_aio_dio; }; /* @@ -1419,7 +1425,7 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t ext4_get_reserved_space(struct inode *inode); - +extern int flush_aio_dio_completed_IO(struct inode *inode); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a38e651c004e..10a63096a95a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3033,6 +3033,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, { int ret = 0; int err = 0; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" "block %llu, max_blocks %u, flags %d, allocated %u", @@ -3045,6 +3046,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_split_unwritten_extents(handle, inode, path, iblock, max_blocks, flags); + /* flag the io_end struct that we need convert when IO done */ + if (io) + io->flag = DIO_AIO_UNWRITTEN; goto out; } /* DIO end_io complete, convert the filled extent to written */ @@ -3130,6 +3134,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, int err = 0, depth, ret, cache_type; unsigned int allocated = 0; struct ext4_allocation_request ar; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%u requested for inode %lu\n", @@ -3279,8 +3284,20 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(ar.len); - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ + /* Mark uninitialized */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ ext4_ext_mark_uninitialized(&newex); + /* + * io_end structure was created for every async + * direct IO write to the middle of the file. + * To avoid unecessary convertion for every aio dio rewrite + * to the mid of file, here we flag the IO that is really + * need the convertion. + * + */ + if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) + io->flag = DIO_AIO_UNWRITTEN; + } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { /* free data blocks we just allocated */ diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 07475740b512..2b1531266ee2 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -44,6 +44,8 @@ * * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. + * + * i_mutex lock is held when entering and exiting this function */ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) @@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) trace_ext4_sync_file(file, dentry, datasync); + ret = flush_aio_dio_completed_IO(inode); + if (ret < 0) + goto out; /* * data=writeback: * The caller's filemap_fdatawrite()/wait will sync the data. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5633af6a7045..118e16ca91d7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3451,6 +3451,8 @@ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int dio_credits; + ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", + inode->i_ino, create); /* * DIO VFS code passes create = 0 flag for write to * the middle of file. It does this to avoid block @@ -3491,55 +3493,152 @@ out: return ret; } -#define DIO_AIO 0x1 - static void ext4_free_io_end(ext4_io_end_t *io) { + BUG_ON(!io); + iput(io->inode); kfree(io); } +static void dump_aio_dio_list(struct inode * inode) +{ +#ifdef EXT4_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ + ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); + return; + } + + ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); + list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } +#endif +} /* - * IO write completion for unwritten extents. - * * check a range of space and convert unwritten extents to written. */ -static void ext4_end_dio_unwritten(struct work_struct *work) +static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); struct inode *inode = io->inode; loff_t offset = io->offset; size_t size = io->size; int ret = 0; - int aio = io->flag & DIO_AIO; - if (aio) - mutex_lock(&inode->i_mutex); + ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io, inode->i_ino, io->list.next, io->list.prev); + + if (list_empty(&io->list)) + return ret; + + if (io->flag != DIO_AIO_UNWRITTEN) + return ret; + if (offset + size <= i_size_read(inode)) ret = ext4_convert_unwritten_extents(inode, offset, size); - if (ret < 0) + if (ret < 0) { printk(KERN_EMERG "%s: failed to convert unwritten" - "extents to written extents, error is %d\n", - __func__, ret); + "extents to written extents, error is %d" + " io is still on inode %lu aio dio list\n", + __func__, ret, inode->i_ino); + return ret; + } + + /* clear the DIO AIO unwritten flag */ + io->flag = 0; + return ret; +} +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_aio_dio_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + int ret = 0; + + mutex_lock(&inode->i_mutex); + ret = ext4_end_aio_dio_nolock(io); + if (ret >= 0) { + if (!list_empty(&io->list)) + list_del_init(&io->list); + ext4_free_io_end(io); + } + mutex_unlock(&inode->i_mutex); +} +/* + * This function is called from ext4_sync_file(). + * + * When AIO DIO IO is completed, the work to convert unwritten + * extents to written is queued on workqueue but may not get immediately + * scheduled. When fsync is called, we need to ensure the + * conversion is complete before fsync returns. + * The inode keeps track of a list of completed AIO from DIO path + * that might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents to written. + */ +int flush_aio_dio_completed_IO(struct inode *inode) +{ + ext4_io_end_t *io; + int ret = 0; + int ret2 = 0; - ext4_free_io_end(io); - if (aio) - mutex_unlock(&inode->i_mutex); + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) + return ret; + + dump_aio_dio_list(inode); + while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ + io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, + ext4_io_end_t, list); + /* + * Calling ext4_end_aio_dio_nolock() to convert completed + * IO to written. + * + * When ext4_sync_file() is called, run_queue() may already + * about to flush the work corresponding to this io structure. + * It will be upset if it founds the io structure related + * to the work-to-be schedule is freed. + * + * Thus we need to keep the io structure still valid here after + * convertion finished. The io structure has a flag to + * avoid double converting from both fsync and background work + * queue work. + */ + ret = ext4_end_aio_dio_nolock(io); + if (ret < 0) + ret2 = ret; + else + list_del_init(&io->list); + } + return (ret2 < 0) ? ret2 : 0; } -static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) +static ext4_io_end_t *ext4_init_io_end (struct inode *inode) { ext4_io_end_t *io = NULL; io = kmalloc(sizeof(*io), GFP_NOFS); if (io) { + igrab(inode); io->inode = inode; - io->flag = flag; + io->flag = 0; io->offset = 0; io->size = 0; io->error = 0; - INIT_WORK(&io->work, ext4_end_dio_unwritten); + INIT_WORK(&io->work, ext4_end_aio_dio_work); + INIT_LIST_HEAD(&io->list); } return io; @@ -3551,19 +3650,31 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; - /* if not hole or unwritten extents, just simple return */ - if (!io_end || !size || !iocb->private) + ext_debug("ext4_end_io_dio(): io_end 0x%p" + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", + iocb->private, io_end->inode->i_ino, iocb, offset, + size); + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) return; + + /* if not aio dio with unwritten extents, just free io and return */ + if (io_end->flag != DIO_AIO_UNWRITTEN){ + ext4_free_io_end(io_end); + iocb->private = NULL; + return; + } + io_end->offset = offset; io_end->size = size; wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - /* We need to convert unwritten extents to written */ + /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); - if (is_sync_kiocb(iocb)) - flush_workqueue(wq); - + /* Add the io_end to per-inode completed aio dio list*/ + list_add_tail(&io_end->list, + &EXT4_I(io_end->inode)->i_aio_dio_complete_list); iocb->private = NULL; } /* @@ -3575,8 +3686,10 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * If those blocks were preallocated, we mark sure they are splited, but * still keep the range to write as unintialized. * - * When end_io call back function called at the last IO complete time, - * those extents will be converted to written extents. + * The unwrritten extents will be converted to written when DIO is completed. + * For async direct IO, since the IO may still pending when return, we + * set up an end_io call back function, which will do the convertion + * when async direct IO completed. * * If the O_DIRECT write will extend the file then add this inode to the * orphan list. So recovery will truncate it back to the original size @@ -3595,28 +3708,76 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, loff_t final_size = offset + count; if (rw == WRITE && final_size <= inode->i_size) { /* - * For DIO we fallocate blocks for holes, we fallocate blocks - * The fallocated extent for hole is marked as uninitialized + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as uninitialized * to prevent paralel buffered read to expose the stale data * before DIO complete the data IO. - * as for previously fallocated extents, ext4 get_block + * + * As to previously fallocated extents, ext4 get_block * will just simply mark the buffer mapped but still * keep the extents uninitialized. * - * At the end of IO, the ext4 end_io callback function - * will convert those unwritten extents to written, - * + * for non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * for async DIO, the conversion needs to be defered when + * the IO is completed. The ext4 end_io callback function + * will be called to take care of the conversion work. + * Here for async case, we allocate an io_end structure to + * hook to the iocb. */ - iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); - if (!iocb->private) - return -ENOMEM; + iocb->private = NULL; + EXT4_I(inode)->cur_aio_dio = NULL; + if (!is_sync_kiocb(iocb)) { + iocb->private = ext4_init_io_end(inode); + if (!iocb->private) + return -ENOMEM; + /* + * we save the io structure for current async + * direct IO, so that later ext4_get_blocks() + * could flag the io structure whether there + * is a unwritten extents needs to be converted + * when IO is completed. + */ + EXT4_I(inode)->cur_aio_dio = iocb->private; + } + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block_dio_write, ext4_end_io_dio); + if (iocb->private) + EXT4_I(inode)->cur_aio_dio = NULL; + /* + * The io_end structure takes a reference to the inode, + * that structure needs to be destroyed and the + * reference to the inode need to be dropped, when IO is + * complete, even with 0 byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will be + * desctroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since + * VFS direct IO won't invoke the end_io call back function, + * we need to free the end_io structure here. + */ + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0) + /* + * for non AIO case, since the IO is already + * completed, we could do the convertion right here + */ + ret = ext4_convert_unwritten_extents(inode, + offset, ret); return ret; } + + /* for write the the end of file case, we fall back to old way */ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1a03ea98fdd1..f095c60b569e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -687,6 +687,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_allocated_meta_blocks = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); + ei->cur_aio_dio = NULL; return &ei->vfs_inode; } @@ -3375,11 +3377,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; + struct ext4_sb_info *sbi = EXT4_SB(sb); trace_ext4_sync_fs(sb, wait); - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { + flush_workqueue(sbi->dio_unwritten_wq); + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); + jbd2_log_wait_commit(sbi->s_journal, target); } return ret; } -- cgit v1.2.3 From f3ce8064b388ccf420012c5a4907aae4f13fe9d0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 28 Sep 2009 15:58:29 -0400 Subject: ext4: EXT4_IOC_MOVE_EXT: Check for different original and donor inodes first Move the check to make sure the original and donor inodes are different earlier, to avoid a potential deadlock by trying to lock the same inode twice. Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 5332fd4c4028..25b6b1457360 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - /* orig and donor should be different file */ - if (orig_inode->i_ino == donor_inode->i_ino) { - ext4_debug("ext4 move extent: The argument files should not " - "be same file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - /* Ext4 move extent supports only extent based file */ if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { ext4_debug("ext4 move extent: orig file is not extents " @@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, int block_len_in_page; int uninit; + /* orig and donor should be different file */ + if (orig_inode->i_ino == donor_inode->i_ino) { + ext4_debug("ext4 move extent: The argument files should not " + "be same file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* protect orig and donor against a truncate */ ret1 = mext_inode_double_lock(orig_inode, donor_inode); if (ret1 < 0) -- cgit v1.2.3 From 9ed74f2dba6ebf9f30b80554290bfc73cc3ef083 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 11 Sep 2009 16:12:44 -0400 Subject: Btrfs: proper -ENOSPC handling At the start of a transaction we do a btrfs_reserve_metadata_space() and specify how many items we plan on modifying. Then once we've done our modifications and such, just call btrfs_unreserve_metadata_space() for the same number of items we reserved. For keeping track of metadata needed for data I've had to add an extent_io op for when we merge extents. This lets us track space properly when we are doing sequential writes, so we don't end up reserving way more metadata space than what we need. The only place where the metadata space accounting is not done is in the relocation code. This is because Yan is going to be reworking that code in the near future, so running btrfs-vol -b could still possibly result in a ENOSPC related panic. This patch also turns off the metadata_ratio stuff in order to allow users to more efficiently use their disk space. This patch makes it so we track how much metadata we need for an inode's delayed allocation extents by tracking how many extents are currently waiting for allocation. It introduces two new callbacks for the extent_io tree's, merge_extent_hook and split_extent_hook. These help us keep track of when we merge delalloc extents together and split them up. Reservations are handled prior to any actually dirty'ing occurs, and then we unreserve after we dirty. btrfs_unreserve_metadata_for_delalloc() will make the appropriate unreservations as needed based on the number of reservations we currently have and the number of extents we currently have. Doing the reservation outside of doing any of the actual dirty'ing lets us do things like filemap_flush() the inode to try and force delalloc to happen, or as a last resort actually start allocation on all delalloc inodes in the fs. This has survived dbench, fs_mark and an fsx torture test. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 8 + fs/btrfs/ctree.h | 23 ++- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 389 +++++++++++++++++++++++++++++++++++++++++++------ fs/btrfs/extent_io.c | 92 +++++++++--- fs/btrfs/extent_io.h | 13 +- fs/btrfs/file.c | 11 +- fs/btrfs/inode.c | 224 ++++++++++++++++++++++++---- fs/btrfs/ioctl.c | 21 ++- fs/btrfs/transaction.c | 10 ++ 10 files changed, 678 insertions(+), 115 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 82ee56bba299..a54d354cefcb 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -127,6 +127,14 @@ struct btrfs_inode { */ u64 last_unlink_trans; + /* + * These two counters are for delalloc metadata reservations. We keep + * track of how many extents we've accounted for vs how many extents we + * have. + */ + int delalloc_reserved_extents; + int delalloc_extents; + /* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80599b4e42bd..b3959a150c3b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -675,18 +675,19 @@ struct btrfs_space_info { current allocations */ u64 bytes_readonly; /* total bytes that are read only */ u64 bytes_super; /* total bytes reserved for the super blocks */ - - /* delalloc accounting */ - u64 bytes_delalloc; /* number of bytes reserved for allocation, - this space is not necessarily reserved yet - by the allocator */ + u64 bytes_root; /* the number of bytes needed to commit a + transaction */ u64 bytes_may_use; /* number of bytes that may be used for - delalloc */ + delalloc/allocations */ + u64 bytes_delalloc; /* number of bytes currently reserved for + delayed allocation */ int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for this space */ + int force_delalloc; /* make people start doing filemap_flush until + we're under a threshold */ struct list_head list; @@ -695,6 +696,9 @@ struct btrfs_space_info { spinlock_t lock; struct rw_semaphore groups_sem; atomic_t caching_threads; + + int allocating_chunk; + wait_queue_head_t wait; }; /* @@ -2022,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -int btrfs_check_metadata_free_space(struct btrfs_root *root); +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct btrfs_root *root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f1e905f7e701..ece8d1e26b5e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1629,7 +1629,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; - fs_info->metadata_ratio = 8; + fs_info->metadata_ratio = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 90d314eeff6d..a4b2b03cd682 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, struct extent_buffer **must_clean); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -2764,67 +2766,346 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) alloc_target); } +static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) +{ + u64 num_bytes; + int level; + + level = BTRFS_MAX_LEVEL - 2; + /* + * NOTE: these calculations are absolutely the worst possible case. + * This assumes that _every_ item we insert will require a new leaf, and + * that the tree has grown to its maximum level size. + */ + + /* + * for every item we insert we could insert both an extent item and a + * extent ref item. Then for ever item we insert, we will need to cow + * both the original leaf, plus the leaf to the left and right of it. + * + * Unless we are talking about the extent root, then we just want the + * number of items * 2, since we just need the extent item plus its ref. + */ + if (root == root->fs_info->extent_root) + num_bytes = num_items * 2; + else + num_bytes = (num_items + (2 * num_items)) * 3; + + /* + * num_bytes is total number of leaves we could need times the leaf + * size, and then for every leaf we could end up cow'ing 2 nodes per + * level, down to the leaf level. + */ + num_bytes = (num_bytes * root->leafsize) + + (num_bytes * (level * 2)) * root->nodesize; + + return num_bytes; +} + /* - * for now this just makes sure we have at least 5% of our metadata space free - * for use. + * Unreserve metadata space for delalloc. If we have less reserved credits than + * we have extents, this function does nothing. */ -int btrfs_check_metadata_free_space(struct btrfs_root *root) +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) { struct btrfs_fs_info *info = root->fs_info; struct btrfs_space_info *meta_sinfo; - u64 alloc_target, thresh; - int committed = 0, ret; + u64 num_bytes; + u64 alloc_target; + bool bug = false; /* get the space info for where the metadata will live */ alloc_target = btrfs_get_alloc_profile(root, 0); meta_sinfo = __find_space_info(info, alloc_target); - if (!meta_sinfo) - goto alloc; -again: + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); + spin_lock(&meta_sinfo->lock); - if (!meta_sinfo->full) - thresh = meta_sinfo->total_bytes * 80; - else - thresh = meta_sinfo->total_bytes * 95; + if (BTRFS_I(inode)->delalloc_reserved_extents <= + BTRFS_I(inode)->delalloc_extents) { + spin_unlock(&meta_sinfo->lock); + return 0; + } + + BTRFS_I(inode)->delalloc_reserved_extents--; + BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0); + + if (meta_sinfo->bytes_delalloc < num_bytes) { + bug = true; + meta_sinfo->bytes_delalloc = 0; + } else { + meta_sinfo->bytes_delalloc -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + BUG_ON(bug); + + return 0; +} + +static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +{ + u64 thresh; + + thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use; + + thresh = meta_sinfo->total_bytes - thresh; + thresh *= 80; do_div(thresh, 100); + if (thresh <= meta_sinfo->bytes_delalloc) + meta_sinfo->force_delalloc = 1; + else + meta_sinfo->force_delalloc = 0; +} - if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super > thresh) { - struct btrfs_trans_handle *trans; - if (!meta_sinfo->full) { - meta_sinfo->force_alloc = 1; +static int maybe_allocate_chunk(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + bool wait = false; + int ret = 0; + u64 min_metadata; + u64 free_space; + + free_space = btrfs_super_total_bytes(disk_super); + /* + * we allow the metadata to grow to a max of either 5gb or 5% of the + * space in the volume. + */ + min_metadata = min((u64)5 * 1024 * 1024 * 1024, + div64_u64(free_space * 5, 100)); + if (info->total_bytes >= min_metadata) { + spin_unlock(&info->lock); + return 0; + } + + if (info->full) { + spin_unlock(&info->lock); + return 0; + } + + if (!info->allocating_chunk) { + info->force_alloc = 1; + info->allocating_chunk = 1; + init_waitqueue_head(&info->wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_event(info->wait, + !info->allocating_chunk); + return 1; + } + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 4096 + 2 * 1024 * 1024, + info->flags, 0); + btrfs_end_transaction(trans, root); + if (ret) + goto out; +out: + spin_lock(&info->lock); + info->allocating_chunk = 0; + spin_unlock(&info->lock); + wake_up(&info->wait); + + if (ret) + return 0; + return 1; +} + +/* + * Reserve metadata space for delalloc. + */ +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int flushed = 0; + int force_delalloc; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); +again: + spin_lock(&meta_sinfo->lock); + + force_delalloc = meta_sinfo->force_delalloc; + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!flushed) + meta_sinfo->bytes_delalloc += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + flushed++; + + if (flushed == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + flushed++; + } else { spin_unlock(&meta_sinfo->lock); -alloc: - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; + } - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, alloc_target, 0); - btrfs_end_transaction(trans, root); - if (!meta_sinfo) { - meta_sinfo = __find_space_info(info, - alloc_target); - } + if (flushed == 2) { + filemap_flush(inode->i_mapping); + goto again; + } else if (flushed == 3) { + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); goto again; } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_delalloc -= num_bytes; spin_unlock(&meta_sinfo->lock); + printk(KERN_ERR "enospc, has %d, reserved %d\n", + BTRFS_I(inode)->delalloc_extents, + BTRFS_I(inode)->delalloc_reserved_extents); + dump_space_info(meta_sinfo, 0, 0); + return -ENOSPC; + } - if (!committed) { - committed = 1; - trans = btrfs_join_transaction(root, 1); - if (!trans) - return -ENOMEM; - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; + BTRFS_I(inode)->delalloc_reserved_extents++; + check_force_delalloc(meta_sinfo); + spin_unlock(&meta_sinfo->lock); + + if (!flushed && force_delalloc) + filemap_flush(inode->i_mapping); + + return 0; +} + +/* + * unreserve num_items number of items worth of metadata space. This needs to + * be paired with btrfs_reserve_metadata_space. + * + * NOTE: if you have the option, run this _AFTER_ you do a + * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref + * oprations which will result in more used metadata, so we want to make sure we + * can do that without issue. + */ +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 alloc_target; + bool bug = false; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); + + spin_lock(&meta_sinfo->lock); + if (meta_sinfo->bytes_may_use < num_bytes) { + bug = true; + meta_sinfo->bytes_may_use = 0; + } else { + meta_sinfo->bytes_may_use -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + + return 0; +} + +/* + * Reserve some metadata space for use. We'll calculate the worste case number + * of bytes that would be needed to modify num_items number of items. If we + * have space, fantastic, if not, you get -ENOSPC. Please call + * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of + * items you reserved, since whatever metadata you needed should have already + * been allocated. + * + * This will commit the transaction to make more space if we don't have enough + * metadata space. THe only time we don't do this is if we're reserving space + * inside of a transaction, then we will just return -ENOSPC and it is the + * callers responsibility to handle it properly. + */ +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int retries = 0; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); +again: + spin_lock(&meta_sinfo->lock); + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!retries) + meta_sinfo->bytes_may_use += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + retries++; + if (retries == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + retries++; + } else { + spin_unlock(&meta_sinfo->lock); + } + + if (retries == 2) { + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); goto again; } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_may_use -= num_bytes; + spin_unlock(&meta_sinfo->lock); + + dump_space_info(meta_sinfo, 0, 0); return -ENOSPC; } + + check_force_delalloc(meta_sinfo); spin_unlock(&meta_sinfo->lock); return 0; @@ -2915,7 +3196,7 @@ alloc: BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); - return btrfs_check_metadata_free_space(root); + return 0; } /* @@ -3014,17 +3295,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(!space_info); spin_lock(&space_info->lock); - if (space_info->force_alloc) { + if (space_info->force_alloc) force = 1; - space_info->force_alloc = 0; - } if (space_info->full) { spin_unlock(&space_info->lock); goto out; } thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 6); + thresh = div_factor(thresh, 8); if (!force && (space_info->bytes_used + space_info->bytes_pinned + space_info->bytes_reserved + alloc_bytes) < thresh) { @@ -3038,7 +3317,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, * we keep a reasonable number of metadata chunks allocated in the * FS as well. */ - if (flags & BTRFS_BLOCK_GROUP_DATA) { + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { fs_info->data_chunk_allocations++; if (!(fs_info->data_chunk_allocations % fs_info->metadata_ratio)) @@ -3046,8 +3325,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } ret = btrfs_alloc_chunk(trans, extent_root, flags); + spin_lock(&space_info->lock); if (ret) space_info->full = 1; + space_info->force_alloc = 0; + spin_unlock(&space_info->lock); out: mutex_unlock(&extent_root->fs_info->chunk_mutex); return ret; @@ -4062,21 +4344,32 @@ loop: return ret; } -static void dump_space_info(struct btrfs_space_info *info, u64 bytes) +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) { struct btrfs_block_group_cache *cache; + spin_lock(&info->lock); printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved), + info->bytes_pinned - info->bytes_reserved - + info->bytes_super), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", + " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" + "\n", (unsigned long long)info->total_bytes, (unsigned long long)info->bytes_pinned, (unsigned long long)info->bytes_delalloc, (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_used); + (unsigned long long)info->bytes_used, + (unsigned long long)info->bytes_root, + (unsigned long long)info->bytes_super, + (unsigned long long)info->bytes_reserved); + spin_unlock(&info->lock); + + if (!dump_block_groups) + return; down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { @@ -4144,7 +4437,7 @@ again: printk(KERN_ERR "btrfs allocation failed flags %llu, " "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes); + dump_space_info(sinfo, num_bytes, 1); } return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0cb88f8146ea..de1793ba004a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree, return NULL; } +static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, + struct extent_state *other) +{ + if (tree->ops && tree->ops->merge_extent_hook) + tree->ops->merge_extent_hook(tree->mapping->host, new, + other); +} + /* * utility function to look for merge candidates inside a given range. * Any extents with matching state are merged together into a single @@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { + merge_cb(tree, state, other); state->start = other->start; other->tree = NULL; rb_erase(&other->rb_node, &tree->state); @@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { + merge_cb(tree, state, other); other->start = state->start; state->tree = NULL; rb_erase(&state->rb_node, &tree->state); free_extent_state(state); + state = NULL; } } + return 0; } -static void set_state_cb(struct extent_io_tree *tree, +static int set_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned long bits) { if (tree->ops && tree->ops->set_bit_hook) { - tree->ops->set_bit_hook(tree->mapping->host, state->start, - state->end, state->state, bits); + return tree->ops->set_bit_hook(tree->mapping->host, + state->start, state->end, + state->state, bits); } + + return 0; } static void clear_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned long bits) { - if (tree->ops && tree->ops->clear_bit_hook) { - tree->ops->clear_bit_hook(tree->mapping->host, state->start, - state->end, state->state, bits); - } + if (tree->ops && tree->ops->clear_bit_hook) + tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } /* @@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree, int bits) { struct rb_node *node; + int ret; if (end < start) { printk(KERN_ERR "btrfs end < start %llu %llu\n", @@ -365,11 +379,14 @@ static int insert_state(struct extent_io_tree *tree, (unsigned long long)start); WARN_ON(1); } - if (bits & EXTENT_DIRTY) - tree->dirty_bytes += end - start + 1; state->start = start; state->end = end; - set_state_cb(tree, state, bits); + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + + if (bits & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; state->state |= bits; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { @@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree, return 0; } +static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, + u64 split) +{ + if (tree->ops && tree->ops->split_extent_hook) + return tree->ops->split_extent_hook(tree->mapping->host, + orig, split); + return 0; +} + /* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an @@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct extent_state *prealloc, u64 split) { struct rb_node *node; + + split_cb(tree, orig, split); + prealloc->start = orig->start; prealloc->end = split - 1; prealloc->state = orig->state; @@ -542,8 +571,8 @@ hit_next: if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, bits, - wake, delete); + set |= clear_state_bit(tree, state, bits, wake, + delete); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -561,12 +590,11 @@ hit_next: prealloc = alloc_extent_state(GFP_ATOMIC); err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, - wake, delete); + set |= clear_state_bit(tree, prealloc, bits, wake, delete); + prealloc = NULL; goto out; } @@ -667,16 +695,23 @@ out: return 0; } -static void set_state_bits(struct extent_io_tree *tree, +static int set_state_bits(struct extent_io_tree *tree, struct extent_state *state, int bits) { + int ret; + + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - set_state_cb(tree, state, bits); state->state |= bits; + + return 0; } static void cache_state(struct extent_state *state, @@ -758,7 +793,10 @@ hit_next: goto out; } - set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, bits); + if (err) + goto out; + cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -805,7 +843,9 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, bits); + if (err) + goto out; cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -829,11 +869,13 @@ hit_next: this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, bits); - cache_state(prealloc, cached_state); - prealloc = NULL; BUG_ON(err == -EEXIST); - if (err) + if (err) { + prealloc = NULL; goto out; + } + cache_state(prealloc, cached_state); + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -852,7 +894,11 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - set_state_bits(tree, prealloc, bits); + err = set_state_bits(tree, prealloc, bits); + if (err) { + prealloc = NULL; + goto out; + } cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 14ed16fd862d..4794ec891fed 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -60,8 +60,13 @@ struct extent_io_ops { struct extent_state *state, int uptodate); int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits); - int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits); + int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, + unsigned long bits); + int (*merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); + int (*split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); int (*write_cache_pages_lock_hook)(struct page *page); }; @@ -79,10 +84,14 @@ struct extent_state { u64 start; u64 end; /* inclusive */ struct rb_node rb_node; + + /* ADD NEW ELEMENTS AFTER THIS */ struct extent_io_tree *tree; wait_queue_head_t wq; atomic_t refs; unsigned long state; + u64 split_start; + u64 split_end; /* for use by the FS */ u64 private; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 571ad3c13b47..1be96ba6f6bb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -123,7 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, root->sectorsize - 1) & ~((u64)root->sectorsize - 1); end_of_last_block = start_pos + num_bytes - 1; - btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + if (err) + return err; + for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; SetPageUptodate(p); @@ -927,6 +930,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, err = file_remove_suid(file); if (err) goto out_nolock; + + err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (err) + goto out_nolock; + file_update_time(file); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); @@ -1028,6 +1036,7 @@ out: mutex_unlock(&inode->i_mutex); if (ret) err = ret; + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); out_nolock: kfree(pages); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 01c5f8b5a34e..3cc5677f5440 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1159,6 +1159,83 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, return ret; } +static int btrfs_split_extent_hook(struct inode *inode, + struct extent_state *orig, u64 split) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 size; + + if (!(orig->state & EXTENT_DELALLOC)) + return 0; + + size = orig->end - orig->start + 1; + if (size > root->fs_info->max_extent) { + u64 num_extents; + u64 new_size; + + new_size = orig->end - split + 1; + num_extents = div64_u64(size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + + /* + * if we break a large extent up then leave delalloc_extents be, + * since we've already accounted for the large extent. + */ + if (div64_u64(new_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent) < num_extents) + return 0; + } + + BTRFS_I(inode)->delalloc_extents++; + + return 0; +} + +/* + * extent_io.c merge_extent_hook, used to track merged delayed allocation + * extents so we can keep track of new extents that are just merged onto old + * extents, such as when we are doing sequential writes, so we can properly + * account for the metadata space we'll need. + */ +static int btrfs_merge_extent_hook(struct inode *inode, + struct extent_state *new, + struct extent_state *other) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 new_size, old_size; + u64 num_extents; + + /* not delalloc, ignore it */ + if (!(other->state & EXTENT_DELALLOC)) + return 0; + + old_size = other->end - other->start + 1; + if (new->start < other->start) + new_size = other->end - new->start + 1; + else + new_size = new->end - other->start + 1; + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= root->fs_info->max_extent) { + BTRFS_I(inode)->delalloc_extents--; + return 0; + } + + /* + * If we grew by another max_extent, just return, we want to keep that + * reserved amount. + */ + num_extents = div64_u64(old_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + if (div64_u64(new_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent) > num_extents) + return 0; + + BTRFS_I(inode)->delalloc_extents--; + + return 0; +} + /* * extent_io.c set_bit_hook, used to track delayed allocation * bytes in this file, and to maintain the list of inodes that @@ -1167,6 +1244,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits) { + /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC @@ -1174,6 +1252,8 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, */ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + + BTRFS_I(inode)->delalloc_extents++; btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += end - start + 1; @@ -1190,22 +1270,27 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits) +static int btrfs_clear_bit_hook(struct inode *inode, + struct extent_state *state, unsigned long bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + BTRFS_I(inode)->delalloc_extents--; + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + spin_lock(&root->fs_info->delalloc_lock); - if (end - start + 1 > root->fs_info->delalloc_bytes) { + if (state->end - state->start + 1 > + root->fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs warning: delalloc account " "%llu %llu\n", - (unsigned long long)end - start + 1, + (unsigned long long) + state->end - state->start + 1, (unsigned long long) root->fs_info->delalloc_bytes); btrfs_delalloc_free_space(root, inode, (u64)-1); @@ -1213,9 +1298,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, BTRFS_I(inode)->delalloc_bytes = 0; } else { btrfs_delalloc_free_space(root, inode, - end - start + 1); - root->fs_info->delalloc_bytes -= end - start + 1; - BTRFS_I(inode)->delalloc_bytes -= end - start + 1; + state->end - + state->start + 1); + root->fs_info->delalloc_bytes -= state->end - + state->start + 1; + BTRFS_I(inode)->delalloc_bytes -= state->end - + state->start + 1; } if (BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { @@ -2950,7 +3038,12 @@ again: goto again; } - btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + if (ret) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + goto out_unlock; + } + ret = 0; if (offset != PAGE_CACHE_SIZE) { kaddr = kmap(page); @@ -2981,15 +3074,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) u64 last_byte; u64 cur_offset; u64 hole_size; - int err; + int err = 0; if (size <= hole_start) return 0; - err = btrfs_check_metadata_free_space(root); - if (err) - return err; - btrfs_truncate_page(inode->i_mapping, inode->i_size); while (1) { @@ -3024,12 +3113,18 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) cur_offset, &hint_byte, 1); if (err) break; + + err = btrfs_reserve_metadata_space(root, 1); + if (err) + break; + err = btrfs_insert_file_extent(trans, root, inode->i_ino, cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); + btrfs_unreserve_metadata_space(root, 1); } free_extent_map(em); cur_offset = last_byte; @@ -3990,11 +4085,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - err = btrfs_check_metadata_free_space(root); + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto fail; + return err; trans = btrfs_start_transaction(root, 1); + if (!trans) + goto fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -4032,6 +4134,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4052,10 +4155,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, u64 objectid; u64 index = 0; - err = btrfs_check_metadata_free_space(root); + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto fail; + return err; + trans = btrfs_start_transaction(root, 1); + if (!trans) + goto fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -4096,6 +4207,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4118,10 +4230,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) return -ENOENT; - btrfs_inc_nlink(inode); - err = btrfs_check_metadata_free_space(root); + /* + * 1 item for inode ref + * 2 items for dir items + */ + err = btrfs_reserve_metadata_space(root, 3); if (err) - goto fail; + return err; + + btrfs_inc_nlink(inode); + err = btrfs_set_inode_index(dir, &index); if (err) goto fail; @@ -4145,6 +4263,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 3); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4164,17 +4283,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) u64 index = 0; unsigned long nr = 1; - err = btrfs_check_metadata_free_space(root); + /* + * 2 items for inode and ref + * 2 items for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto out_unlock; + return err; trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, dir); - - if (IS_ERR(trans)) { - err = PTR_ERR(trans); + if (!trans) { + err = -ENOMEM; goto out_unlock; } + btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); if (err) { @@ -4223,6 +4346,7 @@ out_fail: btrfs_end_transaction_throttle(trans, root); out_unlock: + btrfs_unreserve_metadata_space(root, 5); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -4747,6 +4871,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; } + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + ret = VM_FAULT_SIGBUS; + goto out; + } + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); @@ -4778,7 +4909,12 @@ again: goto again; } - btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + if (ret) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + ret = VM_FAULT_SIGBUS; + goto out_unlock; + } ret = 0; /* page is wholly or partially inside EOF */ @@ -4801,6 +4937,7 @@ again: unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); if (!ret) return VM_FAULT_LOCKED; unlock_page(page); @@ -4917,6 +5054,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) return NULL; ei->last_trans = 0; ei->logged_trans = 0; + ei->delalloc_extents = 0; + ei->delalloc_reserved_extents = 0; btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->ordered_operations); @@ -5070,7 +5209,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - ret = btrfs_check_metadata_free_space(root); + /* + * 2 items for dir items + * 1 item for orphan entry + * 1 item for ref + */ + ret = btrfs_reserve_metadata_space(root, 4); if (ret) return ret; @@ -5185,6 +5329,8 @@ out_fail: if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); + + btrfs_unreserve_metadata_space(root, 4); return ret; } @@ -5256,11 +5402,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; - err = btrfs_check_metadata_free_space(root); + /* + * 2 items for inode item and ref + * 2 items for dir items + * 1 item for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto out_fail; + return err; trans = btrfs_start_transaction(root, 1); + if (!trans) + goto out_fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -5341,6 +5494,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); out_fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -5362,6 +5516,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); + + ret = btrfs_reserve_metadata_space(root, 1); + if (ret) + goto out; + ret = btrfs_reserve_extent(trans, root, alloc_size, root->sectorsize, 0, alloc_hint, (u64)-1, &ins, 1); @@ -5381,6 +5540,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, num_bytes -= ins.offset; cur_offset += ins.offset; alloc_hint = ins.objectid + ins.offset; + btrfs_unreserve_metadata_space(root, 1); } out: if (cur_offset > start) { @@ -5566,6 +5726,8 @@ static struct extent_io_ops btrfs_extent_io_ops = { .readpage_io_failed_hook = btrfs_io_failed_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, + .merge_extent_hook = btrfs_merge_extent_hook, + .split_extent_hook = btrfs_split_extent_hook, }; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a8577a7f26ab..4de7ef6f8603 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -239,7 +239,13 @@ static noinline int create_subvol(struct btrfs_root *root, u64 index = 0; unsigned long nr = 1; - ret = btrfs_check_metadata_free_space(root); + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + ret = btrfs_reserve_metadata_space(root, 6); if (ret) return ret; @@ -340,6 +346,9 @@ fail: err = btrfs_commit_transaction(trans, root); if (err && !ret) ret = err; + + btrfs_unreserve_metadata_space(root, 6); + btrfs_btree_balance_dirty(root, nr); return ret; } @@ -355,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!root->ref_cows) return -EINVAL; - ret = btrfs_check_metadata_free_space(root); + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + ret = btrfs_reserve_metadata_space(root, 6); if (ret) goto fail_unlock; pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) { ret = -ENOMEM; + btrfs_unreserve_metadata_space(root, 6); goto fail_unlock; } pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); if (!pending_snapshot->name) { ret = -ENOMEM; kfree(pending_snapshot); + btrfs_unreserve_metadata_space(root, 6); goto fail_unlock; } memcpy(pending_snapshot->name, name, namelen); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 88f866f85e7a..0b8f36d4400a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -186,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->alloc_exclude_start = 0; h->delayed_ref_updates = 0; + if (!current->journal_info) + current->journal_info = h; + root->fs_info->running_transaction->use_count++; record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); @@ -317,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up(&cur_trans->writer_wait); put_transaction(cur_trans); mutex_unlock(&info->trans_mutex); + + if (current->journal_info == trans) + current->journal_info = NULL; memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); @@ -743,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(&pending->root_key, &key, sizeof(key)); fail: kfree(new_root_item); + btrfs_unreserve_metadata_space(root, 6); return ret; } @@ -1059,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); + if (current->journal_info == trans) + current->journal_info = NULL; + kmem_cache_free(btrfs_trans_handle_cachep, trans); return ret; } -- cgit v1.2.3 From 1f28fcd925b2b3157411bbd08f0024b55b70d8dd Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 28 Sep 2009 01:46:11 +0900 Subject: nilfs2: fix missing zero-fill initialization of btree node cache This will fix file system corruption which infrequently happens after mount. The problem was reported from users with the title "[NILFS users] Fail to mount NILFS." (Message-ID: <200908211918.34720.yuri@itinteg.net>), and so forth. I've also experienced the corruption multiple times on kernel 2.6.30 and 2.6.31. The problem turned out to be caused due to discordance between mapping->nrpages of a btree node cache and the actual number of pages hung on the cache; if the mapping->nrpages becomes zero even as it has pages, truncate_inode_pages() returns without doing anything. Usually this is harmless except it may cause page leak, but garbage collection fairly infrequently sees a stale page remained in the btree node cache of DAT (i.e. disk address translation file of nilfs), and induces the corruption. I identified a missing initialization in btree node caches was the root cause. This corrects the bug. I've tested this for kernel 2.6.30 and 2.6.31. Reported-by: Yuri Chislov Signed-off-by: Ryusuke Konishi Cc: stable --- fs/nilfs2/btnode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 6a2711f4c321..5941958f1e47 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -36,6 +36,7 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc) { + memset(btnc, 0, sizeof(*btnc)); INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); spin_lock_init(&btnc->tree_lock); INIT_LIST_HEAD(&btnc->private_list); -- cgit v1.2.3 From 3cc811bffdf35ebaf1467fbec71a49b57800fc74 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 28 Sep 2009 13:02:46 +0900 Subject: nilfs2: fix missing initialization of i_dir_start_lookup member The i_dir_start_lookup field in nilfs_inode_info objects should be cleared when the objects are allocated, but the the initialization was missing in case of reading from disk. This adds the initialization. Since the variable just gives a start page on directory lookups, the bug was nonfatal until now. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 2d2c501deb54..5040220c3732 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -400,6 +400,7 @@ int nilfs_read_inode_common(struct inode *inode, ii->i_dir_acl = S_ISREG(inode->i_mode) ? 0 : le32_to_cpu(raw_inode->i_dir_acl); #endif + ii->i_dir_start_lookup = 0; ii->i_cno = 0; inode->i_generation = le32_to_cpu(raw_inode->i_generation); -- cgit v1.2.3 From 830156c79b0a99ddf0f62496bcf4de640f9f52cd Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Tue, 29 Sep 2009 10:07:47 -0400 Subject: ext4: Avoid updating the inode table bh twice in no journal mode This is a cleanup of commit 91ac6f4. Since ext4_mark_inode_dirty() has already called ext4_mark_iloc_dirty(), which in turn calls ext4_do_update_inode(), it's not necessary to have ext4_write_inode() call ext4_do_update_inode() in no journal mode. Indeed, it would be duplicated work. Reviewed-by: "Aneesh Kumar K.V" Signed-off-by: Frank Mayhar Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 118e16ca91d7..22fb1a3a045c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4991,8 +4991,7 @@ static int ext4_inode_blocks_set(handle_t *handle, */ static int ext4_do_update_inode(handle_t *handle, struct inode *inode, - struct ext4_iloc *iloc, - int do_sync) + struct ext4_iloc *iloc) { struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); @@ -5093,22 +5092,10 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } - /* - * If we're not using a journal and we were called from - * ext4_write_inode() to sync the inode (making do_sync true), - * we can just use sync_dirty_buffer() directly to do our dirty - * work. Testing s_journal here is a bit redundant but it's - * worth it to avoid potential future trouble. - */ - if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) { - BUFFER_TRACE(bh, "call sync_dirty_buffer"); - sync_dirty_buffer(bh); - } else { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - rc = ext4_handle_dirty_metadata(handle, inode, bh); - if (!err) - err = rc; - } + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + rc = ext4_handle_dirty_metadata(handle, inode, bh); + if (!err) + err = rc; ei->i_state &= ~EXT4_STATE_NEW; out_brelse: @@ -5176,8 +5163,16 @@ int ext4_write_inode(struct inode *inode, int wait) err = ext4_get_inode_loc(inode, &iloc); if (err) return err; - err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE, - inode, &iloc, wait); + if (wait) + sync_dirty_buffer(iloc.bh); + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + ext4_error(inode->i_sb, __func__, + "IO error syncing inode, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)iloc.bh->b_blocknr); + err = -EIO; + } } return err; } @@ -5473,7 +5468,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, get_bh(iloc->bh); /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ - err = ext4_do_update_inode(handle, inode, iloc, 0); + err = ext4_do_update_inode(handle, inode, iloc); put_bh(iloc->bh); return err; } -- cgit v1.2.3 From f3dc272fd5e2ae08244796bb39e7e1ce4b25d3b3 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Tue, 29 Sep 2009 16:06:01 -0400 Subject: ext4: Make sure ext4_dirty_inode() updates the inode in no journal mode This patch a problem that ext4_dirty_inode() was not calling ext4_mark_inode_dirty() if the current_handle is not valid, which it is the case in no journal mode. It also removes a test for non-matching transaction which can never happen. Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 22fb1a3a045c..ec367bce7215 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5615,24 +5615,13 @@ void ext4_dirty_inode(struct inode *inode) handle_t *current_handle = ext4_journal_current_handle(); handle_t *handle; - if (!ext4_handle_valid(current_handle)) { - ext4_mark_inode_dirty(current_handle, inode); - return; - } - handle = ext4_journal_start(inode, 2); if (IS_ERR(handle)) goto out; - if (current_handle && - current_handle->h_transaction != handle->h_transaction) { - /* This task has a transaction open against a different fs */ - printk(KERN_EMERG "%s: transactions do not match!\n", - __func__); - } else { - jbd_debug(5, "marking dirty. outer handle=%p\n", - current_handle); - ext4_mark_inode_dirty(handle, inode); - } + + jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); out: return; -- cgit v1.2.3 From d3d1faf6a74496ea4435fd057c6a2cad49f3e523 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Tue, 29 Sep 2009 11:01:03 -0400 Subject: ext4: Handle nested ext4_journal_start/stop calls without a journal This patch fixes a problem with handling nested calls to ext4_journal_start/ext4_journal_stop, when there is no journal present. Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.h | 6 ++++-- fs/ext4/namei.c | 3 ++- fs/ext4/super.c | 42 ++++++++++++++++++++++++++++++++---------- 3 files changed, 38 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 139fb8cb87e4..a2865980342f 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, handle_t *handle); -#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) +/* Note: Do not use this for NULL handles. This is only to determine if + * a properly allocated handle is using a journal or not. */ static inline int ext4_handle_valid(handle_t *handle) { - if (handle == EXT4_NOJOURNAL_HANDLE) + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) return 0; return 1; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 42f81d285cd5..7c8fe80bacdd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2076,7 +2076,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0; - if (!ext4_handle_valid(handle)) + /* ext4_handle_valid() assumes a valid handle_t pointer */ + if (handle && !ext4_handle_valid(handle)) return 0; mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f095c60b569e..3f7e7010c098 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -189,6 +189,36 @@ void ext4_itable_unused_set(struct super_block *sb, bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } + +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + /* * Wrappers for jbd2_journal_start/end. * @@ -215,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) } return jbd2_journal_start(journal, nblocks); } - /* - * We're not journaling, return the appropriate indication. - */ - current->journal_info = EXT4_NOJOURNAL_HANDLE; - return current->journal_info; + return ext4_get_nojournal(); } /* @@ -235,11 +261,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle) int rc; if (!ext4_handle_valid(handle)) { - /* - * Do this here since we don't call jbd2_journal_stop() in - * no-journal mode. - */ - current->journal_info = NULL; + ext4_put_nojournal(handle); return 0; } sb = handle->h_transaction->t_journal->j_private; -- cgit v1.2.3 From 49cf6f4529b7945ef51b8e39f0bac630726f8c96 Mon Sep 17 00:00:00 2001 From: Chris Ball Date: Tue, 29 Sep 2009 13:51:04 -0400 Subject: Btrfs: Fix setting umask when POSIX ACLs are not enabled We currently set sb->s_flags |= MS_POSIXACL unconditionally, which is incorrect -- it tells the VFS that it shouldn't set umask because we will, yet we don't set it ourselves if we aren't using POSIX ACLs, so the umask ends up ignored. Signed-off-by: Chris Ball Signed-off-by: Chris Mason --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0242c8babae9..e0a64328080c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -344,7 +344,9 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; +#ifdef CONFIG_BTRFS_POSIX_ACL sb->s_flags |= MS_POSIXACL; +#endif tree_root = open_ctree(sb, fs_devices, (char *)data); -- cgit v1.2.3 From fd2696f399e45347c07f1f7f340e8515cace5657 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 29 Sep 2009 13:51:04 -0400 Subject: Btrfs: introduce missing kfree Error handling code following a kzalloc should free the allocated data. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,f1,l; position p1,p2; expression *ptr != NULL; @@ x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S <... when != x when != if (...) { <+...x...+> } ( x->f1 = E | (x->f1 == NULL || ...) | f(...,x->f1,...) ) ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index be953afe804c..20cbd2eebd97 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) goto error; device->name = kstrdup(orig_dev->name, GFP_NOFS); - if (!device->name) + if (!device->name) { + kfree(device); goto error; + } device->devid = orig_dev->devid; device->work.func = pending_bios_fn; -- cgit v1.2.3 From 3baf0bed0a5adab95c7599d2f27124c74692ef28 Mon Sep 17 00:00:00 2001 From: Chris Ball Date: Tue, 29 Sep 2009 13:51:05 -0400 Subject: Btrfs: Use CONFIG_BTRFS_POSIX_ACL to enable ACL code We've already defined CONFIG_BTRFS_POSIX_ACL in Kconfig, but we're currently not using it and are testing CONFIG_FS_POSIX_ACL instead. CONFIG_FS_POSIX_ACL states "Never use this symbol for ifdefs". Signed-off-by: Chris Ball Signed-off-by: Chris Mason --- fs/btrfs/acl.c | 6 +++--- fs/btrfs/ctree.h | 2 +- fs/btrfs/xattr.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index f128427b995b..69b355ae7f49 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -27,7 +27,7 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_POSIX_ACL static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { .set = btrfs_xattr_acl_access_set, }; -#else /* CONFIG_FS_POSIX_ACL */ +#else /* CONFIG_BTRFS_POSIX_ACL */ int btrfs_acl_chmod(struct inode *inode) { @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -#endif /* CONFIG_FS_POSIX_ACL */ +#endif /* CONFIG_BTRFS_POSIX_ACL */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b3959a150c3b..8184f2feb2f3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2366,7 +2366,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index a9d3bf4d2689..b0fc93f95fd0 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -260,7 +260,7 @@ err: * attributes are handled directly. */ struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, #endif -- cgit v1.2.3 From 90576c0b9a0b5323fc4bd7f23f49be0d234f36d1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 29 Sep 2009 15:51:30 -0400 Subject: ext4, jbd2: Drop unneeded printks at mount and unmount time There are a number of kernel printk's which are printed when an ext4 filesystem is mounted and unmounted. Disable them to economize space in the system logs. In addition, disabling the mballoc stats by default saves a number of unneeded atomic operations for every block allocation or deallocation. Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 ++ fs/ext4/mballoc.c | 2 -- fs/ext4/mballoc.h | 2 +- fs/ext4/super.c | 17 ++++------------- fs/jbd2/journal.c | 9 +++------ 5 files changed, 10 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 10a63096a95a..10539e364283 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2380,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb) */ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { +#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) printk(KERN_INFO "EXT4-fs: file extents enabled"); #ifdef AGGRESSIVE_TEST printk(", aggressive tests"); @@ -2391,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb) printk(", stats"); #endif printk("\n"); +#endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); EXT4_SB(sb)->s_ext_min = 1 << 30; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c73d43995b13..3e2320e66721 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2712,8 +2712,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) if (sbi->s_journal) sbi->s_journal->j_commit_callback = release_blocks_on_commit; - - printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 188d3d709b24..14f25f253112 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -84,7 +84,7 @@ extern u8 mb_enable_debug; * with 'ext4_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ -#define MB_DEFAULT_STATS 1 +#define MB_DEFAULT_STATS 0 /* * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3f7e7010c098..e5b206a043a5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1673,13 +1673,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt); - if (EXT4_SB(sb)->s_journal) { - ext4_msg(sb, KERN_INFO, "%s journal on %s", - EXT4_SB(sb)->s_journal->j_inode ? "internal" : - "external", EXT4_SB(sb)->s_journal->j_devname); - } else { - ext4_msg(sb, KERN_INFO, "no journal"); - } return res; } @@ -2885,12 +2878,12 @@ no_journal: "available"); } - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + if (test_opt(sb, DELALLOC) && + (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " "requested data journaling mode"); clear_opt(sbi->s_mount_opt, DELALLOC); - } else if (test_opt(sb, DELALLOC)) - ext4_msg(sb, KERN_INFO, "delayed allocation enabled"); + } err = ext4_setup_system_zone(sb); if (err) { @@ -3202,9 +3195,7 @@ static int ext4_load_journal(struct super_block *sb, return -EINVAL; } - if (journal->j_flags & JBD2_BARRIER) - ext4_msg(sb, KERN_INFO, "barriers enabled"); - else + if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 53b86e16e5fe..977a8dafb76d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -136,10 +136,6 @@ static int kjournald2(void *arg) journal->j_task = current; wake_up(&journal->j_wait_done_commit); - printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, " - "commit interval %ld seconds\n", current->pid, - journal->j_devname, journal->j_commit_interval / HZ); - /* * And now, wait forever for commit wakeup events. */ @@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal) { struct task_struct *t; - t = kthread_run(kjournald2, journal, "kjournald2"); + t = kthread_run(kjournald2, journal, "jbd2/%s", + journal->j_devname); if (IS_ERR(t)) return PTR_ERR(t); @@ -1115,7 +1112,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) while ((p = strchr(p, '/'))) *p = '!'; p = journal->j_devname + strlen(journal->j_devname); - sprintf(p, ":%lu", journal->j_inode->i_ino); + sprintf(p, "-%lu", journal->j_inode->i_ino); jbd_debug(1, "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", journal, inode->i_sb->s_id, inode->i_ino, -- cgit v1.2.3 From 1ab86aedbc7845a946b4ba4edf37762629970708 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 29 Sep 2009 18:38:44 -0400 Subject: Btrfs: fix error cases for ioctl transactions Fix leak of vfsmount write reference and open_ioctl_trans reference on ENOMEM. Clean up the error paths while we're at it. Signed-off-by: Sage Weil Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4de7ef6f8603..9a780c8d0ac8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1232,15 +1232,15 @@ static long btrfs_ioctl_trans_start(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - int ret = 0; + int ret; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + goto out; - if (file->private_data) { - ret = -EINPROGRESS; + ret = -EINPROGRESS; + if (file->private_data) goto out; - } ret = mnt_want_write(file->f_path.mnt); if (ret) @@ -1250,12 +1250,19 @@ static long btrfs_ioctl_trans_start(struct file *file) root->fs_info->open_ioctl_trans++; mutex_unlock(&root->fs_info->trans_mutex); + ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root, 0); - if (trans) - file->private_data = trans; - else - ret = -ENOMEM; - /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ + if (!trans) + goto out_drop; + + file->private_data = trans; + return 0; + +out_drop: + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + mnt_drop_write(file->f_path.mnt); out: return ret; } @@ -1271,24 +1278,20 @@ long btrfs_ioctl_trans_end(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - int ret = 0; trans = file->private_data; - if (!trans) { - ret = -EINVAL; - goto out; - } - btrfs_end_transaction(trans, root); + if (!trans) + return -EINVAL; file->private_data = NULL; + btrfs_end_transaction(trans, root); + mutex_lock(&root->fs_info->trans_mutex); root->fs_info->open_ioctl_trans--; mutex_unlock(&root->fs_info->trans_mutex); mnt_drop_write(file->f_path.mnt); - -out: - return ret; + return 0; } long btrfs_ioctl(struct file *file, unsigned int -- cgit v1.2.3 From dd7e0b7b02ccff73b87032e20fc5b4f2c1cfcc14 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 29 Sep 2009 18:38:44 -0400 Subject: Btrfs: fix deadlock with free space handling and user transactions If an ioctl-initiated transaction is open, we can't force a commit during the free space checks in order to free up pinned extents or else we deadlock. Just ENOSPC instead. A more satisfying solution that reserves space for the entire user transaction up front is forthcoming... Signed-off-by: Sage Weil Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a4b2b03cd682..d119c0388af1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3168,7 +3168,7 @@ alloc: spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ - if (!committed) { + if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); if (!trans) -- cgit v1.2.3 From 296c355cd6443d89fa251885a8d78778fe111dc4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 30 Sep 2009 00:32:42 -0400 Subject: ext4: Use tracepoints for mb_history trace file The /proc/fs/ext4//mb_history was maintained manually, and had a number of problems: it required a largish amount of memory to be allocated for each ext4 filesystem, and the s_mb_history_lock introduced a CPU contention problem. By ripping out the mb_history code and replacing it with ftrace tracepoints, and we get more functionality: timestamps, event filtering, the ability to correlate mballoc history with other ext4 tracepoints, etc. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/proc.txt | 1 - fs/ext4/ext4.h | 14 +- fs/ext4/mballoc.c | 301 ++----------------------------------- fs/ext4/mballoc.h | 33 ---- fs/ext4/super.c | 18 +-- include/trace/events/ext4.h | 163 ++++++++++++++++++++ 6 files changed, 182 insertions(+), 348 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index b5aee7838a00..2c48f945546b 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1113,7 +1113,6 @@ Table 1-12: Files in /proc/fs/ext4/ .............................................................................. File Content mb_groups details of multiblock allocator buddy cache of free blocks - mb_history multiblock allocation history .............................................................................. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b491576e11c3..c508cf7be75c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 0x0001 @@ -971,14 +977,6 @@ struct ext4_sb_info { unsigned long s_mb_last_group; unsigned long s_mb_last_start; - /* history to debug policy */ - struct ext4_mb_history *s_mb_history; - int s_mb_history_cur; - int s_mb_history_max; - int s_mb_history_num; - spinlock_t s_mb_history_lock; - int s_mb_history_filter; - /* stats for buddy allocator */ spinlock_t s_mb_pa_lock; atomic_t s_bal_reqs; /* number of reqs with len > 1 */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3e2320e66721..bba12824defa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2096,207 +2096,6 @@ out: return err; } -#ifdef EXT4_MB_HISTORY -struct ext4_mb_proc_session { - struct ext4_mb_history *history; - struct super_block *sb; - int start; - int max; -}; - -static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s, - struct ext4_mb_history *hs, - int first) -{ - if (hs == s->history + s->max) - hs = s->history; - if (!first && hs == s->history + s->start) - return NULL; - while (hs->orig.fe_len == 0) { - hs++; - if (hs == s->history + s->max) - hs = s->history; - if (hs == s->history + s->start) - return NULL; - } - return hs; -} - -static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos) -{ - struct ext4_mb_proc_session *s = seq->private; - struct ext4_mb_history *hs; - int l = *pos; - - if (l == 0) - return SEQ_START_TOKEN; - hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1); - if (!hs) - return NULL; - while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL); - return hs; -} - -static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v, - loff_t *pos) -{ - struct ext4_mb_proc_session *s = seq->private; - struct ext4_mb_history *hs = v; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ext4_mb_history_skip_empty(s, s->history + s->start, 1); - else - return ext4_mb_history_skip_empty(s, ++hs, 0); -} - -static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) -{ - char buf[25], buf2[25], buf3[25], *fmt; - struct ext4_mb_history *hs = v; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " - "%-5s %-2s %-6s %-5s %-5s %-6s\n", - "pid", "inode", "original", "goal", "result", "found", - "grps", "cr", "flags", "merge", "tail", "broken"); - return 0; - } - - if (hs->op == EXT4_MB_HISTORY_ALLOC) { - fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " - "0x%04x %-5s %-5u %-6u\n"; - sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len, - hs->result.fe_logical); - sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, - hs->orig.fe_start, hs->orig.fe_len, - hs->orig.fe_logical); - sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group, - hs->goal.fe_start, hs->goal.fe_len, - hs->goal.fe_logical); - seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, - hs->found, hs->groups, hs->cr, hs->flags, - hs->merged ? "M" : "", hs->tail, - hs->buddy ? 1 << hs->buddy : 0); - } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { - fmt = "%-5u %-8u %-23s %-23s %-23s\n"; - sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len, - hs->result.fe_logical); - sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, - hs->orig.fe_start, hs->orig.fe_len, - hs->orig.fe_logical); - seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); - } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { - sprintf(buf2, "%u/%d/%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len); - seq_printf(seq, "%-5u %-8u %-23s discard\n", - hs->pid, hs->ino, buf2); - } else if (hs->op == EXT4_MB_HISTORY_FREE) { - sprintf(buf2, "%u/%d/%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len); - seq_printf(seq, "%-5u %-8u %-23s free\n", - hs->pid, hs->ino, buf2); - } - return 0; -} - -static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations ext4_mb_seq_history_ops = { - .start = ext4_mb_seq_history_start, - .next = ext4_mb_seq_history_next, - .stop = ext4_mb_seq_history_stop, - .show = ext4_mb_seq_history_show, -}; - -static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) -{ - struct super_block *sb = PDE(inode)->data; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_mb_proc_session *s; - int rc; - int size; - - if (unlikely(sbi->s_mb_history == NULL)) - return -ENOMEM; - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) - return -ENOMEM; - s->sb = sb; - size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max; - s->history = kmalloc(size, GFP_KERNEL); - if (s->history == NULL) { - kfree(s); - return -ENOMEM; - } - - spin_lock(&sbi->s_mb_history_lock); - memcpy(s->history, sbi->s_mb_history, size); - s->max = sbi->s_mb_history_max; - s->start = sbi->s_mb_history_cur % s->max; - spin_unlock(&sbi->s_mb_history_lock); - - rc = seq_open(file, &ext4_mb_seq_history_ops); - if (rc == 0) { - struct seq_file *m = (struct seq_file *)file->private_data; - m->private = s; - } else { - kfree(s->history); - kfree(s); - } - return rc; - -} - -static int ext4_mb_seq_history_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = (struct seq_file *)file->private_data; - struct ext4_mb_proc_session *s = seq->private; - kfree(s->history); - kfree(s); - return seq_release(inode, file); -} - -static ssize_t ext4_mb_seq_history_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct seq_file *seq = (struct seq_file *)file->private_data; - struct ext4_mb_proc_session *s = seq->private; - struct super_block *sb = s->sb; - char str[32]; - int value; - - if (count >= sizeof(str)) { - printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", - "mb_history", (int)sizeof(str)); - return -EOVERFLOW; - } - - if (copy_from_user(str, buffer, count)) - return -EFAULT; - - value = simple_strtol(str, NULL, 0); - if (value < 0) - return -ERANGE; - EXT4_SB(sb)->s_mb_history_filter = value; - - return count; -} - -static const struct file_operations ext4_mb_seq_history_fops = { - .owner = THIS_MODULE, - .open = ext4_mb_seq_history_open, - .read = seq_read, - .write = ext4_mb_seq_history_write, - .llseek = seq_lseek, - .release = ext4_mb_seq_history_release, -}; - static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = seq->private; @@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = { .release = seq_release, }; -static void ext4_mb_history_release(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_proc != NULL) { - remove_proc_entry("mb_groups", sbi->s_proc); - if (sbi->s_mb_history_max) - remove_proc_entry("mb_history", sbi->s_proc); - } - kfree(sbi->s_mb_history); -} - -static void ext4_mb_history_init(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - int i; - - if (sbi->s_proc != NULL) { - if (sbi->s_mb_history_max) - proc_create_data("mb_history", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_history_fops, sb); - proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_groups_fops, sb); - } - - sbi->s_mb_history_cur = 0; - spin_lock_init(&sbi->s_mb_history_lock); - i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); - sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL; - /* if we can't allocate history, then we simple won't use it */ -} - -static noinline_for_stack void -ext4_mb_store_history(struct ext4_allocation_context *ac) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_mb_history h; - - if (sbi->s_mb_history == NULL) - return; - - if (!(ac->ac_op & sbi->s_mb_history_filter)) - return; - - h.op = ac->ac_op; - h.pid = current->pid; - h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; - h.orig = ac->ac_o_ex; - h.result = ac->ac_b_ex; - h.flags = ac->ac_flags; - h.found = ac->ac_found; - h.groups = ac->ac_groups_scanned; - h.cr = ac->ac_criteria; - h.tail = ac->ac_tail; - h.buddy = ac->ac_buddy; - h.merged = 0; - if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { - if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && - ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) - h.merged = 1; - h.goal = ac->ac_g_ex; - h.result = ac->ac_f_ex; - } - - spin_lock(&sbi->s_mb_history_lock); - memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); - if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) - sbi->s_mb_history_cur = 0; - spin_unlock(&sbi->s_mb_history_lock); -} - -#else -#define ext4_mb_history_release(sb) -#define ext4_mb_history_init(sb) -#endif - /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, @@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); @@ -2708,7 +2430,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } - ext4_mb_history_init(sb); + if (sbi->s_proc) + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); if (sbi->s_journal) sbi->s_journal->j_commit_callback = release_blocks_on_commit; @@ -2788,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - ext4_mb_history_release(sb); + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); return 0; } @@ -3274,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) atomic_inc(&sbi->s_bal_breaks); } - ext4_mb_store_history(ac); + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) + trace_ext4_mballoc_alloc(ac); + else + trace_ext4_mballoc_prealloc(ac); } /* @@ -3774,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (ac) { ac->ac_sb = sb; ac->ac_inode = pa->pa_inode; - ac->ac_op = EXT4_MB_HISTORY_DISCARD; } while (bit < end) { @@ -3794,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ac->ac_b_ex.fe_start = bit; ac->ac_b_ex.fe_len = next - bit; ac->ac_b_ex.fe_logical = 0; - ext4_mb_store_history(ac); + trace_ext4_mballoc_discard(ac); } trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, @@ -3829,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - if (ac) - ac->ac_op = EXT4_MB_HISTORY_DISCARD; - trace_ext4_mb_release_group_pa(ac, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); @@ -3846,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ac->ac_b_ex.fe_start = bit; ac->ac_b_ex.fe_len = pa->pa_len; ac->ac_b_ex.fe_logical = 0; - ext4_mb_store_history(ac); + trace_ext4_mballoc_discard(ac); } return 0; @@ -4737,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { - ac->ac_op = EXT4_MB_HISTORY_FREE; ac->ac_inode = inode; ac->ac_sb = sb; } @@ -4804,7 +4527,7 @@ do_more: ac->ac_b_ex.fe_group = block_group; ac->ac_b_ex.fe_start = bit; ac->ac_b_ex.fe_len = count; - ext4_mb_store_history(ac); + trace_ext4_mballoc_free(ac); } err = ext4_mb_load_buddy(sb, block_group, &e4b); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 14f25f253112..0ca811061bc7 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -52,18 +52,8 @@ extern u8 mb_enable_debug; #define mb_debug(n, fmt, a...) #endif -/* - * with EXT4_MB_HISTORY mballoc stores last N allocations in memory - * and you can monitor it in /proc/fs/ext4//mb_history - */ -#define EXT4_MB_HISTORY #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ #define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ -#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ -#define EXT4_MB_HISTORY_FREE 8 /* free */ - -#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ - EXT4_MB_HISTORY_PREALLOC) /* * How long mballoc can look for a best extent (in found extents) @@ -217,22 +207,6 @@ struct ext4_allocation_context { #define AC_STATUS_FOUND 2 #define AC_STATUS_BREAK 3 -struct ext4_mb_history { - struct ext4_free_extent orig; /* orig allocation */ - struct ext4_free_extent goal; /* goal allocation */ - struct ext4_free_extent result; /* result allocation */ - unsigned pid; - unsigned ino; - __u16 found; /* how many extents have been found */ - __u16 groups; /* how many groups have been scanned */ - __u16 tail; /* what tail broke some buddy */ - __u16 buddy; /* buddy the tail ^^^ broke */ - __u16 flags; - __u8 cr:3; /* which phase the result extent was found at */ - __u8 op:4; - __u8 merged:1; -}; - struct ext4_buddy { struct page *bd_buddy_page; void *bd_buddy; @@ -247,13 +221,6 @@ struct ext4_buddy { #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) -#ifndef EXT4_MB_HISTORY -static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) -{ - return; -} -#endif - #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5b206a043a5..12e726a7073f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -50,13 +50,6 @@ #define CREATE_TRACE_POINTS #include -static int default_mb_history_length = 1000; - -module_param_named(default_mb_history_length, default_mb_history_length, - int, 0644); -MODULE_PARM_DESC(default_mb_history_length, - "Default number of entries saved for mb_history"); - struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; @@ -1079,7 +1072,7 @@ enum { Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, @@ -1126,7 +1119,6 @@ static const match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_data_err_abort, "data_err=abort"}, {Opt_data_err_ignore, "data_err=ignore"}, - {Opt_mb_history_length, "mb_history_length=%u"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1367,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb, case Opt_data_err_ignore: clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); break; - case Opt_mb_history_length: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_mb_history_max = option; - break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -2435,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_mb_history_max = default_mb_history_length; set_opt(sbi->s_mount_opt, BARRIER); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 7c6bbb7198a3..b8320256dc5d 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -743,6 +743,169 @@ TRACE_EVENT(ext4_alloc_da_blocks, __entry->data_blocks, __entry->meta_blocks) ); +TRACE_EVENT(ext4_mballoc_alloc, + TP_PROTO(struct ext4_allocation_context *ac), + + TP_ARGS(ac), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u16, found ) + __field( __u16, groups ) + __field( __u16, buddy ) + __field( __u16, flags ) + __field( __u16, tail ) + __field( __u8, cr ) + __field( __u32, orig_logical ) + __field( int, orig_start ) + __field( __u32, orig_group ) + __field( int, orig_len ) + __field( __u32, goal_logical ) + __field( int, goal_start ) + __field( __u32, goal_group ) + __field( int, goal_len ) + __field( __u32, result_logical ) + __field( int, result_start ) + __field( __u32, result_group ) + __field( int, result_len ) + ), + + TP_fast_assign( + __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->found = ac->ac_found; + __entry->flags = ac->ac_flags; + __entry->groups = ac->ac_groups_scanned; + __entry->buddy = ac->ac_buddy; + __entry->tail = ac->ac_tail; + __entry->cr = ac->ac_criteria; + __entry->orig_logical = ac->ac_o_ex.fe_logical; + __entry->orig_start = ac->ac_o_ex.fe_start; + __entry->orig_group = ac->ac_o_ex.fe_group; + __entry->orig_len = ac->ac_o_ex.fe_len; + __entry->goal_logical = ac->ac_g_ex.fe_logical; + __entry->goal_start = ac->ac_g_ex.fe_start; + __entry->goal_group = ac->ac_g_ex.fe_group; + __entry->goal_len = ac->ac_g_ex.fe_len; + __entry->result_logical = ac->ac_f_ex.fe_logical; + __entry->result_start = ac->ac_f_ex.fe_start; + __entry->result_group = ac->ac_f_ex.fe_group; + __entry->result_len = ac->ac_f_ex.fe_len; + ), + + TP_printk("dev %s inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " + "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " + "tail %u broken %u", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->orig_group, __entry->orig_start, + __entry->orig_len, __entry->orig_logical, + __entry->goal_group, __entry->goal_start, + __entry->goal_len, __entry->goal_logical, + __entry->result_group, __entry->result_start, + __entry->result_len, __entry->result_logical, + __entry->found, __entry->groups, __entry->cr, + __entry->flags, __entry->tail, + __entry->buddy ? 1 << __entry->buddy : 0) +); + +TRACE_EVENT(ext4_mballoc_prealloc, + TP_PROTO(struct ext4_allocation_context *ac), + + TP_ARGS(ac), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u32, orig_logical ) + __field( int, orig_start ) + __field( __u32, orig_group ) + __field( int, orig_len ) + __field( __u32, result_logical ) + __field( int, result_start ) + __field( __u32, result_group ) + __field( int, result_len ) + ), + + TP_fast_assign( + __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->orig_logical = ac->ac_o_ex.fe_logical; + __entry->orig_start = ac->ac_o_ex.fe_start; + __entry->orig_group = ac->ac_o_ex.fe_group; + __entry->orig_len = ac->ac_o_ex.fe_len; + __entry->result_logical = ac->ac_b_ex.fe_logical; + __entry->result_start = ac->ac_b_ex.fe_start; + __entry->result_group = ac->ac_b_ex.fe_group; + __entry->result_len = ac->ac_b_ex.fe_len; + ), + + TP_printk("dev %s inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->orig_group, __entry->orig_start, + __entry->orig_len, __entry->orig_logical, + __entry->result_group, __entry->result_start, + __entry->result_len, __entry->result_logical) +); + +TRACE_EVENT(ext4_mballoc_discard, + TP_PROTO(struct ext4_allocation_context *ac), + + TP_ARGS(ac), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u32, result_logical ) + __field( int, result_start ) + __field( __u32, result_group ) + __field( int, result_len ) + ), + + TP_fast_assign( + __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->result_logical = ac->ac_b_ex.fe_logical; + __entry->result_start = ac->ac_b_ex.fe_start; + __entry->result_group = ac->ac_b_ex.fe_group; + __entry->result_len = ac->ac_b_ex.fe_len; + ), + + TP_printk("dev %s inode %lu extent %u/%d/%u@%u ", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->result_group, __entry->result_start, + __entry->result_len, __entry->result_logical) +); + +TRACE_EVENT(ext4_mballoc_free, + TP_PROTO(struct ext4_allocation_context *ac), + + TP_ARGS(ac), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u32, result_logical ) + __field( int, result_start ) + __field( __u32, result_group ) + __field( int, result_len ) + ), + + TP_fast_assign( + __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->result_logical = ac->ac_b_ex.fe_logical; + __entry->result_start = ac->ac_b_ex.fe_start; + __entry->result_group = ac->ac_b_ex.fe_group; + __entry->result_len = ac->ac_b_ex.fe_len; + ), + + TP_printk("dev %s inode %lu extent %u/%d/%u@%u ", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->result_group, __entry->result_start, + __entry->result_len, __entry->result_logical) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From bf6993276f74d46776f35c45ddef29b981b1d1c6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 30 Sep 2009 00:32:06 -0400 Subject: jbd2: Use tracepoints for history file The /proc/fs/jbd2//history was maintained manually; by using tracepoints, we can get all of the existing functionality of the /proc file plus extra capabilities thanks to the ftrace infrastructure. We save memory as a bonus. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/checkpoint.c | 7 ++ fs/jbd2/commit.c | 59 +++++++------- fs/jbd2/journal.c | 187 +++----------------------------------------- include/linux/jbd2.h | 27 ++----- include/trace/events/jbd2.h | 78 ++++++++++++++++++ 5 files changed, 130 insertions(+), 228 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 5d70b3e6d49b..ca0f5eb62b20 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -643,6 +643,7 @@ out: int __jbd2_journal_remove_checkpoint(struct journal_head *jh) { + struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; int ret = 0; @@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ + stats = &transaction->t_chp_stats; + if (stats->cs_chp_time) + stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, + jiffies); + trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev, + transaction->t_tid, stats); __jbd2_journal_drop_transaction(journal, transaction); kfree(transaction); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 26d991ddc1e6..d4cfd6d2779e 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (commit_transaction->t_synchronous_commit) write_op = WRITE_SYNC_PLUG; trace_jbd2_commit_locking(journal, commit_transaction); - stats.u.run.rs_wait = commit_transaction->t_max_wait; - stats.u.run.rs_locked = jiffies; - stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, - stats.u.run.rs_locked); + stats.run.rs_wait = commit_transaction->t_max_wait; + stats.run.rs_locked = jiffies; + stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, + stats.run.rs_locked); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { @@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_switch_revoke_table(journal); trace_jbd2_commit_flushing(journal, commit_transaction); - stats.u.run.rs_flushing = jiffies; - stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, - stats.u.run.rs_flushing); + stats.run.rs_flushing = jiffies; + stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, + stats.run.rs_flushing); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; @@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_unlock(&journal->j_state_lock); trace_jbd2_commit_logging(journal, commit_transaction); - stats.u.run.rs_logging = jiffies; - stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, - stats.u.run.rs_logging); - stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; - stats.u.run.rs_blocks_logged = 0; + stats.run.rs_logging = jiffies; + stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, + stats.run.rs_logging); + stats.run.rs_blocks = commit_transaction->t_outstanding_credits; + stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); @@ -695,7 +695,7 @@ start_journal_io: submit_bh(write_op, bh); } cond_resched(); - stats.u.run.rs_blocks_logged += bufs; + stats.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ @@ -988,33 +988,30 @@ restart_loop: J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_start = jiffies; - stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, - commit_transaction->t_start); + stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, + commit_transaction->t_start); /* - * File the transaction for history + * File the transaction statistics */ - stats.ts_type = JBD2_STATS_RUN; stats.ts_tid = commit_transaction->t_tid; - stats.u.run.rs_handle_count = commit_transaction->t_handle_count; - spin_lock(&journal->j_history_lock); - memcpy(journal->j_history + journal->j_history_cur, &stats, - sizeof(stats)); - if (++journal->j_history_cur == journal->j_history_max) - journal->j_history_cur = 0; + stats.run.rs_handle_count = commit_transaction->t_handle_count; + trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, + commit_transaction->t_tid, &stats.run); /* * Calculate overall stats */ + spin_lock(&journal->j_history_lock); journal->j_stats.ts_tid++; - journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; - journal->j_stats.u.run.rs_running += stats.u.run.rs_running; - journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; - journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; - journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; - journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; - journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; - journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; spin_unlock(&journal->j_history_lock); commit_transaction->t_state = T_FINISHED; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 977a8dafb76d..761af77491f5 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -676,153 +676,6 @@ struct jbd2_stats_proc_session { int max; }; -static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s, - struct transaction_stats_s *ts, - int first) -{ - if (ts == s->stats + s->max) - ts = s->stats; - if (!first && ts == s->stats + s->start) - return NULL; - while (ts->ts_type == 0) { - ts++; - if (ts == s->stats + s->max) - ts = s->stats; - if (ts == s->stats + s->start) - return NULL; - } - return ts; - -} - -static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos) -{ - struct jbd2_stats_proc_session *s = seq->private; - struct transaction_stats_s *ts; - int l = *pos; - - if (l == 0) - return SEQ_START_TOKEN; - ts = jbd2_history_skip_empty(s, s->stats + s->start, 1); - if (!ts) - return NULL; - l--; - while (l) { - ts = jbd2_history_skip_empty(s, ++ts, 0); - if (!ts) - break; - l--; - } - return ts; -} - -static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct jbd2_stats_proc_session *s = seq->private; - struct transaction_stats_s *ts = v; - - ++*pos; - if (v == SEQ_START_TOKEN) - return jbd2_history_skip_empty(s, s->stats + s->start, 1); - else - return jbd2_history_skip_empty(s, ++ts, 0); -} - -static int jbd2_seq_history_show(struct seq_file *seq, void *v) -{ - struct transaction_stats_s *ts = v; - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s " - "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid", - "wait", "run", "lock", "flush", "log", "hndls", - "block", "inlog", "ctime", "write", "drop", - "close"); - return 0; - } - if (ts->ts_type == JBD2_STATS_RUN) - seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u " - "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid, - jiffies_to_msecs(ts->u.run.rs_wait), - jiffies_to_msecs(ts->u.run.rs_running), - jiffies_to_msecs(ts->u.run.rs_locked), - jiffies_to_msecs(ts->u.run.rs_flushing), - jiffies_to_msecs(ts->u.run.rs_logging), - ts->u.run.rs_handle_count, - ts->u.run.rs_blocks, - ts->u.run.rs_blocks_logged); - else if (ts->ts_type == JBD2_STATS_CHECKPOINT) - seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n", - "C", ts->ts_tid, " ", - jiffies_to_msecs(ts->u.chp.cs_chp_time), - ts->u.chp.cs_written, ts->u.chp.cs_dropped, - ts->u.chp.cs_forced_to_close); - else - J_ASSERT(0); - return 0; -} - -static void jbd2_seq_history_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations jbd2_seq_history_ops = { - .start = jbd2_seq_history_start, - .next = jbd2_seq_history_next, - .stop = jbd2_seq_history_stop, - .show = jbd2_seq_history_show, -}; - -static int jbd2_seq_history_open(struct inode *inode, struct file *file) -{ - journal_t *journal = PDE(inode)->data; - struct jbd2_stats_proc_session *s; - int rc, size; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) - return -ENOMEM; - size = sizeof(struct transaction_stats_s) * journal->j_history_max; - s->stats = kmalloc(size, GFP_KERNEL); - if (s->stats == NULL) { - kfree(s); - return -ENOMEM; - } - spin_lock(&journal->j_history_lock); - memcpy(s->stats, journal->j_history, size); - s->max = journal->j_history_max; - s->start = journal->j_history_cur % s->max; - spin_unlock(&journal->j_history_lock); - - rc = seq_open(file, &jbd2_seq_history_ops); - if (rc == 0) { - struct seq_file *m = file->private_data; - m->private = s; - } else { - kfree(s->stats); - kfree(s); - } - return rc; - -} - -static int jbd2_seq_history_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - struct jbd2_stats_proc_session *s = seq->private; - - kfree(s->stats); - kfree(s); - return seq_release(inode, file); -} - -static struct file_operations jbd2_seq_history_fops = { - .owner = THIS_MODULE, - .open = jbd2_seq_history_open, - .read = seq_read, - .llseek = seq_lseek, - .release = jbd2_seq_history_release, -}; - static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) { return *pos ? NULL : SEQ_START_TOKEN; @@ -839,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) if (v != SEQ_START_TOKEN) return 0; - seq_printf(seq, "%lu transaction, each upto %u blocks\n", + seq_printf(seq, "%lu transaction, each up to %u blocks\n", s->stats->ts_tid, s->journal->j_max_transaction_buffers); if (s->stats->ts_tid == 0) return 0; seq_printf(seq, "average: \n %ums waiting for transaction\n", - jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); seq_printf(seq, " %ums running transaction\n", - jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); seq_printf(seq, " %ums transaction was being locked\n", - jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid)); seq_printf(seq, " %ums flushing data (in ordered mode)\n", - jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid)); seq_printf(seq, " %ums logging transaction\n", - jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid)); seq_printf(seq, " %lluus average transaction commit time\n", div_u64(s->journal->j_average_commit_time, 1000)); seq_printf(seq, " %lu handles per transaction\n", - s->stats->u.run.rs_handle_count / s->stats->ts_tid); + s->stats->run.rs_handle_count / s->stats->ts_tid); seq_printf(seq, " %lu blocks per transaction\n", - s->stats->u.run.rs_blocks / s->stats->ts_tid); + s->stats->run.rs_blocks / s->stats->ts_tid); seq_printf(seq, " %lu logged blocks per transaction\n", - s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); + s->stats->run.rs_blocks_logged / s->stats->ts_tid); return 0; } @@ -931,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal) { journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); if (journal->j_proc_entry) { - proc_create_data("history", S_IRUGO, journal->j_proc_entry, - &jbd2_seq_history_fops, journal); proc_create_data("info", S_IRUGO, journal->j_proc_entry, &jbd2_seq_info_fops, journal); } @@ -941,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal) static void jbd2_stats_proc_exit(journal_t *journal) { remove_proc_entry("info", journal->j_proc_entry); - remove_proc_entry("history", journal->j_proc_entry); remove_proc_entry(journal->j_devname, proc_jbd2_stats); } -static void journal_init_stats(journal_t *journal) -{ - int size; - - if (!proc_jbd2_stats) - return; - - journal->j_history_max = 100; - size = sizeof(struct transaction_stats_s) * journal->j_history_max; - journal->j_history = kzalloc(size, GFP_KERNEL); - if (!journal->j_history) { - journal->j_history_max = 0; - return; - } - spin_lock_init(&journal->j_history_lock); -} - /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1006,7 +839,7 @@ static journal_t * journal_init_common (void) goto fail; } - journal_init_stats(journal); + spin_lock_init(&journal->j_history_lock); return journal; fail: diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 52695d3dfd0b..f1011f7f3d41 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -464,9 +464,9 @@ struct handle_s */ struct transaction_chp_stats_s { unsigned long cs_chp_time; - unsigned long cs_forced_to_close; - unsigned long cs_written; - unsigned long cs_dropped; + __u32 cs_forced_to_close; + __u32 cs_written; + __u32 cs_dropped; }; /* The transaction_t type is the guts of the journaling mechanism. It @@ -668,23 +668,16 @@ struct transaction_run_stats_s { unsigned long rs_flushing; unsigned long rs_logging; - unsigned long rs_handle_count; - unsigned long rs_blocks; - unsigned long rs_blocks_logged; + __u32 rs_handle_count; + __u32 rs_blocks; + __u32 rs_blocks_logged; }; struct transaction_stats_s { - int ts_type; unsigned long ts_tid; - union { - struct transaction_run_stats_s run; - struct transaction_chp_stats_s chp; - } u; + struct transaction_run_stats_s run; }; -#define JBD2_STATS_RUN 1 -#define JBD2_STATS_CHECKPOINT 2 - static inline unsigned long jbd2_time_diff(unsigned long start, unsigned long end) { @@ -988,12 +981,6 @@ struct journal_s /* * Journal statistics */ - struct transaction_stats_s *j_history; - int j_history_max; - int j_history_cur; - /* - * Protect the transactions statistics history - */ spinlock_t j_history_lock; struct proc_dir_entry *j_proc_entry; struct transaction_stats_s j_stats; diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index b851f0b4701c..3c60b75adb9e 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -7,6 +7,9 @@ #include #include +struct transaction_chp_stats_s; +struct transaction_run_stats_s; + TRACE_EVENT(jbd2_checkpoint, TP_PROTO(journal_t *journal, int result), @@ -162,6 +165,81 @@ TRACE_EVENT(jbd2_submit_inode_data, jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) ); +TRACE_EVENT(jbd2_run_stats, + TP_PROTO(dev_t dev, unsigned long tid, + struct transaction_run_stats_s *stats), + + TP_ARGS(dev, tid, stats), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, tid ) + __field( unsigned long, wait ) + __field( unsigned long, running ) + __field( unsigned long, locked ) + __field( unsigned long, flushing ) + __field( unsigned long, logging ) + __field( __u32, handle_count ) + __field( __u32, blocks ) + __field( __u32, blocks_logged ) + ), + + TP_fast_assign( + __entry->dev = dev; + __entry->tid = tid; + __entry->wait = stats->rs_wait; + __entry->running = stats->rs_running; + __entry->locked = stats->rs_locked; + __entry->flushing = stats->rs_flushing; + __entry->logging = stats->rs_logging; + __entry->handle_count = stats->rs_handle_count; + __entry->blocks = stats->rs_blocks; + __entry->blocks_logged = stats->rs_blocks_logged; + ), + + TP_printk("dev %s tid %lu wait %u running %u locked %u flushing %u " + "logging %u handle_count %u blocks %u blocks_logged %u", + jbd2_dev_to_name(__entry->dev), __entry->tid, + jiffies_to_msecs(__entry->wait), + jiffies_to_msecs(__entry->running), + jiffies_to_msecs(__entry->locked), + jiffies_to_msecs(__entry->flushing), + jiffies_to_msecs(__entry->logging), + __entry->handle_count, __entry->blocks, + __entry->blocks_logged) +); + +TRACE_EVENT(jbd2_checkpoint_stats, + TP_PROTO(dev_t dev, unsigned long tid, + struct transaction_chp_stats_s *stats), + + TP_ARGS(dev, tid, stats), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, tid ) + __field( unsigned long, chp_time ) + __field( __u32, forced_to_close ) + __field( __u32, written ) + __field( __u32, dropped ) + ), + + TP_fast_assign( + __entry->dev = dev; + __entry->tid = tid; + __entry->chp_time = stats->cs_chp_time; + __entry->forced_to_close= stats->cs_forced_to_close; + __entry->written = stats->cs_written; + __entry->dropped = stats->cs_dropped; + ), + + TP_printk("dev %s tid %lu chp_time %u forced_to_close %u " + "written %u dropped %u", + jbd2_dev_to_name(__entry->dev), __entry->tid, + jiffies_to_msecs(__entry->chp_time), + __entry->forced_to_close, __entry->written, __entry->dropped) +); + #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ -- cgit v1.2.3 From c1fccc0696bcaff6008c11865091f5ec4b0937ab Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 30 Sep 2009 01:13:55 -0400 Subject: ext4: Fix time encoding with extra epoch bits "Looking at ext4.h, I think the setting of extra time fields forgets to mask the epoch bits so the epoch part overwrites nsec part. The second change is only for coherency (2 -> EXT4_EPOCH_BITS)." Thanks to Damien Guibouret for pointing out this problem. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c508cf7be75c..984ca0cb38c3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -525,8 +525,8 @@ struct move_extent { static inline __le32 ext4_encode_extra_time(struct timespec *time) { return cpu_to_le32((sizeof(time->tv_sec) > 4 ? - time->tv_sec >> 32 : 0) | - ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); + (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | + ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); } static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) @@ -534,7 +534,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) if (sizeof(time->tv_sec) > 4) time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ -- cgit v1.2.3 From 04bedd79a7037ee7af816b06c60c738144475c4a Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 18 Sep 2009 14:31:47 -0500 Subject: dlm: fix lowcomms_connect_node for sctp The recently added dlm_lowcomms_connect_node() from 391fbdc5d527149578490db2f1619951d91f3561 does not work when using SCTP instead of TCP. The sctp connection code has nothing to do without data to send. Check for no data in the sctp connection code and do nothing instead of triggering a BUG. Also have connect_node() do nothing when the protocol is sctp. Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 240cef14fe58..a3350e4c8184 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -316,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; + /* with sctp there's no connecting without sending */ + if (dlm_config.ci_protocol != 0) + return 0; + if (nodeid == dlm_our_nodeid()) return 0; @@ -855,11 +859,14 @@ static void sctp_init_assoc(struct connection *con) outmessage.msg_flags = MSG_EOR; spin_lock(&con->writequeue_lock); - e = list_entry(con->writequeue.next, struct writequeue_entry, - list); - BUG_ON((struct list_head *) e == &con->writequeue); + if (list_empty(&con->writequeue)) { + spin_unlock(&con->writequeue_lock); + log_print("writequeue empty for nodeid %d", con->nodeid); + return; + } + e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; spin_unlock(&con->writequeue_lock); -- cgit v1.2.3 From 6861f350785bf476c2d4e3b9cb69ee36b78df2fc Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 24 Sep 2009 15:58:23 -0500 Subject: dlm: fix socket fd translation The code to set up sctp sockets was not using the sockfd_lookup() and sockfd_put() routines to translate an fd to a socket. The direct fget and fput calls were resulting in error messages from alloc_fd(). Also clean up two log messages and remove a third, related to setting up sctp associations. Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a3350e4c8184..70736eb4b516 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -459,9 +459,9 @@ static void process_sctp_notification(struct connection *con, int prim_len, ret; int addr_len; struct connection *new_con; - struct file *file; sctp_peeloff_arg_t parg; int parglen = sizeof(parg); + int err; /* * We get this before any data for an association. @@ -516,19 +516,22 @@ static void process_sctp_notification(struct connection *con, ret = kernel_getsockopt(con->sock, IPPROTO_SCTP, SCTP_SOCKOPT_PEELOFF, (void *)&parg, &parglen); - if (ret) { + if (ret < 0) { log_print("Can't peel off a socket for " - "connection %d to node %d: err=%d\n", + "connection %d to node %d: err=%d", parg.associd, nodeid, ret); + return; + } + new_con->sock = sockfd_lookup(parg.sd, &err); + if (!new_con->sock) { + log_print("sockfd_lookup error %d", err); + return; } - file = fget(parg.sd); - new_con->sock = SOCKET_I(file->f_dentry->d_inode); add_sock(new_con->sock, new_con); - fput(file); - put_unused_fd(parg.sd); + sockfd_put(new_con->sock); - log_print("got new/restarted association %d nodeid %d", - (int)sn->sn_assoc_change.sac_assoc_id, nodeid); + log_print("connecting to %d sctp association %d", + nodeid, (int)sn->sn_assoc_change.sac_assoc_id); /* Send any pending writes */ clear_bit(CF_CONNECT_PENDING, &new_con->flags); @@ -841,8 +844,6 @@ static void sctp_init_assoc(struct connection *con) if (con->retries++ > MAX_CONNECT_RETRIES) return; - log_print("Initiating association with node %d", con->nodeid); - if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) { log_print("no address for nodeid %d", con->nodeid); return; -- cgit v1.2.3 From 1f94533d9cd75f6d2826018d54a971b9cc085992 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 30 Sep 2009 22:57:41 -0400 Subject: ext4: fix a BUG_ON crash by checking that page has buffers attached to it In ext4_num_dirty_pages() we were calling page_buffers() before checking to see if the page actually had pages attached to it; this would cause a BUG check crash in the inline function page_buffers(). Thanks to Markus Trippelsdorf for reporting this bug. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ec367bce7215..6e65d0e25ed3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1146,8 +1146,8 @@ static int check_block_validity(struct inode *inode, const char *msg, } /* - * Return the number of dirty pages in the given inode starting at - * page frame idx. + * Return the number of contiguous dirty pages in a given inode + * starting at page frame idx. */ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, unsigned int max_pages) @@ -1181,15 +1181,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, unlock_page(page); break; } - head = page_buffers(page); - bh = head; - do { - if (!buffer_delay(bh) && - !buffer_unwritten(bh)) { - done = 1; - break; - } - } while ((bh = bh->b_this_page) != head); + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + if (!buffer_delay(bh) && + !buffer_unwritten(bh)) + done = 1; + bh = bh->b_this_page; + } while (!done && (bh != head)); + } unlock_page(page); if (done) break; -- cgit v1.2.3 From f0e2dfa7f3e1361ca8fc91c25e67fc4e92613cc9 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 1 Oct 2009 02:21:07 -0400 Subject: ext4: drop ext4dev compat Kconfig & super.c promised it'd be gone by 2.6.31, so it's about time to drop it. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/Kconfig | 14 -------------- fs/ext4/super.c | 31 ------------------------------- 2 files changed, 45 deletions(-) (limited to 'fs') diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index d5c0ea2e8f2d..9f2d45d75b1a 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -26,20 +26,6 @@ config EXT4_FS If unsure, say N. -config EXT4DEV_COMPAT - bool "Enable ext4dev compatibility" - depends on EXT4_FS - help - Starting with 2.6.28, the name of the ext4 filesystem was - renamed from ext4dev to ext4. Unfortunately there are some - legacy userspace programs (such as klibc's fstype) have - "ext4dev" hardcoded. - - To enable backwards compatibility so that systems that are - still expecting to mount ext4 filesystems using ext4dev, - choose Y here. This feature will go away by 2.6.31, so - please arrange to get your userspace programs fixed! - config EXT4_FS_XATTR bool "Ext4 extended attributes" depends on EXT4_FS diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 12e726a7073f..312211ee05af 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3966,27 +3966,6 @@ static struct file_system_type ext4_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -#ifdef CONFIG_EXT4DEV_COMPAT -static int ext4dev_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data,struct vfsmount *mnt) -{ - printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs " - "to mount using ext4\n", dev_name); - printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility " - "will go away by 2.6.31\n", dev_name); - return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); -} - -static struct file_system_type ext4dev_fs_type = { - .owner = THIS_MODULE, - .name = "ext4dev", - .get_sb = ext4dev_get_sb, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS("ext4dev"); -#endif - static int __init init_ext4_fs(void) { int err; @@ -4011,13 +3990,6 @@ static int __init init_ext4_fs(void) err = register_filesystem(&ext4_fs_type); if (err) goto out; -#ifdef CONFIG_EXT4DEV_COMPAT - err = register_filesystem(&ext4dev_fs_type); - if (err) { - unregister_filesystem(&ext4_fs_type); - goto out; - } -#endif return 0; out: destroy_inodecache(); @@ -4036,9 +4008,6 @@ out4: static void __exit exit_ext4_fs(void) { unregister_filesystem(&ext4_fs_type); -#ifdef CONFIG_EXT4DEV_COMPAT - unregister_filesystem(&ext4dev_fs_type); -#endif destroy_inodecache(); exit_ext4_xattr(); exit_ext4_mballoc(); -- cgit v1.2.3 From 35d62a942db5ae03104929fe7397835b572c4bc4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Sep 2009 16:47:08 -0400 Subject: Btrfs: fix arguments to btrfs_wait_on_page_writeback_range wait_on_page_writeback_range/btrfs_wait_on_page_writeback_range takes a pagecache offset, not a byte offset into the file. Shift the arguments around to wait for the correct range Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ece8d1e26b5e..69dce50aabd2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -829,7 +829,9 @@ int btrfs_write_tree_block(struct extent_buffer *buf) int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, - buf->start, buf->start + buf->len - 1); + buf->start >> PAGE_CACHE_SHIFT, + (buf->start + buf->len - 1) >> + PAGE_CACHE_SHIFT); } struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, -- cgit v1.2.3 From ab93dbecfba72bbc04b7036343d180aaff1b61a3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 1 Oct 2009 12:29:10 -0400 Subject: Btrfs: take i_mutex before generic_write_checks btrfs_file_write was incorrectly calling generic_write_checks without taking i_mutex. This lead to problems with racing around i_size when doing O_APPEND writes. The fix here is to move i_mutex higher. Signed-off-by: Chris Mason --- fs/btrfs/file.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1be96ba6f6bb..f155179877a6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -920,26 +920,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* do the reserve before the mutex lock in case we have to do some + * flushing. We wouldn't deadlock, but this is more polite. + */ + err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (err) + goto out_nolock; + + mutex_lock(&inode->i_mutex); + current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) - goto out_nolock; + goto out; + if (count == 0) - goto out_nolock; + goto out; err = file_remove_suid(file); if (err) - goto out_nolock; - - err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (err) - goto out_nolock; + goto out; file_update_time(file); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); - mutex_lock(&inode->i_mutex); + /* generic_write_checks can change our pos */ + start_pos = pos; + BTRFS_I(inode)->sequence++; first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + count) >> PAGE_CACHE_SHIFT; -- cgit v1.2.3 From 8aa38c31b7659e338fee4d9af4c3805acbd9806f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Oct 2009 12:58:30 -0400 Subject: Btrfs: remove duplicates of filemap_ helpers Use filemap_fdatawrite_range and filemap_fdatawait_range instead of local copies of the functions. For filemap_fdatawait_range that also means replacing the awkward old wait_on_page_writeback_range calling convention with the regular filemap byte offsets. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 10 +++--- fs/btrfs/file.c | 5 ++- fs/btrfs/ordered-data.c | 93 +++---------------------------------------------- fs/btrfs/ordered-data.h | 4 --- 4 files changed, 10 insertions(+), 102 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d20dc05208fe..af0435f79fa6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -822,16 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, int btrfs_write_tree_block(struct extent_buffer *buf) { - return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, - buf->start + buf->len - 1, WB_SYNC_ALL); + return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, - buf->start >> PAGE_CACHE_SHIFT, - (buf->start + buf->len - 1) >> - PAGE_CACHE_SHIFT); + return filemap_fdatawait_range(buf->first_page->mapping, + buf->start, buf->start + buf->len - 1); } struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7351bdbca26f..ca784a7fbeba 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1022,9 +1022,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } if (will_write) { - btrfs_fdatawrite_range(inode->i_mapping, pos, - pos + write_bytes - 1, - WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, pos, + pos + write_bytes - 1); } else { balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b5d6d24726b0..897fba835f89 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -458,7 +458,7 @@ void btrfs_start_ordered_extent(struct inode *inode, * start IO on any dirty ones so the wait doesn't stall waiting * for pdflush to find them */ - btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -488,17 +488,15 @@ again: /* start IO across the range first to instantiate any delalloc * extents */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); /* The compression code will leave pages locked but return from * writepage without setting the page writeback. Starting again * with WB_SYNC_ALL will end up waiting for the IO to actually start. */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - btrfs_wait_on_page_writeback_range(inode->i_mapping, - start >> PAGE_CACHE_SHIFT, - orig_end >> PAGE_CACHE_SHIFT); + filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; found = 0; @@ -716,89 +714,6 @@ out: } -/** - * taken from mm/filemap.c because it isn't exported - * - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation - * - * Start writeback against all of a mapping's dirty pages that lie - * within the byte offsets inclusive. - * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. - */ -int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, - .range_start = start, - .range_end = end, - }; - return btrfs_writepages(mapping, &wbc); -} - -/** - * taken from mm/filemap.c because it isn't exported - * - * wait_on_page_writeback_range - wait for writeback to complete - * @mapping: target address_space - * @start: beginning page index - * @end: ending page index - * - * Wait for writeback to complete against pages indexed by start->end - * inclusive - */ -int btrfs_wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end) -{ - struct pagevec pvec; - int nr_pages; - int ret = 0; - pgoff_t index; - - if (end < start) - return 0; - - pagevec_init(&pvec, 0); - index = start; - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { - unsigned i; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - - wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - } - pagevec_release(&pvec); - cond_resched(); - } - - /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) - ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) - ret = -EIO; - - return ret; -} - /* * add a given inode to the list of inodes that must be fully on * disk before a transaction commit finishes. diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 993a7ea45c70..f82e87488ca8 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -153,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); int btrfs_ordered_update_i_size(struct inode *inode, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end); -int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From a112a71d45b5e40c3cf07371d20a4a5079a72610 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Sat, 26 Sep 2009 16:19:21 +0200 Subject: fs/bio.c: move EXPORT* macros to line after function As mentioned in Documentation/CodingStyle, move EXPORT* macro's to the line immediately after the closing function brace line. Signed-off-by: H Hartley Sweeten Signed-off-by: Jens Axboe --- fs/bio.c | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 76738005c8e8..402cb84a92a1 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -249,6 +249,7 @@ void bio_free(struct bio *bio, struct bio_set *bs) mempool_free(p, bs->bio_pool); } +EXPORT_SYMBOL(bio_free); void bio_init(struct bio *bio) { @@ -257,6 +258,7 @@ void bio_init(struct bio *bio) bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } +EXPORT_SYMBOL(bio_init); /** * bio_alloc_bioset - allocate a bio for I/O @@ -311,6 +313,7 @@ err_free: mempool_free(p, bs->bio_pool); return NULL; } +EXPORT_SYMBOL(bio_alloc_bioset); static void bio_fs_destructor(struct bio *bio) { @@ -337,6 +340,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) return bio; } +EXPORT_SYMBOL(bio_alloc); static void bio_kmalloc_destructor(struct bio *bio) { @@ -380,6 +384,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) return bio; } +EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio(struct bio *bio) { @@ -416,6 +421,7 @@ void bio_put(struct bio *bio) bio->bi_destructor(bio); } } +EXPORT_SYMBOL(bio_put); inline int bio_phys_segments(struct request_queue *q, struct bio *bio) { @@ -424,6 +430,7 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio) return bio->bi_phys_segments; } +EXPORT_SYMBOL(bio_phys_segments); /** * __bio_clone - clone a bio @@ -451,6 +458,7 @@ void __bio_clone(struct bio *bio, struct bio *bio_src) bio->bi_size = bio_src->bi_size; bio->bi_idx = bio_src->bi_idx; } +EXPORT_SYMBOL(__bio_clone); /** * bio_clone - clone a bio @@ -482,6 +490,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) return b; } +EXPORT_SYMBOL(bio_clone); /** * bio_get_nr_vecs - return approx number of vecs @@ -505,6 +514,7 @@ int bio_get_nr_vecs(struct block_device *bdev) return nr_pages; } +EXPORT_SYMBOL(bio_get_nr_vecs); static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, @@ -635,6 +645,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, return __bio_add_page(q, bio, page, len, offset, queue_max_hw_sectors(q)); } +EXPORT_SYMBOL(bio_add_pc_page); /** * bio_add_page - attempt to add page to bio @@ -655,6 +666,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, struct request_queue *q = bdev_get_queue(bio->bi_bdev); return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); } +EXPORT_SYMBOL(bio_add_page); struct bio_map_data { struct bio_vec *iovecs; @@ -776,6 +788,7 @@ int bio_uncopy_user(struct bio *bio) bio_put(bio); return ret; } +EXPORT_SYMBOL(bio_uncopy_user); /** * bio_copy_user_iov - copy user data to bio @@ -920,6 +933,7 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); } +EXPORT_SYMBOL(bio_copy_user); static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, @@ -1050,6 +1064,7 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); } +EXPORT_SYMBOL(bio_map_user); /** * bio_map_user_iov - map user sg_iovec table into bio @@ -1117,13 +1132,13 @@ void bio_unmap_user(struct bio *bio) __bio_unmap_user(bio); bio_put(bio); } +EXPORT_SYMBOL(bio_unmap_user); static void bio_map_kern_endio(struct bio *bio, int err) { bio_put(bio); } - static struct bio *__bio_map_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) { @@ -1189,6 +1204,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, bio_put(bio); return ERR_PTR(-EINVAL); } +EXPORT_SYMBOL(bio_map_kern); static void bio_copy_kern_endio(struct bio *bio, int err) { @@ -1250,6 +1266,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, return bio; } +EXPORT_SYMBOL(bio_copy_kern); /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions @@ -1400,6 +1417,7 @@ void bio_endio(struct bio *bio, int error) if (bio->bi_end_io) bio->bi_end_io(bio, error); } +EXPORT_SYMBOL(bio_endio); void bio_pair_release(struct bio_pair *bp) { @@ -1410,6 +1428,7 @@ void bio_pair_release(struct bio_pair *bp) mempool_free(bp, bp->bio2.bi_private); } } +EXPORT_SYMBOL(bio_pair_release); static void bio_pair_end_1(struct bio *bi, int err) { @@ -1477,6 +1496,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) return bp; } +EXPORT_SYMBOL(bio_split); /** * bio_sector_offset - Find hardware sector offset in bio @@ -1547,6 +1567,7 @@ void bioset_free(struct bio_set *bs) kfree(bs); } +EXPORT_SYMBOL(bioset_free); /** * bioset_create - Create a bio_set @@ -1592,6 +1613,7 @@ bad: bioset_free(bs); return NULL; } +EXPORT_SYMBOL(bioset_create); static void __init biovec_init_slabs(void) { @@ -1636,29 +1658,4 @@ static int __init init_bio(void) return 0; } - subsys_initcall(init_bio); - -EXPORT_SYMBOL(bio_alloc); -EXPORT_SYMBOL(bio_kmalloc); -EXPORT_SYMBOL(bio_put); -EXPORT_SYMBOL(bio_free); -EXPORT_SYMBOL(bio_endio); -EXPORT_SYMBOL(bio_init); -EXPORT_SYMBOL(__bio_clone); -EXPORT_SYMBOL(bio_clone); -EXPORT_SYMBOL(bio_phys_segments); -EXPORT_SYMBOL(bio_add_page); -EXPORT_SYMBOL(bio_add_pc_page); -EXPORT_SYMBOL(bio_get_nr_vecs); -EXPORT_SYMBOL(bio_map_user); -EXPORT_SYMBOL(bio_unmap_user); -EXPORT_SYMBOL(bio_map_kern); -EXPORT_SYMBOL(bio_copy_kern); -EXPORT_SYMBOL(bio_pair_release); -EXPORT_SYMBOL(bio_split); -EXPORT_SYMBOL(bio_copy_user); -EXPORT_SYMBOL(bio_uncopy_user); -EXPORT_SYMBOL(bioset_create); -EXPORT_SYMBOL(bioset_free); -EXPORT_SYMBOL(bio_alloc_bioset); -- cgit v1.2.3 From fbf190874407f23d2891b53ffdf7d3c6be8d47ff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 1 Oct 2009 17:10:23 -0400 Subject: Btrfs: fix data space leak fix There is a problem where page_mkwrite can be called on a dirtied page that already has a delalloc range associated with it. The fix is to clear any delalloc bits for the range we are dirtying so the space accounting gets handled properly. This is the same thing we do in the normal write case, so we are consistent across the board. With this patch we no longer leak reserved space. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3cc5677f5440..3a6f953337b5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4909,10 +4909,21 @@ again: goto again; } + /* + * XXX - page_mkwrite gets called every time the page is dirtied, even + * if it was already dirty, so for space accounting reasons we need to + * clear any delalloc bits for the range we are fixing to save. There + * is probably a better way to do this, but for now keep consistent with + * prepare_pages in the normal write path. + */ + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); if (ret) { unlock_extent(io_tree, page_start, page_end, GFP_NOFS); ret = VM_FAULT_SIGBUS; + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); goto out_unlock; } ret = 0; -- cgit v1.2.3 From 828c09509b9695271bcbdc53e9fc9a6a737148d2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 1 Oct 2009 15:43:56 -0700 Subject: const: constify remaining file_operations [akpm@linux-foundation.org: fix KVM] Signed-off-by: Alexey Dobriyan Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mach-ns9xxx/clock.c | 2 +- arch/blackfin/mach-bf561/coreb.c | 2 +- arch/cris/arch-v10/drivers/sync_serial.c | 2 +- arch/cris/arch-v32/drivers/mach-fs/gpio.c | 2 +- arch/powerpc/kvm/timing.c | 2 +- arch/powerpc/platforms/cell/spufs/file.c | 2 +- arch/powerpc/platforms/pseries/dtl.c | 2 +- arch/x86/xen/debugfs.c | 2 +- drivers/acpi/video.c | 2 +- drivers/block/cciss.c | 2 +- drivers/char/apm-emulation.c | 2 +- drivers/char/bfin-otp.c | 2 +- drivers/char/xilinx_hwicap/xilinx_hwicap.c | 2 +- drivers/gpio/gpiolib.c | 2 +- drivers/hwmon/fschmd.c | 2 +- drivers/lguest/lguest_user.c | 2 +- drivers/media/dvb/dvb-core/dmxdev.c | 2 +- drivers/media/dvb/firewire/firedtv-ci.c | 2 +- drivers/misc/phantom.c | 2 +- drivers/misc/sgi-gru/grufile.c | 3 +-- drivers/mmc/core/debugfs.c | 2 +- drivers/s390/cio/qdio_debug.c | 2 +- drivers/s390/cio/qdio_perf.c | 2 +- drivers/scsi/sg.c | 43 +++++++++++++++++++++--------- drivers/spi/spidev.c | 2 +- drivers/usb/class/usbtmc.c | 2 +- drivers/usb/gadget/printer.c | 2 +- drivers/usb/host/whci/debug.c | 6 ++--- drivers/usb/misc/rio500.c | 3 +-- drivers/uwb/uwb-debug.c | 6 ++--- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 4 +-- fs/jbd2/journal.c | 2 +- fs/nfsd/nfsctl.c | 2 +- fs/nilfs2/dir.c | 2 +- fs/nilfs2/file.c | 2 +- fs/nilfs2/mdt.c | 2 +- fs/nilfs2/nilfs.h | 4 +-- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/cluster/netdebug.c | 4 +-- fs/ocfs2/dlm/dlmdebug.c | 8 +++--- fs/ocfs2/super.c | 2 +- fs/omfs/dir.c | 2 +- fs/omfs/file.c | 2 +- fs/omfs/omfs.h | 4 +-- include/linux/cgroup.h | 2 +- include/linux/fs.h | 2 +- kernel/cgroup.c | 10 +++---- kernel/kprobes.c | 4 +-- kernel/rcutree_trace.c | 10 +++---- kernel/sched.c | 2 +- kernel/time/timer_list.c | 2 +- kernel/time/timer_stats.c | 2 +- samples/tracepoints/tracepoint-sample.c | 2 +- security/integrity/ima/ima_fs.c | 10 +++---- virt/kvm/kvm_main.c | 2 +- 57 files changed, 110 insertions(+), 95 deletions(-) (limited to 'fs') diff --git a/arch/arm/mach-ns9xxx/clock.c b/arch/arm/mach-ns9xxx/clock.c index 44ed20d4a388..cf81cbc57544 100644 --- a/arch/arm/mach-ns9xxx/clock.c +++ b/arch/arm/mach-ns9xxx/clock.c @@ -195,7 +195,7 @@ static int clk_debugfs_open(struct inode *inode, struct file *file) return single_open(file, clk_debugfs_show, NULL); } -static struct file_operations clk_debugfs_operations = { +static const struct file_operations clk_debugfs_operations = { .open = clk_debugfs_open, .read = seq_read, .llseek = seq_lseek, diff --git a/arch/blackfin/mach-bf561/coreb.c b/arch/blackfin/mach-bf561/coreb.c index 93635a766f9c..1e60a92dd602 100644 --- a/arch/blackfin/mach-bf561/coreb.c +++ b/arch/blackfin/mach-bf561/coreb.c @@ -48,7 +48,7 @@ coreb_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned l return ret; } -static struct file_operations coreb_fops = { +static const struct file_operations coreb_fops = { .owner = THIS_MODULE, .ioctl = coreb_ioctl, }; diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c index 6cc1a0319a5d..562b9a7feae7 100644 --- a/arch/cris/arch-v10/drivers/sync_serial.c +++ b/arch/cris/arch-v10/drivers/sync_serial.c @@ -244,7 +244,7 @@ static unsigned sync_serial_prescale_shadow; #define NUMBER_OF_PORTS 2 -static struct file_operations sync_serial_fops = { +static const struct file_operations sync_serial_fops = { .owner = THIS_MODULE, .write = sync_serial_write, .read = sync_serial_read, diff --git a/arch/cris/arch-v32/drivers/mach-fs/gpio.c b/arch/cris/arch-v32/drivers/mach-fs/gpio.c index fe1fde893887..d89ab80498ed 100644 --- a/arch/cris/arch-v32/drivers/mach-fs/gpio.c +++ b/arch/cris/arch-v32/drivers/mach-fs/gpio.c @@ -855,7 +855,7 @@ gpio_leds_ioctl(unsigned int cmd, unsigned long arg) return 0; } -struct file_operations gpio_fops = { +static const struct file_operations gpio_fops = { .owner = THIS_MODULE, .poll = gpio_poll, .ioctl = gpio_ioctl, diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index 47ee603f558e..2aa371e30079 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -201,7 +201,7 @@ static int kvmppc_exit_timing_open(struct inode *inode, struct file *file) return single_open(file, kvmppc_exit_timing_show, inode->i_private); } -static struct file_operations kvmppc_exit_timing_fops = { +static const struct file_operations kvmppc_exit_timing_fops = { .owner = THIS_MODULE, .open = kvmppc_exit_timing_open, .read = seq_read, diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 961309446170..884e8bcec499 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -147,7 +147,7 @@ static int __fops ## _open(struct inode *inode, struct file *file) \ __simple_attr_check_format(__fmt, 0ull); \ return spufs_attr_open(inode, file, __get, __set, __fmt); \ } \ -static struct file_operations __fops = { \ +static const struct file_operations __fops = { \ .owner = THIS_MODULE, \ .open = __fops ## _open, \ .release = spufs_attr_release, \ diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index ab69925d579b..937a544a236d 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -209,7 +209,7 @@ static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len, return n_read * sizeof(struct dtl_entry); } -static struct file_operations dtl_fops = { +static const struct file_operations dtl_fops = { .open = dtl_file_open, .release = dtl_file_release, .read = dtl_file_read, diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index b53225d2cac3..e133ce25e290 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -100,7 +100,7 @@ static int xen_array_release(struct inode *inode, struct file *file) return 0; } -static struct file_operations u32_array_fops = { +static const struct file_operations u32_array_fops = { .owner = THIS_MODULE, .open = u32_array_open, .release= xen_array_release, diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index a4fddb24476f..f6e54bf8dd96 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -285,7 +285,7 @@ static int acpi_video_device_brightness_open_fs(struct inode *inode, struct file *file); static ssize_t acpi_video_device_write_brightness(struct file *file, const char __user *buffer, size_t count, loff_t *data); -static struct file_operations acpi_video_device_brightness_fops = { +static const struct file_operations acpi_video_device_brightness_fops = { .owner = THIS_MODULE, .open = acpi_video_device_brightness_open_fs, .read = seq_read, diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 24c3e21ab263..1ece0b47b581 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -426,7 +426,7 @@ out: return err; } -static struct file_operations cciss_proc_fops = { +static const struct file_operations cciss_proc_fops = { .owner = THIS_MODULE, .open = cciss_seq_open, .read = seq_read, diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c index aaca40283be9..4f568cb9af3f 100644 --- a/drivers/char/apm-emulation.c +++ b/drivers/char/apm-emulation.c @@ -393,7 +393,7 @@ static int apm_open(struct inode * inode, struct file * filp) return as ? 0 : -ENOMEM; } -static struct file_operations apm_bios_fops = { +static const struct file_operations apm_bios_fops = { .owner = THIS_MODULE, .read = apm_read, .poll = apm_poll, diff --git a/drivers/char/bfin-otp.c b/drivers/char/bfin-otp.c index e3dd24bff514..836d4f0a876f 100644 --- a/drivers/char/bfin-otp.c +++ b/drivers/char/bfin-otp.c @@ -217,7 +217,7 @@ static long bfin_otp_ioctl(struct file *filp, unsigned cmd, unsigned long arg) # define bfin_otp_ioctl NULL #endif -static struct file_operations bfin_otp_fops = { +static const struct file_operations bfin_otp_fops = { .owner = THIS_MODULE, .unlocked_ioctl = bfin_otp_ioctl, .read = bfin_otp_read, diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c index f40ab699860f..4846d50199f3 100644 --- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c +++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c @@ -559,7 +559,7 @@ static int hwicap_release(struct inode *inode, struct file *file) return status; } -static struct file_operations hwicap_fops = { +static const struct file_operations hwicap_fops = { .owner = THIS_MODULE, .write = hwicap_write, .read = hwicap_read, diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index bb11a429394a..662ed923d9eb 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1487,7 +1487,7 @@ static int gpiolib_open(struct inode *inode, struct file *file) return single_open(file, gpiolib_show, NULL); } -static struct file_operations gpiolib_operations = { +static const struct file_operations gpiolib_operations = { .open = gpiolib_open, .read = seq_read, .llseek = seq_lseek, diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c index ea955edde87e..2a7a85a6dc36 100644 --- a/drivers/hwmon/fschmd.c +++ b/drivers/hwmon/fschmd.c @@ -915,7 +915,7 @@ static int watchdog_ioctl(struct inode *inode, struct file *filp, return ret; } -static struct file_operations watchdog_fops = { +static const struct file_operations watchdog_fops = { .owner = THIS_MODULE, .llseek = no_llseek, .open = watchdog_open, diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index b4d3f7ca554f..bd1632388e4a 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -508,7 +508,7 @@ static int close(struct inode *inode, struct file *file) * uses: reading and writing a character device called /dev/lguest. All the * work happens in the read(), write() and close() routines: */ -static struct file_operations lguest_fops = { +static const struct file_operations lguest_fops = { .owner = THIS_MODULE, .release = close, .write = write, diff --git a/drivers/media/dvb/dvb-core/dmxdev.c b/drivers/media/dvb/dvb-core/dmxdev.c index 3750ff48cba1..516414983593 100644 --- a/drivers/media/dvb/dvb-core/dmxdev.c +++ b/drivers/media/dvb/dvb-core/dmxdev.c @@ -1203,7 +1203,7 @@ static unsigned int dvb_dvr_poll(struct file *file, poll_table *wait) return mask; } -static struct file_operations dvb_dvr_fops = { +static const struct file_operations dvb_dvr_fops = { .owner = THIS_MODULE, .read = dvb_dvr_read, .write = dvb_dvr_write, diff --git a/drivers/media/dvb/firewire/firedtv-ci.c b/drivers/media/dvb/firewire/firedtv-ci.c index eeb80d0ea3ff..853e04b7cb36 100644 --- a/drivers/media/dvb/firewire/firedtv-ci.c +++ b/drivers/media/dvb/firewire/firedtv-ci.c @@ -215,7 +215,7 @@ static unsigned int fdtv_ca_io_poll(struct file *file, poll_table *wait) return POLLIN; } -static struct file_operations fdtv_ca_fops = { +static const struct file_operations fdtv_ca_fops = { .owner = THIS_MODULE, .ioctl = dvb_generic_ioctl, .open = dvb_generic_open, diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c index fa57b67593ae..90a95ce8dc34 100644 --- a/drivers/misc/phantom.c +++ b/drivers/misc/phantom.c @@ -271,7 +271,7 @@ static unsigned int phantom_poll(struct file *file, poll_table *wait) return mask; } -static struct file_operations phantom_file_ops = { +static const struct file_operations phantom_file_ops = { .open = phantom_open, .release = phantom_release, .unlocked_ioctl = phantom_ioctl, diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index 300e7ba391a0..41c8fe2a928c 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -53,7 +53,6 @@ struct gru_stats_s gru_stats; /* Guaranteed user available resources on each node */ static int max_user_cbrs, max_user_dsr_bytes; -static struct file_operations gru_fops; static struct miscdevice gru_miscdev; @@ -426,7 +425,7 @@ static void __exit gru_exit(void) gru_proc_exit(); } -static struct file_operations gru_fops = { +static const struct file_operations gru_fops = { .owner = THIS_MODULE, .unlocked_ioctl = gru_file_unlocked_ioctl, .mmap = gru_file_mmap, diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c index 610dbd1fcc82..96d10f40fb23 100644 --- a/drivers/mmc/core/debugfs.c +++ b/drivers/mmc/core/debugfs.c @@ -240,7 +240,7 @@ static int mmc_ext_csd_release(struct inode *inode, struct file *file) return 0; } -static struct file_operations mmc_dbg_ext_csd_fops = { +static const struct file_operations mmc_dbg_ext_csd_fops = { .open = mmc_ext_csd_open, .read = mmc_ext_csd_read, .release = mmc_ext_csd_release, diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c index 1b78f639ead3..76769978285f 100644 --- a/drivers/s390/cio/qdio_debug.c +++ b/drivers/s390/cio/qdio_debug.c @@ -125,7 +125,7 @@ static int qstat_seq_open(struct inode *inode, struct file *filp) filp->f_path.dentry->d_inode->i_private); } -static struct file_operations debugfs_fops = { +static const struct file_operations debugfs_fops = { .owner = THIS_MODULE, .open = qstat_seq_open, .read = seq_read, diff --git a/drivers/s390/cio/qdio_perf.c b/drivers/s390/cio/qdio_perf.c index eff943923c6f..968e3c7c2632 100644 --- a/drivers/s390/cio/qdio_perf.c +++ b/drivers/s390/cio/qdio_perf.c @@ -84,7 +84,7 @@ static int qdio_perf_seq_open(struct inode *inode, struct file *filp) return single_open(filp, qdio_perf_proc_show, NULL); } -static struct file_operations qdio_perf_proc_fops = { +static const struct file_operations qdio_perf_proc_fops = { .owner = THIS_MODULE, .open = qdio_perf_seq_open, .read = seq_read, diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 0cb049f5cc56..747a5e5c1276 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1317,7 +1317,7 @@ static void sg_rq_end_io(struct request *rq, int uptodate) } } -static struct file_operations sg_fops = { +static const struct file_operations sg_fops = { .owner = THIS_MODULE, .read = sg_read, .write = sg_write, @@ -2194,9 +2194,11 @@ static int sg_proc_seq_show_int(struct seq_file *s, void *v); static int sg_proc_single_open_adio(struct inode *inode, struct file *file); static ssize_t sg_proc_write_adio(struct file *filp, const char __user *buffer, size_t count, loff_t *off); -static struct file_operations adio_fops = { - /* .owner, .read and .llseek added in sg_proc_init() */ +static const struct file_operations adio_fops = { + .owner = THIS_MODULE, .open = sg_proc_single_open_adio, + .read = seq_read, + .llseek = seq_lseek, .write = sg_proc_write_adio, .release = single_release, }; @@ -2204,23 +2206,32 @@ static struct file_operations adio_fops = { static int sg_proc_single_open_dressz(struct inode *inode, struct file *file); static ssize_t sg_proc_write_dressz(struct file *filp, const char __user *buffer, size_t count, loff_t *off); -static struct file_operations dressz_fops = { +static const struct file_operations dressz_fops = { + .owner = THIS_MODULE, .open = sg_proc_single_open_dressz, + .read = seq_read, + .llseek = seq_lseek, .write = sg_proc_write_dressz, .release = single_release, }; static int sg_proc_seq_show_version(struct seq_file *s, void *v); static int sg_proc_single_open_version(struct inode *inode, struct file *file); -static struct file_operations version_fops = { +static const struct file_operations version_fops = { + .owner = THIS_MODULE, .open = sg_proc_single_open_version, + .read = seq_read, + .llseek = seq_lseek, .release = single_release, }; static int sg_proc_seq_show_devhdr(struct seq_file *s, void *v); static int sg_proc_single_open_devhdr(struct inode *inode, struct file *file); -static struct file_operations devhdr_fops = { +static const struct file_operations devhdr_fops = { + .owner = THIS_MODULE, .open = sg_proc_single_open_devhdr, + .read = seq_read, + .llseek = seq_lseek, .release = single_release, }; @@ -2229,8 +2240,11 @@ static int sg_proc_open_dev(struct inode *inode, struct file *file); static void * dev_seq_start(struct seq_file *s, loff_t *pos); static void * dev_seq_next(struct seq_file *s, void *v, loff_t *pos); static void dev_seq_stop(struct seq_file *s, void *v); -static struct file_operations dev_fops = { +static const struct file_operations dev_fops = { + .owner = THIS_MODULE, .open = sg_proc_open_dev, + .read = seq_read, + .llseek = seq_lseek, .release = seq_release, }; static const struct seq_operations dev_seq_ops = { @@ -2242,8 +2256,11 @@ static const struct seq_operations dev_seq_ops = { static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v); static int sg_proc_open_devstrs(struct inode *inode, struct file *file); -static struct file_operations devstrs_fops = { +static const struct file_operations devstrs_fops = { + .owner = THIS_MODULE, .open = sg_proc_open_devstrs, + .read = seq_read, + .llseek = seq_lseek, .release = seq_release, }; static const struct seq_operations devstrs_seq_ops = { @@ -2255,8 +2272,11 @@ static const struct seq_operations devstrs_seq_ops = { static int sg_proc_seq_show_debug(struct seq_file *s, void *v); static int sg_proc_open_debug(struct inode *inode, struct file *file); -static struct file_operations debug_fops = { +static const struct file_operations debug_fops = { + .owner = THIS_MODULE, .open = sg_proc_open_debug, + .read = seq_read, + .llseek = seq_lseek, .release = seq_release, }; static const struct seq_operations debug_seq_ops = { @@ -2269,7 +2289,7 @@ static const struct seq_operations debug_seq_ops = { struct sg_proc_leaf { const char * name; - struct file_operations * fops; + const struct file_operations * fops; }; static struct sg_proc_leaf sg_proc_leaf_arr[] = { @@ -2295,9 +2315,6 @@ sg_proc_init(void) for (k = 0; k < num_leaves; ++k) { leaf = &sg_proc_leaf_arr[k]; mask = leaf->fops->write ? S_IRUGO | S_IWUSR : S_IRUGO; - leaf->fops->owner = THIS_MODULE; - leaf->fops->read = seq_read; - leaf->fops->llseek = seq_lseek; proc_create(leaf->name, mask, sg_proc_sgp, leaf->fops); } return 0; diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c index f921bd1109e1..5d23983f02fc 100644 --- a/drivers/spi/spidev.c +++ b/drivers/spi/spidev.c @@ -537,7 +537,7 @@ static int spidev_release(struct inode *inode, struct file *filp) return status; } -static struct file_operations spidev_fops = { +static const struct file_operations spidev_fops = { .owner = THIS_MODULE, /* REVISIT switch to aio primitives, so that userspace * gets more complete API coverage. It'll simplify things diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c index 333ee02e7b2b..864f0ba6a344 100644 --- a/drivers/usb/class/usbtmc.c +++ b/drivers/usb/class/usbtmc.c @@ -993,7 +993,7 @@ skip_io_on_zombie: return retval; } -static struct file_operations fops = { +static const struct file_operations fops = { .owner = THIS_MODULE, .read = usbtmc_read, .write = usbtmc_write, diff --git a/drivers/usb/gadget/printer.c b/drivers/usb/gadget/printer.c index 29500154d00c..2d867fd22413 100644 --- a/drivers/usb/gadget/printer.c +++ b/drivers/usb/gadget/printer.c @@ -875,7 +875,7 @@ printer_ioctl(struct file *fd, unsigned int code, unsigned long arg) } /* used after endpoint configuration */ -static struct file_operations printer_io_operations = { +static const struct file_operations printer_io_operations = { .owner = THIS_MODULE, .open = printer_open, .read = printer_read, diff --git a/drivers/usb/host/whci/debug.c b/drivers/usb/host/whci/debug.c index cf2d45946c57..2273c815941f 100644 --- a/drivers/usb/host/whci/debug.c +++ b/drivers/usb/host/whci/debug.c @@ -134,7 +134,7 @@ static int pzl_open(struct inode *inode, struct file *file) return single_open(file, pzl_print, inode->i_private); } -static struct file_operations di_fops = { +static const struct file_operations di_fops = { .open = di_open, .read = seq_read, .llseek = seq_lseek, @@ -142,7 +142,7 @@ static struct file_operations di_fops = { .owner = THIS_MODULE, }; -static struct file_operations asl_fops = { +static const struct file_operations asl_fops = { .open = asl_open, .read = seq_read, .llseek = seq_lseek, @@ -150,7 +150,7 @@ static struct file_operations asl_fops = { .owner = THIS_MODULE, }; -static struct file_operations pzl_fops = { +static const struct file_operations pzl_fops = { .open = pzl_open, .read = seq_read, .llseek = seq_lseek, diff --git a/drivers/usb/misc/rio500.c b/drivers/usb/misc/rio500.c index d645f3899fe1..32d0199d0c32 100644 --- a/drivers/usb/misc/rio500.c +++ b/drivers/usb/misc/rio500.c @@ -429,8 +429,7 @@ read_rio(struct file *file, char __user *buffer, size_t count, loff_t * ppos) return read_count; } -static struct -file_operations usb_rio_fops = { +static const struct file_operations usb_rio_fops = { .owner = THIS_MODULE, .read = read_rio, .write = write_rio, diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c index 4a42993700c1..2eecec0c13c9 100644 --- a/drivers/uwb/uwb-debug.c +++ b/drivers/uwb/uwb-debug.c @@ -205,7 +205,7 @@ static ssize_t command_write(struct file *file, const char __user *buf, return ret < 0 ? ret : len; } -static struct file_operations command_fops = { +static const struct file_operations command_fops = { .open = command_open, .write = command_write, .read = NULL, @@ -255,7 +255,7 @@ static int reservations_open(struct inode *inode, struct file *file) return single_open(file, reservations_print, inode->i_private); } -static struct file_operations reservations_fops = { +static const struct file_operations reservations_fops = { .open = reservations_open, .read = seq_read, .llseek = seq_lseek, @@ -283,7 +283,7 @@ static int drp_avail_open(struct inode *inode, struct file *file) return single_open(file, drp_avail_print, inode->i_private); } -static struct file_operations drp_avail_fops = { +static const struct file_operations drp_avail_fops = { .open = drp_avail_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80599b4e42bd..4484eb3408af 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2326,7 +2326,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); -extern struct file_operations btrfs_file_operations; +extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 locked_end, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a3492a3ad96b..9ed17dbe5c6e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1196,7 +1196,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } -struct file_operations btrfs_file_operations = { +const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e9b76bcd1c12..b9fe06d751c0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct address_space_operations btrfs_symlink_aops; -static struct file_operations btrfs_dir_file_operations; +static const struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; @@ -5544,7 +5544,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = { .permission = btrfs_permission, }; -static struct file_operations btrfs_dir_file_operations = { +static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = btrfs_real_readdir, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 761af77491f5..b0ab5219becb 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -770,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static struct file_operations jbd2_seq_info_fops = { +static const struct file_operations jbd2_seq_info_fops = { .owner = THIS_MODULE, .open = jbd2_seq_info_open, .read = seq_read, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 00388d2a3c99..5c01fc148ce8 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -176,7 +176,7 @@ static const struct file_operations exports_operations = { extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); -static struct file_operations pool_stats_operations = { +static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 1a4fa04cf071..e097099bfc8f 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -697,7 +697,7 @@ not_empty: return 0; } -struct file_operations nilfs_dir_operations = { +const struct file_operations nilfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = nilfs_readdir, diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 7d7b4983dee3..30292df443ce 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) * We have mostly NULL's here: the current defaults are ok for * the nilfs filesystem. */ -struct file_operations nilfs_file_operations = { +const struct file_operations nilfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index b18c4998f8d0..f6326112d647 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -433,7 +433,7 @@ static const struct address_space_operations def_mdt_aops = { }; static const struct inode_operations def_mdt_iops; -static struct file_operations def_mdt_fops; +static const struct file_operations def_mdt_fops; /* * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index bad7368782d0..4da6f67e9a91 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -294,9 +294,9 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *); /* * Inodes and files operations */ -extern struct file_operations nilfs_dir_operations; +extern const struct file_operations nilfs_dir_operations; extern const struct inode_operations nilfs_file_inode_operations; -extern struct file_operations nilfs_file_operations; +extern const struct file_operations nilfs_file_operations; extern const struct address_space_operations nilfs_aops; extern const struct inode_operations nilfs_dir_inode_operations; extern const struct inode_operations nilfs_special_inode_operations; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 09cc25d04611..c452d116b892 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf, } #endif /* CONFIG_DEBUG_FS */ -static struct file_operations o2hb_debug_fops = { +static const struct file_operations o2hb_debug_fops = { .open = o2hb_debug_open, .release = o2hb_debug_release, .read = o2hb_debug_read, diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index cfb2be708abe..da794bc07a6c 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations nst_seq_fops = { +static const struct file_operations nst_seq_fops = { .open = nst_fop_open, .read = seq_read, .llseek = seq_lseek, @@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations sc_seq_fops = { +static const struct file_operations sc_seq_fops = { .open = sc_fop_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index ca46002ec10e..42b0bad7a612 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -478,7 +478,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_purgelist_fops = { +static const struct file_operations debug_purgelist_fops = { .open = debug_purgelist_open, .release = debug_buffer_release, .read = debug_buffer_read, @@ -538,7 +538,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_mle_fops = { +static const struct file_operations debug_mle_fops = { .open = debug_mle_open, .release = debug_buffer_release, .read = debug_buffer_read, @@ -741,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations debug_lockres_fops = { +static const struct file_operations debug_lockres_fops = { .open = debug_lockres_open, .release = debug_lockres_release, .read = seq_read, @@ -925,7 +925,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_state_fops = { +static const struct file_operations debug_state_fops = { .open = debug_state_open, .release = debug_buffer_release, .read = debug_buffer_read, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 4cc3c890a2cd..c0e48aeebb1c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, } #endif /* CONFIG_DEBUG_FS */ -static struct file_operations ocfs2_osb_debug_fops = { +static const struct file_operations ocfs2_osb_debug_fops = { .open = ocfs2_osb_debug_open, .release = ocfs2_debug_release, .read = ocfs2_debug_read, diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 3680bae335b5..b42d62419034 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -498,7 +498,7 @@ const struct inode_operations omfs_dir_inops = { .rmdir = omfs_rmdir, }; -struct file_operations omfs_dir_operations = { +const struct file_operations omfs_dir_operations = { .read = generic_read_dir, .readdir = omfs_readdir, .llseek = generic_file_llseek, diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 4845fbb18e6e..399487c09364 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, omfs_get_block); } -struct file_operations omfs_file_operations = { +const struct file_operations omfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h index df71039945ac..ebe2fdbe535e 100644 --- a/fs/omfs/omfs.h +++ b/fs/omfs/omfs.h @@ -44,14 +44,14 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request, extern int omfs_clear_range(struct super_block *sb, u64 block, int count); /* dir.c */ -extern struct file_operations omfs_dir_operations; +extern const struct file_operations omfs_dir_operations; extern const struct inode_operations omfs_dir_inops; extern int omfs_make_empty(struct inode *inode, struct super_block *sb); extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, u64 fsblock); /* file.c */ -extern struct file_operations omfs_file_operations; +extern const struct file_operations omfs_file_operations; extern const struct inode_operations omfs_file_inops; extern const struct address_space_operations omfs_aops; extern void omfs_make_empty_table(struct buffer_head *bh, int offset); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b62bb9294d0c..0008dee66514 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -37,7 +37,7 @@ extern void cgroup_exit(struct task_struct *p, int run_callbacks); extern int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); -extern struct file_operations proc_cgroup_operations; +extern const struct file_operations proc_cgroup_operations; /* Define the enumeration of all cgroup subsystems */ #define SUBSYS(_x) _x ## _subsys_id, diff --git a/include/linux/fs.h b/include/linux/fs.h index 2adaa2529f18..a1e6899d4b6c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2446,7 +2446,7 @@ static int __fops ## _open(struct inode *inode, struct file *file) \ __simple_attr_check_format(__fmt, 0ull); \ return simple_attr_open(inode, file, __get, __set, __fmt); \ } \ -static struct file_operations __fops = { \ +static const struct file_operations __fops = { \ .owner = THIS_MODULE, \ .open = __fops ## _open, \ .release = simple_attr_release, \ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7ccba4bc5e3b..d2b88596efde 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -703,7 +703,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; -static struct file_operations proc_cgroupstats_operations; +static const struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { .name = "cgroup", @@ -1863,7 +1863,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file) return single_release(inode, file); } -static struct file_operations cgroup_seqfile_operations = { +static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, @@ -1922,7 +1922,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } -static struct file_operations cgroup_file_operations = { +static const struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, .llseek = generic_file_llseek, @@ -3369,7 +3369,7 @@ static int cgroup_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroup_show, pid); } -struct file_operations proc_cgroup_operations = { +const struct file_operations proc_cgroup_operations = { .open = cgroup_open, .read = seq_read, .llseek = seq_lseek, @@ -3398,7 +3398,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroupstats_show, NULL); } -static struct file_operations proc_cgroupstats_operations = { +static const struct file_operations proc_cgroupstats_operations = { .open = cgroupstats_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index cfadc1291d0b..5240d75f4c60 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1333,7 +1333,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp) return seq_open(filp, &kprobes_seq_ops); } -static struct file_operations debugfs_kprobes_operations = { +static const struct file_operations debugfs_kprobes_operations = { .open = kprobes_open, .read = seq_read, .llseek = seq_lseek, @@ -1515,7 +1515,7 @@ static ssize_t write_enabled_file_bool(struct file *file, return count; } -static struct file_operations fops_kp = { +static const struct file_operations fops_kp = { .read = read_enabled_file_bool, .write = write_enabled_file_bool, }; diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c89f5e9fd173..179e6ad80dc0 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata, NULL); } -static struct file_operations rcudata_fops = { +static const struct file_operations rcudata_fops = { .owner = THIS_MODULE, .open = rcudata_open, .read = seq_read, @@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata_csv, NULL); } -static struct file_operations rcudata_csv_fops = { +static const struct file_operations rcudata_csv_fops = { .owner = THIS_MODULE, .open = rcudata_csv_open, .read = seq_read, @@ -196,7 +196,7 @@ static int rcuhier_open(struct inode *inode, struct file *file) return single_open(file, show_rcuhier, NULL); } -static struct file_operations rcuhier_fops = { +static const struct file_operations rcuhier_fops = { .owner = THIS_MODULE, .open = rcuhier_open, .read = seq_read, @@ -222,7 +222,7 @@ static int rcugp_open(struct inode *inode, struct file *file) return single_open(file, show_rcugp, NULL); } -static struct file_operations rcugp_fops = { +static const struct file_operations rcugp_fops = { .owner = THIS_MODULE, .open = rcugp_open, .read = seq_read, @@ -276,7 +276,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file) return single_open(file, show_rcu_pending, NULL); } -static struct file_operations rcu_pending_fops = { +static const struct file_operations rcu_pending_fops = { .owner = THIS_MODULE, .open = rcu_pending_open, .read = seq_read, diff --git a/kernel/sched.c b/kernel/sched.c index ee61f454a98b..1535f3884b88 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -780,7 +780,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp) return single_open(filp, sched_feat_show, NULL); } -static struct file_operations sched_feat_fops = { +static const struct file_operations sched_feat_fops = { .open = sched_feat_open, .write = sched_feat_write, .read = seq_read, diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index fddd69d16e03..1b5b7aa2fdfd 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp) return single_open(filp, timer_list_show, NULL); } -static struct file_operations timer_list_fops = { +static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 4cde8b9c716f..ee5681f8d7ec 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp) return single_open(filp, tstats_show, NULL); } -static struct file_operations tstats_fops = { +static const struct file_operations tstats_fops = { .open = tstats_open, .read = seq_read, .write = tstats_write, diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c index 9cf80a11e8b6..26fab33ffa8c 100644 --- a/samples/tracepoints/tracepoint-sample.c +++ b/samples/tracepoints/tracepoint-sample.c @@ -28,7 +28,7 @@ static int my_open(struct inode *inode, struct file *file) return -EPERM; } -static struct file_operations mark_ops = { +static const struct file_operations mark_ops = { .open = my_open, }; diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index 8e9777b76405..0c72c9c38956 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -43,7 +43,7 @@ static ssize_t ima_show_htable_violations(struct file *filp, return ima_show_htable_value(buf, count, ppos, &ima_htable.violations); } -static struct file_operations ima_htable_violations_ops = { +static const struct file_operations ima_htable_violations_ops = { .read = ima_show_htable_violations }; @@ -55,7 +55,7 @@ static ssize_t ima_show_measurements_count(struct file *filp, } -static struct file_operations ima_measurements_count_ops = { +static const struct file_operations ima_measurements_count_ops = { .read = ima_show_measurements_count }; @@ -158,7 +158,7 @@ static int ima_measurements_open(struct inode *inode, struct file *file) return seq_open(file, &ima_measurments_seqops); } -static struct file_operations ima_measurements_ops = { +static const struct file_operations ima_measurements_ops = { .open = ima_measurements_open, .read = seq_read, .llseek = seq_lseek, @@ -233,7 +233,7 @@ static int ima_ascii_measurements_open(struct inode *inode, struct file *file) return seq_open(file, &ima_ascii_measurements_seqops); } -static struct file_operations ima_ascii_measurements_ops = { +static const struct file_operations ima_ascii_measurements_ops = { .open = ima_ascii_measurements_open, .read = seq_read, .llseek = seq_lseek, @@ -313,7 +313,7 @@ static int ima_release_policy(struct inode *inode, struct file *file) return 0; } -static struct file_operations ima_measure_policy_ops = { +static const struct file_operations ima_measure_policy_ops = { .open = ima_open_policy, .write = ima_write_policy, .release = ima_release_policy diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b5e7e3f1183f..e79c54034bcd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2625,7 +2625,7 @@ static int vcpu_stat_get(void *_offset, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); -static struct file_operations *stat_fops[] = { +static const struct file_operations *stat_fops[] = { [KVM_STAT_VCPU] = &vcpu_stat_fops, [KVM_STAT_VM] = &vm_stat_fops, }; -- cgit v1.2.3 From 80e50be4220e1244fcf6d5f75b997f8586ae1300 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Oct 2009 15:44:27 -0700 Subject: afs: remove cache.h It's just a wrapper for , so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/cache.h | 12 ------------ fs/afs/internal.h | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) delete mode 100644 fs/afs/cache.h (limited to 'fs') diff --git a/fs/afs/cache.h b/fs/afs/cache.h deleted file mode 100644 index 5c4f6b499e90..000000000000 --- a/fs/afs/cache.h +++ /dev/null @@ -1,12 +0,0 @@ -/* AFS local cache management interface - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 106be66dafd2..6ece2a13bf71 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -18,10 +18,10 @@ #include #include #include +#include #include "afs.h" #include "afs_vl.h" -#include "cache.h" #define AFS_CELL_MAX_ADDRS 15 -- cgit v1.2.3 From 74072d0a63553720dd3c70a8b8e9407eb2027dbe Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Fri, 2 Oct 2009 21:08:32 -0400 Subject: ext4: Fix build warning in ext4_dirty_inode() This fixes the following warning: fs/ext4/inode.c: In function 'ext4_dirty_inode': fs/ext4/inode.c:5615: warning: unused variable 'current_handle' We remove the jbd_debug() statement which does use current_handle, as it's not terribly important in the grand scheme of things. Thanks to Stephen Rothwell for pointing this out. Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6e65d0e25ed3..635f8ec6ebc6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5612,14 +5612,12 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) */ void ext4_dirty_inode(struct inode *inode) { - handle_t *current_handle = ext4_journal_current_handle(); handle_t *handle; handle = ext4_journal_start(inode, 2); if (IS_ERR(handle)) goto out; - jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); -- cgit v1.2.3 From fbbf69456619de5d251cb9f1df609069178c62d5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 2 Oct 2009 21:20:55 -0400 Subject: [PATCH] ext4: retry failed direct IO allocations On a 256M filesystem, doing this in a loop: xfs_io -F -f -d -c 'pwrite 0 64m' test rm -f test eventually leads to ENOSPC. (the xfs_io command does a 64m direct IO write to the file "test") As with other block allocation callers, it looks like we need to potentially retry the allocations on the initial ENOSPC. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 635f8ec6ebc6..5c5bc5dafff8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3378,6 +3378,7 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, ssize_t ret; int orphan = 0; size_t count = iov_length(iov, nr_segs); + int retries = 0; if (rw == WRITE) { loff_t final_size = offset + count; @@ -3400,9 +3401,12 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, } } +retry: ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (orphan) { int err; -- cgit v1.2.3 From 0f78ab9899e9d6acb09d5465def618704255963b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 4 Oct 2009 21:04:38 +0200 Subject: Revert "Seperate read and write statistics of in_flight requests" This reverts commit a9327cac440be4d8333bba975cbbf76045096275. Corrado Zoccolo reports: "with 2.6.32-rc1 I started getting the following strange output from "iostat -kx 2": Linux 2.6.31bisect (et2) 04/10/2009 _i686_ (2 CPU) avg-cpu: %user %nice %system %iowait %steal %idle 10,70 0,00 3,16 15,75 0,00 70,38 Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util sda 18,22 0,00 0,67 0,01 14,77 0,02 43,94 0,01 10,53 39043915,03 2629219,87 sdb 60,89 9,68 50,79 3,04 1724,43 50,52 65,95 0,70 13,06 488437,47 2629219,87 avg-cpu: %user %nice %system %iowait %steal %idle 2,72 0,00 0,74 0,00 0,00 96,53 Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util sda 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 100,00 sdb 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 100,00 avg-cpu: %user %nice %system %iowait %steal %idle 6,68 0,00 0,99 0,00 0,00 92,33 Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util sda 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 100,00 sdb 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 100,00 avg-cpu: %user %nice %system %iowait %steal %idle 4,40 0,00 0,73 1,47 0,00 93,40 Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util sda 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 100,00 sdb 0,00 4,00 0,00 3,00 0,00 28,00 18,67 0,06 19,50 333,33 100,00 Global values for service time and utilization are garbage. For interval values, utilization is always 100%, and service time is higher than normal. I bisected it down to: [a9327cac440be4d8333bba975cbbf76045096275] Seperate read and write statistics of in_flight requests and verified that reverting just that commit indeed solves the issue on 2.6.32-rc1." So until this is debugged, revert the bad commit. Signed-off-by: Jens Axboe --- block/blk-core.c | 6 +++--- block/blk-merge.c | 2 +- block/genhd.c | 4 +--- drivers/md/dm.c | 16 ++++++---------- fs/partitions/check.c | 12 +----------- include/linux/genhd.h | 21 +++++++-------------- 6 files changed, 19 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/block/blk-core.c b/block/blk-core.c index a8c7fbe52e24..81f34311659a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -70,7 +70,7 @@ static void drive_stat_acct(struct request *rq, int new_io) part_stat_inc(cpu, part, merges[rw]); else { part_round_stats(cpu, part); - part_inc_in_flight(part, rw); + part_inc_in_flight(part); } part_stat_unlock(); @@ -1032,7 +1032,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part, if (part->in_flight) { __part_stat_add(cpu, part, time_in_queue, - part_in_flight(part) * (now - part->stamp)); + part->in_flight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1739,7 +1739,7 @@ static void blk_account_io_done(struct request *req) part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); - part_dec_in_flight(part, rw); + part_dec_in_flight(part); part_stat_unlock(); } diff --git a/block/blk-merge.c b/block/blk-merge.c index 99cb5cf1f447..b0de8574fdc8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req) part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_round_stats(cpu, part); - part_dec_in_flight(part, rq_data_dir(req)); + part_dec_in_flight(part); part_stat_unlock(); } diff --git a/block/genhd.c b/block/genhd.c index 517e4332cb37..5a0861da324d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -869,7 +869,6 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); -static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -889,7 +888,6 @@ static struct attribute *disk_attrs[] = { &dev_attr_alignment_offset.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, - &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1055,7 +1053,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, merges[1]), (unsigned long long)part_stat_read(hd, sectors[1]), jiffies_to_msecs(part_stat_read(hd, ticks[1])), - part_in_flight(hd), + hd->in_flight, jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 376f1ab48a24..23e76fe0d359 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -130,7 +130,7 @@ struct mapped_device { /* * A list of ios that arrived while we were suspended. */ - atomic_t pending[2]; + atomic_t pending; wait_queue_head_t wait; struct work_struct work; struct bio_list deferred; @@ -453,14 +453,13 @@ static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; int cpu; - int rw = bio_data_dir(io->bio); io->start_time = jiffies; cpu = part_stat_lock(); part_round_stats(cpu, &dm_disk(md)->part0); part_stat_unlock(); - dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); + dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending); } static void end_io_acct(struct dm_io *io) @@ -480,9 +479,8 @@ static void end_io_acct(struct dm_io *io) * After this is decremented the bio must not be touched if it is * a barrier. */ - dm_disk(md)->part0.in_flight[rw] = pending = - atomic_dec_return(&md->pending[rw]); - pending += atomic_read(&md->pending[rw^0x1]); + dm_disk(md)->part0.in_flight = pending = + atomic_dec_return(&md->pending); /* nudge anyone waiting on suspend queue */ if (!pending) @@ -1787,8 +1785,7 @@ static struct mapped_device *alloc_dev(int minor) if (!md->disk) goto bad_disk; - atomic_set(&md->pending[0], 0); - atomic_set(&md->pending[1], 0); + atomic_set(&md->pending, 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); @@ -2091,8 +2088,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) break; } spin_unlock_irqrestore(q->queue_lock, flags); - } else if (!atomic_read(&md->pending[0]) && - !atomic_read(&md->pending[1])) + } else if (!atomic_read(&md->pending)) break; if (interruptible == TASK_INTERRUPTIBLE && diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7b685e10cbad..f38fee0311a7 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -248,19 +248,11 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - part_in_flight(p), + p->in_flight, jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } -ssize_t part_inflight_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); -} - #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -289,7 +281,6 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); -static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -301,7 +292,6 @@ static struct attribute *part_attrs[] = { &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, - &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 297df45ffd0a..7beaa21b3880 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -98,7 +98,7 @@ struct hd_struct { int make_it_fail; #endif unsigned long stamp; - int in_flight[2]; + int in_flight; #ifdef CONFIG_SMP struct disk_stats *dkstats; #else @@ -322,23 +322,18 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_sub(cpu, gendiskp, field, subnd) \ part_stat_add(cpu, gendiskp, field, -subnd) -static inline void part_inc_in_flight(struct hd_struct *part, int rw) +static inline void part_inc_in_flight(struct hd_struct *part) { - part->in_flight[rw]++; + part->in_flight++; if (part->partno) - part_to_disk(part)->part0.in_flight[rw]++; + part_to_disk(part)->part0.in_flight++; } -static inline void part_dec_in_flight(struct hd_struct *part, int rw) +static inline void part_dec_in_flight(struct hd_struct *part) { - part->in_flight[rw]--; + part->in_flight--; if (part->partno) - part_to_disk(part)->part0.in_flight[rw]--; -} - -static inline int part_in_flight(struct hd_struct *part) -{ - return part->in_flight[0] + part->in_flight[1]; + part_to_disk(part)->part0.in_flight--; } /* block/blk-core.c */ @@ -551,8 +546,6 @@ extern ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf); -extern ssize_t part_inflight_show(struct device *dev, - struct device_attribute *attr, char *buf); #ifdef CONFIG_FAIL_MAKE_REQUEST extern ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); -- cgit v1.2.3 From a99bbaf5ee6bad1aca0c88ea65ec6e5373e86184 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 4 Oct 2009 16:11:37 +0400 Subject: headers: remove sched.h from poll.h Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- drivers/char/dtlk.c | 1 + drivers/char/ipmi/ipmi_devintf.c | 1 + drivers/char/ipmi/ipmi_msghandler.c | 1 + drivers/firewire/core-cdev.c | 1 + drivers/hid/hidraw.c | 1 + drivers/infiniband/core/ucm.c | 1 + drivers/infiniband/core/user_mad.c | 1 + drivers/infiniband/core/uverbs_main.c | 1 + drivers/input/evdev.c | 1 + drivers/input/input.c | 1 + drivers/input/joydev.c | 1 + drivers/input/misc/uinput.c | 1 + drivers/input/mousedev.c | 1 + drivers/isdn/divert/divert_procfs.c | 1 + drivers/media/dvb/dvb-core/dmxdev.c | 1 + drivers/media/dvb/dvb-core/dvb_demux.c | 1 + drivers/media/radio/radio-cadet.c | 1 + drivers/media/video/cpia.c | 1 + drivers/mfd/ucb1400_core.c | 1 + drivers/usb/gadget/inode.c | 1 + drivers/xen/xenfs/xenbus.c | 1 + fs/anon_inodes.c | 2 ++ fs/coda/psdev.c | 1 + fs/select.c | 1 + include/linux/poll.h | 2 +- net/rfkill/core.c | 1 + 26 files changed, 27 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c index 52e06589821d..045c930e6320 100644 --- a/drivers/char/dtlk.c +++ b/drivers/char/dtlk.c @@ -56,6 +56,7 @@ #include /* for -EBUSY */ #include /* for request_region */ #include /* for loops_per_jiffy */ +#include #include /* cycle_kernel_lock() */ #include /* for inb_p, outb_p, inb, outb, etc. */ #include /* for get_user, etc. */ diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c index 41fc11dc921c..65545de3dbf4 100644 --- a/drivers/char/ipmi/ipmi_devintf.c +++ b/drivers/char/ipmi/ipmi_devintf.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 09050797c76a..ec5e3f8df648 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index ced186d7e9a9..5089331544ed 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index 0c6639ea03dd..ba05275e5104 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 51bd9669cb1f..f504c9b00c1b 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 8c46f2257098..7de02969ed7d 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index d3fff9e008a3..aec0fbdfe7f0 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 1148140d08a1..dee6706038aa 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -13,6 +13,7 @@ #define EVDEV_BUFFER_SIZE 64 #include +#include #include #include #include diff --git a/drivers/input/input.c b/drivers/input/input.c index 16ec33f27c5d..c6f88ebb40c7 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c index 901b2525993e..b1bd6dd32286 100644 --- a/drivers/input/joydev.c +++ b/drivers/input/joydev.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index c5a49aba418f..d3f57245420a 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c @@ -30,6 +30,7 @@ * - first public version */ #include +#include #include #include #include diff --git a/drivers/input/mousedev.c b/drivers/input/mousedev.c index 966b8868f792..a13d80f7da17 100644 --- a/drivers/input/mousedev.c +++ b/drivers/input/mousedev.c @@ -13,6 +13,7 @@ #define MOUSEDEV_MINORS 32 #define MOUSEDEV_MIX 31 +#include #include #include #include diff --git a/drivers/isdn/divert/divert_procfs.c b/drivers/isdn/divert/divert_procfs.c index 8b256a617c8a..3697c409bec6 100644 --- a/drivers/isdn/divert/divert_procfs.c +++ b/drivers/isdn/divert/divert_procfs.c @@ -16,6 +16,7 @@ #else #include #endif +#include #include #include #include "isdn_divert.h" diff --git a/drivers/media/dvb/dvb-core/dmxdev.c b/drivers/media/dvb/dvb-core/dmxdev.c index 516414983593..c37790ad92d0 100644 --- a/drivers/media/dvb/dvb-core/dmxdev.c +++ b/drivers/media/dvb/dvb-core/dmxdev.c @@ -20,6 +20,7 @@ * */ +#include #include #include #include diff --git a/drivers/media/dvb/dvb-core/dvb_demux.c b/drivers/media/dvb/dvb-core/dvb_demux.c index eef6d3616626..91c537bca8ad 100644 --- a/drivers/media/dvb/dvb-core/dvb_demux.c +++ b/drivers/media/dvb/dvb-core/dvb_demux.c @@ -21,6 +21,7 @@ * */ +#include #include #include #include diff --git a/drivers/media/radio/radio-cadet.c b/drivers/media/radio/radio-cadet.c index 8b1440136c45..482d0f3be5ff 100644 --- a/drivers/media/radio/radio-cadet.c +++ b/drivers/media/radio/radio-cadet.c @@ -38,6 +38,7 @@ #include /* V4L2 API defs */ #include #include +#include #include /* outb, outb_p */ #include #include diff --git a/drivers/media/video/cpia.c b/drivers/media/video/cpia.c index 43ab0adf3b61..2377313c041a 100644 --- a/drivers/media/video/cpia.c +++ b/drivers/media/video/cpia.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/mfd/ucb1400_core.c b/drivers/mfd/ucb1400_core.c index 2afc08006e6d..fa294b6d600a 100644 --- a/drivers/mfd/ucb1400_core.c +++ b/drivers/mfd/ucb1400_core.c @@ -21,6 +21,7 @@ */ #include +#include #include unsigned int ucb1400_adc_read(struct snd_ac97 *ac97, u16 adc_channel, diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index c44367fea185..bf0f6520c6df 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c index a9592d981b10..6c4269b836b7 100644 --- a/drivers/xen/xenfs/xenbus.c +++ b/drivers/xen/xenfs/xenbus.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index d11c51fc2a3f..2ca7a7cafdbf 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -8,8 +8,10 @@ * */ +#include #include #include +#include #include #include #include diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 0376ac66c44a..be4392ca2098 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/select.c b/fs/select.c index a201fc370223..fd38ce2e32e3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -15,6 +15,7 @@ */ #include +#include #include #include #include diff --git a/include/linux/poll.h b/include/linux/poll.h index fa287f25138d..6673743946f7 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -6,10 +6,10 @@ #ifdef __KERNEL__ #include +#include #include #include #include -#include #include /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating diff --git a/net/rfkill/core.c b/net/rfkill/core.c index dbeaf2983822..ba2efb960c60 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 61d92c328c16419fc96dc50dd16f8b8c695409ec Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 2 Oct 2009 19:11:56 -0400 Subject: Btrfs: fix deadlock on async thread startup The btrfs async worker threads are used for a wide variety of things, including processing bio end_io functions. This means that when the endio threads aren't running, the rest of the FS isn't able to do the final processing required to clear PageWriteback. The endio threads also try to exit as they become idle and start more as the work piles up. The problem is that starting more threads means kthreadd may need to allocate ram, and that allocation may wait until the global number of writeback pages on the system is below a certain limit. The result of that throttling is that end IO threads wait on kthreadd, who is waiting on IO to end, which will never happen. This commit fixes the deadlock by handing off thread startup to a dedicated thread. It also fixes a bug where the on-demand thread creation was creating far too many threads because it didn't take into account threads being started by other procs. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 81 ++++++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/async-thread.h | 10 ++++-- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 42 +++++++++++++------------ fs/btrfs/relocation.c | 4 +-- 5 files changed, 106 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 282ca085c2fb..c0861e781cdb 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -63,6 +63,51 @@ struct btrfs_worker_thread { int idle; }; +/* + * btrfs_start_workers uses kthread_run, which can block waiting for memory + * for a very long time. It will actually throttle on page writeback, + * and so it may not make progress until after our btrfs worker threads + * process all of the pending work structs in their queue + * + * This means we can't use btrfs_start_workers from inside a btrfs worker + * thread that is used as part of cleaning dirty memory, which pretty much + * involves all of the worker threads. + * + * Instead we have a helper queue who never has more than one thread + * where we scheduler thread start operations. This worker_start struct + * is used to contain the work and hold a pointer to the queue that needs + * another worker. + */ +struct worker_start { + struct btrfs_work work; + struct btrfs_workers *queue; +}; + +static void start_new_worker_func(struct btrfs_work *work) +{ + struct worker_start *start; + start = container_of(work, struct worker_start, work); + btrfs_start_workers(start->queue, 1); + kfree(start); +} + +static int start_new_worker(struct btrfs_workers *queue) +{ + struct worker_start *start; + int ret; + + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return -ENOMEM; + + start->work.func = start_new_worker_func; + start->queue = queue; + ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); + if (ret) + kfree(start); + return ret; +} + /* * helper function to move a thread onto the idle list after it * has finished some requests. @@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker) goto out; workers->atomic_start_pending = 0; - if (workers->num_workers >= workers->max_workers) + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) goto out; + workers->num_workers_starting += 1; spin_unlock_irqrestore(&workers->lock, flags); - btrfs_start_workers(workers, 1); + start_new_worker(workers); return; out: @@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers) /* * simple init on struct btrfs_workers */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_helper) { workers->num_workers = 0; + workers->num_workers_starting = 0; INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); @@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) workers->name = name; workers->ordered = 0; workers->atomic_start_pending = 0; - workers->atomic_worker_start = 0; + workers->atomic_worker_start = async_helper; } /* * starts new worker threads. This does not enforce the max worker * count in case you need to temporarily go past it. */ -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +static int __btrfs_start_workers(struct btrfs_workers *workers, + int num_workers) { struct btrfs_worker_thread *worker; int ret = 0; @@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) list_add_tail(&worker->worker_list, &workers->idle_list); worker->idle = 1; workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); spin_unlock_irq(&workers->lock); } return 0; @@ -452,6 +504,14 @@ fail: return ret; } +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + spin_lock_irq(&workers->lock); + workers->num_workers_starting += num_workers; + spin_unlock_irq(&workers->lock); + return __btrfs_start_workers(workers, num_workers); +} + /* * run through the list and find a worker thread that doesn't have a lot * to do right now. This can return null if we aren't yet at the thread @@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; struct list_head *next; - int enforce_min = workers->num_workers < workers->max_workers; + int enforce_min; + + enforce_min = (workers->num_workers + workers->num_workers_starting) < + workers->max_workers; /* * if we find an idle thread, don't move it to the end of the @@ -509,15 +572,17 @@ again: worker = next_worker(workers); if (!worker) { - if (workers->num_workers >= workers->max_workers) { + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) { goto fallback; } else if (workers->atomic_worker_start) { workers->atomic_start_pending = 1; goto fallback; } else { + workers->num_workers_starting++; spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ - btrfs_start_workers(workers, 1); + __btrfs_start_workers(workers, 1); goto again; } } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index fc089b95ec14..5077746cf85e 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ struct btrfs_workers { /* current number of running workers */ int num_workers; + int num_workers_starting; + /* max number of workers allowed. changed by btrfs_start_workers */ int max_workers; @@ -78,9 +80,10 @@ struct btrfs_workers { /* * are we allowed to sleep while starting workers or are we required - * to start them at a later time? + * to start them at a later time? If we can't sleep, this indicates + * which queue we need to use to schedule thread creation. */ - int atomic_worker_start; + struct btrfs_workers *atomic_worker_start; /* list with all the work threads. The workers on the idle thread * may be actively servicing jobs, but they haven't yet hit the @@ -109,7 +112,8 @@ struct btrfs_workers { int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_starter); int btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8184f2feb2f3..1b920ffc6a59 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -907,6 +907,7 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ + struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; struct btrfs_workers endio_workers; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 69dce50aabd2..9903f042765d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1746,21 +1746,22 @@ struct btrfs_root *open_ctree(struct super_block *sb, err = -EINVAL; goto fail_iput; } -printk("thread pool is %d\n", fs_info->thread_pool_size); - /* - * we need to start all the end_io workers up front because the - * queue work function gets called at interrupt time, and so it - * cannot dynamically grow. - */ + + btrfs_init_workers(&fs_info->generic_worker, + "genwork", 1, NULL); + btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size)); + fs_info->thread_pool_size), + &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1774,15 +1775,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); fs_info->delalloc_workers.idle_thresh = 2; fs_info->delalloc_workers.ordered = 1; - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size); + "endio-meta-write", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1794,12 +1800,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; - fs_info->endio_workers.atomic_worker_start = 1; - fs_info->endio_meta_workers.atomic_worker_start = 1; - fs_info->endio_write_workers.atomic_worker_start = 1; - fs_info->endio_meta_write_workers.atomic_worker_start = 1; - btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->generic_worker, 1); btrfs_start_workers(&fs_info->submit_workers, 1); btrfs_start_workers(&fs_info->delalloc_workers, 1); btrfs_start_workers(&fs_info->fixup_workers, 1); @@ -2012,6 +2014,7 @@ fail_chunk_root: free_extent_buffer(chunk_root->node); free_extent_buffer(chunk_root->commit_root); fail_sb_buffer: + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2437,6 +2440,7 @@ int close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 361ad323faac..cfcc93c93a7b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) BUG_ON(!rc->block_group); btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size); + fs_info->thread_pool_size, NULL); rc->extent_root = extent_root; btrfs_prepare_block_group_relocation(extent_root, rc->block_group); @@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) mapping_tree_init(&rc->reloc_root_tree); INIT_LIST_HEAD(&rc->reloc_roots); btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size); + root->fs_info->thread_pool_size, NULL); rc->extent_root = root->fs_info->extent_root; set_reloc_control(rc); -- cgit v1.2.3 From 1cdda9b81ac0e6ee986f034fa02f221679e1c11a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 6 Oct 2009 10:04:28 -0400 Subject: Btrfs: fix possible softlockup in the allocator Like the cluster allocating stuff, we can lockup the box with the normal allocation path. This happens when we 1) Start to cache a block group that is severely fragmented, but has a decent amount of free space. 2) Start to commit a transaction 3) Have the commit try and empty out some of the delalloc inodes with extents that are relatively large. The inodes will not be able to make the allocations because they will ask for allocations larger than a contiguous area in the free space cache. So we will wait for more progress to be made on the block group, but since we're in a commit the caching kthread won't make any more progress and it already has enough free space that wait_block_group_cache_progress will just return. So, if we wait and fail to make the allocation the next time around, just loop and go to the next block group. This keeps us from getting stuck in a softlockup. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d119c0388af1..2f82fabd7011 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4028,6 +4028,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int loop = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; + bool failed_alloc = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -4232,14 +4233,23 @@ refill_cluster: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset && (cached || (!cached && - loop == LOOP_CACHING_NOWAIT))) { - goto loop; - } else if (!offset && (!cached && - loop > LOOP_CACHING_NOWAIT)) { + /* + * If we didn't find a chunk, and we haven't failed on this + * block group before, and this block group is in the middle of + * caching and we are ok with waiting, then go ahead and wait + * for progress to be made, and set failed_alloc to true. + * + * If failed_alloc is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !failed_alloc && !cached && + loop > LOOP_CACHING_NOWAIT) { wait_block_group_cache_progress(block_group, - num_bytes + empty_size); + num_bytes + empty_size); + failed_alloc = true; goto have_block_group; + } else if (!offset) { + goto loop; } checks: search_start = stripe_align(root, offset); @@ -4287,6 +4297,7 @@ checks: break; loop: failed_cluster_refill = false; + failed_alloc = false; btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); -- cgit v1.2.3 From 316d315bffa4026f28085f6b24ebcebede370ac7 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Tue, 6 Oct 2009 20:16:55 +0200 Subject: block: Seperate read and write statistics of in_flight requests v2 Commit a9327cac440be4d8333bba975cbbf76045096275 added seperate read and write statistics of in_flight requests. And exported the number of read and write requests in progress seperately through sysfs. But Corrado Zoccolo reported getting strange output from "iostat -kx 2". Global values for service time and utilization were garbage. For interval values, utilization was always 100%, and service time is higher than normal. So this was reverted by commit 0f78ab9899e9d6acb09d5465def618704255963b The problem was in part_round_stats_single(), I missed the following: if (now == part->stamp) return; - if (part->in_flight) { + if (part_in_flight(part)) { __part_stat_add(cpu, part, time_in_queue, part_in_flight(part) * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); With this chunk included, the reported regression gets fixed. Signed-off-by: Nikanth Karthikesan -- Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++---- block/blk-merge.c | 2 +- block/genhd.c | 4 +++- drivers/md/dm.c | 16 ++++++++++------ fs/partitions/check.c | 12 +++++++++++- include/linux/genhd.h | 21 ++++++++++++++------- 6 files changed, 43 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/block/blk-core.c b/block/blk-core.c index 73ecbed02605..ac0fa10f8fa5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -70,7 +70,7 @@ static void drive_stat_acct(struct request *rq, int new_io) part_stat_inc(cpu, part, merges[rw]); else { part_round_stats(cpu, part); - part_inc_in_flight(part); + part_inc_in_flight(part, rw); } part_stat_unlock(); @@ -1030,9 +1030,9 @@ static void part_round_stats_single(int cpu, struct hd_struct *part, if (now == part->stamp) return; - if (part->in_flight) { + if (part_in_flight(part)) { __part_stat_add(cpu, part, time_in_queue, - part->in_flight * (now - part->stamp)); + part_in_flight(part) * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1739,7 +1739,7 @@ static void blk_account_io_done(struct request *req) part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); - part_dec_in_flight(part); + part_dec_in_flight(part, rw); part_stat_unlock(); } diff --git a/block/blk-merge.c b/block/blk-merge.c index b0de8574fdc8..99cb5cf1f447 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req) part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_round_stats(cpu, part); - part_dec_in_flight(part); + part_dec_in_flight(part, rq_data_dir(req)); part_stat_unlock(); } diff --git a/block/genhd.c b/block/genhd.c index 5a0861da324d..517e4332cb37 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -869,6 +869,7 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -888,6 +889,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_alignment_offset.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1053,7 +1055,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, merges[1]), (unsigned long long)part_stat_read(hd, sectors[1]), jiffies_to_msecs(part_stat_read(hd, ticks[1])), - hd->in_flight, + part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 23e76fe0d359..376f1ab48a24 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -130,7 +130,7 @@ struct mapped_device { /* * A list of ios that arrived while we were suspended. */ - atomic_t pending; + atomic_t pending[2]; wait_queue_head_t wait; struct work_struct work; struct bio_list deferred; @@ -453,13 +453,14 @@ static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; int cpu; + int rw = bio_data_dir(io->bio); io->start_time = jiffies; cpu = part_stat_lock(); part_round_stats(cpu, &dm_disk(md)->part0); part_stat_unlock(); - dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending); + dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); } static void end_io_acct(struct dm_io *io) @@ -479,8 +480,9 @@ static void end_io_acct(struct dm_io *io) * After this is decremented the bio must not be touched if it is * a barrier. */ - dm_disk(md)->part0.in_flight = pending = - atomic_dec_return(&md->pending); + dm_disk(md)->part0.in_flight[rw] = pending = + atomic_dec_return(&md->pending[rw]); + pending += atomic_read(&md->pending[rw^0x1]); /* nudge anyone waiting on suspend queue */ if (!pending) @@ -1785,7 +1787,8 @@ static struct mapped_device *alloc_dev(int minor) if (!md->disk) goto bad_disk; - atomic_set(&md->pending, 0); + atomic_set(&md->pending[0], 0); + atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); @@ -2088,7 +2091,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) break; } spin_unlock_irqrestore(q->queue_lock, flags); - } else if (!atomic_read(&md->pending)) + } else if (!atomic_read(&md->pending[0]) && + !atomic_read(&md->pending[1])) break; if (interruptible == TASK_INTERRUPTIBLE && diff --git a/fs/partitions/check.c b/fs/partitions/check.c index f38fee0311a7..7b685e10cbad 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - p->in_flight, + part_in_flight(p), jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } +ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); +} + #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = { &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7beaa21b3880..297df45ffd0a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -98,7 +98,7 @@ struct hd_struct { int make_it_fail; #endif unsigned long stamp; - int in_flight; + int in_flight[2]; #ifdef CONFIG_SMP struct disk_stats *dkstats; #else @@ -322,18 +322,23 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_sub(cpu, gendiskp, field, subnd) \ part_stat_add(cpu, gendiskp, field, -subnd) -static inline void part_inc_in_flight(struct hd_struct *part) +static inline void part_inc_in_flight(struct hd_struct *part, int rw) { - part->in_flight++; + part->in_flight[rw]++; if (part->partno) - part_to_disk(part)->part0.in_flight++; + part_to_disk(part)->part0.in_flight[rw]++; } -static inline void part_dec_in_flight(struct hd_struct *part) +static inline void part_dec_in_flight(struct hd_struct *part, int rw) { - part->in_flight--; + part->in_flight[rw]--; if (part->partno) - part_to_disk(part)->part0.in_flight--; + part_to_disk(part)->part0.in_flight[rw]--; +} + +static inline int part_in_flight(struct hd_struct *part) +{ + return part->in_flight[0] + part->in_flight[1]; } /* block/blk-core.c */ @@ -546,6 +551,8 @@ extern ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf); #ifdef CONFIG_FAIL_MAKE_REQUEST extern ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); -- cgit v1.2.3 From c5811dbdd26284d63c19fca618bd740dd10ad53d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:40:15 -0400 Subject: NFS: Fix a default mount regression... With the recent spate of changes, the nfs protocol version will now default to 2 instead of 3, while the mount protocol version defaults to 3. The following patch should ensure the defaults are consistent with the previous defaults of vers=3,proto=tcp,mountvers=3,mountproto=tcp. This fixes the bug http://bugzilla.kernel.org/show_bug.cgi?id=14259 Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29786d3b9326..0343ebcfbbdc 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -728,22 +728,24 @@ static void nfs_umount_begin(struct super_block *sb) unlock_kernel(); } -static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags) +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) { struct nfs_parsed_mount_data *data; data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { - data->flags = flags; data->rsize = NFS_MAX_FILE_IO_SIZE; data->wsize = NFS_MAX_FILE_IO_SIZE; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; data->acdirmax = NFS_DEF_ACDIRMAX; + data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; + data->nfs_server.protocol = XPRT_TRANSPORT_TCP; data->auth_flavors[0] = RPC_AUTH_UNIX; data->auth_flavor_len = 1; + data->version = version; data->minorversion = 0; } return data; @@ -1711,8 +1713,6 @@ static int nfs_validate_mount_data(void *options, if (!(data->flags & NFS_MOUNT_TCP)) args->nfs_server.protocol = XPRT_TRANSPORT_UDP; - else - args->nfs_server.protocol = XPRT_TRANSPORT_TCP; /* N.B. caller will free nfs_server.hostname in all cases */ args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); args->namlen = data->namlen; @@ -2106,7 +2106,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP); + data = nfs_alloc_parsed_mount_data(3); mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); if (data == NULL || mntfh == NULL) goto out_free_fh; @@ -2376,7 +2376,6 @@ static int nfs4_validate_mount_data(void *options, if (data == NULL) goto out_no_data; - args->version = 4; switch (data->version) { case 1: if (data->host_addrlen > sizeof(args->nfs_server.address)) @@ -2660,7 +2659,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, struct nfs_parsed_mount_data *data; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(0); + data = nfs_alloc_parsed_mount_data(4); if (data == NULL) goto out_free_data; -- cgit v1.2.3 From f5855fecda65c1965c894915ace1e086d4925154 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:40:37 -0400 Subject: NFS: Fix port and mountport display in /proc/self/mountinfo Currently, the port and mount port will both display as 65535 if you do not specify a port number. That would be wrong... Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0343ebcfbbdc..0d14704c539e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -778,15 +778,13 @@ static int nfs_verify_server_address(struct sockaddr *addr) * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ -static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port, +static void nfs_set_port(struct sockaddr *sap, int *port, const unsigned short default_port) { - unsigned short port = default_port; + if (*port == NFS_UNSPEC_PORT) + *port = default_port; - if (parsed_port != NFS_UNSPEC_PORT) - port = parsed_port; - - rpc_set_port(sap, port); + rpc_set_port(sap, *port); } /* @@ -1477,7 +1475,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, args->mount_server.addrlen = args->nfs_server.addrlen; } request.salen = args->mount_server.addrlen; - nfs_set_default_port(request.sap, args->mount_server.port, 0); + nfs_set_port(request.sap, &args->mount_server.port, 0); /* * Now ask the mount server to map our export path @@ -1767,7 +1765,7 @@ static int nfs_validate_mount_data(void *options, goto out_v4_not_compiled; #endif - nfs_set_default_port(sap, args->nfs_server.port, 0); + nfs_set_port(sap, &args->nfs_server.port, 0); nfs_set_mount_transport_protocol(args); @@ -2331,7 +2329,7 @@ static int nfs4_validate_text_mount_data(void *options, { struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; - nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT); + nfs_set_port(sap, &args->nfs_server.port, NFS_PORT); nfs_validate_transport_protocol(args); -- cgit v1.2.3 From bcd2ea17da6a329a7276cde7286d802f009af332 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:41:22 -0400 Subject: NFS: Fix port initialisation in nfs_remount() The recent changeset 53a0b9c4c99ab0085a06421f71592722e5b3fd5f (NFS: Replace nfs_parse_ip_address() with rpc_pton()) broke nfs_remount, since the call to rpc_pton() will zero out the port number in data->nfs_server.address. This is actually due to a bug in nfs_remount: it should be looking at the port number in nfs_server.port instead... This fixes bug http://bugzilla.kernel.org/show_bug.cgi?id=14276 Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0d14704c539e..fb3b280cacfe 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1846,9 +1846,10 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->acdirmin != nfss->acdirmin / HZ || data->acdirmax != nfss->acdirmax / HZ || data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.port != nfss->port || data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || - memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr, - data->nfs_server.addrlen) != 0) + !rpc_cmp_addr(&data->nfs_server.address, + &nfss->nfs_client->cl_addr)) return -EINVAL; return 0; @@ -1891,6 +1892,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->acdirmin = nfss->acdirmin / HZ; data->acdirmax = nfss->acdirmax / HZ; data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.port = nfss->port; data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); -- cgit v1.2.3 From f4373bf9e67e4a653c8854acd7b02dac9714c98a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 6 Oct 2009 15:42:18 -0400 Subject: nfs: Avoid overrun when copying client IP address string As seen in , nfs4_init_client() can overrun the source string when copying the client IP address from nfs_parsed_mount_data::client_address to nfs_client::cl_ipaddr. Since these are both treated as null-terminated strings elsewhere, the copy should be done with strlcpy() not memcpy(). Signed-off-by: Ben Hutchings Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 63976c0ccc25..99ea196f071f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1180,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp, 1, flags & NFS_MOUNT_NORESVPORT); if (error < 0) goto error; - memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); error = nfs_idmap_new(clp); if (error < 0) { -- cgit v1.2.3 From 517be09def6cd7bc231222ee756fde8ea245a6fe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:42:20 -0400 Subject: NFSv4: Fix the referral mount code Fix a typo which causes try_location() to use the wrong length argument when calling nfs_parse_server_name(). This again, causes the initialisation of the mount's sockaddr structure to fail. Also ensure that if nfs4_pathname_string() returns an error, then we pass that error back up the stack instead of ENOENT. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2636c26d56fa..fa3408f20112 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) - return mnt; + return ERR_CAST(mnt_path); mountdata->mnt_path = mnt_path; maxbuflen = mnt_path - 1 - page2; @@ -132,15 +132,15 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (buf->len <= 0 || buf->len >= maxbuflen) continue; - mountdata->addr = (struct sockaddr *)&addr; - if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) continue; - mountdata->addrlen = nfs_parse_server_name(buf->data, - buf->len, - mountdata->addr, mountdata->addrlen); + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, + (struct sockaddr *)&addr, sizeof(addr)); if (mountdata->addrlen == 0) continue; + + mountdata->addr = (struct sockaddr *)&addr; rpc_set_port(mountdata->addr, NFS_PORT); memcpy(page2, buf->data, buf->len); -- cgit v1.2.3 From 4055e97318809638a57fbe1746b93bc7a90ef0d3 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 7 Oct 2009 16:32:24 -0700 Subject: fs: includecheck fix: proc, kcore.c fix the following 'make includecheck' warning: fs/proc/kcore.c: linux/mm.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 56013371f9f3..a44a7897fd4d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 253fb02d62571e5455eedc9e39b9d660e86a40f0 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:27 -0700 Subject: pagemap: export KPF_HWPOISON This flag indicates a hardware detected memory corruption on the page. Any future access of the page data may bring down the machine. Signed-off-by: Wu Fengguang Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page-types.c | 2 ++ Documentation/vm/pagemap.txt | 4 ++++ fs/proc/page.c | 5 +++++ 3 files changed, 11 insertions(+) (limited to 'fs') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index fa1a30d9e9d5..87f57228f56e 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -69,6 +69,7 @@ #define KPF_COMPOUND_TAIL 16 #define KPF_HUGE 17 #define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 #define KPF_NOPAGE 20 /* [32-] kernel hacking assistances */ @@ -116,6 +117,7 @@ static char *page_flag_names[] = { [KPF_COMPOUND_TAIL] = "T:compound_tail", [KPF_HUGE] = "G:huge", [KPF_UNEVICTABLE] = "u:unevictable", + [KPF_HWPOISON] = "X:hwpoison", [KPF_NOPAGE] = "n:nopage", [KPF_RESERVED] = "r:reserved", diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 600a304a828c..2fdd84a19109 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -57,6 +57,7 @@ There are three components to pagemap: 16. COMPOUND_TAIL 16. HUGE 18. UNEVICTABLE + 19. HWPOISON 20. NOPAGE Short descriptions to the page flags: @@ -86,6 +87,9 @@ Short descriptions to the page flags: 17. HUGE this is an integral part of a HugeTLB page +19. HWPOISON + hardware detected memory corruption on this page: don't touch the data! + 20. NOPAGE no page frame exists at the requested address diff --git a/fs/proc/page.c b/fs/proc/page.c index 2281c2cbfe2b..5033ce0d254b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -94,6 +94,7 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_COMPOUND_TAIL 16 #define KPF_HUGE 17 #define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 #define KPF_NOPAGE 20 #define KPF_KSM 21 @@ -180,6 +181,10 @@ static u64 get_uflags(struct page *page) u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif -- cgit v1.2.3 From 3050141bae57984dd660e6861632ccf9b8bca77e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 8 Oct 2009 11:50:55 -0400 Subject: NFSv4: Kill nfs4_renewd_prepare_shutdown() The NFSv4 renew daemon is shared between all active super blocks that refer to a particular NFS server, so it is wrong to be shutting it down in nfs4_kill_super every time a super block is destroyed. This patch therefore kills nfs4_renewd_prepare_shutdown altogether, and leaves it up to nfs4_shutdown_client() to also shut down the renew daemon by means of the existing call to nfs4_kill_renewd(). Signed-off-by: Trond Myklebust --- fs/nfs/nfs4renewd.c | 6 ------ fs/nfs/super.c | 1 - 2 files changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index e27c6cef18f2..0156c01c212c 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -126,12 +126,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) spin_unlock(&clp->cl_lock); } -void -nfs4_renewd_prepare_shutdown(struct nfs_server *server) -{ - cancel_delayed_work(&server->nfs_client->cl_renewd); -} - void nfs4_kill_renewd(struct nfs_client *clp) { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index fb3b280cacfe..6dabf6feec94 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2689,7 +2689,6 @@ static void nfs4_kill_super(struct super_block *sb) dprintk("--> %s\n", __func__); nfs_super_return_all_delegations(sb); kill_anon_super(sb); - nfs4_renewd_prepare_shutdown(server); nfs_fscache_release_super_cookie(sb); nfs_free_server(server); dprintk("<-- %s\n", __func__); -- cgit v1.2.3 From 664fc5a4e7d0d7f3487e5c856b79f7dac79567fd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Sep 2009 13:34:20 -0700 Subject: ecryptfs: depends on CRYPTO ecryptfs uses crypto APIs so it should depend on CRYPTO. Otherwise many build errors occur. [63 lines not pasted] Signed-off-by: Randy Dunlap Cc: Andrew Morton Cc: ecryptfs-devel@lists.launchpad.net Signed-off-by: Tyler Hicks --- fs/ecryptfs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 8aadb99b7634..89ebed3a4869 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,6 +1,6 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && NET + depends on EXPERIMENTAL && KEYS && NET && CRYPTO select CRYPTO_ECB select CRYPTO_CBC help -- cgit v1.2.3 From ed1f21857e76a92a006e0f890a3d7f72953b1469 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 29 Sep 2009 02:33:59 -0500 Subject: eCryptfs: Remove Kconfig NET dependency and select MD5 eCryptfs no longer uses a netlink interface to communicate with ecryptfsd, so NET is not a valid dependency anymore. MD5 is required and must be built for eCryptfs to be of any use. Signed-off-by: Tyler Hicks --- fs/ecryptfs/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 89ebed3a4869..1cd6d9d3e29a 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,8 +1,9 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && NET && CRYPTO + depends on EXPERIMENTAL && KEYS && CRYPTO select CRYPTO_ECB select CRYPTO_CBC + select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See to learn more about -- cgit v1.2.3 From 36520be8e32b49bd85a63b7b8b40cd07c3da59a5 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Mon, 5 Oct 2009 14:25:44 -0400 Subject: ima: ecryptfs fix imbalance message The unencrypted files are being measured. Update the counters to get rid of the ecryptfs imbalance message. (http://bugzilla.redhat.com/519737) Reported-by: Sachin Garg Cc: Eric Paris Cc: Dustin Kirkland Cc: James Morris Cc: David Safford Cc: stable@kernel.org Signed-off-by: Mimi Zohar Signed-off-by: Tyler Hicks --- fs/ecryptfs/main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 101fe4c7b1ee..c6ac85d6c701 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "ecryptfs_kernel.h" /** @@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) const struct cred *cred = current_cred(); struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); + int opened_lower_file = 0; int rc = 0; mutex_lock(&inode_info->lower_file_mutex); @@ -134,9 +136,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", lower_dentry, lower_mnt, rc); inode_info->lower_file = NULL; - } + } else + opened_lower_file = 1; } mutex_unlock(&inode_info->lower_file_mutex); + if (opened_lower_file) + ima_counts_get(inode_info->lower_file); return rc; } -- cgit v1.2.3 From f9581b1443abac50c90168301d40a7734b13a5dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Oct 2009 20:29:26 +0000 Subject: xfs: implement ->dirty_inode to fix timestamp handling This is picking up on Felix's repost of Dave's patch to implement a .dirty_inode method. We really need this notification because the VFS keeps writing directly into the inode structure instead of going through methods to update this state. In addition to the long-known atime issue we now also have a caller in VM code that updates c/mtime that way for shared writeable mmaps. And I found another one that no one has noticed in practice in the FIFO code. So implement ->dirty_inode to set i_update_core whenever the inode gets externally dirtied, and switch the c/mtime handling to the same scheme we already use for atime (always picking up the value from the Linux inode). Note that this patch also removes the xfs_synchronize_atime call in xfs_reclaim it was superflous as we already synchronize the time when writing the inode via the log (xfs_inode_item_format) or the normal buffers (xfs_iflush_int). In addition also remove the I_CLEAR check before copying the Linux timestamps - now that we always have the Linux inode available we can always use the timestamps in it. Also switch to just using file_update_time for regular reads/writes - that will get us all optimization done to it for free and make sure we notice early when it breaks. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_aops.c | 1 - fs/xfs/linux-2.6/xfs_iops.c | 41 +++++++++++++++-------------------------- fs/xfs/linux-2.6/xfs_lrw.c | 2 +- fs/xfs/linux-2.6/xfs_super.c | 23 +++++++++++++++++++++++ fs/xfs/xfs_dfrag.c | 8 ++++---- fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_inode_item.c | 18 +++++++++++------- fs/xfs/xfs_itable.c | 21 +++++++++++++-------- fs/xfs/xfs_vnodeops.c | 6 ------ 10 files changed, 70 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index d5e5559e31db..40d2226060d1 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -215,7 +215,6 @@ xfs_setfilesize( if (ip->i_d.di_size < isize) { ip->i_d.di_size = isize; - ip->i_update_core = 1; xfs_mark_inode_dirty_sync(ip); } diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 626b474b2cdd..cdf7114d41fc 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -57,19 +57,22 @@ #include /* - * Bring the atime in the XFS inode uptodate. - * Used before logging the inode to disk or when the Linux inode goes away. + * Bring the timestamps in the XFS inode uptodate. + * + * Used before writing the inode to disk. */ void -xfs_synchronize_atime( +xfs_synchronize_times( xfs_inode_t *ip) { struct inode *inode = VFS_I(ip); - if (!(inode->i_state & I_CLEAR)) { - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - } + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; + ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; + ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; } /* @@ -106,32 +109,20 @@ xfs_ichgtime( if ((flags & XFS_ICHGTIME_MOD) && !timespec_equal(&inode->i_mtime, &tv)) { inode->i_mtime = tv; - ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } if ((flags & XFS_ICHGTIME_CHG) && !timespec_equal(&inode->i_ctime, &tv)) { inode->i_ctime = tv; - ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } /* - * We update the i_update_core field _after_ changing - * the timestamps in order to coordinate properly with - * xfs_iflush() so that we don't lose timestamp updates. - * This keeps us from having to hold the inode lock - * while doing this. We use the SYNCHRONIZE macro to - * ensure that the compiler does not reorder the update - * of i_update_core above the timestamp updates above. + * Update complete - now make sure everyone knows that the inode + * is dirty. */ - if (sync_it) { - SYNCHRONIZE(); - ip->i_update_core = 1; + if (sync_it) xfs_mark_inode_dirty_sync(ip); - } } /* @@ -514,10 +505,8 @@ xfs_vn_getattr( stat->gid = ip->i_d.di_gid; stat->ino = ip->i_ino; stat->atime = inode->i_atime; - stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; - stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec; - stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 49e4a6aea73c..072050f8d346 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -667,7 +667,7 @@ start: xip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) - xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + file_update_time(file); /* * If the offset is beyond the size of the file, we have a couple diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 5d7c60ac77b4..1ea65b6e5ab6 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -976,6 +976,28 @@ xfs_fs_inode_init_once( mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); } +/* + * Dirty the XFS inode when mark_inode_dirty_sync() is called so that + * we catch unlogged VFS level updates to the inode. Care must be taken + * here - the transaction code calls mark_inode_dirty_sync() to mark the + * VFS inode dirty in a transaction and clears the i_update_core field; + * it must clear the field after calling mark_inode_dirty_sync() to + * correctly indicate that the dirty state has been propagated into the + * inode log item. + * + * We need the barrier() to maintain correct ordering between unlogged + * updates and the transaction commit code that clears the i_update_core + * field. This requires all updates to be completed before marking the + * inode dirty. + */ +STATIC void +xfs_fs_dirty_inode( + struct inode *inode) +{ + barrier(); + XFS_I(inode)->i_update_core = 1; +} + /* * Attempt to flush the inode, this will actually fail * if the inode is pinned, but we dirty the inode again @@ -1539,6 +1561,7 @@ xfs_fs_get_sb( static struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 7465f9ee125f..ab89a7e94a0f 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -206,10 +206,10 @@ xfs_swap_extents( * process that the file was not changed out from * under it. */ - if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || - (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || - (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || - (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = XFS_ERROR(EBUSY); goto out_unlock; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c1dc7ef5a1d8..b92a4fa2a0a1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3068,9 +3068,9 @@ xfs_iflush_int( SYNCHRONIZE(); /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); + xfs_synchronize_times(ip); if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0b38b9a869ec..41555de1d1db 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -504,7 +504,7 @@ void xfs_ichgtime(xfs_inode_t *, int); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); -void xfs_synchronize_atime(xfs_inode_t *); +void xfs_synchronize_times(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); #if defined(XFS_INODE_TRACE) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 47d5b663c37e..9794b876d6ff 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -231,6 +231,15 @@ xfs_inode_item_format( vecp++; nvecs = 1; + /* + * Make sure the linux inode is dirty. We do this before + * clearing i_update_core as the VFS will call back into + * XFS here and set i_update_core, so we need to dirty the + * inode first so that the ordering of i_update_core and + * unlogged modifications still works as described below. + */ + xfs_mark_inode_dirty_sync(ip); + /* * Clear i_update_core if the timestamps (or any other * non-transactional modification) need flushing/logging @@ -263,14 +272,9 @@ xfs_inode_item_format( } /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); - - /* - * make sure the linux inode is dirty - */ - xfs_mark_inode_dirty_sync(ip); + xfs_synchronize_times(ip); vecp->i_addr = (xfs_caddr_t)&ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index b68f9107e26c..62efab2f3839 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -59,6 +59,7 @@ xfs_bulkstat_one_iget( { xfs_icdinode_t *dic; /* dinode core info pointer */ xfs_inode_t *ip; /* incore inode pointer */ + struct inode *inode; int error; error = xfs_iget(mp, NULL, ino, @@ -72,6 +73,7 @@ xfs_bulkstat_one_iget( ASSERT(ip->i_imap.im_blkno != 0); dic = &ip->i_d; + inode = VFS_I(ip); /* xfs_iget returns the following without needing * further change. @@ -83,16 +85,19 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; + /* - * We are reading the atime from the Linux inode because the - * dinode might not be uptodate. + * We need to read the timestamps from the Linux inode because + * the VFS keeps writing directly into the inode structure instead + * of telling us about the updates. */ - buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec; - buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec; - buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; - buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; - buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; - buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; + buf->bs_atime.tv_sec = inode->i_atime.tv_sec; + buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; + buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; + buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; + buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; + buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; + buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; buf->bs_extents = dic->di_nextents; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index a434f287962d..b572f7e840e0 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2475,12 +2475,6 @@ xfs_reclaim( ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); - /* - * Make sure the atime in the XFS inode is correct before freeing the - * Linux inode. - */ - xfs_synchronize_atime(ip); - /* * If we have nothing to flush with this inode then complete the * teardown now, otherwise break the link between the xfs inode and the -- cgit v1.2.3 From c90b07e8dd9f263d2e2af1d9648ba269f4d0d8fd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:27 +0000 Subject: xfs: fix xfs_quiesce_data We need to do a synchronous xfs_sync_fsdata to make sure the superblock actually is on disk when we return. Also remove SYNC_BDFLUSH flag to xfs_sync_inodes because that particular flag is never checked. Move xfs_filestream_flush call later to only release inodes after they have been written out. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 320be6aea492..7647b8db3ca3 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -419,14 +419,16 @@ xfs_quiesce_data( /* push non-blocking */ xfs_sync_data(mp, 0); xfs_qm_sync(mp, SYNC_TRYLOCK); - xfs_filestream_flush(mp); - /* push and block */ + /* push and block till complete */ xfs_sync_data(mp, SYNC_WAIT); xfs_qm_sync(mp, SYNC_WAIT); + /* drop inode references pinned by filestreams */ + xfs_filestream_flush(mp); + /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp, 0); + error = xfs_sync_fsdata(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) -- cgit v1.2.3 From 69961a26b82931601e07c9e3656bd90c598f2395 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Oct 2009 20:29:28 +0000 Subject: xfs: cleanup ->sync_fs Sort out ->sync_fs to not perform a superblock writeback for the wait = 0 case as that is just an optional first pass and the superblock will be written back properly in the next call with wait = 1. Instead perform an opportunistic quota writeback to have less work later. Also remove the freeze special case as we do a proper wait = 1 call in the freeze code anyway. Also rename the function to xfs_fs_sync_fs to match the normal naming convention, update comments and avoid calling into the laptop_mode logic on an error. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_super.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1ea65b6e5ab6..eefea0d9d03d 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1148,7 +1148,7 @@ xfs_fs_put_super( } STATIC int -xfs_fs_sync_super( +xfs_fs_sync_fs( struct super_block *sb, int wait) { @@ -1156,23 +1156,23 @@ xfs_fs_sync_super( int error; /* - * Treat a sync operation like a freeze. This is to work - * around a race in sync_inodes() which works in two phases - * - an asynchronous flush, which can write out an inode - * without waiting for file size updates to complete, and a - * synchronous flush, which wont do anything because the - * async flush removed the inode's dirty flag. Also - * sync_inodes() will not see any files that just have - * outstanding transactions to be flushed because we don't - * dirty the Linux inode until after the transaction I/O - * completes. + * Not much we can do for the first async pass. Writing out the + * superblock would be counter-productive as we are going to redirty + * when writing out other data and metadata (and writing out a single + * block is quite fast anyway). + * + * Try to asynchronously kick off quota syncing at least. */ - if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) - error = xfs_quiesce_data(mp); - else - error = xfs_sync_fsdata(mp, 0); + if (!wait) { + xfs_qm_sync(mp, SYNC_TRYLOCK); + return 0; + } + + error = xfs_quiesce_data(mp); + if (error) + return -error; - if (unlikely(laptop_mode)) { + if (laptop_mode) { int prev_sync_seq = mp->m_sync_seq; /* @@ -1191,7 +1191,7 @@ xfs_fs_sync_super( mp->m_sync_seq != prev_sync_seq); } - return -error; + return 0; } STATIC int @@ -1565,7 +1565,7 @@ static struct super_operations xfs_super_operations = { .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, - .sync_fs = xfs_fs_sync_super, + .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, -- cgit v1.2.3 From 932640e8adaff3d0c28ee32d4863c7a7a74df055 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:29 +0000 Subject: xfs: mark inodes dirty before issuing I/O To make sure they get properly waited on in sync when I/O is in flight and we latter need to update the inode size. Requires a new helper to check if an ioend structure is beyond the current EOF. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_aops.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 40d2226060d1..6a2910ee8a28 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -185,6 +185,24 @@ xfs_destroy_ioend( mempool_free(ioend, xfs_ioend_pool); } +/* + * If the end of the current ioend is beyond the current EOF, + * return the new EOF value, otherwise zero. + */ +STATIC xfs_fsize_t +xfs_ioend_new_eof( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip = XFS_I(ioend->io_inode); + xfs_fsize_t isize; + xfs_fsize_t bsize; + + bsize = ioend->io_offset + ioend->io_size; + isize = MAX(ip->i_size, ip->i_new_size); + isize = MIN(isize, bsize); + return isize > ip->i_d.di_size ? isize : 0; +} + /* * Update on-disk file size now that data has been written to disk. * The current in-memory file size is i_size. If a write is beyond @@ -192,13 +210,13 @@ xfs_destroy_ioend( * updated. If this write does not extend all the way to the valid * file size then restrict this update to the end of the write. */ + STATIC void xfs_setfilesize( xfs_ioend_t *ioend) { xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - xfs_fsize_t bsize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); ASSERT(ioend->io_type != IOMAP_READ); @@ -206,14 +224,9 @@ xfs_setfilesize( if (unlikely(ioend->io_error)) return; - bsize = ioend->io_offset + ioend->io_size; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - isize = MAX(ip->i_size, ip->i_new_size); - isize = MIN(isize, bsize); - - if (ip->i_d.di_size < isize) { + isize = xfs_ioend_new_eof(ioend); + if (isize) { ip->i_d.di_size = isize; xfs_mark_inode_dirty_sync(ip); } @@ -403,10 +416,16 @@ xfs_submit_ioend_bio( struct bio *bio) { atomic_inc(&ioend->io_remaining); - bio->bi_private = ioend; bio->bi_end_io = xfs_end_bio; + /* + * If the I/O is beyond EOF we mark the inode dirty immediately + * but don't update the inode size until I/O completion. + */ + if (xfs_ioend_new_eof(ioend)) + xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); + submit_bio(WRITE, bio); ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); bio_put(bio); -- cgit v1.2.3 From dce5065a57c158e0ca5db8e63c50118b2ecf4ac5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:30 +0000 Subject: xfs: make sure xfs_sync_fsdata covers the log We want to always cover the log after writing out the superblock, and in case of a synchronous writeout make sure we actually wait for the log to be covered. That way a filesystem that has been sync()ed can be considered clean by log recovery. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 7647b8db3ca3..961df0a22c78 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -309,11 +309,15 @@ xfs_sync_attr( STATIC int xfs_commit_dummy_trans( struct xfs_mount *mp, - uint log_flags) + uint flags) { struct xfs_inode *ip = mp->m_rootip; struct xfs_trans *tp; int error; + int log_flags = XFS_LOG_FORCE; + + if (flags & SYNC_WAIT) + log_flags |= XFS_LOG_SYNC; /* * Put a dummy transaction in the log to tell recovery @@ -331,13 +335,12 @@ xfs_commit_dummy_trans( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* XXX(hch): ignoring the error here.. */ error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* the log force ensures this transaction is pushed to disk */ xfs_log_force(mp, 0, log_flags); - return 0; + return error; } int @@ -385,7 +388,20 @@ xfs_sync_fsdata( else XFS_BUF_ASYNC(bp); - return xfs_bwrite(mp, bp); + error = xfs_bwrite(mp, bp); + if (error) + return error; + + /* + * If this is a data integrity sync make sure all pending buffers + * are flushed out for the log coverage check below. + */ + if (flags & SYNC_WAIT) + xfs_flush_buftarg(mp->m_ddev_targp, 1); + + if (xfs_log_need_covered(mp)) + error = xfs_commit_dummy_trans(mp, flags); + return error; out_brelse: xfs_buf_relse(bp); @@ -572,8 +588,6 @@ xfs_sync_worker( /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); -- cgit v1.2.3 From 8e69ce147127a0235e8d1f2b75ea214be78c61b3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 25 Sep 2009 19:42:26 +0000 Subject: fix readahead calculations in xfs_dir2_leaf_getdents() This is for bug #850, http://oss.sgi.com/bugzilla/show_bug.cgi?id=850 XFS file system segfaults , repeatedly and 100% reproducable in 2.6.30 , 2.6.31 The above only showed up on a CONFIG_XFS_DEBUG=y kernel, because xfs_bmapi() ASSERTs that it has been asked for at least one map, and it was getting 0. The root cause is that our guesstimated "bufsize" from xfs_file_readdir was fairly small, and the bufsize -= length; in the loop was going negative - except bufsize is a size_t, so it was wrapping to a very large number. Then when we did ra_want = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize) - 1; with that very large number, the (int) ra_want was coming out negative, and a subsequent compare: if (1 + ra_want > map_blocks ... was coming out -true- (negative int compare w/ uint) and we went back to xfs_bmapi() for more, even though we did not need more, and asked for 0 maps, and hit the ASSERT. We have kind of a type mess here, but just keeping bufsize from going negative is probably sufficient to avoid the problem. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/xfs_dir2_leaf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index fa913e459442..41ad537c49e9 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents( */ ra_want = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize) - 1; + ASSERT(ra_want >= 0); /* * If we don't have as many as we want, and we haven't @@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents( */ ptr += length; curoff += length; - bufsize -= length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; } /* -- cgit v1.2.3 From d0800703febc04827b8fa91921aa4e254d01e8d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 26 Sep 2009 19:55:04 +0000 Subject: xfs: stop calling filemap_fdatawait inside ->fsync Now that the VFS actually waits for the data I/O to complete before calling into ->fsync we can stop doing it ourselves. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_file.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 988d8f87bc0f..6d65baf15a14 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -176,14 +176,7 @@ xfs_file_fsync( struct dentry *dentry, int datasync) { - struct inode *inode = dentry->d_inode; - struct xfs_inode *ip = XFS_I(inode); - int error; - - /* capture size updates in I/O completion before writing the inode. */ - error = filemap_fdatawait(inode->i_mapping); - if (error) - return error; + struct xfs_inode *ip = XFS_I(dentry->d_inode); xfs_iflags_clear(ip, XFS_ITRUNCATED); return -xfs_fsync(ip); -- cgit v1.2.3 From a791e35e12ff672e8a0e140abeeaf900c3b2ea77 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 8 Oct 2009 11:27:10 -0400 Subject: Btrfs: cleanup extent_clear_unlock_delalloc flags extent_clear_unlock_delalloc has a growing set of ugly parameters that is very difficult to read and maintain. This switches to a flag field and well named flag defines. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 27 +++++++++++---------------- fs/btrfs/extent_io.h | 16 ++++++++++------ fs/btrfs/inode.c | 45 ++++++++++++++++++++++++++++++--------------- 3 files changed, 51 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index de1793ba004a..f9708bd01669 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1401,12 +1401,7 @@ out_failed: int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int unlock_pages, - int clear_unlock, - int clear_delalloc, int clear_dirty, - int set_writeback, - int end_writeback, - int set_private2) + unsigned long op) { int ret; struct page *pages[16]; @@ -1416,17 +1411,17 @@ int extent_clear_unlock_delalloc(struct inode *inode, int i; int clear_bits = 0; - if (clear_unlock) + if (op & EXTENT_CLEAR_UNLOCK) clear_bits |= EXTENT_LOCKED; - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_bits |= EXTENT_DIRTY; - if (clear_delalloc) + if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(unlock_pages || clear_dirty || set_writeback || end_writeback || - set_private2)) + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK | EXTENT_SET_PRIVATE2))) return 0; while (nr_pages > 0) { @@ -1435,20 +1430,20 @@ int extent_clear_unlock_delalloc(struct inode *inode, nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { - if (set_private2) + if (op & EXTENT_SET_PRIVATE2) SetPagePrivate2(pages[i]); if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; } - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_page_dirty_for_io(pages[i]); - if (set_writeback) + if (op & EXTENT_SET_WRITEBACK) set_page_writeback(pages[i]); - if (end_writeback) + if (op & EXTENT_END_WRITEBACK) end_page_writeback(pages[i]); - if (unlock_pages) + if (op & EXTENT_CLEAR_UNLOCK_PAGE) unlock_page(pages[i]); page_cache_release(pages[i]); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4794ec891fed..41d2a47ecf38 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -25,6 +25,15 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 +/* these are flags for extent_clear_unlock_delalloc */ +#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 +#define EXTENT_CLEAR_UNLOCK 0x2 +#define EXTENT_CLEAR_DELALLOC 0x4 +#define EXTENT_CLEAR_DIRTY 0x8 +#define EXTENT_SET_WRITEBACK 0x10 +#define EXTENT_END_WRITEBACK 0x20 +#define EXTENT_SET_PRIVATE2 0x40 + /* * page->private values. Every page that is controlled by the extent * map has page->private set to one. @@ -288,10 +297,5 @@ int extent_range_uptodate(struct extent_io_tree *tree, int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int unlock_page, - int clear_unlock, - int clear_delalloc, int clear_dirty, - int set_writeback, - int end_writeback, - int set_private2); + unsigned long op); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3a6f953337b5..a7058fca3aab 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -424,9 +424,10 @@ again: * and free up our temp pages. */ extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, 1, 0, - 0, 1, 1, 1, 0); + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); ret = 0; goto free_pages_out; } @@ -637,11 +638,13 @@ static noinline int submit_compressed_extents(struct inode *inode, * clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - NULL, 1, 1, 0, 1, 1, 0, 0); + &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); ret = btrfs_submit_compressed_write(inode, async_extent->start, @@ -712,9 +715,14 @@ static noinline int cow_file_range(struct inode *inode, start, end, 0, NULL); if (ret == 0) { extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, 1, 1, - 1, 1, 1, 1, 0); + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; @@ -738,6 +746,8 @@ static noinline int cow_file_range(struct inode *inode, btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { + unsigned long op; + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, @@ -789,10 +799,13 @@ static noinline int cow_file_range(struct inode *inode, * Do set the Private2 bit so we know this page was properly * setup for writepage */ + op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; + op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2; + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, start + ram_size - 1, - locked_page, unlock, 1, - 1, 0, 0, 0, 1); + locked_page, op); disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; @@ -1112,8 +1125,10 @@ out_check: BUG_ON(ret); extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - cur_offset, cur_offset + num_bytes - 1, - locked_page, 1, 1, 1, 0, 0, 0, 1); + cur_offset, cur_offset + num_bytes - 1, + locked_page, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2); cur_offset = extent_end; if (cur_offset > end) break; -- cgit v1.2.3 From a3429ab70b04363c6190964e82f04f44f3e34cf0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 8 Oct 2009 12:30:20 -0400 Subject: Btrfs: delay clearing EXTENT_DELALLOC for compressed extents When compression is on, the cow_file_range code is farmed off to worker threads. This allows us to do significant CPU work in parallel on SMP machines. But it is a delicate balance around when we clear flags and how. In the past we cleared the delalloc flag immediately, which was safe because the pages stayed locked. But this is causing problems with the newest ENOSPC code, and with the recent extent state cleanups we can now clear the delalloc bit at the same time the uncompressed code does. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a7058fca3aab..401dfb2a94e8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -427,6 +427,7 @@ again: &BTRFS_I(inode)->io_tree, start, end, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_CLEAR_DELALLOC | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); ret = 0; goto free_pages_out; @@ -644,6 +645,7 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->ram_size - 1, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); ret = btrfs_submit_compressed_write(inode, @@ -877,8 +879,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; int limit = 10 * 1024 * 1042; - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | - EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, + 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); async_cow->inode = inode; -- cgit v1.2.3 From 32c00aff718bb54a214b39146bdd9ac01511cd25 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 8 Oct 2009 13:34:05 -0400 Subject: Btrfs: release delalloc reservations on extent item insertion This patch fixes an issue with the delalloc metadata space reservation code. The problem is we used to free the reservation as soon as we allocated the delalloc region. The problem with this is if we are not inserting an inline extent, we don't actually insert the extent item until after the ordered extent is written out. This patch does 3 things, 1) It moves the reservation clearing stuff into the ordered code, so when we remove the ordered extent we remove the reservation. 2) It adds a EXTENT_DO_ACCOUNTING flag that gets passed when we clear delalloc bits in the cases where we want to clear the metadata reservation when we clear the delalloc extent, in the case that we do an inline extent or we invalidate the page. 3) It adds another waitqueue to the space info so that when we start a fs wide delalloc flush, anybody else who also hits that area will simply wait for the flush to finish and then try to make their allocation. This has been tested thoroughly to make sure we did not regress on performance. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 12 ++++++----- fs/btrfs/ctree.h | 3 +++ fs/btrfs/extent-tree.c | 54 +++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/extent_io.c | 19 +++++++++++------ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 45 ++++++++++++++++++++++++++++------------- fs/btrfs/ordered-data.c | 6 ++++++ 8 files changed, 107 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a54d354cefcb..c71abec0ab90 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -128,12 +128,14 @@ struct btrfs_inode { u64 last_unlink_trans; /* - * These two counters are for delalloc metadata reservations. We keep - * track of how many extents we've accounted for vs how many extents we - * have. + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. */ - int delalloc_reserved_extents; - int delalloc_extents; + spinlock_t accounting_lock; + int reserved_extents; + int outstanding_extents; /* * ordered_data_close is set by truncate when a file that used diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1b920ffc6a59..dbdada569507 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -699,6 +699,9 @@ struct btrfs_space_info { int allocating_chunk; wait_queue_head_t wait; + + int flushing; + wait_queue_head_t flush_wait; }; /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2f82fabd7011..3d1be0b77f8f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2823,14 +2823,17 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, num_items); spin_lock(&meta_sinfo->lock); - if (BTRFS_I(inode)->delalloc_reserved_extents <= - BTRFS_I(inode)->delalloc_extents) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + if (BTRFS_I(inode)->reserved_extents <= + BTRFS_I(inode)->outstanding_extents) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); spin_unlock(&meta_sinfo->lock); return 0; } + spin_unlock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->delalloc_reserved_extents--; - BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0); + BTRFS_I(inode)->reserved_extents--; + BUG_ON(BTRFS_I(inode)->reserved_extents < 0); if (meta_sinfo->bytes_delalloc < num_bytes) { bug = true; @@ -2863,6 +2866,37 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) meta_sinfo->force_delalloc = 0; } +static void flush_delalloc(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + bool wait = false; + + spin_lock(&info->lock); + + if (!info->flushing) { + info->flushing = 1; + init_waitqueue_head(&info->flush_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_event(info->flush_wait, + !info->flushing); + return; + } + + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); +} + static int maybe_allocate_chunk(struct btrfs_root *root, struct btrfs_space_info *info) { @@ -2980,21 +3014,20 @@ again: filemap_flush(inode->i_mapping); goto again; } else if (flushed == 3) { - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + flush_delalloc(root, meta_sinfo); goto again; } spin_lock(&meta_sinfo->lock); meta_sinfo->bytes_delalloc -= num_bytes; spin_unlock(&meta_sinfo->lock); printk(KERN_ERR "enospc, has %d, reserved %d\n", - BTRFS_I(inode)->delalloc_extents, - BTRFS_I(inode)->delalloc_reserved_extents); + BTRFS_I(inode)->outstanding_extents, + BTRFS_I(inode)->reserved_extents); dump_space_info(meta_sinfo, 0, 0); return -ENOSPC; } - BTRFS_I(inode)->delalloc_reserved_extents++; + BTRFS_I(inode)->reserved_extents++; check_force_delalloc(meta_sinfo); spin_unlock(&meta_sinfo->lock); @@ -3093,8 +3126,7 @@ again: } if (retries == 2) { - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + flush_delalloc(root, meta_sinfo); goto again; } spin_lock(&meta_sinfo->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f9708bd01669..96577e8bf9fd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -460,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, int bits, int wake, int delete) { - int ret = state->state & bits; + int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int ret = state->state & bits_to_clear; if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -468,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); - state->state &= ~bits; + state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); if (delete || state->state == 0) { @@ -956,7 +957,8 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); } @@ -1419,9 +1421,13 @@ int extent_clear_unlock_delalloc(struct inode *inode, if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; + if (op & EXTENT_CLEAR_ACCOUNTING) + clear_bits |= EXTENT_DO_ACCOUNTING; + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK | EXTENT_SET_PRIVATE2))) + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | + EXTENT_SET_PRIVATE2))) return 0; while (nr_pages > 0) { @@ -2709,7 +2715,8 @@ int extent_invalidatepage(struct extent_io_tree *tree, lock_extent(tree, start, end, GFP_NOFS); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); return 0; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 41d2a47ecf38..36de250a7b2b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -15,6 +15,7 @@ #define EXTENT_BUFFER_FILLED (1 << 8) #define EXTENT_BOUNDARY (1 << 9) #define EXTENT_NODATASUM (1 << 10) +#define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) /* flags for bio submission */ @@ -33,6 +34,7 @@ #define EXTENT_SET_WRITEBACK 0x10 #define EXTENT_END_WRITEBACK 0x20 #define EXTENT_SET_PRIVATE2 0x40 +#define EXTENT_CLEAR_ACCOUNTING 0x80 /* * page->private values. Every page that is controlled by the extent diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f155179877a6..53fb1c997f0e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -878,7 +878,8 @@ again: btrfs_put_ordered_extent(ordered); clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, GFP_NOFS); unlock_extent(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, GFP_NOFS); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 401dfb2a94e8..ccc4f1121210 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -428,6 +428,7 @@ again: start, end, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); ret = 0; goto free_pages_out; @@ -722,6 +723,7 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); @@ -1195,15 +1197,17 @@ static int btrfs_split_extent_hook(struct inode *inode, root->fs_info->max_extent); /* - * if we break a large extent up then leave delalloc_extents be, - * since we've already accounted for the large extent. + * if we break a large extent up then leave oustanding_extents + * be, since we've already accounted for the large extent. */ if (div64_u64(new_size + root->fs_info->max_extent - 1, root->fs_info->max_extent) < num_extents) return 0; } - BTRFS_I(inode)->delalloc_extents++; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); return 0; } @@ -1234,7 +1238,9 @@ static int btrfs_merge_extent_hook(struct inode *inode, /* we're not bigger than the max, unreserve the space and go */ if (new_size <= root->fs_info->max_extent) { - BTRFS_I(inode)->delalloc_extents--; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); return 0; } @@ -1248,7 +1254,9 @@ static int btrfs_merge_extent_hook(struct inode *inode, root->fs_info->max_extent) > num_extents) return 0; - BTRFS_I(inode)->delalloc_extents--; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); return 0; } @@ -1270,7 +1278,9 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; - BTRFS_I(inode)->delalloc_extents++; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += end - start + 1; @@ -1298,8 +1308,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; - BTRFS_I(inode)->delalloc_extents--; - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + if (bits & EXTENT_DO_ACCOUNTING) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + } spin_lock(&root->fs_info->delalloc_lock); if (state->end - state->start + 1 > @@ -4825,7 +4839,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, + NULL, GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -4838,8 +4853,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) lock_extent(tree, page_start, page_end, GFP_NOFS); } clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, - 1, 1, NULL, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); @@ -4934,7 +4949,8 @@ again: * prepare_pages in the normal write path. */ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); ret = btrfs_set_extent_delalloc(inode, page_start, page_end); if (ret) { @@ -5082,8 +5098,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) return NULL; ei->last_trans = 0; ei->logged_trans = 0; - ei->delalloc_extents = 0; - ei->delalloc_reserved_extents = 0; + ei->outstanding_extents = 0; + ei->reserved_extents = 0; + spin_lock_init(&ei->accounting_lock); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->ordered_operations); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4a9c8c4cec25..ab21c29f2247 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -306,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode, tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, + inode, 1); + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); -- cgit v1.2.3 From e3ccfa989752c083ceb23c823a84f7ce3a081e61 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 7 Oct 2009 20:44:34 -0400 Subject: Btrfs: async delalloc flushing under space pressure This patch moves the delalloc flushing that occurs when we are under space pressure off to a async thread pool. This helps since we only free up metadata space when we actually insert the extent item, which means it takes quite a while for space to be free'ed up if we wait on all ordered extents. However, if space is freed up due to inline extents being inserted, we can wake people who are waiting up early, and they can finish their work. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 13 ++++---- fs/btrfs/disk-io.c | 6 ++++ fs/btrfs/extent-tree.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 88 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dbdada569507..a362dd617e97 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -691,17 +691,17 @@ struct btrfs_space_info { struct list_head list; + /* for controlling how we free up space for allocations */ + wait_queue_head_t allocate_wait; + wait_queue_head_t flush_wait; + int allocating_chunk; + int flushing; + /* for block groups in our same type */ struct list_head block_groups; spinlock_t lock; struct rw_semaphore groups_sem; atomic_t caching_threads; - - int allocating_chunk; - wait_queue_head_t wait; - - int flushing; - wait_queue_head_t flush_wait; }; /* @@ -918,6 +918,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers submit_workers; + struct btrfs_workers enospc_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9903f042765d..ac8927bdc33d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1762,6 +1762,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size), &fs_info->generic_worker); + btrfs_init_workers(&fs_info->enospc_workers, "enospc", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1809,6 +1812,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_meta_workers, 1); btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); btrfs_start_workers(&fs_info->endio_write_workers, 1); + btrfs_start_workers(&fs_info->enospc_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -2023,6 +2027,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2449,6 +2454,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d1be0b77f8f..53026806ae9e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2866,9 +2866,66 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) meta_sinfo->force_delalloc = 0; } +struct async_flush { + struct btrfs_root *root; + struct btrfs_space_info *info; + struct btrfs_work work; +}; + +static noinline void flush_delalloc_async(struct btrfs_work *work) +{ + struct async_flush *async; + struct btrfs_root *root; + struct btrfs_space_info *info; + + async = container_of(work, struct async_flush, work); + root = async->root; + info = async->info; + + btrfs_start_delalloc_inodes(root); + wake_up(&info->flush_wait); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); + + kfree(async); +} + +static void wait_on_flush(struct btrfs_space_info *info) +{ + DEFINE_WAIT(wait); + u64 used; + + while (1) { + prepare_to_wait(&info->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_lock(&info->lock); + if (!info->flushing) { + spin_unlock(&info->lock); + break; + } + + used = info->bytes_used + info->bytes_reserved + + info->bytes_pinned + info->bytes_readonly + + info->bytes_super + info->bytes_root + + info->bytes_may_use + info->bytes_delalloc; + if (used < info->total_bytes) { + spin_unlock(&info->lock); + break; + } + spin_unlock(&info->lock); + schedule(); + } + finish_wait(&info->flush_wait, &wait); +} + static void flush_delalloc(struct btrfs_root *root, struct btrfs_space_info *info) { + struct async_flush *async; bool wait = false; spin_lock(&info->lock); @@ -2883,11 +2940,24 @@ static void flush_delalloc(struct btrfs_root *root, spin_unlock(&info->lock); if (wait) { - wait_event(info->flush_wait, - !info->flushing); + wait_on_flush(info); return; } + async = kzalloc(sizeof(*async), GFP_NOFS); + if (!async) + goto flush; + + async->root = root; + async->info = info; + async->work.func = flush_delalloc_async; + + btrfs_queue_worker(&root->fs_info->enospc_workers, + &async->work); + wait_on_flush(info); + return; + +flush: btrfs_start_delalloc_inodes(root); btrfs_wait_ordered_extents(root, 0); @@ -2927,7 +2997,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root, if (!info->allocating_chunk) { info->force_alloc = 1; info->allocating_chunk = 1; - init_waitqueue_head(&info->wait); + init_waitqueue_head(&info->allocate_wait); } else { wait = true; } @@ -2935,7 +3005,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root, spin_unlock(&info->lock); if (wait) { - wait_event(info->wait, + wait_event(info->allocate_wait, !info->allocating_chunk); return 1; } @@ -2956,7 +3026,7 @@ out: spin_lock(&info->lock); info->allocating_chunk = 0; spin_unlock(&info->lock); - wake_up(&info->wait); + wake_up(&info->allocate_wait); if (ret) return 0; -- cgit v1.2.3 From ff782e0a131c7f669445c07fe5c7ba91e043b7ed Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 8 Oct 2009 15:30:04 -0400 Subject: Btrfs: optimize fsync for the single writer case This patch optimizes the tree logging stuff so it doesn't always wait 1 jiffie for new people to join the logging transaction if there is only ever 1 writer. This helps a little bit with latency where we have something like RPM where it will fdatasync every file it writes, and so waiting the 1 jiffie for every fdatasync really starts to add up. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/tree-log.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a362dd617e97..16ddb19dda86 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1010,6 +1010,8 @@ struct btrfs_root { atomic_t log_commit[2]; unsigned long log_transid; unsigned long log_batch; + pid_t log_start_pid; + bool log_multiple_pids; u64 objectid; u64 last_trans; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4d7d9abef42f..78f6254ac2d9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { + if (!root->log_start_pid) { + root->log_start_pid = current->pid; + root->log_multiple_pids = false; + } else if (root->log_start_pid != current->pid) { + root->log_multiple_pids = true; + } + root->log_batch++; atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return 0; } + root->log_multiple_pids = false; + root->log_start_pid = current->pid; mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); @@ -1985,7 +1994,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); - while (1) { + while (root->log_multiple_pids) { unsigned long batch = root->log_batch; mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); @@ -2011,6 +2020,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root->log_batch = 0; root->log_transid++; log->log_transid = root->log_transid; + root->log_start_pid = 0; smp_mb(); /* * log tree has been flushed to disk, new modifications of -- cgit v1.2.3 From efefb1438be269897585934fc6c05deb4dfa01ce Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 9 Oct 2009 09:25:16 -0400 Subject: Btrfs: remove negative dentry when deleting subvolumne The use of btrfs_dentry_delete is removing dentries from the dcache when deleting subvolumne. btrfs_dentry_delete ignores negative dentries. This is incorrect since if we don't remove the negative dentry, its parent dentry can't be removed. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 12 +++++++----- fs/btrfs/ioctl.c | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ccc4f1121210..d3585cb6483c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3629,12 +3629,14 @@ static int btrfs_dentry_delete(struct dentry *dentry) { struct btrfs_root *root; - if (!dentry->d_inode) - return 0; + if (!dentry->d_inode && !IS_ROOT(dentry)) + dentry = dentry->d_parent; - root = BTRFS_I(dentry->d_inode)->root; - if (btrfs_root_refs(&root->root_item) == 0) - return 1; + if (dentry->d_inode) { + root = BTRFS_I(dentry->d_inode)->root; + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + } return 0; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9a780c8d0ac8..e8795becad4c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -830,6 +830,7 @@ out_up_write: out_unlock: mutex_unlock(&inode->i_mutex); if (!err) { + shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); } -- cgit v1.2.3 From 94fcca9f8999e7828d5f4dc181daa39cad2af38a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 9 Oct 2009 09:25:16 -0400 Subject: Btrfs: optimize back reference update during btrfs_drop_snapshot This patch reading level 0 tree blocks that already use full backrefs. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 82 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53026806ae9e..4aedbff36b8f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4911,6 +4911,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 refs; + u64 flags; u64 last = 0; u32 nritems; u32 blocksize; @@ -4948,15 +4949,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, generation <= root->root_key.offset) continue; + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &refs, &flags); + BUG_ON(ret); + BUG_ON(refs == 0); + if (wc->stage == DROP_REFERENCE) { - ret = btrfs_lookup_extent_info(trans, root, - bytenr, blocksize, - &refs, NULL); - BUG_ON(ret); - BUG_ON(refs == 0); if (refs == 1) goto reada; + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; if (!wc->update_ref || generation <= root->root_key.offset) continue; @@ -4965,6 +4970,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, &wc->update_progress); if (ret < 0) continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; } reada: ret = readahead_tree_block(root, bytenr, blocksize, @@ -4988,7 +4997,7 @@ reada: static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc) + struct walk_control *wc, int lookup_info) { int level = wc->level; struct extent_buffer *eb = path->nodes[level]; @@ -5003,8 +5012,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ - if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || - (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); ret = btrfs_lookup_extent_info(trans, root, eb->start, eb->len, @@ -5065,7 +5075,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc) + struct walk_control *wc, int *lookup_info) { u64 bytenr; u64 generation; @@ -5085,8 +5095,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= root->root_key.offset) { + *lookup_info = 1; return 1; + } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); blocksize = btrfs_level_size(root, level - 1); @@ -5099,14 +5111,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - if (wc->stage == DROP_REFERENCE) { - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &wc->refs[level - 1], - &wc->flags[level - 1]); - BUG_ON(ret); - BUG_ON(wc->refs[level - 1] == 0); + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &wc->refs[level - 1], + &wc->flags[level - 1]); + BUG_ON(ret); + BUG_ON(wc->refs[level - 1] == 0); + *lookup_info = 0; + if (wc->stage == DROP_REFERENCE) { if (wc->refs[level - 1] > 1) { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + if (!wc->update_ref || generation <= root->root_key.offset) goto skip; @@ -5120,12 +5137,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, wc->stage = UPDATE_BACKREF; wc->shared_level = level - 1; } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; } if (!btrfs_buffer_uptodate(next, generation)) { btrfs_tree_unlock(next); free_extent_buffer(next); next = NULL; + *lookup_info = 1; } if (!next) { @@ -5148,21 +5170,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, skip: wc->refs[level - 1] = 0; wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level])); + parent = 0; + } - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - parent = path->nodes[level]->start; - } else { - BUG_ON(root->root_key.objectid != - btrfs_header_owner(path->nodes[level])); - parent = 0; + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, + root->root_key.objectid, level - 1, 0); + BUG_ON(ret); } - - ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0); - BUG_ON(ret); - btrfs_tree_unlock(next); free_extent_buffer(next); + *lookup_info = 1; return 1; } @@ -5276,6 +5299,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { int level = wc->level; + int lookup_info = 1; int ret; while (level >= 0) { @@ -5283,14 +5307,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, btrfs_header_nritems(path->nodes[level])) break; - ret = walk_down_proc(trans, root, path, wc); + ret = walk_down_proc(trans, root, path, wc, lookup_info); if (ret > 0) break; if (level == 0) break; - ret = do_walk_down(trans, root, path, wc); + ret = do_walk_down(trans, root, path, wc, &lookup_info); if (ret > 0) { path->slots[level]++; continue; -- cgit v1.2.3 From 82d339d9b3a6395f17d3253887653250b693b74b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 9 Oct 2009 09:54:36 -0400 Subject: Btrfs: constify dentry_operations Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 16ddb19dda86..36a19cd43e03 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2330,7 +2330,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); -extern struct dentry_operations btrfs_dentry_operations; +extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3585cb6483c..d960492d3d88 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5840,6 +5840,6 @@ static struct inode_operations btrfs_symlink_inode_operations = { .removexattr = btrfs_removexattr, }; -struct dentry_operations btrfs_dentry_operations = { +const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, }; -- cgit v1.2.3 From e9061e214810c9534381a705a1b46533e09f2676 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 9 Oct 2009 09:57:45 -0400 Subject: Btrfs: fix uninit compiler warning in cow_file_range_nocow The extent_type variable was exposed uninit via a goto. It should be impossible to trigger because it is protected by a check on another variable, but this makes sure. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d960492d3d88..ef399a7794ff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1023,6 +1023,7 @@ next_slot: if (found_key.offset > cur_offset) { extent_end = found_key.offset; + extent_type = 0; goto out_check; } -- cgit v1.2.3 From ac6889cbb254be1ffea376bea4a96ce9be0e0ed0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 9 Oct 2009 11:29:53 -0400 Subject: Btrfs: fix file clone ioctl for bookend extents The file clone ioctl was incorrectly taking the offset into the extent on disk into account when calculating the length of the cloned extent. The length never changes based on the offset into the physical extent. Test case: fallocate -l 1g image mke2fs image bcp image image2 e2fsck -f image2 (errors on image2) The math bug ends up wrapping the length of the extent, and things go wrong from there. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e8795becad4c..cdbb054102b9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1123,8 +1123,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, datao += off - key.offset; datal -= off - key.offset; } - if (key.offset + datao + datal > off + len) - datal = off + len - key.offset - datao; + + if (key.offset + datal > off + len) + datal = off + len - key.offset; + /* disko == 0 means it's a hole */ if (!disko) datao = 0; -- cgit v1.2.3 From d43c36dc6b357fa1806800f18aa30123c747a6d1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 7 Oct 2009 17:09:06 +0400 Subject: headers: remove sched.h from interrupt.h After m68k's task_thread_info() doesn't refer to current, it's possible to remove sched.h from interrupt.h and not break m68k! Many thanks to Heiko Carstens for allowing this. Signed-off-by: Alexey Dobriyan --- arch/arm/kernel/time.c | 1 + arch/arm/mach-integrator/pci_v3.c | 1 + arch/arm/plat-s3c24xx/adc.c | 1 + arch/blackfin/kernel/time.c | 1 + arch/m32r/kernel/smp.c | 1 + arch/um/drivers/line.c | 1 + arch/um/drivers/port_kern.c | 1 + arch/um/kernel/irq.c | 1 + arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + arch/x86/kernel/pci-gart_64.c | 1 + arch/x86/kernel/reboot.c | 1 + arch/xtensa/kernel/time.c | 1 + crypto/aead.c | 1 + drivers/char/applicom.c | 1 + drivers/char/epca.c | 1 + drivers/char/generic_serial.c | 1 + drivers/char/istallion.c | 1 + drivers/char/nozomi.c | 1 + drivers/char/pty.c | 1 + drivers/char/rio/riocmd.c | 1 + drivers/char/rio/rioctrl.c | 1 + drivers/char/rio/riotty.c | 1 + drivers/char/ser_a2232.c | 1 + drivers/char/stallion.c | 1 + drivers/char/tlclk.c | 1 + drivers/hwmon/sht15.c | 1 + drivers/ieee1394/raw1394.c | 1 + drivers/ieee1394/video1394.c | 1 + drivers/infiniband/core/iwcm.c | 1 + drivers/infiniband/core/ucma.c | 1 + drivers/infiniband/hw/cxgb3/iwch_provider.c | 1 + drivers/infiniband/hw/cxgb3/iwch_qp.c | 1 + drivers/infiniband/hw/ipath/ipath_driver.c | 1 + drivers/infiniband/hw/ipath/ipath_iba7220.c | 1 + drivers/infiniband/hw/ipath/ipath_intr.c | 1 + drivers/infiniband/hw/ipath/ipath_qp.c | 1 + drivers/infiniband/hw/ipath/ipath_ruc.c | 1 + drivers/infiniband/hw/ipath/ipath_ud.c | 1 + drivers/infiniband/hw/ipath/ipath_user_pages.c | 1 + drivers/infiniband/hw/ipath/ipath_user_sdma.c | 1 + drivers/infiniband/hw/ipath/ipath_verbs_mcast.c | 1 + drivers/input/keyboard/hilkbd.c | 1 + drivers/input/keyboard/sunkbd.c | 1 + drivers/input/serio/libps2.c | 1 + drivers/input/serio/serio_raw.c | 1 + drivers/input/serio/serport.c | 1 + drivers/isdn/capi/kcapi.c | 1 + drivers/isdn/hisax/arcofi.c | 1 + drivers/isdn/hisax/hfc_2bds0.c | 1 + drivers/isdn/hisax/hfc_pci.c | 1 + drivers/isdn/hysdn/hysdn_procconf.c | 1 + drivers/isdn/hysdn/hysdn_proclog.c | 1 + drivers/isdn/pcbit/drv.c | 1 + drivers/isdn/pcbit/layer2.c | 1 + drivers/isdn/sc/init.c | 1 + drivers/lguest/interrupts_and_traps.c | 1 + drivers/media/dvb/dvb-core/dvb_net.c | 1 + drivers/media/video/meye.c | 1 + drivers/media/video/videobuf-core.c | 1 + drivers/media/video/videobuf-dma-sg.c | 1 + drivers/message/fusion/mptlan.c | 1 + drivers/mfd/ucb1x00-core.c | 1 + drivers/misc/hpilo.c | 1 + drivers/misc/ibmasm/command.c | 1 + drivers/misc/ibmasm/event.c | 1 + drivers/misc/ibmasm/r_heartbeat.c | 1 + drivers/misc/phantom.c | 1 + drivers/mtd/devices/m25p80.c | 1 + drivers/mtd/devices/sst25l.c | 1 + drivers/net/bonding/bond_sysfs.c | 1 + drivers/net/depca.c | 1 + drivers/net/e100.c | 1 + drivers/net/eql.c | 1 + drivers/net/ethoc.c | 1 + drivers/net/ewrk3.c | 1 + drivers/net/forcedeth.c | 1 + drivers/net/hamachi.c | 1 + drivers/net/hamradio/baycom_epp.c | 1 + drivers/net/hamradio/baycom_ser_fdx.c | 1 + drivers/net/hamradio/baycom_ser_hdx.c | 1 + drivers/net/hamradio/hdlcdrv.c | 1 + drivers/net/hp100.c | 1 + drivers/net/igb/igb_ethtool.c | 1 + drivers/net/irda/toim3232-sir.c | 1 + drivers/net/ns83820.c | 1 + drivers/net/pcnet32.c | 1 + drivers/net/sb1000.c | 1 + drivers/net/sis900.c | 1 + drivers/net/skfp/skfddi.c | 1 + drivers/net/skge.c | 1 + drivers/net/slip.c | 1 + drivers/net/sungem.c | 1 + drivers/net/tokenring/ibmtr.c | 1 + drivers/net/typhoon.c | 1 + drivers/net/wan/cosa.c | 1 + drivers/net/wan/cycx_x25.c | 1 + drivers/net/wan/dscc4.c | 1 + drivers/net/wan/farsync.c | 1 + drivers/net/wireless/b43/pio.c | 1 + drivers/net/wireless/b43legacy/main.c | 1 + drivers/net/wireless/b43legacy/phy.c | 1 + drivers/net/wireless/hostap/hostap_info.c | 1 + drivers/net/wireless/hostap/hostap_ioctl.c | 1 + drivers/net/wireless/ipw2x00/ipw2200.c | 1 + drivers/net/wireless/iwlwifi/iwl-3945.c | 1 + drivers/net/wireless/iwlwifi/iwl-4965.c | 1 + drivers/net/wireless/iwlwifi/iwl-5000.c | 1 + drivers/net/wireless/iwlwifi/iwl-agn.c | 1 + drivers/net/wireless/iwlwifi/iwl-core.c | 1 + drivers/net/wireless/iwlwifi/iwl-hcmd.c | 1 + drivers/net/wireless/iwlwifi/iwl-tx.c | 1 + drivers/net/wireless/iwlwifi/iwl3945-base.c | 1 + drivers/net/wireless/iwmc3200wifi/cfg80211.c | 1 + drivers/net/wireless/iwmc3200wifi/commands.c | 1 + drivers/net/wireless/iwmc3200wifi/main.c | 1 + drivers/net/wireless/iwmc3200wifi/rx.c | 1 + drivers/net/wireless/libertas/cmd.c | 1 + drivers/net/wireless/libertas/tx.c | 1 + drivers/net/wireless/prism54/isl_ioctl.c | 1 + drivers/net/wireless/prism54/islpci_dev.c | 1 + drivers/net/wireless/prism54/islpci_mgt.c | 1 + drivers/net/wireless/rt2x00/rt2x00debug.c | 1 + drivers/pci/pcie/aer/aerdrv.c | 1 + drivers/rtc/interface.c | 1 + drivers/rtc/rtc-dev.c | 1 + drivers/uio/uio.c | 1 + drivers/uwb/whc-rc.c | 1 + fs/file.c | 1 + include/linux/interrupt.h | 2 +- include/linux/mmc/host.h | 1 + kernel/irq/handle.c | 1 + kernel/mutex-debug.c | 1 + kernel/time/timekeeping.c | 1 + lib/debugobjects.c | 1 + lib/fault-inject.c | 1 + mm/vmalloc.c | 1 + net/irda/ircomm/ircomm_tty_attach.c | 1 + net/irda/irlan/irlan_common.c | 1 + net/irda/irlan/irlan_eth.c | 1 + net/irda/irnet/irnet_irda.c | 1 + net/irda/irnet/irnet_ppp.c | 1 + net/mac80211/rc80211_pid_debugfs.c | 1 + net/netfilter/nf_conntrack_core.c | 1 + net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 + net/wireless/core.c | 1 + 145 files changed, 145 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 4cdc4a0bd02d..d38cdf2c8276 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/arm/mach-integrator/pci_v3.c b/arch/arm/mach-integrator/pci_v3.c index 901cc205015e..148d25fc636f 100644 --- a/arch/arm/mach-integrator/pci_v3.c +++ b/arch/arm/mach-integrator/pci_v3.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include diff --git a/arch/arm/plat-s3c24xx/adc.c b/arch/arm/plat-s3c24xx/adc.c index 11117a7ba911..4d36b784fb8b 100644 --- a/arch/arm/plat-s3c24xx/adc.c +++ b/arch/arm/plat-s3c24xx/adc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c index e5069fe6861e..bd3b53da295e 100644 --- a/arch/blackfin/kernel/time.c +++ b/arch/blackfin/kernel/time.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 1b7598e6f6e8..8a88f1f0a3e2 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index 14a102e877d6..cf8a97f34518 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -5,6 +5,7 @@ #include "linux/irqreturn.h" #include "linux/kd.h" +#include "linux/sched.h" #include "chan_kern.h" #include "irq_kern.h" #include "irq_user.h" diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c index 19930081d3d8..4ebc8a34738f 100644 --- a/arch/um/drivers/port_kern.c +++ b/arch/um/drivers/port_kern.c @@ -7,6 +7,7 @@ #include "linux/interrupt.h" #include "linux/list.h" #include "linux/mutex.h" +#include "linux/workqueue.h" #include "asm/atomic.h" #include "init.h" #include "irq_kern.h" diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 454cdb43e351..039270b9b73b 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -10,6 +10,7 @@ #include "linux/interrupt.h" #include "linux/kernel_stat.h" #include "linux/module.h" +#include "linux/sched.h" #include "linux/seq_file.h" #include "as-layout.h" #include "kern_util.h" diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 889f665fe93d..7c785634af2b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 98a827ee9ed7..a7f1b64f86e0 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 27349f92a6d7..a1a3cdda06e1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 19085ff0484a..19f7df30937f 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include diff --git a/crypto/aead.c b/crypto/aead.c index d9aa733db164..0a55da70845e 100644 --- a/crypto/aead.c +++ b/crypto/aead.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/drivers/char/applicom.c b/drivers/char/applicom.c index 73a0765344b6..fe2cb2f5db17 100644 --- a/drivers/char/applicom.c +++ b/drivers/char/applicom.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/epca.c b/drivers/char/epca.c index 9d589e3144de..dde5134713e2 100644 --- a/drivers/char/epca.c +++ b/drivers/char/epca.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/generic_serial.c b/drivers/char/generic_serial.c index 9e4e569dc00d..d400cbd280f2 100644 --- a/drivers/char/generic_serial.c +++ b/drivers/char/generic_serial.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c index ab2f3349c5c4..402838f4083e 100644 --- a/drivers/char/istallion.c +++ b/drivers/char/istallion.c @@ -19,6 +19,7 @@ /*****************************************************************************/ #include +#include #include #include #include diff --git a/drivers/char/nozomi.c b/drivers/char/nozomi.c index ec58d8c387ff..d3400b20444f 100644 --- a/drivers/char/nozomi.c +++ b/drivers/char/nozomi.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/pty.c b/drivers/char/pty.c index e066c4fdf81b..62f282e67638 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/rio/riocmd.c b/drivers/char/rio/riocmd.c index 01f2654d5a2e..f121357e5af0 100644 --- a/drivers/char/rio/riocmd.c +++ b/drivers/char/rio/riocmd.c @@ -32,6 +32,7 @@ */ #include +#include #include #include #include diff --git a/drivers/char/rio/rioctrl.c b/drivers/char/rio/rioctrl.c index 74339559f0b9..780506326a73 100644 --- a/drivers/char/rio/rioctrl.c +++ b/drivers/char/rio/rioctrl.c @@ -31,6 +31,7 @@ */ #include +#include #include #include #include diff --git a/drivers/char/rio/riotty.c b/drivers/char/rio/riotty.c index 2fb49e89b324..47fab7c33073 100644 --- a/drivers/char/rio/riotty.c +++ b/drivers/char/rio/riotty.c @@ -33,6 +33,7 @@ #define __EXPLICIT_DEF_H__ #include +#include #include #include #include diff --git a/drivers/char/ser_a2232.c b/drivers/char/ser_a2232.c index 33a2b531802e..9610861d1f5f 100644 --- a/drivers/char/ser_a2232.c +++ b/drivers/char/ser_a2232.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c index 53e504f41b20..db6dcfa35ba0 100644 --- a/drivers/char/stallion.c +++ b/drivers/char/stallion.c @@ -27,6 +27,7 @@ /*****************************************************************************/ #include +#include #include #include #include diff --git a/drivers/char/tlclk.c b/drivers/char/tlclk.c index 8f2284be68e1..80ea6bcfffdc 100644 --- a/drivers/char/tlclk.c +++ b/drivers/char/tlclk.c @@ -32,6 +32,7 @@ #include /* printk() */ #include /* everything... */ #include /* error codes */ +#include #include #include #include diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c index 303c02694c3c..2da6fb2c325e 100644 --- a/drivers/hwmon/sht15.c +++ b/drivers/hwmon/sht15.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c index 0bc3d78ce7b1..8aa56ac07e29 100644 --- a/drivers/ieee1394/raw1394.c +++ b/drivers/ieee1394/raw1394.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include diff --git a/drivers/ieee1394/video1394.c b/drivers/ieee1394/video1394.c index d287ba79821d..949064a05675 100644 --- a/drivers/ieee1394/video1394.c +++ b/drivers/ieee1394/video1394.c @@ -30,6 +30,7 @@ */ #include #include +#include #include #include #include diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 625fec5a741c..0f89909abce9 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 4346a24568fb..bb96d3c4b0f4 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 03cfaecc3bb7..ed7175549ebd 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index 6e8653471941..1cecf98829ac 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -29,6 +29,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include #include "iwch_provider.h" #include "iwch.h" #include "iwch_cm.h" diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 04e88b600558..013d1380e77c 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include #include #include diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c index b2a9d4c155d1..a805402dd4ae 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba7220.c +++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c @@ -37,6 +37,7 @@ #include #include +#include #include #include #include diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 6c21b4b5ec71..c0a03ac03ee7 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -33,6 +33,7 @@ #include #include +#include #include "ipath_kernel.h" #include "ipath_verbs.h" diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index 3a5a89b609c4..cb2d3ef2ae12 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -32,6 +32,7 @@ */ #include +#include #include #include "ipath_verbs.h" diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c index 2296832f94da..1f95bbaf7602 100644 --- a/drivers/infiniband/hw/ipath/ipath_ruc.c +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include #include "ipath_verbs.h" diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index 6076cb61bf6a..7420715256a9 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include #include "ipath_verbs.h" diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c index 855911e7396d..82878e348627 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_pages.c +++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c @@ -33,6 +33,7 @@ #include #include +#include #include "ipath_kernel.h" diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c index 7bff4b9baa0a..be78f6643c06 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c index d73e32232879..6923e1d986da 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c @@ -32,6 +32,7 @@ */ #include +#include #include "ipath_verbs.h" diff --git a/drivers/input/keyboard/hilkbd.c b/drivers/input/keyboard/hilkbd.c index e9d639ec283d..5f72440b50c8 100644 --- a/drivers/input/keyboard/hilkbd.c +++ b/drivers/input/keyboard/hilkbd.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_HP300 diff --git a/drivers/input/keyboard/sunkbd.c b/drivers/input/keyboard/sunkbd.c index 472b56639cdb..a99a04b03ee4 100644 --- a/drivers/input/keyboard/sunkbd.c +++ b/drivers/input/keyboard/sunkbd.c @@ -27,6 +27,7 @@ */ #include +#include #include #include #include diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c index 769ba65a585a..f3876acc3e83 100644 --- a/drivers/input/serio/libps2.c +++ b/drivers/input/serio/libps2.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/drivers/input/serio/serio_raw.c b/drivers/input/serio/serio_raw.c index b03009bb7468..27fdaaffbb40 100644 --- a/drivers/input/serio/serio_raw.c +++ b/drivers/input/serio/serio_raw.c @@ -9,6 +9,7 @@ * the Free Software Foundation. */ +#include #include #include #include diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c index b9694b6445d0..6d345112bcb7 100644 --- a/drivers/input/serio/serport.c +++ b/drivers/input/serio/serport.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index 57d26360f64e..dc506ab99cac 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/isdn/hisax/arcofi.c b/drivers/isdn/hisax/arcofi.c index d30ce5b978c2..85a8fd8dd0b7 100644 --- a/drivers/isdn/hisax/arcofi.c +++ b/drivers/isdn/hisax/arcofi.c @@ -10,6 +10,7 @@ * */ +#include #include "hisax.h" #include "isdnl1.h" #include "isac.h" diff --git a/drivers/isdn/hisax/hfc_2bds0.c b/drivers/isdn/hisax/hfc_2bds0.c index 5c46a7130e06..8d22f50760eb 100644 --- a/drivers/isdn/hisax/hfc_2bds0.c +++ b/drivers/isdn/hisax/hfc_2bds0.c @@ -11,6 +11,7 @@ */ #include +#include #include "hisax.h" #include "hfc_2bds0.h" #include "isdnl1.h" diff --git a/drivers/isdn/hisax/hfc_pci.c b/drivers/isdn/hisax/hfc_pci.c index d110a77940a4..10914731b304 100644 --- a/drivers/isdn/hisax/hfc_pci.c +++ b/drivers/isdn/hisax/hfc_pci.c @@ -20,6 +20,7 @@ #include "hfc_pci.h" #include "isdnl1.h" #include +#include #include static const char *hfcpci_revision = "$Revision: 1.48.2.4 $"; diff --git a/drivers/isdn/hysdn/hysdn_procconf.c b/drivers/isdn/hysdn/hysdn_procconf.c index 8f9f4912de32..90b35e1a4b7e 100644 --- a/drivers/isdn/hysdn/hysdn_procconf.c +++ b/drivers/isdn/hysdn/hysdn_procconf.c @@ -11,6 +11,7 @@ * */ +#include #include #include #include diff --git a/drivers/isdn/hysdn/hysdn_proclog.c b/drivers/isdn/hysdn/hysdn_proclog.c index 8991d2c8ee4a..8bcae28c4409 100644 --- a/drivers/isdn/hysdn/hysdn_proclog.c +++ b/drivers/isdn/hysdn/hysdn_proclog.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "hysdn_defs.h" diff --git a/drivers/isdn/pcbit/drv.c b/drivers/isdn/pcbit/drv.c index 8c66bcb953a1..123c1d6c43b4 100644 --- a/drivers/isdn/pcbit/drv.c +++ b/drivers/isdn/pcbit/drv.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/drivers/isdn/pcbit/layer2.c b/drivers/isdn/pcbit/layer2.c index e075e8d2fce0..30f0f45e3139 100644 --- a/drivers/isdn/pcbit/layer2.c +++ b/drivers/isdn/pcbit/layer2.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/isdn/sc/init.c b/drivers/isdn/sc/init.c index dd0acd06750b..5a0774880d56 100644 --- a/drivers/isdn/sc/init.c +++ b/drivers/isdn/sc/init.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "includes.h" #include "hardware.h" #include "card.h" diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 18648180db02..daaf86631647 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "lg.h" /* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */ diff --git a/drivers/media/dvb/dvb-core/dvb_net.c b/drivers/media/dvb/dvb-core/dvb_net.c index 8c9ae0a3a272..0241a7c5c34a 100644 --- a/drivers/media/dvb/dvb-core/dvb_net.c +++ b/drivers/media/dvb/dvb-core/dvb_net.c @@ -63,6 +63,7 @@ #include #include #include +#include #include "dvb_demux.h" #include "dvb_net.h" diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c index 4b1bc05a462c..01e1eefcf1eb 100644 --- a/drivers/media/video/meye.c +++ b/drivers/media/video/meye.c @@ -28,6 +28,7 @@ */ #include #include +#include #include #include #include diff --git a/drivers/media/video/videobuf-core.c b/drivers/media/video/videobuf-core.c index f1ccf98c0a6f..8e93c6f25c83 100644 --- a/drivers/media/video/videobuf-core.c +++ b/drivers/media/video/videobuf-core.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/drivers/media/video/videobuf-dma-sg.c b/drivers/media/video/videobuf-dma-sg.c index 53cdd67cebe1..032ebae0134a 100644 --- a/drivers/media/video/videobuf-dma-sg.c +++ b/drivers/media/video/videobuf-dma-sg.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c index bc2ec2182c00..34f3f36f819b 100644 --- a/drivers/message/fusion/mptlan.c +++ b/drivers/message/fusion/mptlan.c @@ -56,6 +56,7 @@ #include #include #include +#include #define my_VERSION MPT_LINUX_VERSION_COMMON #define MYNAM "mptlan" diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c index fea9085fe52c..60c3988f3cf3 100644 --- a/drivers/mfd/ucb1x00-core.c +++ b/drivers/mfd/ucb1x00-core.c @@ -18,6 +18,7 @@ */ #include #include +#include #include #include #include diff --git a/drivers/misc/hpilo.c b/drivers/misc/hpilo.c index 1ad27c6abcca..a92a3a742b43 100644 --- a/drivers/misc/hpilo.c +++ b/drivers/misc/hpilo.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/misc/ibmasm/command.c b/drivers/misc/ibmasm/command.c index 276d3fb68094..e2031739aa29 100644 --- a/drivers/misc/ibmasm/command.c +++ b/drivers/misc/ibmasm/command.c @@ -22,6 +22,7 @@ * */ +#include #include "ibmasm.h" #include "lowlevel.h" diff --git a/drivers/misc/ibmasm/event.c b/drivers/misc/ibmasm/event.c index 68a0a5b94795..572d41ffc186 100644 --- a/drivers/misc/ibmasm/event.c +++ b/drivers/misc/ibmasm/event.c @@ -22,6 +22,7 @@ * */ +#include #include "ibmasm.h" #include "lowlevel.h" diff --git a/drivers/misc/ibmasm/r_heartbeat.c b/drivers/misc/ibmasm/r_heartbeat.c index bec9e2c44bef..2de487ac788c 100644 --- a/drivers/misc/ibmasm/r_heartbeat.c +++ b/drivers/misc/ibmasm/r_heartbeat.c @@ -20,6 +20,7 @@ * */ +#include #include "ibmasm.h" #include "dot_command.h" diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c index 90a95ce8dc34..04c27266f567 100644 --- a/drivers/misc/phantom.c +++ b/drivers/misc/phantom.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 379c316f329e..4c19269de91a 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/drivers/mtd/devices/sst25l.c b/drivers/mtd/devices/sst25l.c index c2baf3353f84..0a11721f146e 100644 --- a/drivers/mtd/devices/sst25l.c +++ b/drivers/mtd/devices/sst25l.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index ff449de6f3c0..8762a27a2a18 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/depca.c b/drivers/net/depca.c index 9686c1fa28f1..7a3bdac84abe 100644 --- a/drivers/net/depca.c +++ b/drivers/net/depca.c @@ -237,6 +237,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/e100.c b/drivers/net/e100.c index 679965c2bb86..5d2f48f02251 100644 --- a/drivers/net/e100.c +++ b/drivers/net/e100.c @@ -151,6 +151,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/eql.c b/drivers/net/eql.c index d4d9a3eda695..f5b96cadeb25 100644 --- a/drivers/net/eql.c +++ b/drivers/net/eql.c @@ -111,6 +111,7 @@ * Sorry, I had to rewrite most of this for 2.5.x -DaveM */ +#include #include #include #include diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c index 34d0c69e67f7..96f5b2a2d2c5 100644 --- a/drivers/net/ethoc.c +++ b/drivers/net/ethoc.c @@ -17,6 +17,7 @@ #include #include #include +#include #include static int buffer_size = 0x8000; /* 32 KBytes */ diff --git a/drivers/net/ewrk3.c b/drivers/net/ewrk3.c index b2a5ec8f3721..dd4ba01fd92d 100644 --- a/drivers/net/ewrk3.c +++ b/drivers/net/ewrk3.c @@ -145,6 +145,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index 0a1c2bb27d4d..e1da4666f204 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/hamachi.c b/drivers/net/hamachi.c index 1d5064a09aca..f7519a594945 100644 --- a/drivers/net/hamachi.c +++ b/drivers/net/hamachi.c @@ -145,6 +145,7 @@ static int tx_params[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1}; /* Time in jiffies before concluding the transmitter is hung. */ #define TX_TIMEOUT (5*HZ) +#include #include #include #include diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 7bcaf7c66243..e344c84c0ef9 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c index aa4488e871b2..ed60fd664273 100644 --- a/drivers/net/hamradio/baycom_ser_fdx.c +++ b/drivers/net/hamradio/baycom_ser_fdx.c @@ -71,6 +71,7 @@ /*****************************************************************************/ +#include #include #include #include diff --git a/drivers/net/hamradio/baycom_ser_hdx.c b/drivers/net/hamradio/baycom_ser_hdx.c index 88c593596020..1686f6dcbbce 100644 --- a/drivers/net/hamradio/baycom_ser_hdx.c +++ b/drivers/net/hamradio/baycom_ser_hdx.c @@ -61,6 +61,7 @@ /*****************************************************************************/ +#include #include #include #include diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index 0013c409782c..91c5790c9581 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c @@ -42,6 +42,7 @@ /*****************************************************************************/ +#include #include #include #include diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c index a9a1a99f02dd..dd8665138062 100644 --- a/drivers/net/hp100.c +++ b/drivers/net/hp100.c @@ -98,6 +98,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c index d004c359244c..deaea8fa1032 100644 --- a/drivers/net/igb/igb_ethtool.c +++ b/drivers/net/igb/igb_ethtool.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "igb.h" diff --git a/drivers/net/irda/toim3232-sir.c b/drivers/net/irda/toim3232-sir.c index fcf287b749db..99e1ec02a011 100644 --- a/drivers/net/irda/toim3232-sir.c +++ b/drivers/net/irda/toim3232-sir.c @@ -120,6 +120,7 @@ #include #include #include +#include #include diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c index c594e1946476..57fd483dbb1f 100644 --- a/drivers/net/ns83820.c +++ b/drivers/net/ns83820.c @@ -111,6 +111,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c index 6d28b18e7e28..c1b3f09f452c 100644 --- a/drivers/net/pcnet32.c +++ b/drivers/net/pcnet32.c @@ -31,6 +31,7 @@ static const char *const version = #include #include +#include #include #include #include diff --git a/drivers/net/sb1000.c b/drivers/net/sb1000.c index ee366c5a8fa3..c9c70ab0cce0 100644 --- a/drivers/net/sb1000.c +++ b/drivers/net/sb1000.c @@ -36,6 +36,7 @@ static char version[] = "sb1000.c:v1.1.2 6/01/98 (fventuri@mediaone.net)\n"; #include #include +#include #include #include #include diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c index 97949d0a699b..c072f7f36acf 100644 --- a/drivers/net/sis900.c +++ b/drivers/net/sis900.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/skfp/skfddi.c b/drivers/net/skfp/skfddi.c index 38a508b4aad9..b27156eaf267 100644 --- a/drivers/net/skfp/skfddi.c +++ b/drivers/net/skfp/skfddi.c @@ -73,6 +73,7 @@ static const char * const boot_msg = /* Include files */ +#include #include #include #include diff --git a/drivers/net/skge.c b/drivers/net/skge.c index 01f6811f1324..8f5414348e86 100644 --- a/drivers/net/skge.c +++ b/drivers/net/skge.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/slip.c b/drivers/net/slip.c index e17c535a577e..fe3cebb984de 100644 --- a/drivers/net/slip.c +++ b/drivers/net/slip.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c index 305ec3d783db..7019a0d1a82b 100644 --- a/drivers/net/sungem.c +++ b/drivers/net/sungem.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/tokenring/ibmtr.c b/drivers/net/tokenring/ibmtr.c index 525bbc5b9c9d..36cb2423bcf1 100644 --- a/drivers/net/tokenring/ibmtr.c +++ b/drivers/net/tokenring/ibmtr.c @@ -108,6 +108,7 @@ in the event that chatty debug messages are desired - jjs 12/30/98 */ #define IBMTR_DEBUG_MESSAGES 0 #include +#include #ifdef PCMCIA /* required for ibmtr_cs.c to build */ #undef MODULE /* yes, really */ diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c index d6d345229fe9..5921f5bdd764 100644 --- a/drivers/net/typhoon.c +++ b/drivers/net/typhoon.c @@ -108,6 +108,7 @@ static const int multicast_filter_limit = 32; #include #include +#include #include #include #include diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index 66360a2a14c2..e2c33c06190b 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -76,6 +76,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/wan/cycx_x25.c b/drivers/net/wan/cycx_x25.c index 2573c18b6aa5..cd8cb95c5bd7 100644 --- a/drivers/net/wan/cycx_x25.c +++ b/drivers/net/wan/cycx_x25.c @@ -84,6 +84,7 @@ #include /* printk(), and other useful stuff */ #include #include /* inline memset(), etc. */ +#include #include /* kmalloc(), kfree() */ #include /* offsetof(), etc. */ #include /* WAN router definitions */ diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c index 81c8aec9df92..07d00b4cf48a 100644 --- a/drivers/net/wan/dscc4.c +++ b/drivers/net/wan/dscc4.c @@ -81,6 +81,7 @@ */ #include +#include #include #include #include diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c index 3e90eb816181..beda387f2fc7 100644 --- a/drivers/net/wan/farsync.c +++ b/drivers/net/wan/farsync.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/b43/pio.c b/drivers/net/wireless/b43/pio.c index 9c1397996e0a..5e87650b07fb 100644 --- a/drivers/net/wireless/b43/pio.c +++ b/drivers/net/wireless/b43/pio.c @@ -30,6 +30,7 @@ #include "xmit.h" #include +#include static u16 generate_cookie(struct b43_pio_txqueue *q, diff --git a/drivers/net/wireless/b43legacy/main.c b/drivers/net/wireless/b43legacy/main.c index 1d9223b3d4c4..4b60148a5e61 100644 --- a/drivers/net/wireless/b43legacy/main.c +++ b/drivers/net/wireless/b43legacy/main.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/b43legacy/phy.c b/drivers/net/wireless/b43legacy/phy.c index 11319ec2d64a..aaf227203a98 100644 --- a/drivers/net/wireless/b43legacy/phy.c +++ b/drivers/net/wireless/b43legacy/phy.c @@ -31,6 +31,7 @@ #include #include +#include #include #include "b43legacy.h" diff --git a/drivers/net/wireless/hostap/hostap_info.c b/drivers/net/wireless/hostap/hostap_info.c index 6fa14a4e4b53..4dfb40a84c96 100644 --- a/drivers/net/wireless/hostap/hostap_info.c +++ b/drivers/net/wireless/hostap/hostap_info.c @@ -1,6 +1,7 @@ /* Host AP driver Info Frame processing (part of hostap.o module) */ #include +#include #include "hostap_wlan.h" #include "hostap.h" #include "hostap_ap.h" diff --git a/drivers/net/wireless/hostap/hostap_ioctl.c b/drivers/net/wireless/hostap/hostap_ioctl.c index 3f2bda881a4f..9419cebca8a5 100644 --- a/drivers/net/wireless/hostap/hostap_ioctl.c +++ b/drivers/net/wireless/hostap/hostap_ioctl.c @@ -1,6 +1,7 @@ /* ioctl() (mostly Linux Wireless Extensions) routines for Host AP driver */ #include +#include #include #include #include diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c index 8d58e6ed4e7d..827824d45de9 100644 --- a/drivers/net/wireless/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/ipw2x00/ipw2200.c @@ -30,6 +30,7 @@ ******************************************************************************/ +#include #include "ipw2200.h" diff --git a/drivers/net/wireless/iwlwifi/iwl-3945.c b/drivers/net/wireless/iwlwifi/iwl-3945.c index e70c5b0af364..68136172b823 100644 --- a/drivers/net/wireless/iwlwifi/iwl-3945.c +++ b/drivers/net/wireless/iwlwifi/iwl-3945.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-4965.c b/drivers/net/wireless/iwlwifi/iwl-4965.c index a22a0501c190..6f703a041847 100644 --- a/drivers/net/wireless/iwlwifi/iwl-4965.c +++ b/drivers/net/wireless/iwlwifi/iwl-4965.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-5000.c b/drivers/net/wireless/iwlwifi/iwl-5000.c index eb08f4411000..d6bc0e051043 100644 --- a/drivers/net/wireless/iwlwifi/iwl-5000.c +++ b/drivers/net/wireless/iwlwifi/iwl-5000.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c index cdc07c477457..313d3e5ee84b 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c index 484d5c1a7312..2dc928755454 100644 --- a/drivers/net/wireless/iwlwifi/iwl-core.c +++ b/drivers/net/wireless/iwlwifi/iwl-core.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "iwl-eeprom.h" diff --git a/drivers/net/wireless/iwlwifi/iwl-hcmd.c b/drivers/net/wireless/iwlwifi/iwl-hcmd.c index 532c8d6cd8da..a6856daf14cb 100644 --- a/drivers/net/wireless/iwlwifi/iwl-hcmd.c +++ b/drivers/net/wireless/iwlwifi/iwl-hcmd.c @@ -28,6 +28,7 @@ #include #include +#include #include #include "iwl-dev.h" /* FIXME: remove */ diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c index c18907544701..fb9bcfa6d947 100644 --- a/drivers/net/wireless/iwlwifi/iwl-tx.c +++ b/drivers/net/wireless/iwlwifi/iwl-tx.c @@ -28,6 +28,7 @@ *****************************************************************************/ #include +#include #include #include "iwl-eeprom.h" #include "iwl-dev.h" diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c index c390dbd877e4..aa49230422f3 100644 --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwmc3200wifi/cfg80211.c b/drivers/net/wireless/iwmc3200wifi/cfg80211.c index a56a2b0ac99a..f3c55658225b 100644 --- a/drivers/net/wireless/iwmc3200wifi/cfg80211.c +++ b/drivers/net/wireless/iwmc3200wifi/cfg80211.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwmc3200wifi/commands.c b/drivers/net/wireless/iwmc3200wifi/commands.c index 23b52fa2605f..84158b6d35d8 100644 --- a/drivers/net/wireless/iwmc3200wifi/commands.c +++ b/drivers/net/wireless/iwmc3200wifi/commands.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "iwm.h" #include "bus.h" diff --git a/drivers/net/wireless/iwmc3200wifi/main.c b/drivers/net/wireless/iwmc3200wifi/main.c index d668e4756324..222eb2cf1b30 100644 --- a/drivers/net/wireless/iwmc3200wifi/main.c +++ b/drivers/net/wireless/iwmc3200wifi/main.c @@ -38,6 +38,7 @@ #include #include +#include #include #include diff --git a/drivers/net/wireless/iwmc3200wifi/rx.c b/drivers/net/wireless/iwmc3200wifi/rx.c index 40dbcbc16593..771a301003c9 100644 --- a/drivers/net/wireless/iwmc3200wifi/rx.c +++ b/drivers/net/wireless/iwmc3200wifi/rx.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c index 685098148e10..0a324dcd264c 100644 --- a/drivers/net/wireless/libertas/cmd.c +++ b/drivers/net/wireless/libertas/cmd.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "host.h" #include "hostcmd.h" #include "decl.h" diff --git a/drivers/net/wireless/libertas/tx.c b/drivers/net/wireless/libertas/tx.c index 4c018f7a0a8d..8c3766a6e8e7 100644 --- a/drivers/net/wireless/libertas/tx.c +++ b/drivers/net/wireless/libertas/tx.c @@ -3,6 +3,7 @@ */ #include #include +#include #include "hostcmd.h" #include "radiotap.h" diff --git a/drivers/net/wireless/prism54/isl_ioctl.c b/drivers/net/wireless/prism54/isl_ioctl.c index 4c97c6ad6f5d..bc08464d8323 100644 --- a/drivers/net/wireless/prism54/isl_ioctl.c +++ b/drivers/net/wireless/prism54/isl_ioctl.c @@ -19,6 +19,7 @@ * */ +#include #include #include #include diff --git a/drivers/net/wireless/prism54/islpci_dev.c b/drivers/net/wireless/prism54/islpci_dev.c index e26d7b3ceab5..2505be56ae39 100644 --- a/drivers/net/wireless/prism54/islpci_dev.c +++ b/drivers/net/wireless/prism54/islpci_dev.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/prism54/islpci_mgt.c b/drivers/net/wireless/prism54/islpci_mgt.c index f7c677e2094d..69d2f882fd06 100644 --- a/drivers/net/wireless/prism54/islpci_mgt.c +++ b/drivers/net/wireless/prism54/islpci_mgt.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/wireless/rt2x00/rt2x00debug.c b/drivers/net/wireless/rt2x00/rt2x00debug.c index 7b3ee8c2eaef..68bc9bb1dbf9 100644 --- a/drivers/net/wireless/rt2x00/rt2x00debug.c +++ b/drivers/net/wireless/rt2x00/rt2x00debug.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "rt2x00.h" diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c index 2ce8f9ccc66e..d49ecc94bd49 100644 --- a/drivers/pci/pcie/aer/aerdrv.c +++ b/drivers/pci/pcie/aer/aerdrv.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 4cdb31a362ca..a0c816238aa9 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -12,6 +12,7 @@ */ #include +#include #include int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm) diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c index 8a11de9552cd..62227cd52410 100644 --- a/drivers/rtc/rtc-dev.c +++ b/drivers/rtc/rtc-dev.c @@ -13,6 +13,7 @@ #include #include +#include #include "rtc-core.h" static dev_t rtc_devt; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index a9d707047202..e941367dd28f 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/uwb/whc-rc.c b/drivers/uwb/whc-rc.c index 1d9a6f54658e..01950c62dc8d 100644 --- a/drivers/uwb/whc-rc.c +++ b/drivers/uwb/whc-rc.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/file.c b/fs/file.c index f313314f996f..87e129030ab1 100644 --- a/fs/file.c +++ b/fs/file.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index b78cf8194957..7ca72b74eec7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -610,6 +609,7 @@ extern void debug_poll_all_shared_irqs(void); static inline void debug_poll_all_shared_irqs(void) { } #endif +struct seq_file; int show_interrupts(struct seq_file *p, void *v); struct irq_desc; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 81bb42358595..eaf36364b7d4 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -11,6 +11,7 @@ #define LINUX_MMC_HOST_H #include +#include #include diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a81cf80554db..17c71bb565c6 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 50d022e5a560..ec815a960b5d 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb0f46fa1ecd..c3a4e2907eaa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2755a3bd16a1..eae56fddfa3b 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -9,6 +9,7 @@ */ #include #include +#include #include #include #include diff --git a/lib/fault-inject.c b/lib/fault-inject.c index f97af55bdd96..7e65af70635e 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5e7aed0802bf..0f551a4a44cd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c index eafc010907c2..3c1754023022 100644 --- a/net/irda/ircomm/ircomm_tty_attach.c +++ b/net/irda/ircomm/ircomm_tty_attach.c @@ -30,6 +30,7 @@ ********************************************************************/ #include +#include #include #include diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c index 62116829b817..315ead3cb926 100644 --- a/net/irda/irlan/irlan_common.c +++ b/net/irda/irlan/irlan_common.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index 7b6b631f647f..d340110f5c0c 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c index cf9a4b531a98..cccc2e93234f 100644 --- a/net/irda/irnet/irnet_irda.c +++ b/net/irda/irnet/irnet_irda.c @@ -9,6 +9,7 @@ */ #include "irnet_irda.h" /* Private header */ +#include #include #include diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 68cbcb19cbd8..7dea882dbb75 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -13,6 +13,7 @@ * 2) as a control channel (write commands, read events) */ +#include #include #include "irnet_ppp.h" /* Private header */ /* Please put other headers in irnet.h - Thanks */ diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c index a59043fbb0ff..45667054a5f3 100644 --- a/net/mac80211/rc80211_pid_debugfs.c +++ b/net/mac80211/rc80211_pid_debugfs.c @@ -6,6 +6,7 @@ * published by the Free Software Foundation. */ +#include #include #include #include diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7c9ec3dee96e..ca6e68dcd8a8 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 0cf5e8c27a10..3fa5751af0ec 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/net/wireless/core.c b/net/wireless/core.c index 45b2be3274db..a595f712b5bf 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include "nl80211.h" -- cgit v1.2.3 From ef1f7a7e878e4ae37b3a78ebdeef9f911bae59df Mon Sep 17 00:00:00 2001 From: Bernd Schmidt Date: Tue, 6 Oct 2009 09:55:26 +0100 Subject: ROMFS: fix length used with romfs_dev_strnlen() function An interestingly corrupted romfs file system exposed a problem with the romfs_dev_strnlen function: it's passing the wrong value to its helpers. Rather than limit the string to the length passed in by the callers, it uses the size of the device as the limit. Signed-off-by: Bernd Schmidt Signed-off-by: Mike Frysinger Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/romfs/storage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index b3208adf8e71..71e2b4d50a0a 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -253,11 +253,11 @@ ssize_t romfs_dev_strnlen(struct super_block *sb, #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) - return romfs_mtd_strnlen(sb, pos, limit); + return romfs_mtd_strnlen(sb, pos, maxlen); #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) - return romfs_blk_strnlen(sb, pos, limit); + return romfs_blk_strnlen(sb, pos, maxlen); #endif return -EIO; } -- cgit v1.2.3 From a1be9eee2996fd9969625e7b5e2f2bc2032fd3cb Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Mon, 12 Oct 2009 11:26:12 -0400 Subject: NFS: suppress a build warning struct sockaddr_storage * can safely be used as struct sockaddr *. Suppress an "incompatible pointer type" warning. Signed-off-by: Stefan Richter Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds --- fs/nfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6dabf6feec94..a2c18acb8568 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1848,8 +1848,8 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || data->nfs_server.port != nfss->port || data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || - !rpc_cmp_addr(&data->nfs_server.address, - &nfss->nfs_client->cl_addr)) + !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address, + (struct sockaddr *)&nfss->nfs_client->cl_addr)) return -EINVAL; return 0; -- cgit v1.2.3 From 96ec2e0a719fd61791dd2b0dd01325c5d20e1233 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 16 Sep 2009 11:21:13 -0400 Subject: ext3: Don't update superblock write time when filesystem is read-only This avoids updating the superblock write time when we are mounting the root file system read/only but we need to replay the journal; at that point, for people who are east of GMT and who make their clock tick in localtime for Windows bug-for-bug compatibility, and this will cause e2fsck to complain and force a full file system check. Signed-off-by: "Theodore Ts'o" Signed-off-by: Jan Kara --- fs/ext3/super.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 72743d360509..7a520a862f49 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2321,7 +2321,18 @@ static int ext3_commit_super(struct super_block *sb, if (!sbh) return error; - es->s_wtime = cpu_to_le32(get_seconds()); + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); -- cgit v1.2.3 From 4722607db6a78bd7748c51fa4c8d7371da797254 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 12:55:09 -0400 Subject: Btrfs: only write one super copy during fsync During a tree-log commit for fsync, we've been writing at least two copies of the super block and forcing them to disk. The other filesystems write only one, and this change brings us on par with them. A full transaction commit will write all the super copies, so we still have redundant info written on a regular basis. Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 78f6254ac2d9..6d9ec285644d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2092,7 +2092,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - write_ctree_super(trans, root->fs_info->tree_root, 2); + write_ctree_super(trans, root->fs_info->tree_root, 1); ret = 0; out_wake_log_root: -- cgit v1.2.3 From 257c62e1bce03e5b9f3f069fd52ad73a56de71fd Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 13:21:08 -0400 Subject: Btrfs: avoid tree log commit when there are no changes rpm has a habit of running fdatasync when the file hasn't changed. We already detect if a file hasn't been changed in the current transaction but it might have been sent to the tree-log in this transaction and not changed since the last call to fsync. In this case, we want to avoid a tree log sync, which includes a number of synchronous writes and barriers. This commit extends the existing tracking of the last transaction to change a file to also track the last sub-transaction. The end result is that rpm -ivh and -Uvh are roughly twice as fast, and on par with ext3. Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 6 ++++++ fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 2 ++ fs/btrfs/file.c | 41 ++++++++++++++++++++++++++--------------- fs/btrfs/inode.c | 6 +++++- fs/btrfs/transaction.h | 1 + fs/btrfs/tree-log.c | 27 +++++++++++++++++++++++++++ fs/btrfs/tree-log.h | 3 +++ 8 files changed, 71 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c71abec0ab90..f6783a42f010 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -86,6 +86,12 @@ struct btrfs_inode { * transid of the trans_handle that last modified this inode */ u64 last_trans; + + /* + * log transid when this inode was last modified + */ + u64 last_sub_trans; + /* * transid that last logged this inode */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 36a19cd43e03..d0cede5ff25e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1009,6 +1009,7 @@ struct btrfs_root { atomic_t log_writers; atomic_t log_commit[2]; unsigned long log_transid; + unsigned long last_log_commit; unsigned long log_batch; pid_t log_start_pid; bool log_multiple_pids; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ac8927bdc33d..d4132aad9ea1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -919,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_writers, 0); root->log_batch = 0; root->log_transid = 0; + root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -1089,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->last_log_commit = 0; return 0; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 53fb1c997f0e..4599113ed72e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1087,8 +1087,10 @@ out_nolock: btrfs_end_transaction(trans, root); else btrfs_commit_transaction(trans, root); - } else { + } else if (ret != BTRFS_NO_LOG_SYNC) { btrfs_commit_transaction(trans, root); + } else { + btrfs_end_transaction(trans, root); } } if (file->f_flags & O_DIRECT) { @@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) int ret = 0; struct btrfs_trans_handle *trans; + + /* we wait first, since the writeback may change the inode */ + root->log_batch++; + /* the VFS called filemap_fdatawrite for us */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->log_batch++; + /* * check the transaction that last modified this inode * and see if its already been committed @@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (!BTRFS_I(inode)->last_trans) goto out; + /* + * if the last transaction that changed this file was before + * the current transaction, we can bail out now without any + * syncing + */ mutex_lock(&root->fs_info->trans_mutex); if (BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { @@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) } mutex_unlock(&root->fs_info->trans_mutex); - root->log_batch++; - filemap_fdatawrite(inode->i_mapping); - btrfs_wait_ordered_range(inode, 0, (u64)-1); - root->log_batch++; - - if (datasync && !(inode->i_state & I_DIRTY_PAGES)) - goto out; /* * ok we haven't committed the transaction yet, lets do a commit */ @@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) */ mutex_unlock(&dentry->d_inode->i_mutex); - if (ret > 0) { - ret = btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - ret = btrfs_end_transaction(trans, root); - else + if (ret != BTRFS_NO_LOG_SYNC) { + if (ret > 0) { ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); + } + } else { + ret = btrfs_end_transaction(trans, root); } mutex_lock(&dentry->d_inode->i_mutex); out: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ef399a7794ff..5b9567caba0a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3480,6 +3480,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->generation = 0; bi->sequence = 0; bi->last_trans = 0; + bi->last_sub_trans = 0; bi->logged_trans = 0; bi->delalloc_bytes = 0; bi->reserved_bytes = 0; @@ -4980,7 +4981,9 @@ again: set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: @@ -5100,6 +5103,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) if (!ei) return NULL; ei->last_trans = 0; + ei->last_sub_trans = 0; ei->logged_trans = 0; ei->outstanding_extents = 0; ei->reserved_extents = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 663c67404918..f68cbbe61e56 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { BTRFS_I(inode)->last_trans = trans->transaction->transid; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6d9ec285644d..0a1bde268963 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1980,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; + u64 log_transid = 0; mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; @@ -2018,6 +2019,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; + log_transid = root->log_transid; root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; @@ -2095,6 +2097,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, write_ctree_super(trans, root->fs_info->tree_root, 1); ret = 0; + mutex_lock(&root->log_mutex); + if (root->last_log_commit < log_transid) + root->last_log_commit = log_transid; + mutex_unlock(&root->log_mutex); + out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); @@ -2862,6 +2869,21 @@ out: return ret; } +static int inode_in_log(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == trans->transid && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -2901,6 +2923,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; + if (inode_in_log(trans, inode)) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } + start_log_trans(trans, root); ret = btrfs_log_inode(trans, root, inode, inode_only); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index d09c7609e16b..0776eacb5083 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,9 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ +#define BTRFS_NO_LOG_SYNC 256 + int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); -- cgit v1.2.3 From 690587d109ffe19d6743e4cc80c18b0906b7f9ff Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 13:29:19 -0400 Subject: Btrfs: streamline tree-log btree block writeout Syncing the tree log is a 3 phase operation. 1) write and wait for all the tree log blocks for a given root. 2) write and wait for all the tree log blocks for the tree of tree log roots. 3) write and wait for the super blocks (barriers here) This isn't as efficient as it could be because there is no requirement to wait for the blocks from step one to hit the disk before we start writing the blocks from step two. This commit changes the sequence so that we don't start waiting until all the tree blocks from both steps one and two have been sent to disk. We do this by breaking up btrfs_write_wait_marked_extents into two functions, which is trivial because it was already broken up into two parts. Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/transaction.h | 4 ++++ fs/btrfs/tree-log.c | 8 +++++++- 3 files changed, 53 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0b8f36d4400a..bca82a4ca8e6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -344,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of - * those extents are on disk for transaction or log commit + * those extents are sent to disk but does not wait on them */ -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) { int ret; int err = 0; @@ -394,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, page_cache_release(page); } } + if (err) + werr = err; + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit. We wait + * on all the pages and clear them from the dirty pages state tree + */ +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + while (1) { ret = find_first_extent_bit(dirty_pages, 0, &start, &end, EXTENT_DIRTY); @@ -424,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return werr; } +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int ret2; + + ret = btrfs_write_marked_extents(root, dirty_pages); + ret2 = btrfs_wait_marked_extents(root, dirty_pages); + return ret || ret2; +} + int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index f68cbbe61e56..d4e3e7a6938c 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -108,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0a1bde268963..4aff766d171a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2013,7 +2013,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); btrfs_set_root_node(&log->root_item, log->node); @@ -2048,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2067,6 +2071,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -2075,6 +2080,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages); BUG_ON(ret); + btrfs_wait_marked_extents(log, &log->dirty_log_pages); btrfs_set_super_log_root(&root->fs_info->super_for_commit, log_root_tree->node->start); -- cgit v1.2.3 From 0eda294dfc980c1cbe4f8a0564bf543f86a01ddb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Oct 2009 13:50:18 -0400 Subject: Btrfs: fix btrfs acl #ifdef checks The btrfs acl code was #ifdefing for a define that didn't exist. This correctly matches it to the values used by the Kconfig file. Signed-off-by: Chris Mason --- fs/btrfs/acl.c | 6 +++--- fs/btrfs/ctree.h | 2 +- fs/btrfs/super.c | 2 +- fs/btrfs/xattr.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 69b355ae7f49..361604244271 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -27,7 +27,7 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { .set = btrfs_xattr_acl_access_set, }; -#else /* CONFIG_BTRFS_POSIX_ACL */ +#else /* CONFIG_BTRFS_FS_POSIX_ACL */ int btrfs_acl_chmod(struct inode *inode) { @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -#endif /* CONFIG_BTRFS_POSIX_ACL */ +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d0cede5ff25e..dfd7e6fc66d3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2374,7 +2374,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e0a64328080c..3fbbf0761ac2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -344,7 +344,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b0fc93f95fd0..b6dd5967c48a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -260,7 +260,7 @@ err: * attributes are handled directly. */ struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, #endif -- cgit v1.2.3 From 5d5e103a70f74ae98e3965a4add1ab951d0651d1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 13 Oct 2009 16:46:49 -0400 Subject: Btrfs: fix possible ENOSPC problems with truncate There's a problem where we don't do any space reservation for truncates, which can cause you to OOPs because you will be allowed to go off in the weeds a bit since we don't account for the delalloc bytes that are created as a result of the truncate. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b9567caba0a..78139efe41fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3032,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) + goto out; ret = -ENOMEM; again: page = grab_cache_page(mapping, index); - if (!page) + if (!page) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); goto out; + } page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; @@ -3070,6 +3080,10 @@ again: goto again; } + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); if (ret) { unlock_extent(io_tree, page_start, page_end, GFP_NOFS); @@ -3088,6 +3102,9 @@ again: unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + if (ret) + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); unlock_page(page); page_cache_release(page); out: @@ -3111,7 +3128,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) if (size <= hole_start) return 0; - btrfs_truncate_page(inode->i_mapping, inode->i_size); + err = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (err) + return err; while (1) { struct btrfs_ordered_extent *ordered; @@ -5008,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - btrfs_truncate_page(inode->i_mapping, inode->i_size); + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (ret) + return; btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); -- cgit v1.2.3 From 86df7eb921a009515285e7171363fa57dd2d7d31 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 14 Oct 2009 09:24:59 -0400 Subject: Btrfs: properly wait log writers during log sync A recently fsync optimization make btrfs_sync_log skip calling wait_for_writer in the single log writer case. This is incorrect since the writer count can also be increased by btrfs_pin_log. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4aff766d171a..f51bf13125c0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1995,12 +1995,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); - while (root->log_multiple_pids) { + while (1) { unsigned long batch = root->log_batch; - mutex_unlock(&root->log_mutex); - schedule_timeout_uninterruptible(1); - mutex_lock(&root->log_mutex); - + if (root->log_multiple_pids) { + mutex_unlock(&root->log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&root->log_mutex); + } wait_for_writer(trans, root); if (batch == root->log_batch) break; -- cgit v1.2.3 From e244a0aeb6a599c19a7c802cda6e2d67c847b154 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Oct 2009 09:24:59 -0400 Subject: Btrfs: add -o discard option Enable discard by default is not a good idea given the the trim speed of SSD prototypes we've seen, and the carecteristics for many high-end arrays. Turn of discards by default and require the -o discard option to enable them on. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 3 +++ fs/btrfs/super.c | 7 ++++++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dfd7e6fc66d3..e5dd628a526f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1153,6 +1153,7 @@ struct btrfs_root { #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) +#define BTRFS_MOUNT_DISCARD (1 << 10) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4aedbff36b8f..bf7782fc5939 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1584,6 +1584,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; + if (!btrfs_test_opt(root, DISCARD)) + return 0; + /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, bytenr, &map_length, &multi, 0); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3fbbf0761ac2..939b68f0612d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,7 +66,8 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, + Opt_discard, Opt_err, }; static match_table_t tokens = { @@ -88,6 +89,7 @@ static match_table_t tokens = { {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, + {Opt_discard, "discard"}, {Opt_err, NULL}, }; @@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->metadata_ratio); } break; + case Opt_discard: + btrfs_set_opt(info->mount_opt, DISCARD); + break; default: break; } -- cgit v1.2.3 From 0634857488ec6e28fa22920cd0bee3c2ac07ccfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Oct 2009 09:24:59 -0400 Subject: Btrfs: enable discard support The discard support code in btrfs currently is guarded by ifdefs for BIO_RW_DISCARD, which is never defines as it's the name of an enum memeber. Just remove the useless ifdefs to actually enable the code. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bf7782fc5939..d4a35283d99e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1568,18 +1568,15 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#ifdef BIO_RW_DISCARD static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); } -#endif static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { -#ifdef BIO_RW_DISCARD int ret; u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; @@ -1606,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } return ret; -#else - return 0; -#endif } int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From 444528b3e614f7f2391488d9bca8e0b872db909b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 14 Oct 2009 09:38:28 -0400 Subject: Btrfs: always pin metadata in discard mode We have an optimization in btrfs to allow blocks to be immediately freed if they were allocated in this transaction and never written. Otherwise they are pinned and freed when the transaction commits. This isn't optimal for discard mode because immediately freeing them means immediately discarding them. It is better to give the block to the pinning code and letting the (slow) discard happen later. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d4a35283d99e..c56f91639dc1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3686,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, if (is_data) goto pinit; + /* + * discard is sloooow, and so triggering discards on + * individual btree blocks isn't a good plan. Just + * pin everything in discard mode. + */ + if (btrfs_test_opt(root, DISCARD)) + goto pinit; + buf = btrfs_find_tree_block(root, bytenr, num_bytes); if (!buf) goto pinit; -- cgit v1.2.3 From a6a8357788d6a37f8ad0f7eb46b0a386b613abb9 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 6 Oct 2009 15:33:35 +0200 Subject: sysfs: Allow sysfs_move_dir(..., NULL) again. As device_move() and kobject_move() both handle a NULL destination, sysfs_move_dir() should do this as well (again) and fall back to sysfs_root in that case. Signed-off-by: Cornelia Huck Cc: Phil Carmody Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 0050fc40e8c9..5fad489ce5bc 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -894,7 +894,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) mutex_lock(&sysfs_rename_mutex); BUG_ON(!sd->s_parent); - new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; + new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ? + new_parent_kobj->sd : &sysfs_root; error = 0; if (sd->s_parent == new_parent_sd) -- cgit v1.2.3 From 83db93f4de2d9ae441a491d1dc61c2204f0195de Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Tue, 15 Sep 2009 16:05:51 -0700 Subject: sysfs: Allow sysfs_notify_dirent to be called from interrupt context. sysfs_notify_dirent is a simple atomic operation that can be used to alert user-space that new data can be read from a sysfs attribute. Unfortunately it cannot currently be called from non-process context because of its use of spin_lock which is sometimes taken with interrupts enabled. So change all lockers of sysfs_open_dirent_lock to disable interrupts, thus making sysfs_notify_dirent safe to be called from non-process context (as drivers/md does in md_safemode_timeout). sysfs_get_open_dirent is (documented as being) only called from process context, so it uses spin_lock_irq. Other places use spin_lock_irqsave. The usage for sysfs_notify_dirent in md_safemode_timeout was introduced in 2.6.28, so this patch is suitable for that and more recent kernels. Reported-by: Joel Andres Granados Signed-off-by: NeilBrown Signed-off-by: Dan Williams Cc: stable Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 561a9c050cef..f5ea4680f15f 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -268,7 +268,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, struct sysfs_open_dirent *od, *new_od = NULL; retry: - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irq(&sysfs_open_dirent_lock); if (!sd->s_attr.open && new_od) { sd->s_attr.open = new_od; @@ -281,7 +281,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, list_add_tail(&buffer->list, &od->buffers); } - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irq(&sysfs_open_dirent_lock); if (od) { kfree(new_od); @@ -315,8 +315,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd, struct sysfs_buffer *buffer) { struct sysfs_open_dirent *od = sd->s_attr.open; + unsigned long flags; - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irqsave(&sysfs_open_dirent_lock, flags); list_del(&buffer->list); if (atomic_dec_and_test(&od->refcnt)) @@ -324,7 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd, else od = NULL; - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); kfree(od); } @@ -456,8 +457,9 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) void sysfs_notify_dirent(struct sysfs_dirent *sd) { struct sysfs_open_dirent *od; + unsigned long flags; - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irqsave(&sysfs_open_dirent_lock, flags); od = sd->s_attr.open; if (od) { @@ -465,7 +467,7 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd) wake_up_interruptible(&od->poll); } - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); } EXPORT_SYMBOL_GPL(sysfs_notify_dirent); -- cgit v1.2.3