From 0378da0fda6edf5aaffda6f1248a78986bd955b5 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Aug 2010 10:25:28 +0800 Subject: ocfs2: pass struct file* to ocfs2_write_begin_nolock. struct file * has file_ra_state to store the readahead state and data. So pass this to ocfs2_write_begin_nolock so that it can be used in ocfs2_refcount_cow. Signed-off-by: Tao Ma --- fs/ocfs2/aops.c | 5 +++-- fs/ocfs2/aops.h | 3 ++- fs/ocfs2/mmap.c | 7 ++++--- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0de69c9a08be..e3efdd93773f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, return ret; } -int ocfs2_write_begin_nolock(struct address_space *mapping, +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) @@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index c48e93ffc513..7606f663da6d 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); -int ocfs2_write_begin_nolock(struct address_space *mapping, +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index af2b8fe1f139..b04d6961c0d4 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) return ret; } -static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, +static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct page *page) { int ret; + struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); unsigned int len = PAGE_CACHE_SIZE; @@ -109,7 +110,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, if (page->index == last_index) len = size & ~PAGE_CACHE_MASK; - ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, &fsdata, di_bh, page); if (ret) { if (ret != -ENOSPC) @@ -157,7 +158,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = __ocfs2_page_mkwrite(inode, di_bh, page); + ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page); up_write(&OCFS2_I(inode)->ip_alloc_sem); -- cgit v1.2.3 From b890823635e022467b924e3d9da8c5166a4e349c Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Aug 2010 10:27:14 +0800 Subject: ocfs2: pass struct file* to ocfs2_prepare_inode_for_write. struct file * has file_ra_state to store the readahead state and data. So pass this to ocfs2_prepare_inode_for_write. so that it can be used in ocfs2_refcount_cow. Signed-off-by: Tao Ma --- fs/ocfs2/file.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 81296b4e3646..5920159a421f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2053,6 +2053,7 @@ out: } static int ocfs2_prepare_inode_for_refcount(struct inode *inode, + struct file *file, loff_t pos, size_t count, int *meta_level) { @@ -2078,7 +2079,7 @@ out: return ret; } -static int ocfs2_prepare_inode_for_write(struct dentry *dentry, +static int ocfs2_prepare_inode_for_write(struct file *file, loff_t *ppos, size_t count, int appending, @@ -2086,6 +2087,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, int *has_refcount) { int ret = 0, meta_level = 0; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; loff_t saved_pos, end; @@ -2141,6 +2143,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, meta_level = -1; ret = ocfs2_prepare_inode_for_refcount(inode, + file, saved_pos, count, &meta_level); @@ -2255,7 +2258,7 @@ relock: } can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, + ret = ocfs2_prepare_inode_for_write(file, ppos, iocb->ki_left, appending, &can_do_direct, &has_refcount); if (ret < 0) { @@ -2385,7 +2388,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, { int ret; - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, + ret = ocfs2_prepare_inode_for_write(out, &sd->pos, sd->total_len, 0, NULL, NULL); if (ret < 0) { mlog_errno(ret); -- cgit v1.2.3 From 155027121fe52f9b4f25e9d156c22f2f5012e5fe Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Aug 2010 10:36:38 +0800 Subject: ocfs2: Add struct file to ocfs2_refcount_cow. Add a new parameter 'struct file *' to ocfs2_refcount_cow so that we can add readahead support later. Signed-off-by: Tao Ma --- fs/ocfs2/aops.c | 2 +- fs/ocfs2/file.c | 8 ++++---- fs/ocfs2/refcounttree.c | 4 +++- fs/ocfs2/refcounttree.h | 3 ++- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e3efdd93773f..7155c5a919d7 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1693,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct file *filp, mlog_errno(ret); goto out; } else if (ret == 1) { - ret = ocfs2_refcount_cow(inode, di_bh, + ret = ocfs2_refcount_cow(inode, filp, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5920159a421f..4331f57e9fde 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -361,7 +361,7 @@ static int ocfs2_cow_file_pos(struct inode *inode, if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) goto out; - return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); + return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); out: return status; @@ -904,8 +904,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode, zero_clusters = last_cpos - zero_cpos; if (needs_cow) { - rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, - UINT_MAX); + rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, + zero_clusters, UINT_MAX); if (rc) { mlog_errno(rc); goto out; @@ -2071,7 +2071,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode, *meta_level = 1; - ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); + ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); if (ret) mlog_errno(ret); out: diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3ac5aa733e9c..66dab3c0d495 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3404,6 +3404,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) * unrefcounted extent. */ static int ocfs2_refcount_cow_hunk(struct inode *inode, + struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3481,6 +3482,7 @@ out: * clusters between cpos and cpos+write_len are safe to modify. */ int ocfs2_refcount_cow(struct inode *inode, + struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3500,7 +3502,7 @@ int ocfs2_refcount_cow(struct inode *inode, num_clusters = write_len; if (ext_flags & OCFS2_EXT_REFCOUNTED) { - ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, + ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, num_clusters, max_cpos); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 9983ba1570e2..29cba0eaa927 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u32 clusters, int *credits, int *ref_blocks); -int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_refcount_cow(struct inode *inode, + struct file *filep, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos); typedef int (ocfs2_post_refcount_func)(struct inode *inode, -- cgit v1.2.3 From 7b61cf54a2445ad21df9dd44f0c8bb90154ddea8 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 17 Jun 2010 14:14:36 +0800 Subject: ocfs2: Add readahead support for CoW. Add a new function ocfs2_readahead_for_cow so that we start readahead before we start our CoW. Signed-off-by: Tao Ma --- fs/ocfs2/refcounttree.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 66dab3c0d495..ad08c1134baa 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3398,6 +3398,28 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) return ret; } +static void ocfs2_readahead_for_cow(struct inode *inode, + struct file *file, + u32 start, u32 len) +{ + struct address_space *mapping; + pgoff_t index; + unsigned long num_pages; + int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; + + if (!file) + return; + + mapping = file->f_mapping; + num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT; + if (!num_pages) + num_pages = 1; + + index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT; + page_cache_sync_readahead(mapping, &file->f_ra, file, + index, num_pages); +} + /* * Starting at cpos, try to CoW write_len clusters. Don't CoW * past max_cpos. This will stop when it runs into a hole or an @@ -3433,6 +3455,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, BUG_ON(cow_len == 0); + ocfs2_readahead_for_cow(inode, file, cow_start, cow_len); + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; -- cgit v1.2.3 From 6ea4843f53282465f2bdbe5eedde7d8c3081dfdf Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Aug 2010 10:31:34 +0800 Subject: ocfs2: Add readhead during CoW. In CoW, when we meet with a readahead page, we know it is time to move the readahead window. So carry out a new readahead. Signed-off-by: Tao Ma --- fs/ocfs2/refcounttree.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ad08c1134baa..47549f64224c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -49,6 +49,7 @@ struct ocfs2_cow_context { struct inode *inode; + struct file *file; u32 cow_start; u32 cow_len; struct ocfs2_extent_tree data_et; @@ -2922,13 +2923,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; - unsigned int from, to; + unsigned int from, to, readahead_pages; loff_t offset, end, map_end; struct address_space *mapping = context->inode->i_mapping; mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, new_cluster, new_len, cpos); + readahead_pages = + (ocfs2_cow_contig_clusters(sb) << + OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); /* @@ -2959,6 +2963,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); + if (PageReadahead(page) && context->file) { + page_cache_async_readahead(mapping, + &context->file->f_ra, + context->file, + page, page_index, + readahead_pages); + } + if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); if (ret) { @@ -3478,6 +3490,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, context->ref_root_bh = ref_root_bh; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; context->get_clusters = ocfs2_di_get_clusters; + context->file = file; ocfs2_init_dinode_extent_tree(&context->data_et, INODE_CACHE(inode), di_bh); -- cgit v1.2.3 From ddee5cdb70e6f87de2fc696b87bd7bd184a51eb8 Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Sat, 22 May 2010 16:26:33 +0800 Subject: Ocfs2: Add new OCFS2_IOC_INFO ioctl for ocfs2 v8. The reason why we need this ioctl is to offer the none-privileged end-user a possibility to get filesys info gathering. We use OCFS2_IOC_INFO to manipulate the new ioctl, userspace passes a structure to kernel containing an array of request pointers and request count, such as, * From userspace: struct ocfs2_info_blocksize oib = { .ib_req = { .ir_magic = OCFS2_INFO_MAGIC, .ir_code = OCFS2_INFO_BLOCKSIZE, ... } ... } struct ocfs2_info_clustersize oic = { ... } uint64_t reqs[2] = {(unsigned long)&oib, (unsigned long)&oic}; struct ocfs2_info info = { .oi_requests = reqs, .oi_count = 2, } ret = ioctl(fd, OCFS2_IOC_INFO, &info); * In kernel: Get the request pointers from *info*, then handle each request one bye one. Idea here is to make the spearated request small enough to guarantee a better backward&forward compatibility since a small piece of request would be less likely to be broken if filesys on raw disk get changed. Currently, the following 7 requests are supported per the requirement from userspace tool o2info, and I believe it will grow over time:-) OCFS2_INFO_CLUSTERSIZE OCFS2_INFO_BLOCKSIZE OCFS2_INFO_MAXSLOTS OCFS2_INFO_LABEL OCFS2_INFO_UUID OCFS2_INFO_FS_FEATURES OCFS2_INFO_JOURNAL_SIZE This ioctl is only specific to OCFS2. Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- fs/ocfs2/ioctl.c | 356 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/ocfs2_ioctl.h | 95 +++++++++++++ 2 files changed, 451 insertions(+) diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7d9d9c132cef..7a4868196152 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -26,6 +26,26 @@ #include +#define o2info_from_user(a, b) \ + copy_from_user(&(a), (b), sizeof(a)) +#define o2info_to_user(a, b) \ + copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) + +/* + * This call is void because we are already reporting an error that may + * be -EFAULT. The error will be returned from the ioctl(2) call. It's + * just a best-effort to tell userspace that this request caused the error. + */ +static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, + struct ocfs2_info_request __user *req) +{ + kreq->ir_flags |= OCFS2_INFO_FL_ERROR; + (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); +} + +#define o2info_set_request_error(a, b) \ + __o2info_set_request_error((struct ocfs2_info_request *)&(a), b) + static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) { int status; @@ -109,6 +129,328 @@ bail: return status; } +int ocfs2_info_handle_blocksize(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_blocksize oib; + + if (o2info_from_user(oib, req)) + goto bail; + + oib.ib_blocksize = inode->i_sb->s_blocksize; + oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oib, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oib, req); + + return status; +} + +int ocfs2_info_handle_clustersize(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_clustersize oic; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oic, req)) + goto bail; + + oic.ic_clustersize = osb->s_clustersize; + oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oic, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oic, req); + + return status; +} + +int ocfs2_info_handle_maxslots(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_maxslots oim; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oim, req)) + goto bail; + + oim.im_max_slots = osb->max_slots; + oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oim, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oim, req); + + return status; +} + +int ocfs2_info_handle_label(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_label oil; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oil, req)) + goto bail; + + memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); + oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oil, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oil, req); + + return status; +} + +int ocfs2_info_handle_uuid(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_uuid oiu; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oiu, req)) + goto bail; + + memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); + oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oiu, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oiu, req); + + return status; +} + +int ocfs2_info_handle_fs_features(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_fs_features oif; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oif, req)) + goto bail; + + oif.if_compat_features = osb->s_feature_compat; + oif.if_incompat_features = osb->s_feature_incompat; + oif.if_ro_compat_features = osb->s_feature_ro_compat; + oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oif, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oif, req); + + return status; +} + +int ocfs2_info_handle_journal_size(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_journal_size oij; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oij, req)) + goto bail; + + oij.ij_journal_size = osb->journal->j_inode->i_size; + + oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oij, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oij, req); + + return status; +} + +int ocfs2_info_handle_unknown(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_request oir; + + if (o2info_from_user(oir, req)) + goto bail; + + oir.ir_flags &= ~OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oir, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oir, req); + + return status; +} + +/* + * Validate and distinguish OCFS2_IOC_INFO requests. + * + * - validate the magic number. + * - distinguish different requests. + * - validate size of different requests. + */ +int ocfs2_info_handle_request(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_request oir; + + if (o2info_from_user(oir, req)) + goto bail; + + status = -EINVAL; + if (oir.ir_magic != OCFS2_INFO_MAGIC) + goto bail; + + switch (oir.ir_code) { + case OCFS2_INFO_BLOCKSIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_blocksize)) + status = ocfs2_info_handle_blocksize(inode, req); + break; + case OCFS2_INFO_CLUSTERSIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_clustersize)) + status = ocfs2_info_handle_clustersize(inode, req); + break; + case OCFS2_INFO_MAXSLOTS: + if (oir.ir_size == sizeof(struct ocfs2_info_maxslots)) + status = ocfs2_info_handle_maxslots(inode, req); + break; + case OCFS2_INFO_LABEL: + if (oir.ir_size == sizeof(struct ocfs2_info_label)) + status = ocfs2_info_handle_label(inode, req); + break; + case OCFS2_INFO_UUID: + if (oir.ir_size == sizeof(struct ocfs2_info_uuid)) + status = ocfs2_info_handle_uuid(inode, req); + break; + case OCFS2_INFO_FS_FEATURES: + if (oir.ir_size == sizeof(struct ocfs2_info_fs_features)) + status = ocfs2_info_handle_fs_features(inode, req); + break; + case OCFS2_INFO_JOURNAL_SIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) + status = ocfs2_info_handle_journal_size(inode, req); + break; + default: + status = ocfs2_info_handle_unknown(inode, req); + break; + } + +bail: + return status; +} + +int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, + u64 *req_addr, int compat_flag) +{ + int status = -EFAULT; + u64 __user *bp = NULL; + + if (compat_flag) { +#ifdef CONFIG_COMPAT + /* + * pointer bp stores the base address of a pointers array, + * which collects all addresses of separate request. + */ + bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests); +#else + BUG(); +#endif + } else + bp = (u64 __user *)(unsigned long)(info->oi_requests); + + if (o2info_from_user(*req_addr, bp + idx)) + goto bail; + + status = 0; +bail: + return status; +} + +/* + * OCFS2_IOC_INFO handles an array of requests passed from userspace. + * + * ocfs2_info_handle() recevies a large info aggregation, grab and + * validate the request count from header, then break it into small + * pieces, later specific handlers can handle them one by one. + * + * Idea here is to make each separate request small enough to ensure + * a better backward&forward compatibility, since a small piece of + * request will be less likely to be broken if disk layout get changed. + */ +int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, + int compat_flag) +{ + int i, status = 0; + u64 req_addr; + struct ocfs2_info_request __user *reqp; + + if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) || + (!info->oi_requests)) { + status = -EINVAL; + goto bail; + } + + for (i = 0; i < info->oi_count; i++) { + + status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag); + if (status) + break; + + reqp = (struct ocfs2_info_request *)(unsigned long)req_addr; + if (!reqp) { + status = -EINVAL; + goto bail; + } + + status = ocfs2_info_handle_request(inode, reqp); + if (status) + break; + } + +bail: + return status; +} + long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_path.dentry->d_inode; @@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct reflink_arguments args; const char *old_path, *new_path; bool preserve; + struct ocfs2_info info; switch (cmd) { case OCFS2_IOC_GETFLAGS: @@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); + case OCFS2_IOC_INFO: + if (copy_from_user(&info, (struct ocfs2_info __user *)arg, + sizeof(struct ocfs2_info))) + return -EFAULT; + + return ocfs2_info_handle(inode, &info, 0); default: return -ENOTTY; } @@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) bool preserve; struct reflink_arguments args; struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_info info; switch (cmd) { case OCFS2_IOC32_GETFLAGS: @@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), compat_ptr(args.new_path), preserve); + case OCFS2_IOC_INFO: + if (copy_from_user(&info, (struct ocfs2_info __user *)arg, + sizeof(struct ocfs2_info))) + return -EFAULT; + + return ocfs2_info_handle(inode, &info, 1); default: return -ENOIOCTLCMD; } diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 2d3420af1a83..9bc535499868 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -76,4 +76,99 @@ struct reflink_arguments { }; #define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) +/* Following definitions dedicated for ocfs2_info_request ioctls. */ +#define OCFS2_INFO_MAX_REQUEST (50) +#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2) + +/* Magic number of all requests */ +#define OCFS2_INFO_MAGIC (0x4F32494E) + +/* + * Always try to separate info request into small pieces to + * guarantee the backward&forward compatibility. + */ +struct ocfs2_info { + __u64 oi_requests; /* Array of __u64 pointers to requests */ + __u32 oi_count; /* Number of requests in info_requests */ + __u32 oi_pad; +}; + +struct ocfs2_info_request { +/*00*/ __u32 ir_magic; /* Magic number */ + __u32 ir_code; /* Info request code */ + __u32 ir_size; /* Size of request */ + __u32 ir_flags; /* Request flags */ +/*10*/ /* Request specific fields */ +}; + +struct ocfs2_info_clustersize { + struct ocfs2_info_request ic_req; + __u32 ic_clustersize; + __u32 ic_pad; +}; + +struct ocfs2_info_blocksize { + struct ocfs2_info_request ib_req; + __u32 ib_blocksize; + __u32 ib_pad; +}; + +struct ocfs2_info_maxslots { + struct ocfs2_info_request im_req; + __u32 im_max_slots; + __u32 im_pad; +}; + +struct ocfs2_info_label { + struct ocfs2_info_request il_req; + __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN]; +} __attribute__ ((packed)); + +struct ocfs2_info_uuid { + struct ocfs2_info_request iu_req; + __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1]; +} __attribute__ ((packed)); + +struct ocfs2_info_fs_features { + struct ocfs2_info_request if_req; + __u32 if_compat_features; + __u32 if_incompat_features; + __u32 if_ro_compat_features; + __u32 if_pad; +}; + +struct ocfs2_info_journal_size { + struct ocfs2_info_request ij_req; + __u64 ij_journal_size; +}; + +/* Codes for ocfs2_info_request */ +enum ocfs2_info_type { + OCFS2_INFO_CLUSTERSIZE = 1, + OCFS2_INFO_BLOCKSIZE, + OCFS2_INFO_MAXSLOTS, + OCFS2_INFO_LABEL, + OCFS2_INFO_UUID, + OCFS2_INFO_FS_FEATURES, + OCFS2_INFO_JOURNAL_SIZE, + OCFS2_INFO_NUM_TYPES +}; + +/* Flags for struct ocfs2_info_request */ +/* Filled by the caller */ +#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not + required. This is a hint. + It is up to ocfs2 whether + the request can be fulfilled + without locking. */ +/* Filled by ocfs2 */ +#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood + this request and + filled in the answer */ + +#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during + request handling. */ + +#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) + #endif /* OCFS2_IOCTL_H */ -- cgit v1.2.3 From 3c3f20c9813391ba4004764072989744395cf405 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 1 Jun 2010 13:58:13 +0800 Subject: ocfs2: Add some trace log for orphan scan. Now orphan scan worker has no trace log, so it is very hard to tell whether it is finished or blocked. So add 2 mlog trace log so that we can tell whether the current orphan scan worker is blocked or not. It does help when I analyzed a orphan scan bug. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/journal.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 9b57c0350ff9..04d41dfeab9a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1888,6 +1888,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) os = &osb->osb_orphan_scan; + mlog(0, "Begin orphan scan\n"); + if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) goto out; @@ -1920,6 +1922,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) unlock: ocfs2_orphan_scan_unlock(osb, seqno); out: + mlog(0, "Orphan scan completed\n"); return; } -- cgit v1.2.3 From 95fa859a268fd7d9bae6f2d4d095e3f61100ac1b Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 9 Jun 2010 16:48:59 +0800 Subject: ocfs2: Remove obscure error handling in direct_write. In ocfs2, actually we don't allow any direct write pass i_size, see the function ocfs2_prepare_inode_for_write. So we don't need the bogus simple_setsize. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9a03c151b5ce..4f9133f6368e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2312,17 +2312,6 @@ relock: written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, ppos, count, ocount); if (written < 0) { - /* - * direct write may have instantiated a few - * blocks outside i_size. Trim these off again. - * Don't need i_size_read because we hold i_mutex. - * - * XXX(truncate): this looks buggy because ocfs2 did not - * actually implement ->truncate. Take a look at - * the new truncate sequence and update this accordingly - */ - if (*ppos + count > inode->i_size) - truncate_setsize(inode, inode->i_size); ret = written; goto out_dio; } -- cgit v1.2.3 From 83fd9c7f65634ac440a6b9b7a63ba562f213ac60 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 10 Jun 2010 17:21:36 -0500 Subject: Reorganize data elements to reduce struct sizes Thanks for the comments. I have incorportated them all. CONFIG_OCFS2_FS_STATS is enabled and CONFIG_DEBUG_LOCK_ALLOC is disabled. Statistics now look like - ocfs2_write_ctxt: 2144 - 2136 = 8 ocfs2_inode_info: 1960 - 1848 = 112 ocfs2_journal: 168 - 160 = 8 ocfs2_lock_res: 336 - 304 = 32 ocfs2_refcount_tree: 512 - 472 = 40 Signed-off-by: Goldwyn Rodrigues Signed-off-by: Joel Becker --- fs/ocfs2/aops.c | 2 +- fs/ocfs2/inode.h | 11 ++++------- fs/ocfs2/journal.h | 3 ++- fs/ocfs2/ocfs2.h | 23 +++++++++++++++-------- fs/ocfs2/refcounttree.h | 4 ++-- 5 files changed, 24 insertions(+), 19 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0de69c9a08be..f477f18b35d5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -883,8 +883,8 @@ struct ocfs2_write_ctxt { * out in so that future reads from that region will get * zero's. */ - struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; unsigned int w_num_pages; + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; struct page *w_target_page; /* diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 6de5a869db30..0bc477a3aeb8 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -46,27 +46,24 @@ struct ocfs2_inode_info /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; - u32 ip_clusters; struct list_head ip_io_markers; + u32 ip_clusters; + u16 ip_dyn_features; struct mutex ip_io_mutex; - u32 ip_flags; /* see below */ u32 ip_attr; /* inode attributes */ - u16 ip_dyn_features; /* protected by recovery_lock. */ struct inode *ip_next_orphan; - u32 ip_dir_start_lookup; - struct ocfs2_caching_info ip_metadata_cache; - struct ocfs2_extent_map ip_extent_map; - struct inode vfs_inode; struct jbd2_inode ip_jinode; + u32 ip_dir_start_lookup; + /* Only valid if the inode is the dir. */ u32 ip_last_used_slot; u64 ip_last_used_group; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index b5baaa8e710f..43e56b97f9c0 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -67,11 +67,12 @@ struct ocfs2_journal { struct buffer_head *j_bh; /* Journal disk inode block */ atomic_t j_num_trans; /* Number of transactions * currently in the system. */ + spinlock_t j_lock; unsigned long j_trans_id; struct rw_semaphore j_trans_barrier; wait_queue_head_t j_checkpointed; - spinlock_t j_lock; + /* both fields protected by j_lock*/ struct list_head j_la_cleanups; struct work_struct j_recovery_work; }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c67003b6b5a2..65739b3b3276 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data); struct ocfs2_lock_res { void *l_priv; struct ocfs2_lock_res_ops *l_ops; - spinlock_t l_lock; + struct list_head l_blocked_list; struct list_head l_mask_waiters; - enum ocfs2_lock_type l_type; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; - int l_level; unsigned int l_ro_holders; unsigned int l_ex_holders; - struct ocfs2_dlm_lksb l_lksb; + unsigned char l_level; + + /* Data packed - type enum ocfs2_lock_type */ + unsigned char l_type; /* used from AST/BAST funcs. */ - enum ocfs2_ast_action l_action; - enum ocfs2_unlock_action l_unlock_action; - int l_requested; - int l_blocking; + /* Data packed - enum type ocfs2_ast_action */ + unsigned char l_action; + /* Data packed - enum type ocfs2_unlock_action */ + unsigned char l_unlock_action; + unsigned char l_requested; + unsigned char l_blocking; unsigned int l_pending_gen; + spinlock_t l_lock; + + struct ocfs2_dlm_lksb l_lksb; + wait_queue_head_t l_event; struct list_head l_debug_list; diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 9983ba1570e2..f04892d6175d 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -21,14 +21,14 @@ struct ocfs2_refcount_tree { struct rb_node rf_node; u64 rf_blkno; u32 rf_generation; + struct kref rf_getcnt; struct rw_semaphore rf_sem; struct ocfs2_lock_res rf_lockres; - struct kref rf_getcnt; int rf_removed; /* the following 4 fields are used by caching_info. */ - struct ocfs2_caching_info rf_ci; spinlock_t rf_lock; + struct ocfs2_caching_info rf_ci; struct mutex rf_io_mutex; struct super_block *rf_sb; }; -- cgit v1.2.3 From 4c38881f87c21ada5609a4065cb10e3e74da0d6e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 5 Aug 2010 20:32:46 +0200 Subject: ocfs2: Remove ocfs2_sync_inode() ocfs2_sync_inode() is used only from ocfs2_sync_file(). But all data has already been written before calling ocfs2_sync_file() and ocfs2 doesn't use inode's private_list for tracking metadata buffers thus sync_mapping_buffers() is superfluous as well. Signed-off-by: Jan Kara Acked-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4f9133f6368e..b03f6601fd71 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -64,12 +64,6 @@ #include "buffer_head_io.h" -static int ocfs2_sync_inode(struct inode *inode) -{ - filemap_fdatawrite(inode->i_mapping); - return sync_mapping_buffers(inode->i_mapping); -} - static int ocfs2_init_file_private(struct inode *inode, struct file *file) { struct ocfs2_file_private *fp; @@ -187,10 +181,6 @@ static int ocfs2_sync_file(struct file *file, int datasync) mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, dentry->d_name.len, dentry->d_name.name); - err = ocfs2_sync_inode(dentry->d_inode); - if (err) - goto bail; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { /* * We still have to flush drive's caches to get data to the -- cgit v1.2.3 From f9c57ada32ea3f2e12600cf274035fff063b2e0f Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 2 Aug 2010 11:02:13 +0800 Subject: ocfs2: Remove unused old_id in ocfs2_commit_cache. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/journal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 04d41dfeab9a..5787802c2d6a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) { int status = 0; unsigned int flushed; - unsigned long old_id; struct ocfs2_journal *journal = NULL; mlog_entry_void(); @@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) goto finally; } - old_id = ocfs2_inc_trans_id(journal); + ocfs2_inc_trans_id(journal); flushed = atomic_read(&journal->j_num_trans); atomic_set(&journal->j_num_trans, 0); -- cgit v1.2.3 From 17ae521158d6fe89cfdd00a9990aa40d02e81391 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 2 Aug 2010 11:02:14 +0800 Subject: ocfs2: Remove obsolete comments before ocfs2_start_trans. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/journal.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 5787802c2d6a..faa2303dbf0a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -341,9 +341,6 @@ finally: return status; } -/* pass it NULL and it will allocate a new handle object for you. If - * you pass it a handle however, it may still return error, in which - * case it has free'd the passed handle for you. */ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) { journal_t *journal = osb->journal->j_journal; -- cgit v1.2.3 From 30ca22c70e3ef0a96ff84de69cd7e8561b416cb2 Mon Sep 17 00:00:00 2001 From: "Patrick J. LoPresti" Date: Thu, 22 Jul 2010 15:03:41 -0700 Subject: ext3/ext4: Factor out disk addressability check As part of adding support for OCFS2 to mount huge volumes, we need to check that the sector_t and page cache of the system are capable of addressing the entire volume. An identical check already appears in ext3 and ext4. This patch moves the addressability check into its own function in fs/libfs.c and modifies ext3 and ext4 to invoke it. [Edited to -EINVAL instead of BUG_ON() for bad blocksize_bits -- Joel] Signed-off-by: Patrick LoPresti Cc: linux-ext4@vger.kernel.org Acked-by: Andreas Dilger Signed-off-by: Joel Becker --- fs/ext3/super.c | 4 ++-- fs/ext4/super.c | 8 +++----- fs/libfs.c | 29 +++++++++++++++++++++++++++++ include/linux/fs.h | 2 ++ 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5dbf4dba03c4..a367dd044280 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1849,8 +1849,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - if (le32_to_cpu(es->s_blocks_count) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + if (generic_check_addressable(sb->s_blocksize_bits, + le32_to_cpu(es->s_blocks_count))) { ext3_msg(sb, KERN_ERR, "error: filesystem is too large to mount safely"); if (sizeof(sector_t) < 8) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 26147746c272..7f47c366bf15 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2831,15 +2831,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ - if ((ext4_blocks_count(es) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || - (ext4_blocks_count(es) > - (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { + ret = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (ret) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); - ret = -EFBIG; goto failed_mount; } diff --git a/fs/libfs.c b/fs/libfs.c index 0a9da95317f7..8debe7b33769 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -913,6 +913,35 @@ int generic_file_fsync(struct file *file, int datasync) } EXPORT_SYMBOL(generic_file_fsync); +/** + * generic_check_addressable - Check addressability of file system + * @blocksize_bits: log of file system block size + * @num_blocks: number of blocks in file system + * + * Determine whether a file system with @num_blocks blocks (and a + * block size of 2**@blocksize_bits) is addressable by the sector_t + * and page cache of the system. Return 0 if so and -EFBIG otherwise. + */ +int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) +{ + u64 last_fs_block = num_blocks - 1; + + if (unlikely(num_blocks == 0)) + return 0; + + if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT)) + return -EINVAL; + + if ((last_fs_block > + (sector_t)(~0ULL) >> (blocksize_bits - 9)) || + (last_fs_block > + (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - blocksize_bits))) { + return -EFBIG; + } + return 0; +} +EXPORT_SYMBOL(generic_check_addressable); + /* * No-op implementation of ->fsync for in-memory filesystems. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 76041b614758..1a759f40ab9e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2374,6 +2374,8 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, extern int generic_file_fsync(struct file *, int); +extern int generic_check_addressable(unsigned, u64); + #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, struct page *, struct page *); -- cgit v1.2.3 From 1113e1b504f6e8d4364c0b73c9097828067d4617 Mon Sep 17 00:00:00 2001 From: "Patrick J. LoPresti" Date: Thu, 22 Jul 2010 15:04:16 -0700 Subject: JBD2: Allow feature checks before journal recovery Before we start accessing a huge (> 16 TiB) OCFS2 volume, we need to confirm that its journal supports 64-bit offsets. In particular, we need to check the journal's feature bits before recovering the journal. This is not possible with JBD2 at present, because the journal superblock (where the feature bits reside) is not loaded from disk until the journal is recovered. This patch loads the journal superblock in jbd2_journal_check_used_features() if it has not already been loaded, allowing us to check the feature bits before journal recovery. Signed-off-by: Patrick LoPresti Cc: linux-ext4@vger.kernel.org Acked-by: "Theodore Ts'o" Signed-off-by: Joel Becker --- fs/jbd2/journal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0e8014ea6b94..262419f83d80 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1371,6 +1371,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, if (!compat && !ro && !incompat) return 1; + /* Load journal superblock if it is not loaded yet. */ + if (journal->j_format_version == 0 && + journal_get_superblock(journal) != 0) + return 0; if (journal->j_format_version == 1) return 0; -- cgit v1.2.3 From 3bdb8efd94a73bb137e3315cd831cbc874052b4b Mon Sep 17 00:00:00 2001 From: "Patrick J. LoPresti" Date: Thu, 22 Jul 2010 15:05:57 -0700 Subject: OCFS2: Allow huge (> 16 TiB) volumes to mount The OCFS2 developers have already done all of the hard work to allow volumes larger than 16 TiB. But there is still a "sanity check" in fs/ocfs2/super.c that prevents the mounting of such volumes, even when the cluster size and journal options would allow it. This patch replaces that sanity check with a more sophisticated one to mount a huge volume provided that (a) it is addressable by the raw word/address size of the system (borrowing a test from ext4); (b) the volume is using JBD2; and (c) the JBD2_FEATURE_INCOMPAT_64BIT flag is set on the journal. I factored out the sanity check into its own function. I also moved it from ocfs2_initialize_super() down to ocfs2_check_volume(); any earlier, and the journal will not have been initialized yet. This patch is one of a pair, and it depends on the other ("JBD2: Allow feature checks before journal recovery"). I have tested this patch on small volumes, huge volumes, and huge volumes without 64-bit block support in the journal. All of them appear to work or to fail gracefully, as appropriate. Signed-off-by: Patrick LoPresti Signed-off-by: Joel Becker --- fs/ocfs2/super.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index fa1be1b304d1..47415398d56a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1990,6 +1990,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu return 0; } +/* Make sure entire volume is addressable by our journal. Requires + osb_clusters_at_boot to be valid and for the journal to have been + initialized by ocfs2_journal_init(). */ +static int ocfs2_journal_addressable(struct ocfs2_super *osb) +{ + int status = 0; + u64 max_block = + ocfs2_clusters_to_blocks(osb->sb, + osb->osb_clusters_at_boot) - 1; + + /* 32-bit block number is always OK. */ + if (max_block <= (u32)~0ULL) + goto out; + + /* Volume is "huge", so see if our journal is new enough to + support it. */ + if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_COMPAT_JBD2_SB) && + jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT))) { + mlog(ML_ERROR, "The journal cannot address the entire volume. " + "Enable the 'block64' journal option with tunefs.ocfs2"); + status = -EFBIG; + goto out; + } + + out: + return status; +} + static int ocfs2_initialize_super(struct super_block *sb, struct buffer_head *bh, int sector_size, @@ -2002,6 +2032,7 @@ static int ocfs2_initialize_super(struct super_block *sb, struct ocfs2_journal *journal; __le32 uuid_net_key; struct ocfs2_super *osb; + u64 total_blocks; mlog_entry_void(); @@ -2214,11 +2245,15 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) - > (u32)~0UL) { - mlog(ML_ERROR, "Volume might try to write to blocks beyond " - "what jbd can address in 32 bits.\n"); - status = -EINVAL; + total_blocks = ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(di->i_clusters)); + + status = generic_check_addressable(osb->sb->s_blocksize_bits, + total_blocks); + if (status) { + mlog(ML_ERROR, "Volume too large " + "to mount safely on this system"); + status = -EFBIG; goto bail; } @@ -2380,6 +2415,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) goto finally; } + /* Now that journal has been initialized, check to make sure + entire volume is addressable. */ + status = ocfs2_journal_addressable(osb); + if (status) + goto finally; + /* If the journal was unmounted cleanly then we don't want to * recover anything. Otherwise, journal_load will do that * dirty work for us :) */ -- cgit v1.2.3 From a33f13efe05192e7a805018a2ce2b2afddd04057 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Mon, 16 Aug 2010 12:10:17 -0700 Subject: libfs: Fix shift bug in generic_check_addressable() generic_check_addressable() erroneously shifts pages down by a block factor when it should be shifting up. To prevent overflow, we shift blocks down to pages. Signed-off-by: Joel Becker --- fs/libfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 8debe7b33769..62baa0387d6e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -925,6 +925,8 @@ EXPORT_SYMBOL(generic_file_fsync); int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) { u64 last_fs_block = num_blocks - 1; + u64 last_fs_page = + last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits); if (unlikely(num_blocks == 0)) return 0; @@ -932,10 +934,8 @@ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT)) return -EINVAL; - if ((last_fs_block > - (sector_t)(~0ULL) >> (blocksize_bits - 9)) || - (last_fs_block > - (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - blocksize_bits))) { + if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || + (last_fs_page > (pgoff_t)(~0ULL))) { return -EFBIG; } return 0; -- cgit v1.2.3 From b4d693fcc5fe99ed211addb5c6a0f8398f0b266e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 16 Aug 2010 16:58:21 +0800 Subject: ocfs2: Cache system inodes of other slots. Durring orphan scan, if we are slot 0, and we are replaying orphan_dir:0001, the general process is that for every file in this dir: 1. we will iget orphan_dir:0001, since there is no inode for it. we will have to create an inode and read it from the disk. 2. do the normal work, such as delete_inode and remove it from the dir if it is allowed. 3. call iput orphan_dir:0001 when we are done. In this case, since we have no dcache for this inode, i_count will reach 0, and VFS will have to call clear_inode and in ocfs2_clear_inode we will checkpoint the inode which will let ocfs2_cmt and journald begin to work. 4. We loop back to 1 for the next file. So you see, actually for every deleted file, we have to read the orphan dir from the disk and checkpoint the journal. It is very time consuming and cause a lot of journal checkpoint I/O. A better solution is that we can have another reference for these inodes in ocfs2_super. So if there is no other race among nodes(which will let dlmglue to checkpoint the inode), for step 3, clear_inode won't be called and for step 1, we may only need to read the inode for the 1st time. This is a big win for us. So this patch will try to cache system inodes of other slots so that we will have one more reference for these inodes and avoid the extra inode read and journal checkpoint. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2.h | 3 ++- fs/ocfs2/ocfs2_fs.h | 5 +++++ fs/ocfs2/super.c | 20 +++++++++++++++--- fs/ocfs2/sysfile.c | 60 +++++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 73 insertions(+), 15 deletions(-) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 65739b3b3276..687e291d73f2 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -284,7 +284,8 @@ struct ocfs2_super struct super_block *sb; struct inode *root_inode; struct inode *sys_root_inode; - struct inode *system_inodes[NUM_SYSTEM_INODES]; + struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES]; + struct inode **local_system_inodes; struct ocfs2_slot_info *slot_info; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 33f1c9a8258d..723b20dac414 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -309,6 +309,7 @@ enum { USER_QUOTA_SYSTEM_INODE, GROUP_QUOTA_SYSTEM_INODE, #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE +#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE, EXTENT_ALLOC_SYSTEM_INODE, INODE_ALLOC_SYSTEM_INODE, @@ -317,8 +318,12 @@ enum { TRUNCATE_LOG_SYSTEM_INODE, LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE, +#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE NUM_SYSTEM_INODES }; +#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE +#define NUM_LOCAL_SYSTEM_INODES \ + (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { /* Global system inodes (single copy) */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 47415398d56a..350e8b5a9396 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -514,11 +514,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb) mlog_entry_void(); - for (i = 0; i < NUM_SYSTEM_INODES; i++) { - inode = osb->system_inodes[i]; + for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { + inode = osb->global_system_inodes[i]; if (inode) { iput(inode); - osb->system_inodes[i] = NULL; + osb->global_system_inodes[i] = NULL; } } @@ -534,6 +534,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb) osb->root_inode = NULL; } + if (!osb->local_system_inodes) + goto out; + + for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { + if (osb->local_system_inodes[i]) { + iput(osb->local_system_inodes[i]); + osb->local_system_inodes[i] = NULL; + } + } + + kfree(osb->local_system_inodes); + osb->local_system_inodes = NULL; + +out: mlog_exit(0); } diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index bfe7190cdbf1..902efb23b6a6 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, int type, u32 slot); -static inline int is_global_system_inode(int type); -static inline int is_in_system_inode_array(struct ocfs2_super *osb, - int type, - u32 slot); - #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; #endif @@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type) type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; } -static inline int is_in_system_inode_array(struct ocfs2_super *osb, - int type, - u32 slot) +static struct inode **get_local_system_inode(struct ocfs2_super *osb, + int type, + u32 slot) { - return slot == osb->slot_num || is_global_system_inode(type); + int index; + struct inode **local_system_inodes, **free = NULL; + + BUG_ON(slot == OCFS2_INVALID_SLOT); + BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE || + type > OCFS2_LAST_LOCAL_SYSTEM_INODE); + + spin_lock(&osb->osb_lock); + local_system_inodes = osb->local_system_inodes; + spin_unlock(&osb->osb_lock); + + if (unlikely(!local_system_inodes)) { + local_system_inodes = kzalloc(sizeof(struct inode *) * + NUM_LOCAL_SYSTEM_INODES * + osb->max_slots, + GFP_NOFS); + if (!local_system_inodes) { + mlog_errno(-ENOMEM); + /* + * return NULL here so that ocfs2_get_sytem_file_inodes + * will try to create an inode and use it. We will try + * to initialize local_system_inodes next time. + */ + return NULL; + } + + spin_lock(&osb->osb_lock); + if (osb->local_system_inodes) { + /* Someone has initialized it for us. */ + free = local_system_inodes; + local_system_inodes = osb->local_system_inodes; + } else + osb->local_system_inodes = local_system_inodes; + spin_unlock(&osb->osb_lock); + if (unlikely(free)) + kfree(free); + } + + index = (slot * NUM_LOCAL_SYSTEM_INODES) + + (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE); + + return &local_system_inodes[index]; } struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, @@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, struct inode **arr = NULL; /* avoid the lookup if cached in local system file array */ - if (is_in_system_inode_array(osb, type, slot)) - arr = &(osb->system_inodes[type]); + if (is_global_system_inode(type)) { + arr = &(osb->global_system_inodes[type]); + } else + arr = get_local_system_inode(osb, type, slot); if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ -- cgit v1.2.3 From 5e98d492406818e6a94c0ba54c61f59d40cefa4a Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 28 Jun 2010 10:04:32 -0500 Subject: Track negative entries v3 Track negative dentries by recording the generation number of the parent directory in d_fsdata. The generation number for the parent directory is recorded in the inode_info, which increments every time the lock on the directory is dropped. If the generation number of the parent directory and the negative dentry matches, there is no need to perform the revalidate, else a revalidate is forced. This improves performance in situations where nodes look for the same non-existent file multiple times. Thanks Mark for explaining the DLM sequence. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Joel Becker --- fs/ocfs2/dcache.c | 33 +++++++++++++++++++++++++++++---- fs/ocfs2/dcache.h | 1 + fs/ocfs2/dlmglue.c | 8 ++++++++ fs/ocfs2/inode.c | 1 + fs/ocfs2/inode.h | 1 + fs/ocfs2/namei.c | 3 ++- 6 files changed, 42 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index b4957c7d9fe2..edaded48e7e9 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -40,6 +40,14 @@ #include "inode.h" #include "super.h" +void ocfs2_dentry_attach_gen(struct dentry *dentry) +{ + unsigned long gen = + OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; + BUG_ON(dentry->d_inode); + dentry->d_fsdata = (void *)gen; +} + static int ocfs2_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) @@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); - /* Never trust a negative dentry - force a new lookup. */ + /* For a negative dentry - + * check the generation number of the parent and compare with the + * one stored in the inode. + */ if (inode == NULL) { - mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, - dentry->d_name.name); - goto bail; + unsigned long gen = (unsigned long) dentry->d_fsdata; + unsigned long pgen = + OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; + mlog(0, "negative dentry: %.*s parent gen: %lu " + "dentry gen: %lu\n", + dentry->d_name.len, dentry->d_name.name, pgen, gen); + if (gen != pgen) + goto bail; + goto valid; } BUG_ON(!osb); @@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, goto bail; } +valid: ret = 1; bail: @@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, if (!inode) return 0; + if (!dentry->d_inode && dentry->d_fsdata) { + /* Converting a negative dentry to positive + Clear dentry->d_fsdata */ + dentry->d_fsdata = dl = NULL; + } + if (dl) { mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, " \"%.*s\": old parent: %llu, new: %llu\n", @@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) out: iput(inode); + ocfs2_dentry_attach_gen(dentry); } /* diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index f5dd1789acf1..b79eff709958 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, struct inode *old_dir, struct inode *new_dir); extern spinlock_t dentry_attach_lock; +void ocfs2_dentry_attach_gen(struct dentry *dentry); #endif /* OCFS2_DCACHE_H */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 5e02a893f46e..e8d94d722ecb 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, { struct inode *inode; struct address_space *mapping; + struct ocfs2_inode_info *oi; inode = ocfs2_lock_res_inode(lockres); mapping = inode->i_mapping; + if (S_ISDIR(inode->i_mode)) { + oi = OCFS2_I(inode); + oi->ip_dir_lock_gen++; + mlog(0, "generation: %u\n", oi->ip_dir_lock_gen); + goto out; + } + if (!S_ISREG(inode->i_mode)) goto out; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index eece3e05d9d0..f935fd6600dd 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, else inode->i_fop = &ocfs2_dops_no_plocks; i_size_write(inode, le64_to_cpu(fe->i_size)); + OCFS2_I(inode)->ip_dir_lock_gen = 1; break; case S_IFLNK: if (ocfs2_inode_is_fast_symlink(inode)) diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 0bc477a3aeb8..1c508b149b3a 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -67,6 +67,7 @@ struct ocfs2_inode_info /* Only valid if the inode is the dir. */ u32 ip_last_used_slot; u64 ip_last_used_group; + u32 ip_dir_lock_gen; struct ocfs2_alloc_reservation ip_la_data_resv; }; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a00dda2e4f16..e7bde21149ae 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -171,7 +171,8 @@ bail_add: ret = ERR_PTR(status); goto bail_unlock; } - } + } else + ocfs2_dentry_attach_gen(dentry); bail_unlock: /* Don't drop the cluster lock until *after* the d_add -- -- cgit v1.2.3 From c0e1a3e80dc030cd54f06416efbf9e439bf6ecb7 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 15 Sep 2010 16:56:54 -0700 Subject: ocfs2: Silence unused warning. When CONFIG_OCFS2_DEBUG_MASKLOG is undefined, we don't use the dentry variable in ocfs2_sync_file(). Let's just move all access to the dentry inside the logging call. Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9a74542e1a05..13af9937bdda 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -174,12 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync) { int err = 0; journal_t *journal; - struct dentry *dentry = file->f_path.dentry; struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, - dentry->d_name.len, dentry->d_name.name); + mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync, + file->f_path.dentry, file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name); if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { /* -- cgit v1.2.3 From 93f3b86fb1bd0ad7b4a5eb1ad1fdae2b290633b7 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 15 Sep 2010 17:00:58 -0700 Subject: ocfs2: Initialize the bktcnt variable properly, and call it bucket_count Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmdebug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 5efdd37dfe48..26ff2e185b1e 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) struct hlist_head *bucket; struct hlist_node *list; int i, out = 0; - unsigned long total = 0, longest = 0, bktcnt; + unsigned long total = 0, longest = 0, bucket_count = 0; out += snprintf(db->buf + out, db->len - out, "Dumping MLEs for Domain: %s\n", dlm->name); @@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) mle = hlist_entry(list, struct dlm_master_list_entry, master_hash_node); ++total; - ++bktcnt; + ++bucket_count; if (db->len - out < 200) continue; out += dump_mle(mle, db->buf + out, db->len - out); } - longest = max(longest, bktcnt); - bktcnt = 0; + longest = max(longest, bucket_count); + bucket_count = 0; } spin_unlock(&dlm->master_lock); -- cgit v1.2.3 From 54b5187b5a1ad6573ade8b18e065dda92501fc52 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 15:26:08 -0700 Subject: ocfs2/cluster: Add heartbeat mode configfs parameter Add heartbeat mode parameter to the configfs tree. This will be used to set/show the heartbeat mode. The user is free to toggle the mode between local and global as long as there is no active heartbeat region. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 41d5f1f92d56..4d36459a8343 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -77,7 +77,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); #define O2HB_DEFAULT_BLOCK_BITS 9 +enum o2hb_heartbeat_modes { + O2HB_HEARTBEAT_LOCAL = 0, + O2HB_HEARTBEAT_GLOBAL, + O2HB_HEARTBEAT_NUM_MODES, +}; + +char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { + "local", /* O2HB_HEARTBEAT_LOCAL */ + "global", /* O2HB_HEARTBEAT_GLOBAL */ +}; + unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; +unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; /* Only sets a new threshold if there are no active regions. * @@ -94,6 +106,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold) } } +static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) +{ + int ret = -1; + + if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) { + spin_lock(&o2hb_live_lock); + if (list_empty(&o2hb_all_regions)) { + o2hb_heartbeat_mode = hb_mode; + ret = 0; + } + spin_unlock(&o2hb_live_lock); + } + + return ret; +} + struct o2hb_node_event { struct list_head hn_item; enum o2hb_callback_type hn_event_type; @@ -1688,6 +1716,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group return count; } +static +ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group, + char *page) +{ + return sprintf(page, "%s\n", + o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); +} + +static +ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, + const char *page, size_t count) +{ + unsigned int i; + int ret; + size_t len; + + len = (page[count - 1] == '\n') ? count - 1 : count; + if (!len) + return -EINVAL; + + for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { + if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) + continue; + + ret = o2hb_global_hearbeat_mode_set(i); + if (!ret) + printk(KERN_NOTICE "ocfs2: Heartbeat mode set to %s\n", + o2hb_heartbeat_mode_desc[i]); + return count; + } + + return -EINVAL; + +} + static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { .attr = { .ca_owner = THIS_MODULE, .ca_name = "dead_threshold", @@ -1696,8 +1759,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold .store = o2hb_heartbeat_group_threshold_store, }; +static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "mode", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_heartbeat_group_mode_show, + .store = o2hb_heartbeat_group_mode_store, +}; + static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { &o2hb_heartbeat_group_attr_threshold.attr, + &o2hb_heartbeat_group_attr_mode.attr, NULL, }; -- cgit v1.2.3 From 98f486f23bc5b6a6fa90e1a0707b7e9fe0e7f3e4 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Sat, 9 Oct 2010 10:24:46 -0700 Subject: ocfs2: Add an incompat feature flag OCFS2_FEATURE_INCOMPAT_CLUSTERINFO OCFS2_FEATURE_INCOMPAT_CLUSTERINFO allows us to use sb->s_cluster_info for both userspace and o2cb cluster stacks. It also allows us to extend cluster info to include stack flags. This patch also adds stackflags to sb->s_clusterinfo. It also introduces a clusterinfo flag OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT to denote the enabled global heartbeat mode. This incompat flag can be set/cleared using tunefs.ocfs2 --fs-features. The clusterinfo flag is set/cleared using tunefs.ocfs2 --update-cluster-stack. Signed-off-by: Sunil Mushran --- fs/ocfs2/ocfs2.h | 31 +++++++++++++++++++++++++++++-- fs/ocfs2/ocfs2_fs.h | 40 ++++++++++++++++++++++++++++++++++------ fs/ocfs2/super.c | 4 +++- 3 files changed, 66 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c67003b6b5a2..d5496a792bdb 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -368,6 +368,8 @@ struct ocfs2_super struct ocfs2_alloc_stats alloc_stats; char dev_str[20]; /* "major,minor" of the device */ + u8 osb_stackflags; + char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; @@ -601,10 +603,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) return ret; } -static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) +static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb) { return (osb->s_feature_incompat & - OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK); + (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK | + OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)); +} + +static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) +{ + if (ocfs2_clusterinfo_valid(osb) && + memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + return 1; + return 0; +} + +static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb) +{ + if (ocfs2_clusterinfo_valid(osb) && + !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + return 1; + return 0; +} + +static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) +{ + return ocfs2_o2cb_stack(osb) && + (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT); } static inline int ocfs2_mount_local(struct ocfs2_super *osb) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index fa31d05e41b7..d5b1d99abc3c 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -101,7 +101,8 @@ | OCFS2_FEATURE_INCOMPAT_META_ECC \ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ - | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) + | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -169,6 +170,13 @@ /* Discontigous block groups */ #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 +/* + * Incompat bit to indicate useable clusterinfo with stackflags for all + * cluster stacks (userspace adnd o2cb). If this bit is set, + * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. + */ +#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -292,10 +300,13 @@ #define OCFS2_VOL_UUID_LEN 16 #define OCFS2_MAX_VOL_LABEL_LEN 64 -/* The alternate, userspace stack fields */ +/* The cluster stack fields */ #define OCFS2_STACK_LABEL_LEN 4 #define OCFS2_CLUSTER_NAME_LEN 16 +/* Classic (historically speaking) cluster stack */ +#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb" + /* Journal limits (in bytes) */ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) @@ -305,6 +316,11 @@ */ #define OCFS2_MIN_XATTR_INLINE_SIZE 256 +/* + * Cluster info flags (ocfs2_cluster_info.ci_stackflags) + */ +#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01) + struct ocfs2_system_inode_info { char *si_name; int si_iflags; @@ -566,9 +582,21 @@ struct ocfs2_slot_map_extended { */ }; +/* + * ci_stackflags is only valid if the incompat bit + * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set. + */ struct ocfs2_cluster_info { /*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; - __le32 ci_reserved; + union { + __le32 ci_reserved; + struct { + __u8 ci_stackflags; + __u8 ci_reserved1; + __u8 ci_reserved2; + __u8 ci_reserved3; + }; + }; /*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; /*18*/ }; @@ -605,9 +633,9 @@ struct ocfs2_super_block { * group header */ /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ -/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace - stack. Only valid - with INCOMPAT flag. */ +/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either + userspace or clusterinfo + INCOMPAT flag set. */ /*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size for this fs*/ __le16 s_reserved0; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index fa1be1b304d1..755431739396 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2149,7 +2149,9 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - if (ocfs2_userspace_stack(osb)) { + if (ocfs2_clusterinfo_valid(osb)) { + osb->osb_stackflags = + OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; memcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, OCFS2_STACK_LABEL_LEN); -- cgit v1.2.3 From 2c442719e90a44a6982c033d69df4aae4b167cfa Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 15:23:50 -0700 Subject: ocfs2: Add support for heartbeat=global mount option Adds support for heartbeat=global mount option. It ensures that the heartbeat mode passed matches the one enabled on disk. Signed-off-by: Sunil Mushran --- fs/ocfs2/ocfs2.h | 4 +++- fs/ocfs2/ocfs2_fs.h | 1 + fs/ocfs2/super.c | 55 +++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d5496a792bdb..481387b90b21 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -243,7 +243,7 @@ enum ocfs2_local_alloc_state enum ocfs2_mount_options { - OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ + OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */ OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ @@ -256,6 +256,8 @@ enum ocfs2_mount_options control lists */ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ + OCFS2_MOUNT_HB_NONE = 1 << 12, /* No heartbeat */ + OCFS2_MOUNT_HB_GLOBAL = 1 << 13, /* Global heartbeat */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index d5b1d99abc3c..28ff536b4f8d 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -376,6 +376,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { /* Parameter passed from mount.ocfs2 to module */ #define OCFS2_HB_NONE "heartbeat=none" #define OCFS2_HB_LOCAL "heartbeat=local" +#define OCFS2_HB_GLOBAL "heartbeat=global" /* * OCFS2 directory file types. Only the low 3 bits are used. The diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 755431739396..4e009ad303a1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -162,6 +162,7 @@ enum { Opt_nointr, Opt_hb_none, Opt_hb_local, + Opt_hb_global, Opt_data_ordered, Opt_data_writeback, Opt_atime_quantum, @@ -190,6 +191,7 @@ static const match_table_t tokens = { {Opt_nointr, "nointr"}, {Opt_hb_none, OCFS2_HB_NONE}, {Opt_hb_local, OCFS2_HB_LOCAL}, + {Opt_hb_global, OCFS2_HB_GLOBAL}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, {Opt_atime_quantum, "atime_quantum=%u"}, @@ -608,6 +610,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) int ret = 0; struct mount_options parsed_options; struct ocfs2_super *osb = OCFS2_SB(sb); + u32 tmp; lock_kernel(); @@ -617,8 +620,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } - if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != - (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) { + tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE; + if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); goto out; @@ -809,23 +813,29 @@ bail: static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) { - if (ocfs2_mount_local(osb)) { - if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL; + + if (osb->s_mount_opt & hb_enabled) { + if (ocfs2_mount_local(osb)) { mlog(ML_ERROR, "Cannot heartbeat on a locally " "mounted device.\n"); return -EINVAL; } - } - - if (ocfs2_userspace_stack(osb)) { - if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + if (ocfs2_userspace_stack(osb)) { mlog(ML_ERROR, "Userspace stack expected, but " "o2cb heartbeat arguments passed to mount\n"); return -EINVAL; } + if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) && + !ocfs2_cluster_o2cb_global_heartbeat(osb)) || + ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) && + ocfs2_cluster_o2cb_global_heartbeat(osb))) { + mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n"); + return -EINVAL; + } } - if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { + if (!(osb->s_mount_opt & hb_enabled)) { if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && !ocfs2_userspace_stack(osb)) { mlog(ML_ERROR, "Heartbeat has to be started to mount " @@ -1291,6 +1301,7 @@ static int ocfs2_parse_options(struct super_block *sb, { int status; char *p; + u32 tmp; mlog_entry("remount: %d, options: \"%s\"\n", is_remount, options ? options : "(none)"); @@ -1322,7 +1333,10 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; break; case Opt_hb_none: - mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; + mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; + break; + case Opt_hb_global: + mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; break; case Opt_barrier: if (match_int(&args[0], &option)) { @@ -1477,6 +1491,15 @@ static int ocfs2_parse_options(struct super_block *sb, } } + /* Ensure only one heartbeat mode */ + tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + status = 0; + goto bail; + } + status = 1; bail: @@ -1490,10 +1513,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) unsigned long opts = osb->s_mount_opt; unsigned int local_alloc_megs; - if (opts & OCFS2_MOUNT_HB_LOCAL) - seq_printf(s, ",_netdev,heartbeat=local"); - else - seq_printf(s, ",heartbeat=none"); + if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) { + seq_printf(s, ",_netdev"); + if (opts & OCFS2_MOUNT_HB_LOCAL) + seq_printf(s, ",%s", OCFS2_HB_LOCAL); + else + seq_printf(s, ",%s", OCFS2_HB_GLOBAL); + } else + seq_printf(s, ",%s", OCFS2_HB_NONE); if (opts & OCFS2_MOUNT_NOINTR) seq_printf(s, ",nointr"); -- cgit v1.2.3 From b1365d0bd14b912cceb424cbeed9fe939a9038e3 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:34 -0700 Subject: ocfs2/dlm: Expose dlm_protocol in dlm_state Add dlm_protocol to the list of info shown by the debugfs file, dlm_state. Signed-off-by: Sunil Mushran --- fs/ocfs2/dlm/dlmdebug.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 901ca52bf86b..f693ab812f3e 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -782,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ out += snprintf(db->buf + out, db->len - out, - "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key); + "Domain: %s Key: 0x%08x Protocol: %d.%d\n", + dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); /* Thread Pid: xxx Node: xxx State: xxxxx */ out += snprintf(db->buf + out, db->len - out, -- cgit v1.2.3 From b3c85c4cdf77154acc940dd0f14d1fb99cbbaf75 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 14:31:06 -0700 Subject: ocfs2/cluster: Get all heartbeat regions Export function in o2hb to get a list of heartbeat regions. It also adds an upper limit to the length of the heartbeat region name. o2hb_global_heartbeat_active() currently disables global heartbeat. It will be enabled in a later patch after all the code is added. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 34 ++++++++++++++++++++++++++++++++++ fs/ocfs2/cluster/heartbeat.h | 4 ++++ 2 files changed, 38 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4d36459a8343..3415e58ff77b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1623,6 +1623,9 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (reg == NULL) return ERR_PTR(-ENOMEM); + if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + config_item_init_type_name(®->hr_item, name, &o2hb_region_type); spin_lock(&o2hb_live_lock); @@ -2035,3 +2038,34 @@ void o2hb_stop_all_regions(void) spin_unlock(&o2hb_live_lock); } EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); + +int o2hb_get_all_regions(char *region_uuids, u8 max_regions) +{ + struct o2hb_region *reg; + int numregs = 0; + char *p; + + spin_lock(&o2hb_live_lock); + + p = region_uuids; + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + mlog(0, "Region: %s\n", config_item_name(®->hr_item)); + if (numregs < max_regions) { + memcpy(p, config_item_name(®->hr_item), + O2HB_MAX_REGION_NAME_LEN); + p += O2HB_MAX_REGION_NAME_LEN; + } + numregs++; + } + + spin_unlock(&o2hb_live_lock); + + return numregs; +} +EXPORT_SYMBOL_GPL(o2hb_get_all_regions); + +int o2hb_global_heartbeat_active(void) +{ + return 0; +} +EXPORT_SYMBOL(o2hb_global_heartbeat_active); diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 2f1649253b49..00ad8e8fea51 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -31,6 +31,8 @@ #define O2HB_REGION_TIMEOUT_MS 2000 +#define O2HB_MAX_REGION_NAME_LEN 32 + /* number of changes to be seen as live */ #define O2HB_LIVE_THRESHOLD 2 /* number of equal samples to be seen as dead */ @@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); int o2hb_check_local_node_heartbeating(void); void o2hb_stop_all_regions(void); +int o2hb_get_all_regions(char *region_uuids, u8 numregions); +int o2hb_global_heartbeat_active(void); #endif /* O2CLUSTER_HEARTBEAT_H */ -- cgit v1.2.3 From ea2034416b54700e30371f2ad6517cbb94674083 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Sat, 9 Oct 2010 10:26:23 -0700 Subject: ocfs2/dlm: Add message DLM_QUERY_REGION Adds new dlm message DLM_QUERY_REGION that sends the names of all active heartbeat regions. This message is only sent in the global heartbeat mode. If the regions in the joining node do not fully match the ones in the active nodes, the join domain request is rejected. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/ocfs2_nodemanager.h | 6 + fs/ocfs2/dlm/dlmcommon.h | 12 +- fs/ocfs2/dlm/dlmdomain.c | 218 +++++++++++++++++++++++++++++++++++ 3 files changed, 235 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h index 5b9854bad571..49b594325bec 100644 --- a/fs/ocfs2/cluster/ocfs2_nodemanager.h +++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h @@ -36,4 +36,10 @@ /* host name, group name, cluster name all 64 bytes */ #define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN +/* + * Maximum number of global heartbeat regions allowed. + * **CAUTION** Changing this number will break dlm compatibility. + */ +#define O2NM_MAX_REGIONS 32 + #endif /* _OCFS2_NODEMANAGER_H */ diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 765298908f1d..aa506d3e2ae6 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -445,7 +445,8 @@ enum { DLM_LOCK_REQUEST_MSG, /* 515 */ DLM_RECO_DATA_DONE_MSG, /* 516 */ DLM_BEGIN_RECO_MSG, /* 517 */ - DLM_FINALIZE_RECO_MSG /* 518 */ + DLM_FINALIZE_RECO_MSG, /* 518 */ + DLM_QUERY_REGION, /* 519 */ }; struct dlm_reco_node_data @@ -727,6 +728,15 @@ struct dlm_cancel_join u8 domain[O2NM_MAX_NAME_LEN]; }; +struct dlm_query_region { + u8 qr_node; + u8 qr_numregions; + u8 qr_namelen; + u8 pad1; + u8 qr_domain[O2NM_MAX_NAME_LEN]; + u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS]; +}; + struct dlm_exit_domain { u8 node_idx; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 11a5c87fd7f7..49650756dfef 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -128,6 +128,9 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * will have a negotiated version with the same major number and a minor * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should * be used to determine what a running domain is actually using. + * + * New in version 1.1: + * - Message DLM_QUERY_REGION added to support global heartbeat */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, @@ -142,6 +145,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); +static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data); static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); static int dlm_protocol_compare(struct dlm_protocol_version *existing, @@ -921,6 +926,203 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, return 0; } +static int dlm_match_regions(struct dlm_ctxt *dlm, + struct dlm_query_region *qr) +{ + char *local = NULL, *remote = qr->qr_regions; + char *l, *r; + int localnr, i, j, foundit; + int status = 0; + + if (!o2hb_global_heartbeat_active()) { + if (qr->qr_numregions) { + mlog(ML_ERROR, "Domain %s: Joining node %d has global " + "heartbeat enabled but local node %d does not\n", + qr->qr_domain, qr->qr_node, dlm->node_num); + status = -EINVAL; + } + goto bail; + } + + if (o2hb_global_heartbeat_active() && !qr->qr_numregions) { + mlog(ML_ERROR, "Domain %s: Local node %d has global " + "heartbeat enabled but joining node %d does not\n", + qr->qr_domain, dlm->node_num, qr->qr_node); + status = -EINVAL; + goto bail; + } + + r = remote; + for (i = 0; i < qr->qr_numregions; ++i) { + mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r); + r += O2HB_MAX_REGION_NAME_LEN; + } + + local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); + if (!local) { + status = -ENOMEM; + goto bail; + } + + localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS); + + /* compare local regions with remote */ + l = local; + for (i = 0; i < localnr; ++i) { + foundit = 0; + r = remote; + for (j = 0; j <= qr->qr_numregions; ++j) { + if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) { + foundit = 1; + break; + } + r += O2HB_MAX_REGION_NAME_LEN; + } + if (!foundit) { + status = -EINVAL; + mlog(ML_ERROR, "Domain %s: Region '%.*s' registered " + "in local node %d but not in joining node %d\n", + qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l, + dlm->node_num, qr->qr_node); + goto bail; + } + l += O2HB_MAX_REGION_NAME_LEN; + } + + /* compare remote with local regions */ + r = remote; + for (i = 0; i < qr->qr_numregions; ++i) { + foundit = 0; + l = local; + for (j = 0; j < localnr; ++j) { + if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) { + foundit = 1; + break; + } + l += O2HB_MAX_REGION_NAME_LEN; + } + if (!foundit) { + status = -EINVAL; + mlog(ML_ERROR, "Domain %s: Region '%.*s' registered " + "in joining node %d but not in local node %d\n", + qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r, + qr->qr_node, dlm->node_num); + goto bail; + } + r += O2HB_MAX_REGION_NAME_LEN; + } + +bail: + kfree(local); + + return status; +} + +static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map) +{ + struct dlm_query_region *qr = NULL; + int status, ret = 0, i; + char *p; + + if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + goto bail; + + qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL); + if (!qr) { + ret = -ENOMEM; + mlog_errno(ret); + goto bail; + } + + qr->qr_node = dlm->node_num; + qr->qr_namelen = strlen(dlm->name); + memcpy(qr->qr_domain, dlm->name, qr->qr_namelen); + /* if local hb, the numregions will be zero */ + if (o2hb_global_heartbeat_active()) + qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions, + O2NM_MAX_REGIONS); + + p = qr->qr_regions; + for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN) + mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p); + + i = -1; + while ((i = find_next_bit(node_map, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (i == dlm->node_num) + continue; + + mlog(0, "Sending regions to node %d\n", i); + + ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr, + sizeof(struct dlm_query_region), + i, &status); + if (ret >= 0) + ret = status; + if (ret) { + mlog(ML_ERROR, "Region mismatch %d, node %d\n", + ret, i); + break; + } + } + +bail: + kfree(qr); + return ret; +} + +static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_query_region *qr; + struct dlm_ctxt *dlm = NULL; + int status = 0; + int locked = 0; + + qr = (struct dlm_query_region *) msg->buf; + + mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node, + qr->qr_domain); + + status = -EINVAL; + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen); + if (!dlm) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "before join domain\n", qr->qr_node, qr->qr_domain); + goto bail; + } + + spin_lock(&dlm->spinlock); + locked = 1; + if (dlm->joining_node != qr->qr_node) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "but joining node is %d\n", qr->qr_node, qr->qr_domain, + dlm->joining_node); + goto bail; + } + + /* Support for global heartbeat was added in 1.1 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor == 0) { + mlog(ML_ERROR, "Node %d queried hb regions on domain %s " + "but active dlm protocol is %d.%d\n", qr->qr_node, + qr->qr_domain, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); + goto bail; + } + + status = dlm_match_regions(dlm, qr); + +bail: + if (locked) + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + + return status; +} + static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data) { @@ -1241,6 +1443,15 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) set_bit(dlm->node_num, dlm->domain_map); spin_unlock(&dlm->spinlock); + /* Support for global heartbeat was added in 1.1 */ + if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) { + status = dlm_send_regions(dlm, ctxt->yes_resp_map); + if (status) { + mlog_errno(status); + goto bail; + } + } + dlm_send_join_asserts(dlm, ctxt->yes_resp_map); /* Joined state *must* be set before the joining node @@ -1807,6 +2018,13 @@ static int dlm_register_net_handlers(void) sizeof(struct dlm_cancel_join), dlm_cancel_join_handler, NULL, NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY, + sizeof(struct dlm_query_region), + dlm_query_region_handler, + NULL, NULL, &dlm_join_handlers); bail: if (status < 0) -- cgit v1.2.3 From 5f3c6d9c615770708df464c170316f83cf437242 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:29 -0700 Subject: ocfs2: Print message if user mounts without starting global heartbeat In global heartbeat mode, the heartbeat is started by the user. This patch prints an error if the user attempts to mount a volume without starting the heartbeat. Signed-off-by: Sunil Mushran --- fs/ocfs2/stack_o2cb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 0d3049f696c5..19965b00c43c 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) /* for now we only have one cluster/node, make sure we see it * in the heartbeat universe */ if (!o2hb_check_local_node_heartbeating()) { + if (o2hb_global_heartbeat_active()) + mlog(ML_ERROR, "Global heartbeat not started\n"); rc = -EINVAL; goto out; } -- cgit v1.2.3 From 18cfdf1b1a8e83b09e4185c02396257ce7e7bee3 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 16:47:03 -0700 Subject: ocfs2/dlm: Add message DLM_QUERY_NODEINFO Adds new dlm message DLM_QUERY_NODEINFO that sends the attributes of all registered nodes. This message is sent if the negotiated dlm protocol is 1.1 or higher. If the information of the joining node does not match that of any existing nodes, the join domain request is rejected. Signed-off-by: Sunil Mushran --- fs/ocfs2/dlm/dlmcommon.h | 17 +++++ fs/ocfs2/dlm/dlmdomain.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 198 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index aa506d3e2ae6..b36d0bf77a5a 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -447,6 +447,7 @@ enum { DLM_BEGIN_RECO_MSG, /* 517 */ DLM_FINALIZE_RECO_MSG, /* 518 */ DLM_QUERY_REGION, /* 519 */ + DLM_QUERY_NODEINFO, /* 520 */ }; struct dlm_reco_node_data @@ -737,6 +738,22 @@ struct dlm_query_region { u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS]; }; +struct dlm_node_info { + u8 ni_nodenum; + u8 pad1; + u16 ni_ipv4_port; + u32 ni_ipv4_address; +}; + +struct dlm_query_nodeinfo { + u8 qn_nodenum; + u8 qn_numnodes; + u8 qn_namelen; + u8 pad1; + u8 qn_domain[O2NM_MAX_NAME_LEN]; + struct dlm_node_info qn_nodes[O2NM_MAX_NODES]; +}; + struct dlm_exit_domain { u8 node_idx; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 49650756dfef..78d428f5e10e 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -131,6 +131,7 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * * New in version 1.1: * - Message DLM_QUERY_REGION added to support global heartbeat + * - Message DLM_QUERY_NODEINFO added to allow online node removes */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, @@ -1123,6 +1124,173 @@ bail: return status; } +static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn) +{ + struct o2nm_node *local; + struct dlm_node_info *remote; + int i, j; + int status = 0; + + for (j = 0; j < qn->qn_numnodes; ++j) + mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum, + &(qn->qn_nodes[j].ni_ipv4_address), + ntohs(qn->qn_nodes[j].ni_ipv4_port)); + + for (i = 0; i < O2NM_MAX_NODES && !status; ++i) { + local = o2nm_get_node_by_num(i); + remote = NULL; + for (j = 0; j < qn->qn_numnodes; ++j) { + if (qn->qn_nodes[j].ni_nodenum == i) { + remote = &(qn->qn_nodes[j]); + break; + } + } + + if (!local && !remote) + continue; + + if ((local && !remote) || (!local && remote)) + status = -EINVAL; + + if (!status && + ((remote->ni_nodenum != local->nd_num) || + (remote->ni_ipv4_port != local->nd_ipv4_port) || + (remote->ni_ipv4_address != local->nd_ipv4_address))) + status = -EINVAL; + + if (status) { + if (remote && !local) + mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) " + "registered in joining node %d but not in " + "local node %d\n", qn->qn_domain, + remote->ni_nodenum, + &(remote->ni_ipv4_address), + ntohs(remote->ni_ipv4_port), + qn->qn_nodenum, dlm->node_num); + if (local && !remote) + mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) " + "registered in local node %d but not in " + "joining node %d\n", qn->qn_domain, + local->nd_num, &(local->nd_ipv4_address), + ntohs(local->nd_ipv4_port), + dlm->node_num, qn->qn_nodenum); + BUG_ON((!local && !remote)); + } + + if (local) + o2nm_node_put(local); + } + + return status; +} + +static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map) +{ + struct dlm_query_nodeinfo *qn = NULL; + struct o2nm_node *node; + int ret = 0, status, count, i; + + if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + goto bail; + + qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL); + if (!qn) { + ret = -ENOMEM; + mlog_errno(ret); + goto bail; + } + + for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) { + node = o2nm_get_node_by_num(i); + if (!node) + continue; + qn->qn_nodes[count].ni_nodenum = node->nd_num; + qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port; + qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address; + mlog(0, "Node %3d, %pI4:%u\n", node->nd_num, + &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port)); + ++count; + o2nm_node_put(node); + } + + qn->qn_nodenum = dlm->node_num; + qn->qn_numnodes = count; + qn->qn_namelen = strlen(dlm->name); + memcpy(qn->qn_domain, dlm->name, qn->qn_namelen); + + i = -1; + while ((i = find_next_bit(node_map, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (i == dlm->node_num) + continue; + + mlog(0, "Sending nodeinfo to node %d\n", i); + + ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY, + qn, sizeof(struct dlm_query_nodeinfo), + i, &status); + if (ret >= 0) + ret = status; + if (ret) { + mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i); + break; + } + } + +bail: + kfree(qn); + return ret; +} + +static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_query_nodeinfo *qn; + struct dlm_ctxt *dlm = NULL; + int locked = 0, status = -EINVAL; + + qn = (struct dlm_query_nodeinfo *) msg->buf; + + mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum, + qn->qn_domain); + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen); + if (!dlm) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s before " + "join domain\n", qn->qn_nodenum, qn->qn_domain); + goto bail; + } + + spin_lock(&dlm->spinlock); + locked = 1; + if (dlm->joining_node != qn->qn_nodenum) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s but " + "joining node is %d\n", qn->qn_nodenum, qn->qn_domain, + dlm->joining_node); + goto bail; + } + + /* Support for node query was added in 1.1 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor == 0) { + mlog(ML_ERROR, "Node %d queried nodes on domain %s " + "but active dlm protocol is %d.%d\n", qn->qn_nodenum, + qn->qn_domain, dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor); + goto bail; + } + + status = dlm_match_nodes(dlm, qn); + +bail: + if (locked) + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + + return status; +} + static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data) { @@ -1443,8 +1611,13 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) set_bit(dlm->node_num, dlm->domain_map); spin_unlock(&dlm->spinlock); - /* Support for global heartbeat was added in 1.1 */ + /* Support for global heartbeat and node info was added in 1.1 */ if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) { + status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map); + if (status) { + mlog_errno(status); + goto bail; + } status = dlm_send_regions(dlm, ctxt->yes_resp_map); if (status) { mlog_errno(status); @@ -2026,6 +2199,13 @@ static int dlm_register_net_handlers(void) dlm_query_region_handler, NULL, NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY, + sizeof(struct dlm_query_nodeinfo), + dlm_query_nodeinfo_handler, + NULL, NULL, &dlm_join_handlers); bail: if (status < 0) dlm_unregister_net_handlers(); -- cgit v1.2.3 From 18c50cb0d3c293eabd6c2ef89c43f2a968e709ed Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 18:26:59 -0700 Subject: ocfs2/cluster: Print messages when adding/removing heartbeat regions Prints messages when the user adds or removes heartbeat regions in global heartbeat mode. These messages are useful when debugging cluster related issues. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 3415e58ff77b..12bb12ba8640 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1476,6 +1476,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, else ret = -EIO; + if (hb_task && o2hb_global_heartbeat_active()) + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", + config_item_name(®->hr_item)); + out: if (filp) fput(filp); @@ -1659,6 +1663,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, wake_up(&o2hb_steady_queue); } + if (o2hb_global_heartbeat_active()) + printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", + config_item_name(®->hr_item)); config_item_put(item); } @@ -1745,7 +1752,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, ret = o2hb_global_hearbeat_mode_set(i); if (!ret) - printk(KERN_NOTICE "ocfs2: Heartbeat mode set to %s\n", + printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", o2hb_heartbeat_mode_desc[i]); return count; } -- cgit v1.2.3 From 39a298563e0619b1b6e2e0974e58801de780621c Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 17:30:17 -0700 Subject: ocfs2/cluster: Print messages when adding/removing nodes Prints messages when the user adds or removes nodes. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/masklog.h | 3 ++- fs/ocfs2/cluster/nodemanager.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index fd96e2a2fa56..ea2ed9f56c94 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -119,7 +119,8 @@ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ #define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ -#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ +#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ +#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */ #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index ed0c9f367fed..bb240647ca5f 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group, config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); spin_lock_init(&node->nd_lock); + mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name); + return &node->nd_item; } @@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group, } write_unlock(&cluster->cl_nodes_lock); + mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n", + config_item_name(&node->nd_item)); + config_item_put(item); } -- cgit v1.2.3 From 0e105d37c2adb19cb777aa6701a866f211764a30 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 17:00:16 -0700 Subject: ocfs2/cluster: Check slots for unconfigured live nodes o2hb currently checks slots for configured nodes only. This patch makes it check the slots for the live nodes too to take care of a race in which a node is removed from the configuration but not from the live map. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 38 +++++++++++++++++++++++++++++++------- fs/ocfs2/cluster/tcp.c | 5 +++++ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 12bb12ba8640..a8f10649674d 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -541,6 +541,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event, { assert_spin_locked(&o2hb_live_lock); + BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB)); + event->hn_event_type = type; event->hn_node = node; event->hn_node_num = node_num; @@ -593,14 +595,22 @@ static int o2hb_check_slot(struct o2hb_region *reg, u64 cputime; unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; unsigned int slot_dead_ms; + int tmp; memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); - /* Is this correct? Do we assume that the node doesn't exist - * if we're not configured for him? */ + /* + * If a node is no longer configured but is still in the livemap, we + * may need to clear that bit from the livemap. + */ node = o2nm_get_node_by_num(slot->ds_node_num); - if (!node) - return 0; + if (!node) { + spin_lock(&o2hb_live_lock); + tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap); + spin_unlock(&o2hb_live_lock); + if (!tmp) + return 0; + } if (!o2hb_verify_crc(reg, hb_block)) { /* all paths from here will drop o2hb_live_lock for @@ -717,8 +727,9 @@ fire_callbacks: if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); - o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, - slot->ds_node_num); + /* node can be null */ + o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, + node, slot->ds_node_num); changed = 1; } @@ -738,7 +749,8 @@ out: o2hb_run_event_list(&event); - o2nm_node_put(node); + if (node) + o2nm_node_put(node); return changed; } @@ -765,6 +777,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct o2hb_bio_wait_ctxt write_wc; ret = o2nm_configured_node_map(configured_nodes, @@ -774,6 +787,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) return ret; } + /* + * If a node is not configured but is in the livemap, we still need + * to read the slot so as to be able to remove it from the livemap. + */ + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + set_bit(i, configured_nodes); + } + highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index cbe2f057cc28..9aa426e42123 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, { o2quo_hb_down(node_num); + if (!node) + return; + if (node_num != o2nm_this_node()) o2net_disconnect_node(node); @@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, o2quo_hb_up(node_num); + BUG_ON(!node); + /* ensure an immediate connect attempt */ nn->nn_last_connect_attempt = jiffies - (msecs_to_jiffies(o2net_reconnect_delay()) + 1); -- cgit v1.2.3 From 8ca8b0bbd841b6bcd8ac05e51b0143aa61cfeff3 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 17:01:27 -0700 Subject: ocfs2/cluster: Reorganize o2hb debugfs init o2hb debugfs handling is reorganized to allow for easy expansion. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 101 +++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 23 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a8f10649674d..16e49765c853 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -62,8 +62,19 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); +#define O2HB_DB_TYPE_LIVENODES 0 +struct o2hb_debug_buf { + int db_type; + int db_size; + int db_len; + void *db_data; +}; + +static struct o2hb_debug_buf *o2hb_db_livenodes; + #define O2HB_DEBUG_DIR "o2hb" #define O2HB_DEBUG_LIVENODES "livenodes" + static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; @@ -969,21 +980,35 @@ static int o2hb_thread(void *data) #ifdef CONFIG_DEBUG_FS static int o2hb_debug_open(struct inode *inode, struct file *file) { + struct o2hb_debug_buf *db = inode->i_private; unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; char *buf = NULL; int i = -1; int out = 0; + /* max_nodes should be the largest bitmap we pass here */ + BUG_ON(sizeof(map) < db->db_size); + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) goto bail; - o2hb_fill_node_map(map, sizeof(map)); + switch (db->db_type) { + case O2HB_DB_TYPE_LIVENODES: + spin_lock(&o2hb_live_lock); + memcpy(map, db->db_data, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + default: + goto done; + } - while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) + while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); out += snprintf(buf + out, PAGE_SIZE - out, "\n"); +done: i_size_write(inode, out); file->private_data = buf; @@ -1030,10 +1055,56 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { - if (o2hb_debug_livenodes) - debugfs_remove(o2hb_debug_livenodes); - if (o2hb_debug_dir) - debugfs_remove(o2hb_debug_dir); + kfree(o2hb_db_livenodes); + debugfs_remove(o2hb_debug_livenodes); + debugfs_remove(o2hb_debug_dir); +} + +static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, + int type, int size, int len, void *data) +{ + *db = kmalloc(db_len, GFP_KERNEL); + if (!*db) + return NULL; + + (*db)->db_type = type; + (*db)->db_size = size; + (*db)->db_len = len; + (*db)->db_data = data; + + return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, + &o2hb_debug_fops); +} + +static int o2hb_debug_init(void) +{ + int ret = -ENOMEM; + + o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); + if (!o2hb_debug_dir) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, + o2hb_debug_dir, + &o2hb_db_livenodes, + sizeof(*o2hb_db_livenodes), + O2HB_DB_TYPE_LIVENODES, + sizeof(o2hb_live_node_bitmap), + O2NM_MAX_NODES, + o2hb_live_node_bitmap); + if (!o2hb_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + ret = 0; +bail: + if (ret) + o2hb_exit(); + + return ret; } int o2hb_init(void) @@ -1050,23 +1121,7 @@ int o2hb_init(void) memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); - o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (!o2hb_debug_dir) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } - - o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES, - S_IFREG|S_IRUSR, - o2hb_debug_dir, NULL, - &o2hb_debug_fops); - if (!o2hb_debug_livenodes) { - mlog_errno(-ENOMEM); - debugfs_remove(o2hb_debug_dir); - return -ENOMEM; - } - - return 0; + return o2hb_debug_init(); } /* if we're already in a callback then we're already serialized by the sem */ -- cgit v1.2.3 From 823a637ae933fde8fdb280612dd3ff9912e301e3 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:21 -0700 Subject: ocfs2/cluster: Maintain live node bitmap per heartbeat region Currently we track a global livenode bitmap that keeps track of all nodes that are heartbeating in all regions. This patch adds the ability to track the livenode bitmap on a per region basis. We will use this facility in a later patch to allow us to withstand the loss of a minority number of regions. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 16e49765c853..188f50269b89 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -174,6 +174,9 @@ struct o2hb_region { struct block_device *hr_bdev; struct o2hb_disk_slot *hr_slots; + /* live node map of this region */ + unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have * a more complete api that doesn't lead to this sort of fragility. */ @@ -688,6 +691,8 @@ fire_callbacks: mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", slot->ds_node_num, (long long)slot->ds_last_generation); + set_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + /* first on the list generates a callback */ if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { set_bit(slot->ds_node_num, o2hb_live_node_bitmap); @@ -733,6 +738,8 @@ fire_callbacks: mlog(ML_HEARTBEAT, "Node %d left my region\n", slot->ds_node_num); + clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + /* last off the live_slot generates a callback */ list_del_init(&slot->ds_live_item); if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { -- cgit v1.2.3 From 536f0741f324f116d8b059295999945a2dac56bc Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 17:03:07 -0700 Subject: ocfs2/cluster: Track number of global heartbeat regions In global heartbeat mode, we have a upper limit for the number of active regions. This patch adds the facility to track the number of active global heartbeat regions and fails to start heartbeat if the number exceeds the maximum. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 188f50269b89..d66b17c000d4 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -62,6 +62,12 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); +/* + * In global heartbeat, we maintain a series of region bitmaps. + * - o2hb_region_bitmap allows us to limit the region number to max region. + */ +static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; + #define O2HB_DB_TYPE_LIVENODES 0 struct o2hb_debug_buf { int db_type; @@ -176,6 +182,7 @@ struct o2hb_region { /* live node map of this region */ unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned int hr_region_num; /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have @@ -1127,6 +1134,7 @@ int o2hb_init(void) INIT_LIST_HEAD(&o2hb_node_events); memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); + memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); return o2hb_debug_init(); } @@ -1716,12 +1724,22 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - config_item_init_type_name(®->hr_item, name, &o2hb_region_type); - spin_lock(&o2hb_live_lock); + reg->hr_region_num = 0; + if (o2hb_global_heartbeat_active()) { + reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap, + O2NM_MAX_REGIONS); + if (reg->hr_region_num >= O2NM_MAX_REGIONS) { + spin_unlock(&o2hb_live_lock); + return ERR_PTR(-EFBIG); + } + set_bit(reg->hr_region_num, o2hb_region_bitmap); + } list_add_tail(®->hr_all_item, &o2hb_all_regions); spin_unlock(&o2hb_live_lock); + config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + return ®->hr_item; } @@ -1733,6 +1751,8 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, /* stop the thread when the user removes the region dir */ spin_lock(&o2hb_live_lock); + if (o2hb_global_heartbeat_active()) + clear_bit(reg->hr_region_num, o2hb_region_bitmap); hb_task = reg->hr_task; reg->hr_task = NULL; spin_unlock(&o2hb_live_lock); -- cgit v1.2.3 From e7d656baf6607a0775f4ca85464a4ead306741e5 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:18 -0700 Subject: ocfs2/cluster: Track bitmap of live heartbeat regions A heartbeat region becomes live (or active) after a fixed number of (steady) iterations. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index d66b17c000d4..2a7cd17e96f0 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -65,8 +65,10 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); /* * In global heartbeat, we maintain a series of region bitmaps. * - o2hb_region_bitmap allows us to limit the region number to max region. + * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). */ static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_LIVENODES 0 struct o2hb_debug_buf { @@ -1135,6 +1137,7 @@ int o2hb_init(void) memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); + memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); return o2hb_debug_init(); } @@ -1563,6 +1566,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, /* Ok, we were woken. Make sure it wasn't by drop_item() */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; + if (o2hb_global_heartbeat_active()) + set_bit(reg->hr_region_num, o2hb_live_region_bitmap); spin_unlock(&o2hb_live_lock); if (hb_task) @@ -1751,8 +1756,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, /* stop the thread when the user removes the region dir */ spin_lock(&o2hb_live_lock); - if (o2hb_global_heartbeat_active()) + if (o2hb_global_heartbeat_active()) { clear_bit(reg->hr_region_num, o2hb_region_bitmap); + clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); + } hb_task = reg->hr_task; reg->hr_task = NULL; spin_unlock(&o2hb_live_lock); -- cgit v1.2.3 From 43182d2a799865872041b6e4d8387131e9462f56 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:16 -0700 Subject: ocfs2/cluster: Maintain bitmap of quorum regions o2hb allows online adding of regions. However, a newly added region is not used in quorum calculations unless it has been added on all nodes. This patch tracks a bitmap of such quorum regions. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 2a7cd17e96f0..62a8af271344 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -66,9 +66,12 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); * In global heartbeat, we maintain a series of region bitmaps. * - o2hb_region_bitmap allows us to limit the region number to max region. * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). + * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes + * heartbeat on it. */ static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_LIVENODES 0 struct o2hb_debug_buf { @@ -607,6 +610,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) o2nm_node_put(node); } +static void o2hb_set_quorum_device(struct o2hb_region *reg, + struct o2hb_disk_slot *slot) +{ + assert_spin_locked(&o2hb_live_lock); + + if (!o2hb_global_heartbeat_active()) + return; + + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + return; + + /* + * A region can be added to the quorum only when it sees all + * live nodes heartbeat on it. In other words, the region has been + * added to all nodes. + */ + if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + sizeof(o2hb_live_node_bitmap))) + return; + + if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) + return; + + printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", + config_item_name(®->hr_item)); + + set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); +} + static int o2hb_check_slot(struct o2hb_region *reg, struct o2hb_disk_slot *slot) { @@ -772,6 +804,8 @@ fire_callbacks: slot->ds_equal_samples = 0; } out: + o2hb_set_quorum_device(reg, slot); + spin_unlock(&o2hb_live_lock); o2hb_run_event_list(&event); @@ -1138,6 +1172,7 @@ int o2hb_init(void) memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); + memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); return o2hb_debug_init(); } -- cgit v1.2.3 From b1c5ebfbe398b3360614a4788c02061cd153e60a Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 7 Oct 2010 17:05:52 -0700 Subject: ocfs2/cluster: Maintain bitmap of failed regions In global heartbeat mode, we track the bitmap of regions that have seen heartbeat timeouts. We fence if the number of such regions is greater than or equal to half the number of quorum regions. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 62a8af271344..f890656127fa 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -68,10 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes * heartbeat on it. + * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts. */ static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_LIVENODES 0 struct o2hb_debug_buf { @@ -217,8 +219,19 @@ struct o2hb_bio_wait_ctxt { int wc_error; }; +static int o2hb_pop_count(void *map, int count) +{ + int i = -1, pop = 0; + + while ((i = find_next_bit(map, count, i + 1)) < count) + pop++; + return pop; +} + static void o2hb_write_timeout(struct work_struct *work) { + int failed, quorum; + unsigned long flags; struct o2hb_region *reg = container_of(work, struct o2hb_region, hr_write_timeout_work.work); @@ -226,6 +239,28 @@ static void o2hb_write_timeout(struct work_struct *work) mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " "milliseconds\n", reg->hr_dev_name, jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); + + if (o2hb_global_heartbeat_active()) { + spin_lock_irqsave(&o2hb_live_lock, flags); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + failed = o2hb_pop_count(&o2hb_failed_region_bitmap, + O2NM_MAX_REGIONS); + quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS); + spin_unlock_irqrestore(&o2hb_live_lock, flags); + + mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", + quorum, failed); + + /* + * Fence if the number of failed regions >= half the number + * of quorum regions + */ + if ((failed << 1) < quorum) + return; + } + o2quo_disk_timeout(); } @@ -234,6 +269,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + spin_unlock(&o2hb_live_lock); + } cancel_delayed_work(®->hr_write_timeout_work); reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, @@ -1173,6 +1213,7 @@ int o2hb_init(void) memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); + memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); return o2hb_debug_init(); } -- cgit v1.2.3 From a6de013654b4839c8609e26241ebd9eb1ecc52e6 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:13 -0700 Subject: ocfs2/cluster: Create debugfs files for live, quorum and failed region bitmaps This patch prints the bitmaps of live, quorum and failed regions. This information will be useful in debugging cluster issues. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f890656127fa..b06b9e52fba8 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -76,6 +76,9 @@ static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_LIVENODES 0 +#define O2HB_DB_TYPE_LIVEREGIONS 1 +#define O2HB_DB_TYPE_QUORUMREGIONS 2 +#define O2HB_DB_TYPE_FAILEDREGIONS 3 struct o2hb_debug_buf { int db_type; int db_size; @@ -84,12 +87,21 @@ struct o2hb_debug_buf { }; static struct o2hb_debug_buf *o2hb_db_livenodes; +static struct o2hb_debug_buf *o2hb_db_liveregions; +static struct o2hb_debug_buf *o2hb_db_quorumregions; +static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_DIR "o2hb" #define O2HB_DEBUG_LIVENODES "livenodes" +#define O2HB_DEBUG_LIVEREGIONS "live_regions" +#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" +#define O2HB_DEBUG_FAILEDREGIONS "failed_regions" static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; +static struct dentry *o2hb_debug_liveregions; +static struct dentry *o2hb_debug_quorumregions; +static struct dentry *o2hb_debug_failedregions; static LIST_HEAD(o2hb_all_regions); @@ -1085,6 +1097,9 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) switch (db->db_type) { case O2HB_DB_TYPE_LIVENODES: + case O2HB_DB_TYPE_LIVEREGIONS: + case O2HB_DB_TYPE_QUORUMREGIONS: + case O2HB_DB_TYPE_FAILEDREGIONS: spin_lock(&o2hb_live_lock); memcpy(map, db->db_data, db->db_size); spin_unlock(&o2hb_live_lock); @@ -1146,6 +1161,12 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { kfree(o2hb_db_livenodes); + kfree(o2hb_db_liveregions); + kfree(o2hb_db_quorumregions); + kfree(o2hb_db_failedregions); + debugfs_remove(o2hb_debug_failedregions); + debugfs_remove(o2hb_debug_quorumregions); + debugfs_remove(o2hb_debug_liveregions); debugfs_remove(o2hb_debug_livenodes); debugfs_remove(o2hb_debug_dir); } @@ -1189,6 +1210,48 @@ static int o2hb_debug_init(void) mlog_errno(ret); goto bail; } + + o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, + o2hb_debug_dir, + &o2hb_db_liveregions, + sizeof(*o2hb_db_liveregions), + O2HB_DB_TYPE_LIVEREGIONS, + sizeof(o2hb_live_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_live_region_bitmap); + if (!o2hb_debug_liveregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_quorumregions = + o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, + o2hb_debug_dir, + &o2hb_db_quorumregions, + sizeof(*o2hb_db_quorumregions), + O2HB_DB_TYPE_QUORUMREGIONS, + sizeof(o2hb_quorum_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_quorum_region_bitmap); + if (!o2hb_debug_quorumregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_failedregions = + o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, + o2hb_debug_dir, + &o2hb_db_failedregions, + sizeof(*o2hb_db_failedregions), + O2HB_DB_TYPE_FAILEDREGIONS, + sizeof(o2hb_failed_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_failed_region_bitmap); + if (!o2hb_debug_failedregions) { + mlog_errno(ret); + goto bail; + } + ret = 0; bail: if (ret) -- cgit v1.2.3 From 1f28530537f106f83e5cf7ef0193075667b6d520 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:12 -0700 Subject: ocfs2/cluster: Create debugfs dir/files for each region This patch creates debugfs directory for each o2hb region and creates files to expose the region number and the per region live node bitmap. This information will be useful in debugging cluster issues. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 77 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index b06b9e52fba8..f28de4b09c6b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -79,6 +79,8 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_LIVEREGIONS 1 #define O2HB_DB_TYPE_QUORUMREGIONS 2 #define O2HB_DB_TYPE_FAILEDREGIONS 3 +#define O2HB_DB_TYPE_REGION_LIVENODES 4 +#define O2HB_DB_TYPE_REGION_NUMBER 5 struct o2hb_debug_buf { int db_type; int db_size; @@ -96,6 +98,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_LIVEREGIONS "live_regions" #define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" #define O2HB_DEBUG_FAILEDREGIONS "failed_regions" +#define O2HB_DEBUG_REGION_NUMBER "num" static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; @@ -203,6 +206,12 @@ struct o2hb_region { unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned int hr_region_num; + struct dentry *hr_debug_dir; + struct dentry *hr_debug_livenodes; + struct dentry *hr_debug_regnum; + struct o2hb_debug_buf *hr_db_livenodes; + struct o2hb_debug_buf *hr_db_regnum; + /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have * a more complete api that doesn't lead to this sort of fragility. */ @@ -1083,6 +1092,7 @@ static int o2hb_thread(void *data) static int o2hb_debug_open(struct inode *inode, struct file *file) { struct o2hb_debug_buf *db = inode->i_private; + struct o2hb_region *reg; unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; char *buf = NULL; int i = -1; @@ -1105,6 +1115,19 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) spin_unlock(&o2hb_live_lock); break; + case O2HB_DB_TYPE_REGION_LIVENODES: + spin_lock(&o2hb_live_lock); + reg = (struct o2hb_region *)db->db_data; + memcpy(map, reg->hr_live_node_bitmap, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + case O2HB_DB_TYPE_REGION_NUMBER: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", + reg->hr_region_num); + goto done; + default: goto done; } @@ -1342,6 +1365,12 @@ static void o2hb_region_release(struct config_item *item) if (reg->hr_slots) kfree(reg->hr_slots); + kfree(reg->hr_db_regnum); + kfree(reg->hr_db_livenodes); + debugfs_remove(reg->hr_debug_livenodes); + debugfs_remove(reg->hr_debug_regnum); + debugfs_remove(reg->hr_debug_dir); + spin_lock(&o2hb_live_lock); list_del(®->hr_all_item); spin_unlock(&o2hb_live_lock); @@ -1856,10 +1885,52 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } +static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +{ + int ret = -ENOMEM; + + reg->hr_debug_dir = + debugfs_create_dir(config_item_name(®->hr_item), dir); + if (!reg->hr_debug_dir) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_livenodes = + o2hb_debug_create(O2HB_DEBUG_LIVENODES, + reg->hr_debug_dir, + &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), + O2NM_MAX_NODES, reg); + if (!reg->hr_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_regnum = + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, + reg->hr_debug_dir, + &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, + 0, O2NM_MAX_NODES, reg); + if (!reg->hr_debug_regnum) { + mlog_errno(ret); + goto bail; + } + + ret = 0; +bail: + return ret; +} + static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, const char *name) { struct o2hb_region *reg = NULL; + int ret; reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); if (reg == NULL) @@ -1884,6 +1955,12 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + ret = o2hb_debug_region_init(reg, o2hb_debug_dir); + if (ret) { + config_item_put(®->hr_item); + return ERR_PTR(ret); + } + return ®->hr_item; } -- cgit v1.2.3 From d6aa1c7c9e4b48081c2302e14b0f857017461efd Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 18:50:50 -0700 Subject: ocfs2/cluster: Add mlogs for heartbeat up/down events This patch adds mlogs for o2hb up and down events. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f28de4b09c6b..e8676accf902 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -797,6 +797,8 @@ fire_callbacks: /* first on the list generates a callback */ if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes " + "bitmap\n", slot->ds_node_num); set_bit(slot->ds_node_num, o2hb_live_node_bitmap); o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, @@ -845,6 +847,8 @@ fire_callbacks: /* last off the live_slot generates a callback */ list_del_init(&slot->ds_live_item); if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live " + "nodes bitmap\n", slot->ds_node_num); clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); /* node can be null */ -- cgit v1.2.3 From 43695d095dfaf266a8a940d9b07eed7f46076b49 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 6 Oct 2010 17:55:09 -0700 Subject: ocfs2/cluster: Show per region heartbeat elapsed time This patch adds a per region debugfs file that shows the elapsed time since the time the o2hb timer was last armed. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index e8676accf902..29aee2128edb 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -81,6 +81,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; #define O2HB_DB_TYPE_FAILEDREGIONS 3 #define O2HB_DB_TYPE_REGION_LIVENODES 4 #define O2HB_DB_TYPE_REGION_NUMBER 5 +#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 struct o2hb_debug_buf { int db_type; int db_size; @@ -99,6 +100,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" #define O2HB_DEBUG_FAILEDREGIONS "failed_regions" #define O2HB_DEBUG_REGION_NUMBER "num" +#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; @@ -209,8 +211,10 @@ struct o2hb_region { struct dentry *hr_debug_dir; struct dentry *hr_debug_livenodes; struct dentry *hr_debug_regnum; + struct dentry *hr_debug_elapsed_time; struct o2hb_debug_buf *hr_db_livenodes; struct o2hb_debug_buf *hr_db_regnum; + struct o2hb_debug_buf *hr_db_elapsed_time; /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have @@ -1132,6 +1136,13 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) reg->hr_region_num); goto done; + case O2HB_DB_TYPE_REGION_ELAPSED_TIME: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + jiffies_to_msecs(jiffies - + reg->hr_last_timeout_start)); + goto done; + default: goto done; } @@ -1925,6 +1936,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) goto bail; } + reg->hr_debug_elapsed_time = + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, + reg->hr_debug_dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, + 0, 0, reg); + if (!reg->hr_debug_elapsed_time) { + mlog_errno(ret); + goto bail; + } + ret = 0; bail: return ret; -- cgit v1.2.3 From 4d94aa1b1d437f9513ddc89974d8bd214b8304f6 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Sat, 9 Oct 2010 10:27:04 -0700 Subject: ocfs2/cluster: Bump up dlm protocol to version 1.1 dlm protocol 1.1. activates messages DLM_QUERY_REGION and DLM_QUERY_NODEINFO that are a must for global heartbeat. It also activates o2hb_global_heartbeat_active(). Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/dlm/dlmdomain.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 29aee2128edb..6a1280a013ea 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2429,6 +2429,6 @@ EXPORT_SYMBOL_GPL(o2hb_get_all_regions); int o2hb_global_heartbeat_active(void) { - return 0; + return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL); } EXPORT_SYMBOL(o2hb_global_heartbeat_active); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 78d428f5e10e..58a93b953735 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -135,7 +135,7 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, - .pv_minor = 0, + .pv_minor = 1, }; #define DLM_DOMAIN_BACKOFF_MS 200 -- cgit v1.2.3 From 9b5cd10e4c14a1a642076ace6a73be3d33c91fb6 Mon Sep 17 00:00:00 2001 From: Srinivas Eeda Date: Tue, 5 Oct 2010 15:53:06 -0700 Subject: ocfs2: validate bg_free_bits_count after update This patch adds a safe check to ensure bg_free_bits_count doesn't exceed bg_bits in a group descriptor. This is to avoid on disk corruption that was seen recently. debugfs: group <52803072> Group Chain: 179 Parent Inode: 11 Generation: 2959379682 CRC32: 00000000 ECC: 0000 ## Block# Total Used Free Contig Size 0 52803072 32256 4294965350 34202 18207 4032 ...... Signed-off-by: Srinivas Eeda Signed-off-by: Joel Becker --- fs/ocfs2/suballoc.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8a286f54dca1..64f2c50a1c37 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, -num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); @@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, (unsigned long *) undo_bg->bg_bitmap); } le16_add_cpu(&bg->bg_free_bits_count, num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } if (undo_fn) jbd_unlock_bh_state(group_bh); -- cgit v1.2.3 From f30d44f3e54a94e037da7a71d714b585dab28d9e Mon Sep 17 00:00:00 2001 From: Poyo VL Date: Mon, 11 Oct 2010 13:45:52 -0700 Subject: When I tried to compile I got the following warning: fs/ocfs2/slot_map.c: In function ‘ocfs2_init_slot_info’: fs/ocfs2/slot_map.c:360: warning: ‘bytes’ may be used uninitialized in this function fs/ocfs2/slot_map.c:360: note: ‘bytes’ was declared here Compiler: gcc version 4.4.3 (GCC) on Mandriva I'm not sure why this warning occurs, I think compiler don't know that variable "bytes" is initialized when it is sent by reference to ocfs2_slot_map_physical_size and it throws that ugly warning. However, a simple initialization of "bytes" variable with 0 will fix it. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ionut Gabriel Popescu Signed-off-by: Joel Becker --- fs/ocfs2/slot_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index bfbd7e9e949f..ab4e0172cc1d 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, { int status = 0; u64 blkno; - unsigned long long blocks, bytes; + unsigned long long blocks, bytes = 0; unsigned int i; struct buffer_head *bh; -- cgit v1.2.3 From 75d9bbc73804285020aa4d99bd2a9600edea8945 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 11 Oct 2010 12:57:09 -0500 Subject: Initialize max_slots early Functions such as ocfs2_recovery_init() make use of osb->max_slots. Initialize osb->max_slots early so the functions may use the correct value. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Joel Becker --- fs/ocfs2/super.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 350e8b5a9396..b578644b6637 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2105,6 +2105,15 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); + if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { + mlog(ML_ERROR, "Invalid number of node slots (%u)\n", + osb->max_slots); + status = -EINVAL; + goto bail; + } + mlog(0, "max_slots for this device: %u\n", osb->max_slots); + ocfs2_orphan_scan_init(osb); status = ocfs2_recovery_init(osb); @@ -2143,15 +2152,6 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); - if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { - mlog(ML_ERROR, "Invalid number of node slots (%u)\n", - osb->max_slots); - status = -EINVAL; - goto bail; - } - mlog(0, "max_slots for this device: %u\n", osb->max_slots); - osb->slot_recovery_generations = kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), GFP_KERNEL); -- cgit v1.2.3 From 7bdb0d18bfd381cc5491eb95973ec5604b356c7e Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Mon, 11 Oct 2010 16:46:39 +0800 Subject: ocfs2: Add a mount option "coherency=*" to handle cluster coherency for O_DIRECT writes. Currently, the default behavior of O_DIRECT writes was allowing concurrent writing among nodes to the same file, with no cluster coherency guaranteed (no EX lock held). This can leave stale data in the cache for buffered reads on other nodes. The new mount option introduce a chance to choose two different behaviors for O_DIRECT writes: * coherency=full, as the default value, will disallow concurrent O_DIRECT writes by taking EX locks. * coherency=buffered, allow concurrent O_DIRECT writes without EX lock among nodes, which gains high performance at risk of getting stale data on other nodes. Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- Documentation/filesystems/ocfs2.txt | 7 +++++++ fs/ocfs2/file.c | 29 +++++++++++++++++++++++++++-- fs/ocfs2/ocfs2.h | 3 +++ fs/ocfs2/super.c | 15 +++++++++++++++ 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index 1f7ae144f6d8..5393e6611691 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -87,3 +87,10 @@ dir_resv_level= (*) By default, directory reservations will scale with file reservations - users should rarely need to change this value. If allocation reservations are turned off, this option will have no effect. +coherency=full (*) Disallow concurrent O_DIRECT writes, cluster inode + lock will be taken to force other nodes drop cache, + therefore full cluster coherency is guaranteed even + for O_DIRECT writes. +coherency=buffered Allow concurrent O_DIRECT writes without EX lock among + nodes, which gains high performance at risk of getting + stale data on other nodes. diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 13af9937bdda..9e8cc4346b76 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2225,6 +2225,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file->f_path.dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); mlog_entry("(0x%p, %u, '%.*s')\n", file, (unsigned int)nr_segs, @@ -2248,14 +2250,37 @@ relock: have_alloc_sem = 1; } - /* concurrent O_DIRECT writes are allowed */ - rw_level = !direct_io; + /* + * Concurrent O_DIRECT writes are allowed with + * mount_option "coherency=buffered". + */ + rw_level = (!direct_io || full_coherency); + ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { mlog_errno(ret); goto out_sems; } + /* + * O_DIRECT writes with "coherency=full" need to take EX cluster + * inode_lock to guarantee coherency. + */ + if (direct_io && full_coherency) { + /* + * We need to take and drop the inode lock to force + * other nodes to drop their caches. Buffered I/O + * already does this in write_begin(). + */ + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_sems; + } + + ocfs2_inode_unlock(inode, 1); + } + can_do_direct = direct_io; ret = ocfs2_prepare_inode_for_write(file, ppos, iocb->ki_left, appending, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 687e291d73f2..3064feef1430 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -263,6 +263,9 @@ enum ocfs2_mount_options control lists */ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ + + OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12 /* Allow concurrent O_DIRECT + writes */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b578644b6637..9122d59f8127 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -177,6 +177,8 @@ enum { Opt_noacl, Opt_usrquota, Opt_grpquota, + Opt_coherency_buffered, + Opt_coherency_full, Opt_resv_level, Opt_dir_resv_level, Opt_err, @@ -205,6 +207,8 @@ static const match_table_t tokens = { {Opt_noacl, "noacl"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, + {Opt_coherency_buffered, "coherency=buffered"}, + {Opt_coherency_full, "coherency=full"}, {Opt_resv_level, "resv_level=%u"}, {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_err, NULL} @@ -1452,6 +1456,12 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_grpquota: mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; break; + case Opt_coherency_buffered: + mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; + break; + case Opt_coherency_full: + mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; + break; case Opt_acl: mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; @@ -1550,6 +1560,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (opts & OCFS2_MOUNT_GRPQUOTA) seq_printf(s, ",grpquota"); + if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) + seq_printf(s, ",coherency=buffered"); + else + seq_printf(s, ",coherency=full"); + if (opts & OCFS2_MOUNT_NOUSERXATTR) seq_printf(s, ",nouser_xattr"); else -- cgit v1.2.3 From d4396eafe402b710a8535137b3bf2abe6c059a15 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Fri, 15 Oct 2010 11:57:21 -0700 Subject: ocfs2/cluster: Release debugfs file elapsed_time_in_ms An earlier commit forgot to remove a debugfs file, elapsed_time_in_ms. Signed-off-by: Sunil Mushran --- fs/ocfs2/cluster/heartbeat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6a1280a013ea..52c7557f3e25 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1384,6 +1384,7 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_db_livenodes); debugfs_remove(reg->hr_debug_livenodes); debugfs_remove(reg->hr_debug_regnum); + debugfs_remove(reg->hr_debug_elapsed_time); debugfs_remove(reg->hr_debug_dir); spin_lock(&o2hb_live_lock); -- cgit v1.2.3 From 2decd65a2630633cee04d0b83fdcee46ad2989a1 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 12 Oct 2010 11:18:18 +0800 Subject: ocfs2: Avoid to evaluate xattr block flags again. It was evaludated to indexed before, check it is ok i think. Signed-off-by: Jeff Liu Signed-off-by: Joel Becker --- fs/ocfs2/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 06fa5e77c40e..67cd43914641 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args, goto out; } - if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) + if (!indexed) ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); else ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); -- cgit v1.2.3