From 2d7c86a8f9cdce1408c4f3c69d94d007eff2f179 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:50 +0530 Subject: libceph: generalize addr/ip parsing based on delimiter ... and remove hardcoded function name in ceph_parse_ips(). [ idryomov: delim parameter, drop CEPH_ADDR_PARSE_DEFAULT_DELIM ] Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bab61232dc5a..c444371ebc38 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -272,7 +272,7 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dout("server path '%s'\n", fsopt->server_path); ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log); + pctx->copts, fc->log.log, ','); if (ret) return ret; -- cgit v1.2.3 From 7b19b4db5add8d9f50e854907a82a10ba4d27c42 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:52 +0530 Subject: ceph: new device mount syntax Old mount device syntax (source) has the following problems: - mounts to the same cluster but with different fsnames and/or creds have identical device string which can confuse xfstests. - Userspace mount helper tool resolves monitor addresses and fill in mon addrs automatically, but that means the device shown in /proc/mounts is different than what was used for mounting. New device syntax is as follows: cephuser@fsid.mycephfs2=/path Note, there is no "monitor address" in the device string. That gets passed in as mount option. This keeps the device string same when monitor addresses change (on remounts). Also note that the userspace mount helper tool is backward compatible. I.e., the mount helper will fallback to using old syntax after trying to mount with the new syntax. [ idryomov: drop CEPH_MON_ADDR_MNTOPT_DELIM ] Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- fs/ceph/super.h | 3 ++ 2 files changed, 134 insertions(+), 10 deletions(-) (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c444371ebc38..e2c40d055711 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -146,6 +146,7 @@ enum { Opt_mds_namespace, Opt_recover_session, Opt_source, + Opt_mon_addr, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -197,6 +198,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), {} @@ -228,9 +230,92 @@ static void canonicalize_path(char *path) } /* - * Parse the source parameter. Distinguish the server list from the path. + * Check if the mds namespace in ceph_mount_options matches + * the passed in namespace string. First time match (when + * ->mds_namespace is NULL) is treated specially, since + * ->mds_namespace needs to be initialized by the caller. + */ +static int namespace_equals(struct ceph_mount_options *fsopt, + const char *namespace, size_t len) +{ + return !(fsopt->mds_namespace && + (strlen(fsopt->mds_namespace) != len || + strncmp(fsopt->mds_namespace, namespace, len))); +} + +static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + int r; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + if (*dev_name_end != ':') + return invalfc(fc, "separator ':' missing in source"); + + r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name, + pctx->copts, fc->log.log, ','); + if (r) + return r; + + fsopt->new_dev_syntax = false; + return 0; +} + +static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + size_t len; + struct ceph_fsid fsid; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *fsid_start, *fs_name_start; + + if (*dev_name_end != '=') { + dout("separator '=' missing in source"); + return -EINVAL; + } + + fsid_start = strchr(dev_name, '@'); + if (!fsid_start) + return invalfc(fc, "missing cluster fsid"); + ++fsid_start; /* start of cluster fsid */ + + fs_name_start = strchr(fsid_start, '.'); + if (!fs_name_start) + return invalfc(fc, "missing file system name"); + + if (ceph_parse_fsid(fsid_start, &fsid)) + return invalfc(fc, "Invalid FSID"); + + ++fs_name_start; /* start of file system name */ + len = dev_name_end - fs_name_start; + + if (!namespace_equals(fsopt, fs_name_start, len)) + return invalfc(fc, "Mismatching mds_namespace"); + kfree(fsopt->mds_namespace); + fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL); + if (!fsopt->mds_namespace) + return -ENOMEM; + dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace); + + fsopt->new_dev_syntax = true; + return 0; +} + +/* + * Parse the source parameter for new device format. Distinguish the device + * spec from the path. Try parsing new device format and fallback to old + * format if needed. + * + * New device syntax will looks like: + * =/ + * where + * is name@fsid.fsname + * is optional, but if present must begin with '/' + * (monitor addresses are passed via mount option) * - * The source will look like: + * Old device syntax is: * [,...]:[] * where * is [:] @@ -263,24 +348,44 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dev_name_end = dev_name + strlen(dev_name); } - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') - return invalfc(fc, "No path or : separator in source"); + dev_name_end--; /* back up to separator */ + if (dev_name_end < dev_name) + return invalfc(fc, "Path missing in source"); dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); if (fsopt->server_path) dout("server path '%s'\n", fsopt->server_path); - ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log, ','); - if (ret) - return ret; + dout("trying new device syntax"); + ret = ceph_parse_new_source(dev_name, dev_name_end, fc); + if (ret) { + if (ret != -EINVAL) + return ret; + dout("trying old device syntax"); + ret = ceph_parse_old_source(dev_name, dev_name_end, fc); + if (ret) + return ret; + } fc->source = param->string; param->string = NULL; return 0; } +static int ceph_parse_mon_addr(struct fs_parameter *param, + struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + kfree(fsopt->mon_addr); + fsopt->mon_addr = param->string; + param->string = NULL; + + return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr), + pctx->copts, fc->log.log, '/'); +} + static int ceph_parse_mount_param(struct fs_context *fc, struct fs_parameter *param) { @@ -306,6 +411,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, param->string = NULL; break; case Opt_mds_namespace: + if (!namespace_equals(fsopt, param->string, strlen(param->string))) + return invalfc(fc, "Mismatching mds_namespace"); kfree(fsopt->mds_namespace); fsopt->mds_namespace = param->string; param->string = NULL; @@ -323,6 +430,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, if (fc->source) return invalfc(fc, "Multiple sources specified"); return ceph_parse_source(param, fc); + case Opt_mon_addr: + return ceph_parse_mon_addr(param, fc); case Opt_wsize: if (result.uint_32 < PAGE_SIZE || result.uint_32 > CEPH_MAX_WRITE_SIZE) @@ -474,6 +583,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->mds_namespace); kfree(args->server_path); kfree(args->fscache_uniq); + kfree(args->mon_addr); kfree(args); } @@ -517,6 +627,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } @@ -572,9 +686,13 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) seq_puts(m, ",copyfrom"); - if (fsopt->mds_namespace) + /* dump mds_namespace when old device syntax is in use */ + if (fsopt->mds_namespace && !fsopt->new_dev_syntax) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + if (fsopt->mon_addr) + seq_printf(m, ",mon_addr=%s", fsopt->mon_addr); + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); @@ -1060,6 +1178,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) static int ceph_get_tree(struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; @@ -1071,6 +1190,8 @@ static int ceph_get_tree(struct fs_context *fc) if (!fc->source) return invalfc(fc, "No source"); + if (fsopt->new_dev_syntax && !fsopt->mon_addr) + return invalfc(fc, "No monitor address"); /* create client (which we may/may not use) */ fsc = create_fs_client(pctx->opts, pctx->copts); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ac331aa07cfa..ec6b221e5b62 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -89,6 +89,8 @@ struct ceph_mount_options { unsigned int max_readdir; /* max readdir result (entries) */ unsigned int max_readdir_bytes; /* max readdir result (bytes) */ + bool new_dev_syntax; + /* * everything above this point can be memcmp'd; everything below * is handled in compare_mount_options() @@ -98,6 +100,7 @@ struct ceph_mount_options { char *mds_namespace; /* default NULL */ char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ + char *mon_addr; }; struct ceph_fs_client { -- cgit v1.2.3 From 2167f2cc686a97911a0b06ba9c97cec304b7c432 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:53 +0530 Subject: ceph: record updated mon_addr on remount Note that the new monitors are just shown in /proc/mounts. Ceph does not (re)connect to new monitors yet. [ jlayton: s/printk\(KERN_NOTICE/pr_notice(/ s/strcmp/strcmp_null/ ] Signed-off-by: Venky Shankar Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e2c40d055711..31b5786cd98f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1277,6 +1277,13 @@ static int ceph_reconfigure_fc(struct fs_context *fc) else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { + kfree(fsc->mount_options->mon_addr); + fsc->mount_options->mon_addr = fsopt->mon_addr; + fsopt->mon_addr = NULL; + pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); + } + sync_filesystem(fc->root->d_sb); return 0; } -- cgit v1.2.3 From adbed05ed62d1f3b6f6c5cb88ec52c1ffafc0fd9 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 3 Nov 2021 10:30:39 +0530 Subject: ceph: mount syntax module parameter Add read-only module parameters for supported mount syntaxes. Primary user is the user-space mount helper for catching v2 syntax bugs during testing by cross verifying if the kernel supports v2 syntax on mount failure. Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 31b5786cd98f..2166fc2c1625 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1461,6 +1461,14 @@ bool disable_send_metrics = false; module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); +/* for both v1 and v2 syntax */ +static bool mount_support = true; +static const struct kernel_param_ops param_ops_mount_syntax = { + .get = param_get_bool, +}; +module_param_cb(mount_syntax_v1, ¶m_ops_mount_syntax, &mount_support, 0444); +module_param_cb(mount_syntax_v2, ¶m_ops_mount_syntax, &mount_support, 0444); + module_init(init_ceph); module_exit(exit_ceph); -- cgit v1.2.3 From 94cc0877cad0bc6ca84686c4fa874bf530eb8b88 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 30 Nov 2021 14:12:13 -0500 Subject: ceph: add new "nopagecache" option CephFS is a bit unlike most other filesystems in that it only conditionally does buffered I/O based on the caps that it gets from the MDS. In most cases, unless there is contended access for an inode the MDS does give Fbc caps to the client, so the unbuffered codepaths are only infrequently traveled and are difficult to test. At one time, the "-o sync" mount option would give you this behavior, but that was removed in commit 7ab9b3807097 ("ceph: Don't use ceph-sync-mode for synchronous-fs."). Add a new mount option to tell the client to ignore Fbc caps when doing I/O, and to use the synchronous codepaths exclusively, even on non-O_DIRECT file descriptors. We already have an ioctl that forces this behavior on a per-file basis, so we can just always set the CEPH_F_SYNC flag in the file description on such mounts. Additionally, this patch also changes the client to not request Fbc when doing direct I/O. We aren't using the cache with O_DIRECT so we don't have any need for those caps. Signed-off-by: Jeff Layton Acked-by: Greg Farnum Reviewed-by: Venky Shankar Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 24 +++++++++++++++--------- fs/ceph/super.c | 11 +++++++++++ fs/ceph/super.h | 1 + 3 files changed, 27 insertions(+), 9 deletions(-) (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c138e8126286..7de5db51c3d0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mount_options *opt = + ceph_inode_to_client(&ci->vfs_inode)->mount_options; struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, if (!fi) return -ENOMEM; + if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + fi->flags |= CEPH_F_SYNC; + file->private_data = fi; } @@ -1536,7 +1541,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) struct ceph_inode_info *ci = ceph_inode(inode); bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; - int want, got = 0; + int want = 0, got = 0; int retry_op = 0, read = 0; again: @@ -1551,13 +1556,14 @@ again: else ceph_start_io_read(inode); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_CACHE; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_CACHE; + want |= CEPH_CAP_FILE_LAZYIO; + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); if (ret < 0) { - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_read(inode); @@ -1691,7 +1697,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; - int err, want, got; + int err, want = 0, got; bool direct_lock = false; u32 map_flags; u64 pool_flags; @@ -1766,10 +1772,10 @@ retry_snap: dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_BUFFER; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; + want |= CEPH_CAP_FILE_LAZYIO; got = 0; err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 2166fc2c1625..eb255d24e321 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -160,6 +160,7 @@ enum { Opt_quotadf, Opt_copyfrom, Opt_wsync, + Opt_pagecache, }; enum ceph_recover_session_mode { @@ -201,6 +202,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), + fsparam_flag_no ("pagecache", Opt_pagecache), {} }; @@ -564,6 +566,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; break; + case Opt_pagecache: + if (result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; + break; default: BUG(); } @@ -699,6 +707,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + seq_puts(m, ",nopagecache"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5ec465a1350a..0d489a973d5e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -46,6 +46,7 @@ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ +#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ -- cgit v1.2.3 From a0b3a15eab6bc2e90008460b646d53e7d9dcdbbb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Jan 2022 18:28:33 -0500 Subject: ceph: move CEPH_SUPER_MAGIC definition to magic.h The uapi headers are missing the ceph definition. Move it there so userland apps can ID cephfs. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 2 ++ fs/ceph/super.h | 3 --- include/uapi/linux/magic.h | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ceph/super.c') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index eb255d24e321..dbcf9b743c88 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -27,6 +27,8 @@ #include #include +#include + static DEFINE_SPINLOCK(ceph_fsc_lock); static LIST_HEAD(ceph_fsc_list); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0d489a973d5e..dc9fe7ed3fa8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -25,9 +25,6 @@ #include #endif -/* f_type in struct statfs */ -#define CEPH_SUPER_MAGIC 0x00c36400 - /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 35687dcb1a42..53a3c20394cf 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -6,6 +6,7 @@ #define AFFS_SUPER_MAGIC 0xadff #define AFS_SUPER_MAGIC 0x5346414F #define AUTOFS_SUPER_MAGIC 0x0187 +#define CEPH_SUPER_MAGIC 0x00c36400 #define CODA_SUPER_MAGIC 0x73757245 #define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ #define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */ -- cgit v1.2.3