From 8654df4e2ac9704905198d63845554c2ddf6a93f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 16:06:06 -0500 Subject: mnt: Refactor fs_fully_visible into mount_too_revealing Replace the call of fs_fully_visible in do_new_mount from before the new superblock is allocated with a call of mount_too_revealing after the superblock is allocated. This winds up being a much better location for maintainability of the code. The first change this enables is the replacement of FS_USERNS_VISIBLE with SB_I_USERNS_VISIBLE. Moving the flag from struct filesystem_type to sb_iflags on the superblock. Unfortunately mount_too_revealing fundamentally needs to touch mnt_flags adding several MNT_LOCKED_XXX flags at the appropriate times. If the mnt_flags did not need to be touched the code could be easily moved into the filesystem specific mount code. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- include/linux/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index dd288148a6b1..71988dd3af95 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1328,6 +1328,9 @@ struct mm_struct; #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ +/* sb->s_iflags to limit user namespace mounts */ +#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ + /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ @@ -2011,7 +2014,6 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ -#define FS_USERNS_VISIBLE 32 /* FS must already be visible */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); -- cgit v1.2.3 From d91ee87d8d85a0808c01787e8b4a6b48f2ba487b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 May 2016 14:51:59 -0500 Subject: vfs: Pass data, ns, and ns->userns to mount_ns Today what is normally called data (the mount options) is not passed to fill_super through mount_ns. Pass the mount options and the namespace separately to mount_ns so that filesystems such as proc that have mount options, can use mount_ns. Pass the user namespace to mount_ns so that the standard permission check that verifies the mounter has permissions over the namespace can be performed in mount_ns instead of in each filesystems .mount method. Thus removing the duplication between mqueuefs and proc in terms of permission checks. The extra permission check does not currently affect the rpc_pipefs filesystem and the nfsd filesystem as those filesystems do not currently allow unprivileged mounts. Without unpvileged mounts it is guaranteed that the caller has already passed capable(CAP_SYS_ADMIN) which guarantees extra permission check will pass. Update rpc_pipefs and the nfsd filesystem to ensure that the network namespace reference is always taken in fill_super and always put in kill_sb so that the logic is simpler and so that errors originating inside of fill_super do not cause a network namespace leak. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/nfsd/nfsctl.c | 13 ++++--------- fs/super.c | 13 ++++++++++--- include/linux/fs.h | 5 +++-- ipc/mqueue.c | 19 ++++++++----------- net/sunrpc/rpc_pipe.c | 8 ++++---- 5 files changed, 29 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 9690cb4dd588..5a6ae2522266 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1154,20 +1154,15 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) #endif /* last one */ {""} }; - struct net *net = data; - int ret; - - ret = simple_fill_super(sb, 0x6e667364, nfsd_files); - if (ret) - return ret; - sb->s_fs_info = get_net(net); - return 0; + get_net(sb->s_fs_info); + return simple_fill_super(sb, 0x6e667364, nfsd_files); } static struct dentry *nfsd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super); + struct net *net = current->nsproxy->net_ns; + return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super); } static void nfsd_umount(struct super_block *sb) diff --git a/fs/super.c b/fs/super.c index d78b9847e6cb..fd65667832e5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -918,12 +918,19 @@ static int ns_set_super(struct super_block *sb, void *data) return set_anon_super(sb, NULL); } -struct dentry *mount_ns(struct file_system_type *fs_type, int flags, - void *data, int (*fill_super)(struct super_block *, void *, int)) +struct dentry *mount_ns(struct file_system_type *fs_type, + int flags, void *data, void *ns, struct user_namespace *user_ns, + int (*fill_super)(struct super_block *, void *, int)) { struct super_block *sb; - sb = sget(fs_type, ns_test_super, ns_set_super, flags, data); + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN + * over the namespace. + */ + if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns); if (IS_ERR(sb)) return ERR_CAST(sb); diff --git a/include/linux/fs.h b/include/linux/fs.h index 71988dd3af95..1ce006a24f49 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2034,8 +2034,9 @@ struct file_system_type { #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) -extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, - void *data, int (*fill_super)(struct super_block *, void *, int)); +extern struct dentry *mount_ns(struct file_system_type *fs_type, + int flags, void *data, void *ns, struct user_namespace *user_ns, + int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index ade739f67f1d..60d97082f4dc 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -305,7 +305,7 @@ err: static int mqueue_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - struct ipc_namespace *ns = data; + struct ipc_namespace *ns = sb->s_fs_info; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -326,17 +326,14 @@ static struct dentry *mqueue_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - if (!(flags & MS_KERNMOUNT)) { - struct ipc_namespace *ns = current->nsproxy->ipc_ns; - /* Don't allow mounting unless the caller has CAP_SYS_ADMIN - * over the ipc namespace. - */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - data = ns; + struct ipc_namespace *ns; + if (flags & MS_KERNMOUNT) { + ns = data; + data = NULL; + } else { + ns = current->nsproxy->ipc_ns; } - return mount_ns(fs_type, flags, data, mqueue_fill_super); + return mount_ns(fs_type, flags, data, ns, ns->user_ns, mqueue_fill_super); } static void init_once(void *foo) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index fc48eca21fd2..84f98cbe31c3 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1386,7 +1386,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct dentry *root, *gssd_dentry; - struct net *net = data; + struct net *net = get_net(sb->s_fs_info); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1419,7 +1419,6 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb); if (err) goto err_depopulate; - sb->s_fs_info = get_net(net); mutex_unlock(&sn->pipefs_sb_lock); return 0; @@ -1448,7 +1447,8 @@ static struct dentry * rpc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_ns(fs_type, flags, current->nsproxy->net_ns, rpc_fill_super); + struct net *net = current->nsproxy->net_ns; + return mount_ns(fs_type, flags, data, net, net->user_ns, rpc_fill_super); } static void rpc_kill_sb(struct super_block *sb) @@ -1468,9 +1468,9 @@ static void rpc_kill_sb(struct super_block *sb) RPC_PIPEFS_UMOUNT, sb); mutex_unlock(&sn->pipefs_sb_lock); - put_net(net); out: kill_litter_super(sb); + put_net(net); } static struct file_system_type rpc_pipe_fs_type = { -- cgit v1.2.3 From 6e4eab577a0cae15b3da9b888cff16fe57981b3e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 24 May 2016 09:29:01 -0500 Subject: fs: Add user namespace member to struct super_block Start marking filesystems with a user namespace owner, s_user_ns. In this change this is only used for permission checks of who may mount a filesystem. Ultimately s_user_ns will be used for translating ids and checking capabilities for filesystems mounted from user namespaces. The default policy for setting s_user_ns is implemented in sget(), which arranges for s_user_ns to be set to current_user_ns() and to ensure that the mounter of the filesystem has CAP_SYS_ADMIN in that user_ns. The guts of sget are split out into another function sget_userns(). The function sget_userns calls alloc_super with the specified user namespace or it verifies the existing superblock that was found has the expected user namespace, and fails with EBUSY when it is not. This failing prevents users with the wrong privileges mounting a filesystem. The reason for the split of sget_userns from sget is that in some cases such as mount_ns and kernfs_mount_ns a different policy for permission checking of mounts and setting s_user_ns is necessary, and the existence of sget_userns() allows those policies to be implemented. The helper mount_ns is expected to be used for filesystems such as proc and mqueuefs which present per namespace information. The function mount_ns is modified to call sget_userns instead of sget to ensure the user namespace owner of the namespace whose information is presented by the filesystem is used on the superblock. For sysfs and cgroup the appropriate permission checks are already in place, and kernfs_mount_ns is modified to call sget_userns so that the init_user_ns is the only user namespace used. For the cgroup filesystem cgroup namespace mounts are bind mounts of a subset of the full cgroup filesystem and as such s_user_ns must be the same for all of them as there is only a single superblock. Mounts of sysfs that vary based on the network namespace could in principle change s_user_ns but it keeps the analysis and implementation of kernfs simpler if that is not supported, and at present there appear to be no benefits from supporting a different s_user_ns on any sysfs mount. Getting the details of setting s_user_ns correct has been a long process. Thanks to Pavel Tikhorirorv who spotted a leak in sget_userns. Thanks to Seth Forshee who has kept the work alive. Thanks-to: Seth Forshee Thanks-to: Pavel Tikhomirov Acked-by: Seth Forshee Signed-off-by: Eric W. Biederman --- fs/kernfs/mount.c | 3 ++- fs/super.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++------ include/linux/fs.h | 12 ++++++++++++ 3 files changed, 60 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 63534f5f9073..d90d574c15a2 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -241,7 +241,8 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, info->root = root; info->ns = ns; - sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info); + sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, + &init_user_ns, info); if (IS_ERR(sb) || sb->s_fs_info != info) kfree(info); if (IS_ERR(sb)) diff --git a/fs/super.c b/fs/super.c index fd65667832e5..874c7e3ebb8f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "internal.h" @@ -165,6 +166,7 @@ static void destroy_super(struct super_block *s) list_lru_destroy(&s->s_inode_lru); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); + put_user_ns(s->s_user_ns); kfree(s->s_subtype); kfree(s->s_options); call_rcu(&s->rcu, destroy_super_rcu); @@ -174,11 +176,13 @@ static void destroy_super(struct super_block *s) * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags + * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ -static struct super_block *alloc_super(struct file_system_type *type, int flags) +static struct super_block *alloc_super(struct file_system_type *type, int flags, + struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; @@ -188,6 +192,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) return NULL; INIT_LIST_HEAD(&s->s_mounts); + s->s_user_ns = get_user_ns(user_ns); if (security_sb_alloc(s)) goto fail; @@ -443,17 +448,18 @@ void generic_shutdown_super(struct super_block *sb) EXPORT_SYMBOL(generic_shutdown_super); /** - * sget - find or create a superblock + * sget_userns - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags + * @user_ns: User namespace for the super_block * @data: argument to each of them */ -struct super_block *sget(struct file_system_type *type, +struct super_block *sget_userns(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), - int flags, + int flags, struct user_namespace *user_ns, void *data) { struct super_block *s = NULL; @@ -466,6 +472,14 @@ retry: hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; + if (user_ns != old->s_user_ns) { + spin_unlock(&sb_lock); + if (s) { + up_write(&s->s_umount); + destroy_super(s); + } + return ERR_PTR(-EBUSY); + } if (!grab_super(old)) goto retry; if (s) { @@ -478,7 +492,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, flags); + s = alloc_super(type, flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -501,6 +515,31 @@ retry: return s; } +EXPORT_SYMBOL(sget_userns); + +/** + * sget - find or create a superblock + * @type: filesystem type superblock should belong to + * @test: comparison callback + * @set: setup callback + * @flags: mount flags + * @data: argument to each of them + */ +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + int flags, + void *data) +{ + struct user_namespace *user_ns = current_user_ns(); + + /* Ensure the requestor has permissions over the target filesystem */ + if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + return sget_userns(type, test, set, flags, user_ns, data); +} + EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) @@ -930,7 +969,8 @@ struct dentry *mount_ns(struct file_system_type *fs_type, if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns); + sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags, + user_ns, ns); if (IS_ERR(sb)) return ERR_CAST(sb); diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ce006a24f49..9eef64f23a75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1432,6 +1432,13 @@ struct super_block { struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; + /* + * Owning user namespace and default context in which to + * interpret filesystem uids, gids, quotas, device nodes, + * xattrs and security labels. + */ + struct user_namespace *s_user_ns; + /* * Keep the lru lists last in the structure so they always sit on their * own individual cachelines. @@ -2056,6 +2063,11 @@ void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); +struct super_block *sget_userns(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + int flags, struct user_namespace *user_ns, + void *data); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), -- cgit v1.2.3 From a2982cc922c3068783eb9a1f77a5626a1ec36a1f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 15:34:02 -0500 Subject: vfs: Generalize filesystem nodev handling. Introduce a function may_open_dev that tests MNT_NODEV and a new superblock flab SB_I_NODEV. Use this new function in all of the places where MNT_NODEV was previously tested. Add the new SB_I_NODEV s_iflag to proc, sysfs, and mqueuefs as those filesystems should never support device nodes, and a simple superblock flags makes that very hard to get wrong. With SB_I_NODEV set if any device nodes somehow manage to show up on on a filesystem those device nodes will be unopenable. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/block_dev.c | 2 +- fs/kernfs/mount.c | 4 ++-- fs/namei.c | 8 +++++++- fs/proc/inode.c | 4 ++-- include/linux/fs.h | 2 ++ ipc/mqueue.c | 2 +- 6 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/block_dev.c b/fs/block_dev.c index 71ccab1d22c6..30b8d568203a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1857,7 +1857,7 @@ struct block_device *lookup_bdev(const char *pathname) if (!S_ISBLK(inode->i_mode)) goto fail; error = -EACCES; - if (path.mnt->mnt_flags & MNT_NODEV) + if (!may_open_dev(&path)) goto fail; error = -ENOMEM; bdev = bd_acquire(inode); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 1443df670260..b3d73ad52b22 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -152,8 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) struct dentry *root; info->sb = sb; - /* Userspace would break if executables appear on sysfs */ - sb->s_iflags |= SB_I_NOEXEC; + /* Userspace would break if executables or devices appear on sysfs */ + sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = magic; diff --git a/fs/namei.c b/fs/namei.c index 6a82fb7e2127..757a32725d92 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2881,6 +2881,12 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } EXPORT_SYMBOL(vfs_create); +bool may_open_dev(const struct path *path) +{ + return !(path->mnt->mnt_flags & MNT_NODEV) && + !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); +} + static int may_open(struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; @@ -2899,7 +2905,7 @@ static int may_open(struct path *path, int acc_mode, int flag) break; case S_IFBLK: case S_IFCHR: - if (path->mnt->mnt_flags & MNT_NODEV) + if (!may_open_dev(path)) return -EACCES; /*FALLTHRU*/ case S_IFIFO: diff --git a/fs/proc/inode.c b/fs/proc/inode.c index f4817efb25a6..a5b2c33745b7 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -466,8 +466,8 @@ int proc_fill_super(struct super_block *s, void *data, int silent) if (!proc_parse_options(data, ns)) return -EINVAL; - /* User space would break if executables appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC; + /* User space would break if executables or devices appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eef64f23a75..e05983170d23 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1327,6 +1327,7 @@ struct mm_struct; /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ +#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ @@ -1602,6 +1603,7 @@ extern int vfs_whiteout(struct inode *, struct dentry *); */ extern void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode); +extern bool may_open_dev(const struct path *path); /* * VFS FS_IOC_FIEMAP helper definitions. */ diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 5bdd50de7d05..0b13ace266f2 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -307,7 +307,7 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode; struct ipc_namespace *ns = sb->s_fs_info; - sb->s_iflags |= SB_I_NOEXEC; + sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = MQUEUE_MAGIC; -- cgit v1.2.3 From cc50a07a247e17db76b1f0b0ca06652556e04fa3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 15:44:48 -0500 Subject: userns: Remove the now unnecessary FS_USERNS_DEV_MOUNT flag Now that SB_I_NODEV controls the nodev behavior devpts can just clear this flag during mount. Simplifying the code and making it easier to audit how the code works. While still preserving the invariant that s_iflags is only modified during mount. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/devpts/inode.c | 3 ++- fs/super.c | 3 +-- include/linux/fs.h | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 37c134a132c7..d116453b0276 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -396,6 +396,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; + s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; @@ -480,7 +481,7 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, - .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; /* diff --git a/fs/super.c b/fs/super.c index 25cdceed2ad3..37813bf479cf 100644 --- a/fs/super.c +++ b/fs/super.c @@ -206,8 +206,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; - if ((s->s_user_ns != &init_user_ns) && - !(type->fs_flags & FS_USERNS_DEV_MOUNT)) + if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); diff --git a/include/linux/fs.h b/include/linux/fs.h index e05983170d23..375e37f42cdf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2022,7 +2022,6 @@ struct file_system_type { #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ -#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); -- cgit v1.2.3 From d07b846f6200454c50d791796edb82660192513d Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Wed, 23 Sep 2015 15:16:04 -0500 Subject: fs: Limit file caps to the user namespace of the super block Capability sets attached to files must be ignored except in the user namespaces where the mounter is privileged, i.e. s_user_ns and its descendants. Otherwise a vector exists for gaining privileges in namespaces where a user is not already privileged. Add a new helper function, current_in_user_ns(), to test whether a user namespace is the same as or a descendant of another namespace. Use this helper to determine whether a file's capability set should be applied to the caps constructed during exec. --EWB Replaced in_userns with the simpler current_in_userns. Acked-by: Serge Hallyn Signed-off-by: Seth Forshee Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 6 ++++++ kernel/user_namespace.c | 14 ++++++++++++++ security/commoncap.c | 2 ++ 3 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 8297e5b341d8..9217169c64cb 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -72,6 +72,7 @@ extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); +extern bool current_in_userns(const struct user_namespace *target_ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) @@ -100,6 +101,11 @@ static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } + +static inline bool current_in_userns(const struct user_namespace *target_ns) +{ + return true; +} #endif #endif /* _LINUX_USER_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9bafc211930c..68f594212759 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns) return allowed; } +/* + * Returns true if @ns is the same namespace as or a descendant of + * @target_ns. + */ +bool current_in_userns(const struct user_namespace *target_ns) +{ + struct user_namespace *ns; + for (ns = current_user_ns(); ns; ns = ns->parent) { + if (ns == target_ns) + return true; + } + return false; +} + static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); diff --git a/security/commoncap.c b/security/commoncap.c index e7fadde737f4..e109e6dac858 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -455,6 +455,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) return 0; + if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns)) + return 0; rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); if (rc < 0) { -- cgit v1.2.3 From 380cf5ba6b0a0b307f4afb62b186ca801defb203 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 23 Jun 2016 16:41:05 -0500 Subject: fs: Treat foreign mounts as nosuid If a process gets access to a mount from a different user namespace, that process should not be able to take advantage of setuid files or selinux entrypoints from that filesystem. Prevent this by treating mounts from other mount namespaces and those not owned by current_user_ns() or an ancestor as nosuid. This will make it safer to allow more complex filesystems to be mounted in non-root user namespaces. This does not remove the need for MNT_LOCK_NOSUID. The setuid, setgid, and file capability bits can no longer be abused if code in a user namespace were to clear nosuid on an untrusted filesystem, but this patch, by itself, is insufficient to protect the system from abuse of files that, when execed, would increase MAC privilege. As a more concrete explanation, any task that can manipulate a vfsmount associated with a given user namespace already has capabilities in that namespace and all of its descendents. If they can cause a malicious setuid, setgid, or file-caps executable to appear in that mount, then that executable will only allow them to elevate privileges in exactly the set of namespaces in which they are already privileges. On the other hand, if they can cause a malicious executable to appear with a dangerous MAC label, running it could change the caller's security context in a way that should not have been possible, even inside the namespace in which the task is confined. As a hardening measure, this would have made CVE-2014-5207 much more difficult to exploit. Signed-off-by: Andy Lutomirski Signed-off-by: Seth Forshee Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/exec.c | 2 +- fs/namespace.c | 13 +++++++++++++ include/linux/mount.h | 1 + security/commoncap.c | 8 +++++++- security/selinux/hooks.c | 2 +- 5 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/exec.c b/fs/exec.c index 887c1c955df8..ca239fc86d8d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1411,7 +1411,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) bprm->cred->euid = current_euid(); bprm->cred->egid = current_egid(); - if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) + if (!mnt_may_suid(bprm->file->f_path.mnt)) return; if (task_no_new_privs(current)) diff --git a/fs/namespace.c b/fs/namespace.c index 9786a38d1681..aabe8e397fc3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3280,6 +3280,19 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return !mnt_already_visible(ns, mnt, new_mnt_flags); } +bool mnt_may_suid(struct vfsmount *mnt) +{ + /* + * Foreign mounts (accessed via fchdir or through /proc + * symlinks) are always treated as if they are nosuid. This + * prevents namespaces from trusting potentially unsafe + * suid/sgid bits, file caps, or security labels that originate + * in other namespaces. + */ + return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && + current_in_userns(mnt->mnt_sb->s_user_ns); +} + static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; diff --git a/include/linux/mount.h b/include/linux/mount.h index f822c3c11377..54a594d49733 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -81,6 +81,7 @@ extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); +extern bool mnt_may_suid(struct vfsmount *mnt); struct path; extern struct vfsmount *clone_private_mount(struct path *path); diff --git a/security/commoncap.c b/security/commoncap.c index e109e6dac858..14540bd78561 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -453,8 +453,14 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c if (!file_caps_enabled) return 0; - if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) + if (!mnt_may_suid(bprm->file->f_path.mnt)) return 0; + + /* + * This check is redundant with mnt_may_suid() but is kept to make + * explicit that capability bits are limited to s_user_ns and its + * descendants. + */ if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns)) return 0; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index a86d537eb79b..15541756eb07 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2259,7 +2259,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, const struct task_security_struct *new_tsec) { int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS); - int nosuid = (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID); + int nosuid = !mnt_may_suid(bprm->file->f_path.mnt); int rc; if (!nnp && !nosuid) -- cgit v1.2.3 From 37b11804ed1725dc2ea97be2236150210a69e9d5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Jun 2016 15:52:48 -0500 Subject: userns: Handle -1 in k[ug]id_has_mapping when !CONFIG_USER_NS Refuse to admit any user namespace has a mapping of the INVALID_UID and the INVALID_GID when !CONFIG_USER_NS. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- include/linux/uidgid.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index 03835522dfcb..25e9d9216340 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -177,12 +177,12 @@ static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid) static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { - return true; + return uid_valid(uid); } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { - return true; + return gid_valid(gid); } #endif /* CONFIG_USER_NS */ -- cgit v1.2.3 From 0d4d717f25834134bb6f43284f84c8ccee5bbf2a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Jun 2016 16:04:06 -0500 Subject: vfs: Verify acls are valid within superblock's s_user_ns. Update posix_acl_valid to verify that an acl is within a user namespace. Update the callers of posix_acl_valid to pass in an appropriate user namespace. For posix_acl_xattr_set and v9fs_xattr_set_acl pass in inode->i_sb->s_user_ns to posix_acl_valid. For md_unpack_acl pass in &init_user_ns as no inode or superblock is in sight. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- drivers/staging/lustre/lustre/mdc/mdc_request.c | 2 +- fs/9p/acl.c | 2 +- fs/posix_acl.c | 8 ++++---- include/linux/posix_acl.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c index 86b7445365f4..40cf57fad581 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -434,7 +434,7 @@ static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md) return rc; } - rc = posix_acl_valid(acl); + rc = posix_acl_valid(&init_user_ns, acl); if (rc) { CERROR("validate acl: %d\n", rc); posix_acl_release(acl); diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 0576eaeb60b9..5b6a1743ea17 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -266,7 +266,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { - retval = posix_acl_valid(acl); + retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (retval) goto err_out; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8a4a266beff3..647c28180675 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -205,7 +205,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. */ int -posix_acl_valid(const struct posix_acl *acl) +posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; @@ -225,7 +225,7 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_USER: if (state != ACL_USER) return -EINVAL; - if (!uid_valid(pa->e_uid)) + if (!kuid_has_mapping(user_ns, pa->e_uid)) return -EINVAL; needs_mask = 1; break; @@ -240,7 +240,7 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_GROUP: if (state != ACL_GROUP) return -EINVAL; - if (!gid_valid(pa->e_gid)) + if (!kgid_has_mapping(user_ns, pa->e_gid)) return -EINVAL; needs_mask = 1; break; @@ -845,7 +845,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler, return PTR_ERR(acl); if (acl) { - ret = posix_acl_valid(acl); + ret = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (ret) goto out; } diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 5b5a80cc5926..1af6438fde3e 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -81,7 +81,7 @@ posix_acl_release(struct posix_acl *acl) extern void posix_acl_init(struct posix_acl *, int); extern struct posix_acl *posix_acl_alloc(int, gfp_t); -extern int posix_acl_valid(const struct posix_acl *); +extern int posix_acl_valid(struct user_namespace *, const struct posix_acl *); extern int posix_acl_permission(struct inode *, const struct posix_acl *, int); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); -- cgit v1.2.3 From 0bd23d09b874e53bd1a2fe2296030aa2720d7b08 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 29 Jun 2016 14:54:46 -0500 Subject: vfs: Don't modify inodes with a uid or gid unknown to the vfs When a filesystem outside of init_user_ns is mounted it could have uids and gids stored in it that do not map to init_user_ns. The plan is to allow those filesystems to set i_uid to INVALID_UID and i_gid to INVALID_GID for unmapped uids and gids and then to handle that strange case in the vfs to ensure there is consistent robust handling of the weirdness. Upon a careful review of the vfs and filesystems about the only case where there is any possibility of confusion or trouble is when the inode is written back to disk. In that case filesystems typically read the inode->i_uid and inode->i_gid and write them to disk even when just an inode timestamp is being updated. Which leads to a rule that is very simple to implement and understand inodes whose i_uid or i_gid is not valid may not be written. In dealing with access times this means treat those inodes as if the inode flag S_NOATIME was set. Reads of the inodes appear safe and useful, but any write or modification is disallowed. The only inode write that is allowed is a chown that sets the uid and gid on the inode to valid values. After such a chown the inode is normal and may be treated as such. Denying all writes to inodes with uids or gids unknown to the vfs also prevents several oddball cases where corruption would have occurred because the vfs does not have complete information. One problem case that is prevented is attempting to use the gid of a directory for new inodes where the directories sgid bit is set but the directories gid is not mapped. Another problem case avoided is attempting to update the evm hash after setxattr, removexattr, and setattr. As the evm hash includeds the inode->i_uid or inode->i_gid not knowning the uid or gid prevents a correct evm hash from being computed. evm hash verification also fails when i_uid or i_gid is unknown but that is essentially harmless as it does not cause filesystem corruption. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/attr.c | 8 ++++++++ fs/inode.c | 7 +++++++ fs/namei.c | 26 +++++++++++++++++++++----- fs/xattr.c | 7 +++++++ include/linux/fs.h | 5 +++++ 5 files changed, 48 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/attr.c b/fs/attr.c index dd723578ddce..42bb42bb3c72 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -266,6 +266,14 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) return -EOVERFLOW; + /* Don't allow modifications of files with invalid uids or + * gids unless those uids & gids are being made valid. + */ + if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid)) + return -EOVERFLOW; + if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid)) + return -EOVERFLOW; + error = security_inode_setattr(dentry, attr); if (error) return error; diff --git a/fs/inode.c b/fs/inode.c index 4ccbc21b30ce..c0ebb97fb085 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1617,6 +1617,13 @@ bool atime_needs_update(const struct path *path, struct inode *inode) if (inode->i_flags & S_NOATIME) return false; + + /* Atime updates will likely cause i_uid and i_gid to be written + * back improprely if their true value is unknown to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return false; + if (IS_NOATIME(inode)) return false; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) diff --git a/fs/namei.c b/fs/namei.c index 8701bd9a5270..840201c4c290 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -410,6 +410,14 @@ int __inode_permission(struct inode *inode, int mask) */ if (IS_IMMUTABLE(inode)) return -EACCES; + + /* + * Updating mtime will likely cause i_uid and i_gid to be + * written back improperly if their true value is unknown + * to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EACCES; } retval = do_inode_permission(inode, mask); @@ -2759,10 +2767,11 @@ EXPORT_SYMBOL(__check_sticky); * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. - * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. - * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. - * 9. We can't remove a root or mountpoint. - * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * 7. If the victim has an unknown uid or gid we can't change the inode. + * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 10. We can't remove a root or mountpoint. + * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) @@ -2784,7 +2793,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) return -EPERM; if (check_sticky(dir, inode) || IS_APPEND(inode) || - IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -4190,6 +4199,13 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; + /* + * Updating the link count will likely cause i_uid and i_gid to + * be writen back improperly if their true value is unknown to + * the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) diff --git a/fs/xattr.c b/fs/xattr.c index 4beafc43daa5..c243905835ab 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -38,6 +38,13 @@ xattr_permission(struct inode *inode, const char *name, int mask) if (mask & MAY_WRITE) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; + /* + * Updating an xattr will likely cause i_uid and i_gid + * to be writen back improperly if their true value is + * unknown to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EPERM; } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 375e37f42cdf..cb25ceb6d1ef 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1874,6 +1874,11 @@ struct super_operations { #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) +static inline bool HAS_UNMAPPED_ID(struct inode *inode) +{ + return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid); +} + /* * Inode state bits. Protected by inode->i_lock * -- cgit v1.2.3 From d49d37624a1931c2f3b5d0cbe95bd5181cbdc279 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 30 Jun 2016 16:31:01 -0500 Subject: quota: Ensure qids map to the filesystem Introduce the helper qid_has_mapping and use it to ensure that the quota system only considers qids that map to the filesystems s_user_ns. In practice for quota supporting filesystems today this is the exact same check as qid_valid. As only 0xffffffff aka (qid_t)-1 does not map into init_user_ns. Replace the qid_valid calls with qid_has_mapping as values come in from userspace. This is harmless today and it prepares the quota system to work on filesystems with quotas but mounted by unprivileged users. Call qid_has_mapping from dqget. This ensures the passed in qid has a prepresentation on the underlying filesystem. Previously this was unnecessary as filesystesm never had qids that could not map. With the introduction of filesystems outside of s_user_ns this will not remain true. All of this ensures the quota code never has to deal with qids that don't map to the underlying filesystem. Cc: Jan Kara Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/quota/dquot.c | 3 +++ fs/quota/quota.c | 12 ++++++------ include/linux/quota.h | 10 ++++++++++ 3 files changed, 19 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ff21980d0119..74706b6aa747 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -841,6 +841,9 @@ struct dquot *dqget(struct super_block *sb, struct kqid qid) unsigned int hashent = hashfn(sb, qid); struct dquot *dquot, *empty = NULL; + if (!qid_has_mapping(sb->s_user_ns, qid)) + return ERR_PTR(-EINVAL); + if (!sb_has_quota_active(sb, qid.type)) return ERR_PTR(-ESRCH); we_slept: diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 0f10ee9892ce..73f6f4cf0a21 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -211,7 +211,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); if (ret) @@ -237,7 +237,7 @@ static int quota_getnextquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq); if (ret) @@ -288,7 +288,7 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; copy_from_if_dqblk(&fdq, &idq); return sb->s_qcop->set_dqblk(sb, qid, &fdq); @@ -581,7 +581,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; /* Are we actually setting timer / warning limits for all users? */ if (from_kqid(&init_user_ns, qid) == 0 && @@ -642,7 +642,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &qdq); if (ret) @@ -669,7 +669,7 @@ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq); if (ret) diff --git a/include/linux/quota.h b/include/linux/quota.h index 9dfb6bce8c9e..1db16ee39b31 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -179,6 +179,16 @@ static inline struct kqid make_kqid_projid(kprojid_t projid) return kqid; } +/** + * qid_has_mapping - Report if a qid maps into a user namespace. + * @ns: The user namespace to see if a value maps into. + * @qid: The kernel internal quota identifier to test. + */ +static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid) +{ + return from_kqid(ns, qid) != (qid_t) -1; +} + extern spinlock_t dq_data_lock; -- cgit v1.2.3 From 81754357770ebd900801231e7bc8d151ddc00498 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Thu, 11 Dec 2014 10:15:45 -0600 Subject: fs: Update i_[ug]id_(read|write) to translate relative to s_user_ns For filesystems mounted from a user namespace on-disk ids should be translated relative to s_users_ns rather than init_user_ns. When an id in the filesystem doesn't exist in s_user_ns the associated id in the inode will be set to INVALID_[UG]ID, which turns these into de facto "nobody" ids. This actually maps pretty well into the way most code already works, and those places where it didn't were fixed in previous patches. Moving forward vfs code needs to be careful to handle instances where ids in inodes may be invalid. Signed-off-by: Seth Forshee Signed-off-by: Eric W. Biederman --- include/linux/fs.h | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index cb25ceb6d1ef..8aa9b72e0bc5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -831,31 +831,6 @@ static inline void i_size_write(struct inode *inode, loff_t i_size) #endif } -/* Helper functions so that in most cases filesystems will - * not need to deal directly with kuid_t and kgid_t and can - * instead deal with the raw numeric values that are stored - * in the filesystem. - */ -static inline uid_t i_uid_read(const struct inode *inode) -{ - return from_kuid(&init_user_ns, inode->i_uid); -} - -static inline gid_t i_gid_read(const struct inode *inode) -{ - return from_kgid(&init_user_ns, inode->i_gid); -} - -static inline void i_uid_write(struct inode *inode, uid_t uid) -{ - inode->i_uid = make_kuid(&init_user_ns, uid); -} - -static inline void i_gid_write(struct inode *inode, gid_t gid) -{ - inode->i_gid = make_kgid(&init_user_ns, gid); -} - static inline unsigned iminor(const struct inode *inode) { return MINOR(inode->i_rdev); @@ -1461,6 +1436,31 @@ struct super_block { struct list_head s_inodes; /* all inodes */ }; +/* Helper functions so that in most cases filesystems will + * not need to deal directly with kuid_t and kgid_t and can + * instead deal with the raw numeric values that are stored + * in the filesystem. + */ +static inline uid_t i_uid_read(const struct inode *inode) +{ + return from_kuid(inode->i_sb->s_user_ns, inode->i_uid); +} + +static inline gid_t i_gid_read(const struct inode *inode) +{ + return from_kgid(inode->i_sb->s_user_ns, inode->i_gid); +} + +static inline void i_uid_write(struct inode *inode, uid_t uid) +{ + inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid); +} + +static inline void i_gid_write(struct inode *inode, gid_t gid) +{ + inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); +} + extern struct timespec current_fs_time(struct super_block *sb); /* -- cgit v1.2.3