summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/ext4_jbd2.c3
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/io-wq.c71
-rw-r--r--fs/namespace.c42
-rw-r--r--fs/notify/fanotify/fanotify_user.c17
-rw-r--r--fs/notify/inotify/inotify_user.c17
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c47
-rw-r--r--fs/overlayfs/readdir.c5
-rw-r--r--fs/pipe.c19
11 files changed, 166 insertions, 61 deletions
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b96ecba91899..b60f0152ea57 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -244,9 +244,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
* "bh" may be NULL: a metadata block may have been freed from memory
* but there may still be a record of it in the journal, and that record
* still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
*/
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
int is_metadata, struct inode *inode,
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index bc364c119af6..cebea4270817 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -138,7 +138,7 @@ static int kmmpd(void *data)
unsigned mmp_check_interval;
unsigned long last_update_time;
unsigned long diff;
- int retval;
+ int retval = 0;
mmp_block = le64_to_cpu(es->s_mmp_block);
mmp = (struct mmp_struct *)(bh->b_data);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5fd56f616cf0..f3bbcd4efb56 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2517,7 +2517,7 @@ again:
goto journal_error;
err = ext4_handle_dirty_dx_node(handle, dir,
frame->bh);
- if (err)
+ if (restart || err)
goto journal_error;
} else {
struct dx_root *dxroot;
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cf086b01c6c6..12fc19353bb0 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -130,6 +130,7 @@ struct io_cb_cancel_data {
};
static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
+static void io_wqe_dec_running(struct io_worker *worker);
static bool io_worker_get(struct io_worker *worker)
{
@@ -168,26 +169,21 @@ static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- unsigned flags;
if (refcount_dec_and_test(&worker->ref))
complete(&worker->ref_done);
wait_for_completion(&worker->ref_done);
- preempt_disable();
- current->flags &= ~PF_IO_WORKER;
- flags = worker->flags;
- worker->flags = 0;
- if (flags & IO_WORKER_F_RUNNING)
- atomic_dec(&acct->nr_running);
- worker->flags = 0;
- preempt_enable();
-
raw_spin_lock_irq(&wqe->lock);
- if (flags & IO_WORKER_F_FREE)
+ if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
acct->nr_workers--;
+ preempt_disable();
+ io_wqe_dec_running(worker);
+ worker->flags = 0;
+ current->flags &= ~PF_IO_WORKER;
+ preempt_enable();
raw_spin_unlock_irq(&wqe->lock);
kfree_rcu(worker, rcu);
@@ -214,15 +210,19 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
struct hlist_nulls_node *n;
struct io_worker *worker;
- n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
- if (is_a_nulls(n))
- return false;
-
- worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
- if (io_worker_get(worker)) {
- wake_up_process(worker->task);
+ /*
+ * Iterate free_list and see if we can find an idle worker to
+ * activate. If a given worker is on the free_list but in the process
+ * of exiting, keep trying.
+ */
+ hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
+ if (!io_worker_get(worker))
+ continue;
+ if (wake_up_process(worker->task)) {
+ io_worker_release(worker);
+ return true;
+ }
io_worker_release(worker);
- return true;
}
return false;
@@ -247,10 +247,19 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
ret = io_wqe_activate_free_worker(wqe);
rcu_read_unlock();
- if (!ret && acct->nr_workers < acct->max_workers) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- create_io_worker(wqe->wq, wqe, acct->index);
+ if (!ret) {
+ bool do_create = false;
+
+ raw_spin_lock_irq(&wqe->lock);
+ if (acct->nr_workers < acct->max_workers) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ acct->nr_workers++;
+ do_create = true;
+ }
+ raw_spin_unlock_irq(&wqe->lock);
+ if (do_create)
+ create_io_worker(wqe->wq, wqe, acct->index);
}
}
@@ -271,9 +280,17 @@ static void create_worker_cb(struct callback_head *cb)
{
struct create_worker_data *cwd;
struct io_wq *wq;
+ struct io_wqe *wqe;
+ struct io_wqe_acct *acct;
cwd = container_of(cb, struct create_worker_data, work);
- wq = cwd->wqe->wq;
+ wqe = cwd->wqe;
+ wq = wqe->wq;
+ acct = &wqe->acct[cwd->index];
+ raw_spin_lock_irq(&wqe->lock);
+ if (acct->nr_workers < acct->max_workers)
+ acct->nr_workers++;
+ raw_spin_unlock_irq(&wqe->lock);
create_io_worker(wq, cwd->wqe, cwd->index);
kfree(cwd);
}
@@ -635,6 +652,9 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
kfree(worker);
fail:
atomic_dec(&acct->nr_running);
+ raw_spin_lock_irq(&wqe->lock);
+ acct->nr_workers--;
+ raw_spin_unlock_irq(&wqe->lock);
io_worker_ref_put(wq);
return;
}
@@ -650,9 +670,8 @@ fail:
worker->flags |= IO_WORKER_F_FREE;
if (index == IO_WQ_ACCT_BOUND)
worker->flags |= IO_WORKER_F_BOUND;
- if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
+ if ((acct->nr_workers == 1) && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED;
- acct->nr_workers++;
raw_spin_unlock_irq(&wqe->lock);
wake_up_new_task(tsk);
}
diff --git a/fs/namespace.c b/fs/namespace.c
index ab4174a3c802..f79d9471cb76 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1938,6 +1938,20 @@ void drop_collected_mounts(struct vfsmount *mnt)
namespace_unlock();
}
+static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+{
+ struct mount *child;
+
+ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+ if (!is_subdir(child->mnt_mountpoint, dentry))
+ continue;
+
+ if (child->mnt.mnt_flags & MNT_LOCKED)
+ return true;
+ }
+ return false;
+}
+
/**
* clone_private_mount - create a private clone of a path
* @path: path to clone
@@ -1953,10 +1967,19 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
+ down_read(&namespace_sem);
if (IS_MNT_UNBINDABLE(old_mnt))
- return ERR_PTR(-EINVAL);
+ goto invalid;
+
+ if (!check_mnt(old_mnt))
+ goto invalid;
+
+ if (has_locked_children(old_mnt, path->dentry))
+ goto invalid;
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+ up_read(&namespace_sem);
+
if (IS_ERR(new_mnt))
return ERR_CAST(new_mnt);
@@ -1964,6 +1987,10 @@ struct vfsmount *clone_private_mount(const struct path *path)
new_mnt->mnt_ns = MNT_NS_INTERNAL;
return &new_mnt->mnt;
+
+invalid:
+ up_read(&namespace_sem);
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);
@@ -2315,19 +2342,6 @@ static int do_change_type(struct path *path, int ms_flags)
return err;
}
-static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
-{
- struct mount *child;
- list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
- if (!is_subdir(child->mnt_mountpoint, dentry))
- continue;
-
- if (child->mnt.mnt_flags & MNT_LOCKED)
- return true;
- }
- return false;
-}
-
static struct mount *__do_loopback(struct path *old_path, int recurse)
{
struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 64864fb40b40..28b67cb9458d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -54,22 +54,27 @@ static int fanotify_max_queued_events __read_mostly;
#include <linux/sysctl.h>
+static long ft_zero = 0;
+static long ft_int_max = INT_MAX;
+
struct ctl_table fanotify_table[] = {
{
.procname = "max_user_groups",
.data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &ft_zero,
+ .extra2 = &ft_int_max,
},
{
.procname = "max_user_marks",
.data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &ft_zero,
+ .extra2 = &ft_int_max,
},
{
.procname = "max_queued_events",
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 98f61b31745a..62051247f6d2 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -55,22 +55,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
#include <linux/sysctl.h>
+static long it_zero = 0;
+static long it_int_max = INT_MAX;
+
struct ctl_table inotify_table[] = {
{
.procname = "max_user_instances",
.data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &it_zero,
+ .extra2 = &it_int_max,
},
{
.procname = "max_user_watches",
.data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &it_zero,
+ .extra2 = &it_int_max,
},
{
.procname = "max_queued_events",
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 41ebf52f1bbc..ebde05c9cf62 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -392,6 +392,7 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
*/
take_dentry_name_snapshot(&name, real);
this = lookup_one_len(name.name.name, connected, name.name.len);
+ release_dentry_name_snapshot(&name);
err = PTR_ERR(this);
if (IS_ERR(this)) {
goto fail;
@@ -406,7 +407,6 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
}
out:
- release_dentry_name_snapshot(&name);
dput(parent);
inode_unlock(dir);
return this;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 4d53d3b7e5fe..d081faa55e83 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -392,6 +392,51 @@ out_unlock:
return ret;
}
+/*
+ * Calling iter_file_splice_write() directly from overlay's f_op may deadlock
+ * due to lock order inversion between pipe->mutex in iter_file_splice_write()
+ * and file_start_write(real.file) in ovl_write_iter().
+ *
+ * So do everything ovl_write_iter() does and call iter_file_splice_write() on
+ * the real file.
+ */
+static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ struct inode *inode = file_inode(out);
+ struct inode *realinode = ovl_inode_real(inode);
+ ssize_t ret;
+
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(realinode, inode);
+ ret = file_remove_privs(out);
+ if (ret)
+ goto out_unlock;
+
+ ret = ovl_real_fdget(out, &real);
+ if (ret)
+ goto out_unlock;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ file_start_write(real.file);
+
+ ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
+
+ file_end_write(real.file);
+ /* Update size */
+ ovl_copyattr(realinode, inode);
+ revert_creds(old_cred);
+ fdput(real);
+
+out_unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct fd real;
@@ -603,7 +648,7 @@ const struct file_operations ovl_file_operations = {
.fadvise = ovl_fadvise,
.flush = ovl_flush,
.splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
+ .splice_write = ovl_splice_write,
.copy_file_range = ovl_copy_file_range,
.remap_file_range = ovl_remap_file_range,
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index e8ad2c2c77dd..150fdf3bc68d 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -481,6 +481,8 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
}
this = lookup_one_len(p->name, dir, p->len);
if (IS_ERR_OR_NULL(this) || !this->d_inode) {
+ /* Mark a stale entry */
+ p->is_whiteout = true;
if (IS_ERR(this)) {
err = PTR_ERR(this);
this = NULL;
@@ -776,6 +778,9 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
if (err)
goto out;
}
+ }
+ /* ovl_cache_update_ino() sets is_whiteout on stale entry */
+ if (!p->is_whiteout) {
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 9ef4231cce61..8e6ef62aeb1c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -32,6 +32,21 @@
#include "internal.h"
/*
+ * New pipe buffers will be restricted to this size while the user is exceeding
+ * their pipe buffer quota. The general pipe use case needs at least two
+ * buffers: one for data yet to be read, and one for new data. If this is less
+ * than two, then a write to a non-empty pipe may block even if the pipe is not
+ * full. This can occur with GNU make jobserver or similar uses of pipes as
+ * semaphores: multiple processes may be waiting to write tokens back to the
+ * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
+ *
+ * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
+ * own risk, namely: pipe writes to non-full pipes may block until the pipe is
+ * emptied.
+ */
+#define PIPE_MIN_DEF_BUFFERS 2
+
+/*
* The max size that a non-root user is allowed to grow the pipe. Can
* be set by root in /proc/sys/fs/pipe-max-size
*/
@@ -781,8 +796,8 @@ struct pipe_inode_info *alloc_pipe_info(void)
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
- user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
- pipe_bufs = 1;
+ user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
+ pipe_bufs = PIPE_MIN_DEF_BUFFERS;
}
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())