summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/block_dev.c17
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/scrub.c2
-rw-r--r--fs/cifs/cifs_debug.c2
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/cifsglob.h19
-rw-r--r--fs/cifs/connect.c14
-rw-r--r--fs/cifs/sess.c1
-rw-r--r--fs/cifs/smb2inode.c1
-rw-r--r--fs/cifs/smb2misc.c10
-rw-r--r--fs/cifs/smb2ops.c10
-rw-r--r--fs/cifs/smb2pdu.c6
-rw-r--r--fs/cifs/smb2proto.h3
-rw-r--r--fs/cifs/transport.c4
-rw-r--r--fs/configfs/file.c6
-rw-r--r--fs/crypto/bio.c6
-rw-r--r--fs/erofs/zdata.c2
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/f2fs/checkpoint.c2
-rw-r--r--fs/f2fs/data.c4
-rw-r--r--fs/f2fs/segment.c2
-rw-r--r--fs/f2fs/segment.h4
-rw-r--r--fs/f2fs/super.c4
-rw-r--r--fs/gfs2/log.c6
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/super.c10
-rw-r--r--fs/gfs2/super.h2
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/gfs2/util.c17
-rw-r--r--fs/io-wq.c25
-rw-r--r--fs/io-wq.h2
-rw-r--r--fs/io_uring.c837
-rw-r--r--fs/iomap/buffered-io.c4
-rw-r--r--fs/iomap/direct-io.c4
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/dir.c58
-rw-r--r--fs/nfs/inode.c7
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/nfs3xdr.c3
-rw-r--r--fs/nfs/nfs42proc.c12
-rw-r--r--fs/nfs/nfs4proc.c33
-rw-r--r--fs/nfs/unlink.c6
-rw-r--r--fs/nfs/write.c8
-rw-r--r--fs/nilfs2/segbuf.c2
-rw-r--r--fs/pnode.h2
-rw-r--r--fs/squashfs/block.c2
-rw-r--r--fs/zonefs/super.c2
50 files changed, 639 insertions, 545 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 462253ae483a..a55bda4233bb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -203,7 +203,7 @@ config TMPFS_XATTR
config TMPFS_INODE64
bool "Use 64-bit ino_t by default in tmpfs"
- depends on TMPFS && 64BIT && !(S390 || ALPHA)
+ depends on TMPFS && 64BIT
default n
help
tmpfs has historically used only inode numbers as wide as an unsigned
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4aa1f88d5bf8..92ed7d5df677 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -118,13 +118,22 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
if (!(mode & FMODE_EXCL)) {
int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
if (err)
- return err;
+ goto invalidate;
}
truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
if (!(mode & FMODE_EXCL))
bd_abort_claiming(bdev, truncate_bdev_range);
return 0;
+
+invalidate:
+ /*
+ * Someone else has handle exclusively open. Try invalidating instead.
+ * The 'end' argument is inclusive so the rounding is safe.
+ */
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
+ lstart >> PAGE_SHIFT,
+ lend >> PAGE_SHIFT);
}
static void set_init_blocksize(struct block_device *bdev)
@@ -423,7 +432,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size;
- nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES);
+ nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
if (!nr_pages) {
bool polled = false;
@@ -491,8 +500,8 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (!iov_iter_count(iter))
return 0;
- nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1);
- if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
+ nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
+ if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4671c99d468d..191e358f1322 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3059,7 +3059,7 @@ struct bio *btrfs_bio_alloc(u64 first_byte)
{
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
bio->bi_iter.bi_sector = first_byte >> 9;
btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c2900ebf767a..3d9088eab2fc 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1428,7 +1428,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
if (!first_page->dev->bdev)
goto out;
- bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
+ bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
bio_set_dev(bio, first_page->dev->bdev);
for (page_num = 0; page_num < sblock->page_count; page_num++) {
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 3aedc484e440..88a7958170ee 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -207,7 +207,7 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
from_kuid(&init_user_ns, cfile->uid),
cfile->dentry);
#ifdef CONFIG_CIFS_DEBUG2
- seq_printf(m, " 0x%llx\n", cfile->fid.mid);
+ seq_printf(m, " %llu\n", cfile->fid.mid);
#else
seq_printf(m, "\n");
#endif /* CIFS_DEBUG2 */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d43e935d2df4..099ad9f3660b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -290,7 +290,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
rc = server->ops->queryfs(xid, tcon, cifs_sb, buf);
free_xid(xid);
- return 0;
+ return rc;
}
static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 3de3c5908a72..31fc8695abd6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -257,7 +257,7 @@ struct smb_version_operations {
/* verify the message */
int (*check_message)(char *, unsigned int, struct TCP_Server_Info *);
bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
- int (*handle_cancelled_mid)(char *, struct TCP_Server_Info *);
+ int (*handle_cancelled_mid)(struct mid_q_entry *, struct TCP_Server_Info *);
void (*downgrade_oplock)(struct TCP_Server_Info *server,
struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int epoch, bool *purge_cache);
@@ -1705,16 +1705,17 @@ static inline bool is_retryable_error(int error)
#define CIFS_NO_RSP_BUF 0x040 /* no response buffer required */
/* Type of request operation */
-#define CIFS_ECHO_OP 0x080 /* echo request */
-#define CIFS_OBREAK_OP 0x0100 /* oplock break request */
-#define CIFS_NEG_OP 0x0200 /* negotiate request */
+#define CIFS_ECHO_OP 0x080 /* echo request */
+#define CIFS_OBREAK_OP 0x0100 /* oplock break request */
+#define CIFS_NEG_OP 0x0200 /* negotiate request */
+#define CIFS_CP_CREATE_CLOSE_OP 0x0400 /* compound create+close request */
/* Lower bitmask values are reserved by others below. */
-#define CIFS_SESS_OP 0x2000 /* session setup request */
-#define CIFS_OP_MASK 0x2380 /* mask request type */
+#define CIFS_SESS_OP 0x2000 /* session setup request */
+#define CIFS_OP_MASK 0x2780 /* mask request type */
-#define CIFS_HAS_CREDITS 0x0400 /* already has credits */
-#define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */
-#define CIFS_NO_SRV_RSP 0x1000 /* there is no server response */
+#define CIFS_HAS_CREDITS 0x0400 /* already has credits */
+#define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */
+#define CIFS_NO_SRV_RSP 0x1000 /* there is no server response */
/* Security Flags: indicate type of session setup needed */
#define CIFSSEC_MAY_SIGN 0x00001
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 112692300fb6..eec8a2052da2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -741,7 +741,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
spin_lock(&GlobalMid_Lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid);
+ cifs_dbg(FYI, "Clearing mid %llu\n", mid_entry->mid);
kref_get(&mid_entry->refcount);
mid_entry->mid_state = MID_SHUTDOWN;
list_move(&mid_entry->qhead, &dispose_list);
@@ -752,7 +752,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
/* now walk dispose list and issue callbacks */
list_for_each_safe(tmp, tmp2, &dispose_list) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid);
+ cifs_dbg(FYI, "Callback mid %llu\n", mid_entry->mid);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
cifs_mid_q_entry_release(mid_entry);
@@ -1429,6 +1429,11 @@ smbd_connected:
tcp_ses->min_offload = ctx->min_offload;
tcp_ses->tcpStatus = CifsNeedNegotiate;
+ if ((ctx->max_credits < 20) || (ctx->max_credits > 60000))
+ tcp_ses->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
+ else
+ tcp_ses->max_credits = ctx->max_credits;
+
tcp_ses->nr_targets = 1;
tcp_ses->ignore_signature = ctx->ignore_signature;
/* thread spawned, put it on the list */
@@ -2832,11 +2837,6 @@ static int mount_get_conns(struct smb3_fs_context *ctx, struct cifs_sb_info *cif
*nserver = server;
- if ((ctx->max_credits < 20) || (ctx->max_credits > 60000))
- server->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
- else
- server->max_credits = ctx->max_credits;
-
/* get a reference to a SMB session */
ses = cifs_get_smb_ses(server, ctx);
if (IS_ERR(ses)) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 183a3a868d7b..63d517b9f2ff 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -230,6 +230,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
ctx.noautotune = ses->server->noautotune;
ctx.sockopt_tcp_nodelay = ses->server->tcp_nodelay;
ctx.echo_interval = ses->server->echo_interval / HZ;
+ ctx.max_credits = ses->server->max_credits;
/*
* This will be used for encoding/decoding user/domain/pw
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1f900b81c34a..a718dc77e604 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -358,6 +358,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
if (cfile)
goto after_close;
/* Close */
+ flags |= CIFS_CP_CREATE_CLOSE_OP;
rqst[num_rqst].rq_iov = &vars->close_iov[0];
rqst[num_rqst].rq_nvec = 1;
rc = SMB2_close_init(tcon, server,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 60d4bd1eae2b..b50164e2c88d 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -767,7 +767,7 @@ smb2_cancelled_close_fid(struct work_struct *work)
int rc;
if (cancelled->mid)
- cifs_tcon_dbg(VFS, "Close unmatched open for MID:%llx\n",
+ cifs_tcon_dbg(VFS, "Close unmatched open for MID:%llu\n",
cancelled->mid);
else
cifs_tcon_dbg(VFS, "Close interrupted close\n");
@@ -844,14 +844,14 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
}
int
-smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server)
+smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer;
- struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer;
+ struct smb2_sync_hdr *sync_hdr = mid->resp_buf;
+ struct smb2_create_rsp *rsp = mid->resp_buf;
struct cifs_tcon *tcon;
int rc;
- if (sync_hdr->Command != SMB2_CREATE ||
+ if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || sync_hdr->Command != SMB2_CREATE ||
sync_hdr->Status != STATUS_SUCCESS)
return 0;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f5087295424c..9bae7e8deb09 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1195,7 +1195,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
struct TCP_Server_Info *server = cifs_pick_channel(ses);
__le16 *utf16_path = NULL;
int ea_name_len = strlen(ea_name);
- int flags = 0;
+ int flags = CIFS_CP_CREATE_CLOSE_OP;
int len;
struct smb_rqst rqst[3];
int resp_buftype[3];
@@ -1573,7 +1573,7 @@ smb2_ioctl_query_info(const unsigned int xid,
struct smb_query_info qi;
struct smb_query_info __user *pqi;
int rc = 0;
- int flags = 0;
+ int flags = CIFS_CP_CREATE_CLOSE_OP;
struct smb2_query_info_rsp *qi_rsp = NULL;
struct smb2_ioctl_rsp *io_rsp = NULL;
void *buffer = NULL;
@@ -2577,7 +2577,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
{
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server = cifs_pick_channel(ses);
- int flags = 0;
+ int flags = CIFS_CP_CREATE_CLOSE_OP;
struct smb_rqst rqst[3];
int resp_buftype[3];
struct kvec rsp_iov[3];
@@ -2975,7 +2975,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int sub_offset;
unsigned int print_len;
unsigned int print_offset;
- int flags = 0;
+ int flags = CIFS_CP_CREATE_CLOSE_OP;
struct smb_rqst rqst[3];
int resp_buftype[3];
struct kvec rsp_iov[3];
@@ -3157,7 +3157,7 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_fid fid;
struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
- int flags = 0;
+ int flags = CIFS_CP_CREATE_CLOSE_OP;
struct smb_rqst rqst[3];
int resp_buftype[3];
struct kvec rsp_iov[3];
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 4bbb6126b14d..2199a9bfae8f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -4041,8 +4041,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
SMB2_MAX_BUFFER_SIZE));
- shdr->CreditRequest =
- cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
+ shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8);
rc = adjust_credits(server, &rdata->credits, rdata->bytes);
if (rc)
@@ -4348,8 +4347,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (wdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
SMB2_MAX_BUFFER_SIZE));
- shdr->CreditRequest =
- cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
+ shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8);
rc = adjust_credits(server, &wdata->credits, wdata->bytes);
if (rc)
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 9565e27681a5..a2eb34a8d9c9 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -246,8 +246,7 @@ extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon,
__u64 persistent_fid,
__u64 volatile_fid);
-extern int smb2_handle_cancelled_mid(char *buffer,
- struct TCP_Server_Info *server);
+extern int smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server);
void smb2_cancelled_close_fid(struct work_struct *work);
extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e90a1d1380b0..007d99437c77 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -101,7 +101,7 @@ static void _cifs_mid_q_entry_release(struct kref *refcount)
if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) &&
midEntry->mid_state == MID_RESPONSE_RECEIVED &&
server->ops->handle_cancelled_mid)
- server->ops->handle_cancelled_mid(midEntry->resp_buf, server);
+ server->ops->handle_cancelled_mid(midEntry, server);
midEntry->mid_state = MID_FREE;
atomic_dec(&midCount);
@@ -1207,7 +1207,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
}
if (rc != 0) {
for (; i < num_rqst; i++) {
- cifs_server_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n",
+ cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n",
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(server, &rqst[i], midQ[i]);
spin_lock(&GlobalMid_Lock);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 1f0270229d7b..da8351d1e455 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -378,7 +378,7 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type
attr = to_attr(dentry);
if (!attr)
- goto out_put_item;
+ goto out_free_buffer;
if (type & CONFIGFS_ITEM_BIN_ATTR) {
buffer->bin_attr = to_bin_attr(dentry);
@@ -391,7 +391,7 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type
/* Grab the module reference for this attribute if we have one */
error = -ENODEV;
if (!try_module_get(buffer->owner))
- goto out_put_item;
+ goto out_free_buffer;
error = -EACCES;
if (!buffer->item->ci_type)
@@ -435,8 +435,6 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type
out_put_module:
module_put(buffer->owner);
-out_put_item:
- config_item_put(buffer->item);
out_free_buffer:
up_read(&frag->frag_sem);
kfree(buffer);
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index b048a0e38516..68a2de6b5a9b 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -52,7 +52,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
int num_pages = 0;
/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
- bio = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+ bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
while (len) {
unsigned int blocks_this_page = min(len, blocks_per_page);
@@ -74,7 +74,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
len -= blocks_this_page;
lblk += blocks_this_page;
pblk += blocks_this_page;
- if (num_pages == BIO_MAX_PAGES || !len ||
+ if (num_pages == BIO_MAX_VECS || !len ||
!fscrypt_mergeable_bio(bio, inode, lblk)) {
err = submit_bio_wait(bio);
if (err)
@@ -126,7 +126,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk,
len);
- BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES);
+ BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS);
nr_pages = min_t(unsigned int, ARRAY_SIZE(pages),
(len + blocks_per_page - 1) >> blocks_per_page_bits);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 6cb356c4217b..3851e1a64f73 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1235,7 +1235,7 @@ submit_bio_retry:
}
if (!bio) {
- bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+ bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
bio->bi_end_io = z_erofs_decompressqueue_endio;
bio_set_dev(bio, sb->s_bdev);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 03a44a0de86a..f038d578d8d8 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -398,7 +398,7 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+ bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 174a0819ad96..be5415a0dbbc 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -292,7 +292,7 @@ void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
f2fs_put_page(page, 0);
if (readahead)
- f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
+ f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true);
}
static int __f2fs_write_meta_page(struct page *page,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7c95818639a6..4e5257c763d0 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -857,7 +857,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL);
alloc_new:
if (!bio) {
- bio = __bio_alloc(fio, BIO_MAX_PAGES);
+ bio = __bio_alloc(fio, BIO_MAX_VECS);
__attach_io_flag(fio);
f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
fio->page->index, fio, GFP_NOIO);
@@ -932,7 +932,7 @@ alloc_new:
fio->retry = true;
goto skip;
}
- io->bio = __bio_alloc(fio, BIO_MAX_PAGES);
+ io->bio = __bio_alloc(fio, BIO_MAX_VECS);
f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host,
bio_page->index, fio, GFP_NOIO);
io->fio = *fio;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 993004f06a77..c2866561263e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -4381,7 +4381,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
block_t total_node_blocks = 0;
do {
- readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES,
+ readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS,
META_SIT, true);
start = start_blk * sit_i->sents_per_block;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 229814b4f4a6..e9a7a637d688 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -851,7 +851,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
else if (type == NODE)
return 8 * sbi->blocks_per_seg;
else if (type == META)
- return 8 * BIO_MAX_PAGES;
+ return 8 * BIO_MAX_VECS;
else
return 0;
}
@@ -868,7 +868,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
return 0;
nr_to_write = wbc->nr_to_write;
- desired = BIO_MAX_PAGES;
+ desired = BIO_MAX_VECS;
if (type == NODE)
desired <<= 1;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 7069793752f1..82592b19b4e0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -753,9 +753,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_io_size_bits:
if (args->from && match_int(args, &arg))
return -EINVAL;
- if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) {
+ if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) {
f2fs_warn(sbi, "Not support %d, larger than %d",
- 1 << arg, BIO_MAX_PAGES);
+ 1 << arg, BIO_MAX_VECS);
return -EINVAL;
}
F2FS_OPTION(sbi).write_io_size_bits = arg;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 16937ebb2a3e..6410281546f9 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -998,12 +998,16 @@ static void trans_drain(struct gfs2_trans *tr)
while (!list_empty(head)) {
bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
+ if (!list_empty(&bd->bd_ail_st_list))
+ gfs2_remove_from_ail(bd);
kmem_cache_free(gfs2_bufdata_cachep, bd);
}
head = &tr->tr_databuf;
while (!list_empty(head)) {
bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
+ if (!list_empty(&bd->bd_ail_st_list))
+ gfs2_remove_from_ail(bd);
kmem_cache_free(gfs2_bufdata_cachep, bd);
}
}
@@ -1032,7 +1036,7 @@ repeat:
* Do this check while holding the log_flush_lock to prevent new
* buffers from being added to the ail via gfs2_pin()
*/
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawn(sdp) || !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
goto out;
/* Log might have been flushed while we waited for the flush lock */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index dc1b93a877c6..a82f4747aa8d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -267,7 +267,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
bio_end_io_t *end_io)
{
struct super_block *sb = sdp->sd_vfs;
- struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+ struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift;
bio_set_dev(bio, sb->s_bdev);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 74c7d01723b9..aa4136055a83 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1539,9 +1539,7 @@ static int gfs2_reconfigure(struct fs_context *fc)
return -EINVAL;
if (fc->sb_flags & SB_RDONLY) {
- error = gfs2_make_fs_ro(sdp);
- if (error)
- errorfc(fc, "unable to remount read-only");
+ gfs2_make_fs_ro(sdp);
} else {
error = gfs2_make_fs_rw(sdp);
if (error)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 861ed5fe02a5..97076d3f562f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -587,9 +587,8 @@ out:
* Returns: errno
*/
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+void gfs2_make_fs_ro(struct gfs2_sbd *sdp)
{
- int error = 0;
int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
gfs2_flush_delete_work(sdp);
@@ -624,8 +623,6 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
if (!log_write_allowed)
sdp->sd_vfs->s_flags |= SB_RDONLY;
-
- return error;
}
/**
@@ -637,7 +634,6 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
static void gfs2_put_super(struct super_block *sb)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
- int error;
struct gfs2_jdesc *jd;
/* No more recovery requests */
@@ -658,9 +654,7 @@ restart:
spin_unlock(&sdp->sd_jindex_spin);
if (!sb_rdonly(sb)) {
- error = gfs2_make_fs_ro(sdp);
- if (error)
- gfs2_io_error(sdp);
+ gfs2_make_fs_ro(sdp);
}
WARN_ON(gfs2_withdrawing(sdp));
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 08e502dec7ec..ec4affb33ed5 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -34,7 +34,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
struct gfs2_inode **ipp);
extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-extern int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
+extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp);
extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index ab96cf0bf26b..63fec11ef2ce 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -169,6 +169,8 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
bd->bd_bh = bh;
bd->bd_gl = gl;
INIT_LIST_HEAD(&bd->bd_list);
+ INIT_LIST_HEAD(&bd->bd_ail_st_list);
+ INIT_LIST_HEAD(&bd->bd_ail_gl_list);
bh->b_private = bd;
return bd;
}
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 8d3c670c990f..4f034b87b427 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -119,17 +119,22 @@ void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh)
static void signal_our_withdraw(struct gfs2_sbd *sdp)
{
struct gfs2_glock *live_gl = sdp->sd_live_gh.gh_gl;
- struct inode *inode = sdp->sd_jdesc->jd_inode;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_glock *i_gl = ip->i_gl;
- u64 no_formal_ino = ip->i_no_formal_ino;
+ struct inode *inode;
+ struct gfs2_inode *ip;
+ struct gfs2_glock *i_gl;
+ u64 no_formal_ino;
int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
int ret = 0;
int tries;
- if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
+ if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || !sdp->sd_jdesc)
return;
+ inode = sdp->sd_jdesc->jd_inode;
+ ip = GFS2_I(inode);
+ i_gl = ip->i_gl;
+ no_formal_ino = ip->i_no_formal_ino;
+
/* Prevent any glock dq until withdraw recovery is complete */
set_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
/*
@@ -156,7 +161,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
ret = 0;
}
if (!ret)
- ret = gfs2_make_fs_ro(sdp);
+ gfs2_make_fs_ro(sdp);
gfs2_freeze_unlock(&freeze_gh);
}
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 28868eb4cd09..0ae9ecadf295 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -110,7 +110,6 @@ struct io_wq {
io_wq_work_fn *do_work;
struct task_struct *manager;
- struct user_struct *user;
struct io_wq_hash *hash;
@@ -592,7 +591,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
tsk->pf_io_worker = worker;
worker->task = tsk;
set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node));
- tsk->flags |= PF_NOFREEZE | PF_NO_SETAFFINITY;
+ tsk->flags |= PF_NO_SETAFFINITY;
raw_spin_lock_irq(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
@@ -710,7 +709,6 @@ static int io_wq_manager(void *data)
set_current_state(TASK_INTERRUPTIBLE);
io_wq_check_workers(wq);
schedule_timeout(HZ);
- try_to_freeze();
if (fatal_signal_pending(current))
set_bit(IO_WQ_BIT_EXIT, &wq->state);
} while (!test_bit(IO_WQ_BIT_EXIT, &wq->state));
@@ -722,9 +720,9 @@ static int io_wq_manager(void *data)
io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
rcu_read_unlock();
- /* we might not ever have created any workers */
- if (atomic_read(&wq->worker_refs))
- wait_for_completion(&wq->worker_done);
+ if (atomic_dec_and_test(&wq->worker_refs))
+ complete(&wq->worker_done);
+ wait_for_completion(&wq->worker_done);
spin_lock_irq(&wq->hash->wait.lock);
for_each_node(node)
@@ -774,7 +772,10 @@ static int io_wq_fork_manager(struct io_wq *wq)
if (wq->manager)
return 0;
- reinit_completion(&wq->worker_done);
+ WARN_ON_ONCE(test_bit(IO_WQ_BIT_EXIT, &wq->state));
+
+ init_completion(&wq->worker_done);
+ atomic_set(&wq->worker_refs, 1);
tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE);
if (!IS_ERR(tsk)) {
wq->manager = get_task_struct(tsk);
@@ -782,6 +783,9 @@ static int io_wq_fork_manager(struct io_wq *wq)
return 0;
}
+ if (atomic_dec_and_test(&wq->worker_refs))
+ complete(&wq->worker_done);
+
return PTR_ERR(tsk);
}
@@ -794,8 +798,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
/* Can only happen if manager creation fails after exec */
if (io_wq_fork_manager(wqe->wq) ||
test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) {
- work->flags |= IO_WQ_WORK_CANCEL;
- wqe->wq->do_work(work);
+ io_run_cancel(work, wqe);
return;
}
@@ -1018,13 +1021,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
init_completion(&wq->exited);
refcount_set(&wq->refs, 1);
- init_completion(&wq->worker_done);
- atomic_set(&wq->worker_refs, 0);
-
ret = io_wq_fork_manager(wq);
if (!ret)
return wq;
-
err:
io_wq_put_hash(data->hash);
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 5fbf7997149e..1ac2f3248088 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -79,8 +79,8 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
+ const struct cred *creds;
unsigned flags;
- unsigned short personality;
};
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 92c25b5f1349..a4bce17af506 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -258,12 +258,10 @@ enum {
struct io_sq_data {
refcount_t refs;
- struct mutex lock;
+ struct rw_semaphore rw_lock;
/* ctx's that are using this sqd */
struct list_head ctx_list;
- struct list_head ctx_new_list;
- struct mutex ctx_lock;
struct task_struct *thread;
struct wait_queue_head wait;
@@ -271,10 +269,9 @@ struct io_sq_data {
unsigned sq_thread_idle;
int sq_cpu;
pid_t task_pid;
+ pid_t task_tgid;
unsigned long state;
- struct completion startup;
- struct completion parked;
struct completion exited;
};
@@ -336,7 +333,6 @@ struct io_ring_ctx {
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
unsigned int restricted: 1;
- unsigned int sqo_exec: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -380,6 +376,7 @@ struct io_ring_ctx {
/* Only used for accounting purposes */
struct mm_struct *mm_account;
+ const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
@@ -400,7 +397,6 @@ struct io_ring_ctx {
struct user_struct *user;
struct completion ref_comp;
- struct completion sq_thread_comp;
#if defined(CONFIG_UNIX)
struct socket *ring_sock;
@@ -408,7 +404,8 @@ struct io_ring_ctx {
struct idr io_buffer_idr;
- struct idr personality_idr;
+ struct xarray personalities;
+ u32 pers_next;
struct {
unsigned cached_cq_tail;
@@ -454,6 +451,7 @@ struct io_ring_ctx {
/* Keep this last, we don't need it for the fast path */
struct work_struct exit_work;
+ struct list_head tctx_list;
};
/*
@@ -805,6 +803,12 @@ struct io_kiocb {
struct io_wq_work work;
};
+struct io_tctx_node {
+ struct list_head ctx_node;
+ struct task_struct *task;
+ struct io_ring_ctx *ctx;
+};
+
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
@@ -979,6 +983,8 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_UNLINKAT] = {},
};
+static bool io_disarm_next(struct io_kiocb *req);
+static void io_uring_del_task_file(unsigned long index);
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files);
@@ -1129,9 +1135,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->cq_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
- init_completion(&ctx->sq_thread_comp);
idr_init(&ctx->io_buffer_idr);
- idr_init(&ctx->personality_idr);
+ xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
@@ -1144,6 +1149,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
+ INIT_LIST_HEAD(&ctx->tctx_list);
INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
return ctx;
@@ -1183,6 +1189,9 @@ static void io_prep_async_work(struct io_kiocb *req)
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
+
if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT;
@@ -1514,15 +1523,14 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
__io_cqring_fill_event(req, res, 0);
}
-static inline void io_req_complete_post(struct io_kiocb *req, long res,
- unsigned int cflags)
+static void io_req_complete_post(struct io_kiocb *req, long res,
+ unsigned int cflags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
__io_cqring_fill_event(req, res, cflags);
- io_commit_cqring(ctx);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
@@ -1530,19 +1538,26 @@ static inline void io_req_complete_post(struct io_kiocb *req, long res,
if (refcount_dec_and_test(&req->refs)) {
struct io_comp_state *cs = &ctx->submit_state.comp;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK))
+ io_disarm_next(req);
+ if (req->link) {
+ io_req_task_queue(req->link);
+ req->link = NULL;
+ }
+ }
io_dismantle_req(req);
io_put_task(req->task, 1);
list_add(&req->compl.list, &cs->locked_free_list);
cs->locked_free_nr++;
} else
req = NULL;
+ io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
-
io_cqring_ev_posted(ctx);
- if (req) {
- io_queue_next(req);
+
+ if (req)
percpu_ref_put(&ctx->refs);
- }
}
static void io_req_complete_state(struct io_kiocb *req, long res,
@@ -1648,6 +1663,10 @@ static void io_dismantle_req(struct io_kiocb *req)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
if (req->fixed_rsrc_refs)
percpu_ref_put(req->fixed_rsrc_refs);
+ if (req->work.creds) {
+ put_cred(req->work.creds);
+ req->work.creds = NULL;
+ }
if (req->flags & REQ_F_INFLIGHT) {
struct io_ring_ctx *ctx = req->ctx;
@@ -1690,15 +1709,11 @@ static inline void io_remove_next_linked(struct io_kiocb *req)
nxt->link = NULL;
}
-static void io_kill_linked_timeout(struct io_kiocb *req)
+static bool io_kill_linked_timeout(struct io_kiocb *req)
+ __must_hold(&req->ctx->completion_lock)
{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *link;
+ struct io_kiocb *link = req->link;
bool cancelled = false;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- link = req->link;
/*
* Can happen if a linked timeout fired and link had been like
@@ -1713,50 +1728,48 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
io_cqring_fill_event(link, -ECANCELED);
- io_commit_cqring(ctx);
+ io_put_req_deferred(link, 1);
cancelled = true;
}
}
req->flags &= ~REQ_F_LINK_TIMEOUT;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
-
- if (cancelled) {
- io_cqring_ev_posted(ctx);
- io_put_req(link);
- }
+ return cancelled;
}
-
static void io_fail_links(struct io_kiocb *req)
+ __must_hold(&req->ctx->completion_lock)
{
- struct io_kiocb *link, *nxt;
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
+ struct io_kiocb *nxt, *link = req->link;
- spin_lock_irqsave(&ctx->completion_lock, flags);
- link = req->link;
req->link = NULL;
-
while (link) {
nxt = link->link;
link->link = NULL;
trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
-
io_put_req_deferred(link, 2);
link = nxt;
}
- io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+}
- io_cqring_ev_posted(ctx);
+static bool io_disarm_next(struct io_kiocb *req)
+ __must_hold(&req->ctx->completion_lock)
+{
+ bool posted = false;
+
+ if (likely(req->flags & REQ_F_LINK_TIMEOUT))
+ posted = io_kill_linked_timeout(req);
+ if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+ posted |= (req->link != NULL);
+ io_fail_links(req);
+ }
+ return posted;
}
static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
{
- if (req->flags & REQ_F_LINK_TIMEOUT)
- io_kill_linked_timeout(req);
+ struct io_kiocb *nxt;
/*
* If LINK is set, we have dependent requests in this chain. If we
@@ -1764,14 +1777,22 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
- struct io_kiocb *nxt = req->link;
+ if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) {
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
+ bool posted;
- req->link = NULL;
- return nxt;
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ posted = io_disarm_next(req);
+ if (posted)
+ io_commit_cqring(req->ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ if (posted)
+ io_cqring_ev_posted(ctx);
}
- io_fail_links(req);
- return NULL;
+ nxt = req->link;
+ req->link = NULL;
+ return nxt;
}
static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
@@ -5559,22 +5580,30 @@ add:
return 0;
}
+struct io_cancel_data {
+ struct io_ring_ctx *ctx;
+ u64 user_data;
+};
+
static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_cancel_data *cd = data;
- return req->user_data == (unsigned long) data;
+ return req->ctx == cd->ctx && req->user_data == cd->user_data;
}
-static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
+static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
+ struct io_ring_ctx *ctx)
{
+ struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
enum io_wq_cancel cancel_ret;
int ret = 0;
- if (!tctx->io_wq)
+ if (!tctx || !tctx->io_wq)
return -ENOENT;
- cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
+ cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
@@ -5597,8 +5626,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
unsigned long flags;
int ret;
- ret = io_async_cancel_one(req->task->io_uring,
- (void *) (unsigned long) sqe_addr);
+ ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
if (ret != -ENOENT) {
spin_lock_irqsave(&ctx->completion_lock, flags);
goto done;
@@ -5639,8 +5667,47 @@ static int io_async_cancel_prep(struct io_kiocb *req,
static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ u64 sqe_addr = req->cancel.addr;
+ struct io_tctx_node *node;
+ int ret;
+
+ /* tasks should wait for their io-wq threads, so safe w/o sync */
+ ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
+ spin_lock_irq(&ctx->completion_lock);
+ if (ret != -ENOENT)
+ goto done;
+ ret = io_timeout_cancel(ctx, sqe_addr);
+ if (ret != -ENOENT)
+ goto done;
+ ret = io_poll_cancel(ctx, sqe_addr);
+ if (ret != -ENOENT)
+ goto done;
+ spin_unlock_irq(&ctx->completion_lock);
+
+ /* slow path, try all io-wq's */
+ io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ ret = -ENOENT;
+ list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
+ struct io_uring_task *tctx = node->task->io_uring;
+
+ if (!tctx || !tctx->io_wq)
+ continue;
+ ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
+ if (ret != -ENOENT)
+ break;
+ }
+ io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
- io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
+ spin_lock_irq(&ctx->completion_lock);
+done:
+ io_cqring_fill_event(req, ret);
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
+
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_put_req(req);
return 0;
}
@@ -5916,18 +5983,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
const struct cred *creds = NULL;
int ret;
- if (req->work.personality) {
- const struct cred *new_creds;
-
- if (!(issue_flags & IO_URING_F_NONBLOCK))
- mutex_lock(&ctx->uring_lock);
- new_creds = idr_find(&ctx->personality_idr, req->work.personality);
- if (!(issue_flags & IO_URING_F_NONBLOCK))
- mutex_unlock(&ctx->uring_lock);
- if (!new_creds)
- return -EINVAL;
- creds = override_creds(new_creds);
- }
+ if (req->work.creds && req->work.creds != current_cred())
+ creds = override_creds(req->work.creds);
switch (req->opcode) {
case IORING_OP_NOP:
@@ -6291,7 +6348,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
{
struct io_submit_state *state;
unsigned int sqe_flags;
- int ret = 0;
+ int personality, ret = 0;
req->opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
@@ -6306,6 +6363,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
refcount_set(&req->refs, 2);
req->task = current;
req->result = 0;
+ req->work.list.next = NULL;
+ req->work.creds = NULL;
+ req->work.flags = 0;
/* enforce forwards compatibility on users */
if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
@@ -6323,9 +6383,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
!io_op_defs[req->opcode].buffer_select)
return -EOPNOTSUPP;
- req->work.list.next = NULL;
- req->work.flags = 0;
- req->work.personality = READ_ONCE(sqe->personality);
+ personality = READ_ONCE(sqe->personality);
+ if (personality) {
+ req->work.creds = xa_load(&ctx->personalities, personality);
+ if (!req->work.creds)
+ return -EINVAL;
+ get_cred(req->work.creds);
+ }
state = &ctx->submit_state;
/*
@@ -6587,7 +6651,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (!list_empty(&ctx->iopoll_list))
io_do_iopoll(ctx, &nr_events, 0);
- if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
+ if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
+ !(ctx->flags & IORING_SETUP_R_DISABLED))
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
}
@@ -6611,58 +6676,6 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
sqd->sq_thread_idle = sq_thread_idle;
}
-static void io_sqd_init_new(struct io_sq_data *sqd)
-{
- struct io_ring_ctx *ctx;
-
- while (!list_empty(&sqd->ctx_new_list)) {
- ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
- list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
- complete(&ctx->sq_thread_comp);
- }
-
- io_sqd_update_thread_idle(sqd);
-}
-
-static bool io_sq_thread_should_stop(struct io_sq_data *sqd)
-{
- return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
-}
-
-static bool io_sq_thread_should_park(struct io_sq_data *sqd)
-{
- return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-}
-
-static void io_sq_thread_parkme(struct io_sq_data *sqd)
-{
- for (;;) {
- /*
- * TASK_PARKED is a special state; we must serialize against
- * possible pending wakeups to avoid store-store collisions on
- * task->state.
- *
- * Such a collision might possibly result in the task state
- * changin from TASK_PARKED and us failing the
- * wait_task_inactive() in kthread_park().
- */
- set_special_state(TASK_PARKED);
- if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
- break;
-
- /*
- * Thread is going to call schedule(), do not preempt it,
- * or the caller of kthread_park() may spend more time in
- * wait_task_inactive().
- */
- preempt_disable();
- complete(&sqd->parked);
- schedule_preempt_disabled();
- preempt_enable();
- }
- __set_current_state(TASK_RUNNING);
-}
-
static int io_sq_thread(void *data)
{
struct io_sq_data *sqd = data;
@@ -6681,31 +6694,32 @@ static int io_sq_thread(void *data)
set_cpus_allowed_ptr(current, cpu_online_mask);
current->flags |= PF_NO_SETAFFINITY;
- wait_for_completion(&sqd->startup);
+ down_read(&sqd->rw_lock);
- while (!io_sq_thread_should_stop(sqd)) {
+ while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) {
int ret;
bool cap_entries, sqt_spin, needs_sched;
- /*
- * Any changes to the sqd lists are synchronized through the
- * thread parking. This synchronizes the thread vs users,
- * the users are synchronized on the sqd->ctx_lock.
- */
- if (io_sq_thread_should_park(sqd)) {
- io_sq_thread_parkme(sqd);
- continue;
- }
- if (unlikely(!list_empty(&sqd->ctx_new_list))) {
- io_sqd_init_new(sqd);
+ if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
+ up_read(&sqd->rw_lock);
+ cond_resched();
+ down_read(&sqd->rw_lock);
+ io_run_task_work();
timeout = jiffies + sqd->sq_thread_idle;
+ continue;
}
if (fatal_signal_pending(current))
break;
sqt_spin = false;
cap_entries = !list_is_singular(&sqd->ctx_list);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ const struct cred *creds = NULL;
+
+ if (ctx->sq_creds != current_cred())
+ creds = override_creds(ctx->sq_creds);
ret = __io_sq_thread(ctx, cap_entries);
+ if (creds)
+ revert_creds(creds);
if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
@@ -6732,12 +6746,13 @@ static int io_sq_thread(void *data)
}
}
- if (needs_sched && !io_sq_thread_should_park(sqd)) {
+ if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
+ up_read(&sqd->rw_lock);
schedule();
- try_to_freeze();
+ down_read(&sqd->rw_lock);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_clear_wakeup_flag(ctx);
}
@@ -6745,32 +6760,23 @@ static int io_sq_thread(void *data)
finish_wait(&sqd->wait, &wait);
timeout = jiffies + sqd->sq_thread_idle;
}
-
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_uring_cancel_sqpoll(ctx);
-
- io_run_task_work();
-
+ up_read(&sqd->rw_lock);
+ down_write(&sqd->rw_lock);
/*
- * Ensure that we park properly if racing with someone trying to park
- * while we're exiting. If we fail to grab the lock, check park and
- * park if necessary. The ordering with the park bit and the lock
- * ensures that we catch this reliably.
+ * someone may have parked and added a cancellation task_work, run
+ * it first because we don't want it in io_uring_cancel_sqpoll()
*/
- if (!mutex_trylock(&sqd->lock)) {
- if (io_sq_thread_should_park(sqd))
- io_sq_thread_parkme(sqd);
- mutex_lock(&sqd->lock);
- }
+ io_run_task_work();
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_uring_cancel_sqpoll(ctx);
sqd->thread = NULL;
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- ctx->sqo_exec = 1;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
- }
+ up_write(&sqd->rw_lock);
+ io_run_task_work();
complete(&sqd->exited);
- mutex_unlock(&sqd->lock);
do_exit(0);
}
@@ -7069,44 +7075,37 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
}
static void io_sq_thread_unpark(struct io_sq_data *sqd)
- __releases(&sqd->lock)
+ __releases(&sqd->rw_lock)
{
- if (sqd->thread == current)
- return;
+ WARN_ON_ONCE(sqd->thread == current);
+
clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
- if (sqd->thread)
- wake_up_state(sqd->thread, TASK_PARKED);
- mutex_unlock(&sqd->lock);
+ up_write(&sqd->rw_lock);
}
static void io_sq_thread_park(struct io_sq_data *sqd)
- __acquires(&sqd->lock)
+ __acquires(&sqd->rw_lock)
{
- if (sqd->thread == current)
- return;
+ WARN_ON_ONCE(sqd->thread == current);
+
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
- mutex_lock(&sqd->lock);
- if (sqd->thread) {
+ down_write(&sqd->rw_lock);
+ /* set again for consistency, in case concurrent parks are happening */
+ set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+ if (sqd->thread)
wake_up_process(sqd->thread);
- wait_for_completion(&sqd->parked);
- }
}
static void io_sq_thread_stop(struct io_sq_data *sqd)
{
- if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state))
- return;
- mutex_lock(&sqd->lock);
- if (sqd->thread) {
- set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
- WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state));
+ WARN_ON_ONCE(sqd->thread == current);
+
+ down_write(&sqd->rw_lock);
+ set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ if (sqd->thread)
wake_up_process(sqd->thread);
- mutex_unlock(&sqd->lock);
- wait_for_completion(&sqd->exited);
- WARN_ON_ONCE(sqd->thread);
- } else {
- mutex_unlock(&sqd->lock);
- }
+ up_write(&sqd->rw_lock);
+ wait_for_completion(&sqd->exited);
}
static void io_put_sq_data(struct io_sq_data *sqd)
@@ -7122,22 +7121,15 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx)
struct io_sq_data *sqd = ctx->sq_data;
if (sqd) {
- complete(&sqd->startup);
- if (sqd->thread) {
- wait_for_completion(&ctx->sq_thread_comp);
- io_sq_thread_park(sqd);
- }
-
- mutex_lock(&sqd->ctx_lock);
- list_del(&ctx->sqd_list);
+ io_sq_thread_park(sqd);
+ list_del_init(&ctx->sqd_list);
io_sqd_update_thread_idle(sqd);
- mutex_unlock(&sqd->ctx_lock);
-
- if (sqd->thread)
- io_sq_thread_unpark(sqd);
+ io_sq_thread_unpark(sqd);
io_put_sq_data(sqd);
ctx->sq_data = NULL;
+ if (ctx->sq_creds)
+ put_cred(ctx->sq_creds);
}
}
@@ -7161,18 +7153,32 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
fdput(f);
return ERR_PTR(-EINVAL);
}
+ if (sqd->task_tgid != current->tgid) {
+ fdput(f);
+ return ERR_PTR(-EPERM);
+ }
refcount_inc(&sqd->refs);
fdput(f);
return sqd;
}
-static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
+static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
+ bool *attached)
{
struct io_sq_data *sqd;
- if (p->flags & IORING_SETUP_ATTACH_WQ)
- return io_attach_sq_data(p);
+ *attached = false;
+ if (p->flags & IORING_SETUP_ATTACH_WQ) {
+ sqd = io_attach_sq_data(p);
+ if (!IS_ERR(sqd)) {
+ *attached = true;
+ return sqd;
+ }
+ /* fall through for EPERM case, setup new sqd/task */
+ if (PTR_ERR(sqd) != -EPERM)
+ return sqd;
+ }
sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
if (!sqd)
@@ -7180,12 +7186,8 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
refcount_set(&sqd->refs, 1);
INIT_LIST_HEAD(&sqd->ctx_list);
- INIT_LIST_HEAD(&sqd->ctx_new_list);
- mutex_init(&sqd->ctx_lock);
- mutex_init(&sqd->lock);
+ init_rwsem(&sqd->rw_lock);
init_waitqueue_head(&sqd->wait);
- init_completion(&sqd->startup);
- init_completion(&sqd->parked);
init_completion(&sqd->exited);
return sqd;
}
@@ -7802,7 +7804,6 @@ static int io_uring_alloc_task_context(struct task_struct *task,
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
atomic_set(&tctx->in_idle, 0);
- tctx->sqpoll = false;
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
@@ -7823,26 +7824,6 @@ void __io_uring_free(struct task_struct *tsk)
tsk->io_uring = NULL;
}
-static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
-{
- struct task_struct *tsk;
- int ret;
-
- clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
- reinit_completion(&sqd->parked);
- ctx->sqo_exec = 0;
- sqd->task_pid = current->pid;
- tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
- if (IS_ERR(tsk))
- return PTR_ERR(tsk);
- ret = io_uring_alloc_task_context(tsk, ctx);
- if (ret)
- set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
- sqd->thread = tsk;
- wake_up_new_task(tsk);
- return ret;
-}
-
static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
@@ -7865,39 +7846,51 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct task_struct *tsk;
struct io_sq_data *sqd;
+ bool attached;
ret = -EPERM;
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
goto err;
- sqd = io_get_sq_data(p);
+ sqd = io_get_sq_data(p, &attached);
if (IS_ERR(sqd)) {
ret = PTR_ERR(sqd);
goto err;
}
+ ctx->sq_creds = get_current_cred();
ctx->sq_data = sqd;
- io_sq_thread_park(sqd);
- mutex_lock(&sqd->ctx_lock);
- list_add(&ctx->sqd_list, &sqd->ctx_new_list);
- mutex_unlock(&sqd->ctx_lock);
- io_sq_thread_unpark(sqd);
-
ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
if (!ctx->sq_thread_idle)
ctx->sq_thread_idle = HZ;
- if (sqd->thread)
+ ret = 0;
+ io_sq_thread_park(sqd);
+ /* don't attach to a dying SQPOLL thread, would be racy */
+ if (attached && !sqd->thread) {
+ ret = -ENXIO;
+ } else {
+ list_add(&ctx->sqd_list, &sqd->ctx_list);
+ io_sqd_update_thread_idle(sqd);
+ }
+ io_sq_thread_unpark(sqd);
+
+ if (ret < 0) {
+ io_put_sq_data(sqd);
+ ctx->sq_data = NULL;
+ return ret;
+ } else if (attached) {
return 0;
+ }
if (p->flags & IORING_SETUP_SQ_AFF) {
int cpu = p->sq_thread_cpu;
ret = -EINVAL;
if (cpu >= nr_cpu_ids)
- goto err;
+ goto err_sqpoll;
if (!cpu_online(cpu))
- goto err;
+ goto err_sqpoll;
sqd->sq_cpu = cpu;
} else {
@@ -7905,15 +7898,15 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
}
sqd->task_pid = current->pid;
+ sqd->task_tgid = current->tgid;
tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
if (IS_ERR(tsk)) {
ret = PTR_ERR(tsk);
- goto err;
+ goto err_sqpoll;
}
- ret = io_uring_alloc_task_context(tsk, ctx);
- if (ret)
- set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+
sqd->thread = tsk;
+ ret = io_uring_alloc_task_context(tsk, ctx);
wake_up_new_task(tsk);
if (ret)
goto err;
@@ -7927,15 +7920,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
err:
io_sq_thread_finish(ctx);
return ret;
-}
-
-static void io_sq_offload_start(struct io_ring_ctx *ctx)
-{
- struct io_sq_data *sqd = ctx->sq_data;
-
- ctx->flags &= ~IORING_SETUP_R_DISABLED;
- if (ctx->flags & IORING_SETUP_SQPOLL)
- complete(&sqd->startup);
+err_sqpoll:
+ complete(&ctx->sq_data->exited);
+ goto err;
}
static inline void __io_unaccount_mem(struct user_struct *user,
@@ -8418,7 +8405,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
- idr_destroy(&ctx->personality_idr);
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
@@ -8483,7 +8469,7 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
const struct cred *creds;
- creds = idr_remove(&ctx->personality_idr, id);
+ creds = xa_erase(&ctx->personalities, id);
if (creds) {
put_cred(creds);
return 0;
@@ -8492,14 +8478,6 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
return -EINVAL;
}
-static int io_remove_personalities(int id, void *p, void *data)
-{
- struct io_ring_ctx *ctx = data;
-
- io_unregister_personality(ctx, id);
- return 0;
-}
-
static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
{
struct callback_head *work, *next;
@@ -8522,10 +8500,34 @@ static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
return executed;
}
+struct io_tctx_exit {
+ struct callback_head task_work;
+ struct completion completion;
+ struct io_ring_ctx *ctx;
+};
+
+static void io_tctx_exit_cb(struct callback_head *cb)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ struct io_tctx_exit *work;
+
+ work = container_of(cb, struct io_tctx_exit, task_work);
+ /*
+ * When @in_idle, we're in cancellation and it's racy to remove the
+ * node. It'll be removed by the end of cancellation, just ignore it.
+ */
+ if (!atomic_read(&tctx->in_idle))
+ io_uring_del_task_file((unsigned long)work->ctx);
+ complete(&work->completion);
+}
+
static void io_ring_exit_work(struct work_struct *work)
{
- struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
- exit_work);
+ struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
+ unsigned long timeout = jiffies + HZ * 60 * 5;
+ struct io_tctx_exit exit;
+ struct io_tctx_node *node;
+ int ret;
/*
* If we're doing polled IO and end up having requests being
@@ -8535,19 +8537,47 @@ static void io_ring_exit_work(struct work_struct *work)
*/
do {
io_uring_try_cancel_requests(ctx, NULL, NULL);
+
+ WARN_ON_ONCE(time_after(jiffies, timeout));
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
+
+ mutex_lock(&ctx->uring_lock);
+ while (!list_empty(&ctx->tctx_list)) {
+ WARN_ON_ONCE(time_after(jiffies, timeout));
+
+ node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
+ ctx_node);
+ exit.ctx = ctx;
+ init_completion(&exit.completion);
+ init_task_work(&exit.task_work, io_tctx_exit_cb);
+ ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
+ if (WARN_ON_ONCE(ret))
+ continue;
+ wake_up_process(node->task);
+
+ mutex_unlock(&ctx->uring_lock);
+ wait_for_completion(&exit.completion);
+ cond_resched();
+ mutex_lock(&ctx->uring_lock);
+ }
+ mutex_unlock(&ctx->uring_lock);
+
io_ring_ctx_free(ctx);
}
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
+ unsigned long index;
+ struct creds *creds;
+
mutex_lock(&ctx->uring_lock);
percpu_ref_kill(&ctx->refs);
/* if force is set, the ring is going away. always drop after that */
ctx->cq_overflow_flushed = 1;
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true, NULL, NULL);
- idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
+ xa_for_each(&ctx->personalities, index, creds)
+ io_unregister_personality(ctx, index);
mutex_unlock(&ctx->uring_lock);
io_kill_timeouts(ctx, NULL, NULL);
@@ -8600,11 +8630,11 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
return ret;
}
-static void io_cancel_defer_files(struct io_ring_ctx *ctx,
+static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files)
{
- struct io_defer_entry *de = NULL;
+ struct io_defer_entry *de;
LIST_HEAD(list);
spin_lock_irq(&ctx->completion_lock);
@@ -8615,6 +8645,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
}
}
spin_unlock_irq(&ctx->completion_lock);
+ if (list_empty(&list))
+ return false;
while (!list_empty(&list)) {
de = list_first_entry(&list, struct io_defer_entry, list);
@@ -8624,6 +8656,38 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
io_req_complete(de->req, -ECANCELED);
kfree(de);
}
+ return true;
+}
+
+static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+ return req->ctx == data;
+}
+
+static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
+{
+ struct io_tctx_node *node;
+ enum io_wq_cancel cret;
+ bool ret = false;
+
+ mutex_lock(&ctx->uring_lock);
+ list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
+ struct io_uring_task *tctx = node->task->io_uring;
+
+ /*
+ * io_wq will stay alive while we hold uring_lock, because it's
+ * killed after ctx nodes, which requires to take the lock.
+ */
+ if (!tctx || !tctx->io_wq)
+ continue;
+ cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
+ ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
+ }
+ mutex_unlock(&ctx->uring_lock);
+
+ return ret;
}
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
@@ -8631,27 +8695,34 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct files_struct *files)
{
struct io_task_cancel cancel = { .task = task, .files = files, };
- struct task_struct *tctx_task = task ?: current;
- struct io_uring_task *tctx = tctx_task->io_uring;
+ struct io_uring_task *tctx = task ? task->io_uring : NULL;
while (1) {
enum io_wq_cancel cret;
bool ret = false;
- if (tctx && tctx->io_wq) {
+ if (!task) {
+ ret |= io_uring_try_cancel_iowq(ctx);
+ } else if (tctx && tctx->io_wq) {
+ /*
+ * Cancels requests of all rings, not only @ctx, but
+ * it's fine as the task is in exit/exec.
+ */
cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
&cancel, true);
ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
}
/* SQPOLL thread does its own polling */
- if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
+ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) ||
+ (ctx->sq_data && ctx->sq_data->thread == current)) {
while (!list_empty_careful(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
}
}
+ ret |= io_cancel_defer_files(ctx, task, files);
ret |= io_poll_remove_all(ctx, task, files);
ret |= io_kill_timeouts(ctx, task, files);
ret |= io_run_task_work();
@@ -8691,58 +8762,21 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
io_uring_try_cancel_requests(ctx, task, files);
- if (ctx->sq_data)
- io_sq_thread_unpark(ctx->sq_data);
prepare_to_wait(&task->io_uring->wait, &wait,
TASK_UNINTERRUPTIBLE);
if (inflight == io_uring_count_inflight(ctx, task, files))
schedule();
finish_wait(&task->io_uring->wait, &wait);
- if (ctx->sq_data)
- io_sq_thread_park(ctx->sq_data);
- }
-}
-
-/*
- * We need to iteratively cancel requests, in case a request has dependent
- * hard links. These persist even for failure of cancelations, hence keep
- * looping until none are found.
- */
-static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
- struct files_struct *files)
-{
- struct task_struct *task = current;
-
- if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
- /* never started, nothing to cancel */
- if (ctx->flags & IORING_SETUP_R_DISABLED) {
- io_sq_offload_start(ctx);
- return;
- }
- io_sq_thread_park(ctx->sq_data);
- task = ctx->sq_data->thread;
- if (task)
- atomic_inc(&task->io_uring->in_idle);
}
-
- io_cancel_defer_files(ctx, task, files);
-
- io_uring_cancel_files(ctx, task, files);
- if (!files)
- io_uring_try_cancel_requests(ctx, task, NULL);
-
- if (task)
- atomic_dec(&task->io_uring->in_idle);
- if (ctx->sq_data)
- io_sq_thread_unpark(ctx->sq_data);
}
/*
* Note that this task has used io_uring. We use it for cancelation purposes.
*/
-static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
+static int io_uring_add_task_file(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
+ struct io_tctx_node *node;
int ret;
if (unlikely(!tctx)) {
@@ -8751,102 +8785,151 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
return ret;
tctx = current->io_uring;
}
- if (tctx->last != file) {
- void *old = xa_load(&tctx->xa, (unsigned long)file);
+ if (tctx->last != ctx) {
+ void *old = xa_load(&tctx->xa, (unsigned long)ctx);
if (!old) {
- get_file(file);
- ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
- file, GFP_KERNEL));
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+ node->ctx = ctx;
+ node->task = current;
+
+ ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
+ node, GFP_KERNEL));
if (ret) {
- fput(file);
+ kfree(node);
return ret;
}
+
+ mutex_lock(&ctx->uring_lock);
+ list_add(&node->ctx_node, &ctx->tctx_list);
+ mutex_unlock(&ctx->uring_lock);
}
- tctx->last = file;
+ tctx->last = ctx;
}
-
- /*
- * This is race safe in that the task itself is doing this, hence it
- * cannot be going through the exit/cancel paths at the same time.
- * This cannot be modified while exit/cancel is running.
- */
- if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
- tctx->sqpoll = true;
-
return 0;
}
/*
* Remove this io_uring_file -> task mapping.
*/
-static void io_uring_del_task_file(struct file *file)
+static void io_uring_del_task_file(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
+ struct io_tctx_node *node;
- if (tctx->last == file)
+ if (!tctx)
+ return;
+ node = xa_erase(&tctx->xa, index);
+ if (!node)
+ return;
+
+ WARN_ON_ONCE(current != node->task);
+ WARN_ON_ONCE(list_empty(&node->ctx_node));
+
+ mutex_lock(&node->ctx->uring_lock);
+ list_del(&node->ctx_node);
+ mutex_unlock(&node->ctx->uring_lock);
+
+ if (tctx->last == node->ctx)
tctx->last = NULL;
- file = xa_erase(&tctx->xa, (unsigned long)file);
- if (file)
- fput(file);
+ kfree(node);
}
static void io_uring_clean_tctx(struct io_uring_task *tctx)
{
- struct file *file;
+ struct io_tctx_node *node;
unsigned long index;
- xa_for_each(&tctx->xa, index, file)
- io_uring_del_task_file(file);
+ xa_for_each(&tctx->xa, index, node)
+ io_uring_del_task_file(index);
if (tctx->io_wq) {
io_wq_put_and_exit(tctx->io_wq);
tctx->io_wq = NULL;
}
}
+static s64 tctx_inflight(struct io_uring_task *tctx)
+{
+ return percpu_counter_sum(&tctx->inflight);
+}
+
+static void io_sqpoll_cancel_cb(struct callback_head *cb)
+{
+ struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
+ struct io_ring_ctx *ctx = work->ctx;
+ struct io_sq_data *sqd = ctx->sq_data;
+
+ if (sqd->thread)
+ io_uring_cancel_sqpoll(ctx);
+ complete(&work->completion);
+}
+
+static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
+{
+ struct io_sq_data *sqd = ctx->sq_data;
+ struct io_tctx_exit work = { .ctx = ctx, };
+ struct task_struct *task;
+
+ io_sq_thread_park(sqd);
+ list_del_init(&ctx->sqd_list);
+ io_sqd_update_thread_idle(sqd);
+ task = sqd->thread;
+ if (task) {
+ init_completion(&work.completion);
+ init_task_work(&work.task_work, io_sqpoll_cancel_cb);
+ WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL));
+ wake_up_process(task);
+ }
+ io_sq_thread_unpark(sqd);
+
+ if (task)
+ wait_for_completion(&work.completion);
+}
+
void __io_uring_files_cancel(struct files_struct *files)
{
struct io_uring_task *tctx = current->io_uring;
- struct file *file;
+ struct io_tctx_node *node;
unsigned long index;
/* make sure overflow events are dropped */
atomic_inc(&tctx->in_idle);
- xa_for_each(&tctx->xa, index, file)
- io_uring_cancel_task_requests(file->private_data, files);
+ xa_for_each(&tctx->xa, index, node) {
+ struct io_ring_ctx *ctx = node->ctx;
+
+ if (ctx->sq_data) {
+ io_sqpoll_cancel_sync(ctx);
+ continue;
+ }
+ io_uring_cancel_files(ctx, current, files);
+ if (!files)
+ io_uring_try_cancel_requests(ctx, current, NULL);
+ }
atomic_dec(&tctx->in_idle);
if (files)
io_uring_clean_tctx(tctx);
}
-static s64 tctx_inflight(struct io_uring_task *tctx)
-{
- return percpu_counter_sum(&tctx->inflight);
-}
-
+/* should only be called by SQPOLL task */
static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
{
struct io_sq_data *sqd = ctx->sq_data;
- struct io_uring_task *tctx;
+ struct io_uring_task *tctx = current->io_uring;
s64 inflight;
DEFINE_WAIT(wait);
- if (!sqd)
- return;
- io_sq_thread_park(sqd);
- if (!sqd->thread || !sqd->thread->io_uring) {
- io_sq_thread_unpark(sqd);
- return;
- }
- tctx = ctx->sq_data->thread->io_uring;
+ WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current);
+
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
inflight = tctx_inflight(tctx);
if (!inflight)
break;
- io_uring_cancel_task_requests(ctx, NULL);
+ io_uring_try_cancel_requests(ctx, current, NULL);
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
/*
@@ -8859,7 +8942,6 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
finish_wait(&tctx->wait, &wait);
} while (1);
atomic_dec(&tctx->in_idle);
- io_sq_thread_unpark(sqd);
}
/*
@@ -8874,15 +8956,6 @@ void __io_uring_task_cancel(void)
/* make sure overflow events are dropped */
atomic_inc(&tctx->in_idle);
-
- if (tctx->sqpoll) {
- struct file *file;
- unsigned long index;
-
- xa_for_each(&tctx->xa, index, file)
- io_uring_cancel_sqpoll(file->private_data);
- }
-
do {
/* read completions before cancelations */
inflight = tctx_inflight(tctx);
@@ -8981,7 +9054,6 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
{
- int ret = 0;
DEFINE_WAIT(wait);
do {
@@ -8995,7 +9067,7 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
} while (!signal_pending(current));
finish_wait(&ctx->sqo_sq_wait, &wait);
- return ret;
+ return 0;
}
static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
@@ -9069,13 +9141,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (ctx->flags & IORING_SETUP_SQPOLL) {
io_cqring_overflow_flush(ctx, false, NULL, NULL);
- if (unlikely(ctx->sqo_exec)) {
- ret = io_sq_thread_fork(ctx->sq_data, ctx);
- if (ret)
- goto out;
- ctx->sqo_exec = 0;
- }
ret = -EOWNERDEAD;
+ if (unlikely(ctx->sq_data->thread == NULL)) {
+ goto out;
+ }
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sq_data->wait);
if (flags & IORING_ENTER_SQ_WAIT) {
@@ -9085,7 +9154,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
}
submitted = to_submit;
} else if (to_submit) {
- ret = io_uring_add_task_file(ctx, f.file);
+ ret = io_uring_add_task_file(ctx);
if (unlikely(ret))
goto out;
mutex_lock(&ctx->uring_lock);
@@ -9127,10 +9196,9 @@ out_fput:
}
#ifdef CONFIG_PROC_FS
-static int io_uring_show_cred(int id, void *p, void *data)
+static int io_uring_show_cred(struct seq_file *m, unsigned int id,
+ const struct cred *cred)
{
- const struct cred *cred = p;
- struct seq_file *m = data;
struct user_namespace *uns = seq_user_ns(m);
struct group_info *gi;
kernel_cap_t cap;
@@ -9198,9 +9266,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
(unsigned int) buf->len);
}
- if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
+ if (has_lock && !xa_empty(&ctx->personalities)) {
+ unsigned long index;
+ const struct cred *cred;
+
seq_printf(m, "Personalities:\n");
- idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
+ xa_for_each(&ctx->personalities, index, cred)
+ io_uring_show_cred(m, index, cred);
}
seq_printf(m, "PollList:\n");
spin_lock_irq(&ctx->completion_lock);
@@ -9294,7 +9366,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
if (fd < 0)
return fd;
- ret = io_uring_add_task_file(ctx, file);
+ ret = io_uring_add_task_file(ctx);
if (ret) {
put_unused_fd(fd);
return ret;
@@ -9402,9 +9474,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
if (ret)
goto err;
- if (!(p->flags & IORING_SETUP_R_DISABLED))
- io_sq_offload_start(ctx);
-
memset(&p->sq_off, 0, sizeof(p->sq_off));
p->sq_off.head = offsetof(struct io_rings, sq.head);
p->sq_off.tail = offsetof(struct io_rings, sq.tail);
@@ -9532,14 +9601,16 @@ out:
static int io_register_personality(struct io_ring_ctx *ctx)
{
const struct cred *creds;
+ u32 id;
int ret;
creds = get_current_cred();
- ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
- USHRT_MAX, GFP_KERNEL);
- if (ret < 0)
- put_cred(creds);
+ ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
+ XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
+ if (!ret)
+ return id;
+ put_cred(creds);
return ret;
}
@@ -9621,7 +9692,9 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
if (ctx->restrictions.registered)
ctx->restricted = 1;
- io_sq_offload_start(ctx);
+ ctx->flags &= ~IORING_SETUP_R_DISABLED;
+ if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
+ wake_up(&ctx->sq_data->wait);
return 0;
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 7ffcd7ef33d4..414769a6ad11 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1221,7 +1221,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
struct iomap_ioend *ioend;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset);
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset);
bio_set_dev(bio, wpc->iomap.bdev);
bio->bi_iter.bi_sector = sector;
bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
@@ -1252,7 +1252,7 @@ iomap_chain_bio(struct bio *prev)
{
struct bio *new;
- new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+ new = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
bio_copy_dev(new, prev);/* also copies over blkcg information */
new->bi_iter.bi_sector = bio_end_sector(prev);
new->bi_opf = prev->bi_opf;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index e2c4991833b8..bdd0d89bbf0a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -296,7 +296,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
*/
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
- nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES);
+ nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
size_t n;
if (dio->error) {
@@ -338,7 +338,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
copied += n;
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
- BIO_MAX_PAGES);
+ BIO_MAX_VECS);
iomap_dio_submit_bio(dio, iomap, bio, pos);
pos += n;
} while (nr_pages);
diff --git a/fs/mpage.c b/fs/mpage.c
index 961234d68779..334e7d09aa65 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -616,7 +616,7 @@ alloc_new:
goto out;
}
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH);
+ BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH);
if (bio == NULL)
goto confused;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index e2a488d403a6..14a72224b657 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -127,7 +127,7 @@ config PNFS_BLOCK
config PNFS_FLEXFILE_LAYOUT
tristate
depends on NFS_V4_1 && NFS_V3
- default m
+ default NFS_V4
config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
string "NFSv4.1 Implementation ID Domain"
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 19a9f434442f..fc4f490f2d78 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -81,8 +81,9 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
spin_lock(&dir->i_lock);
if (list_empty(&nfsi->open_files) &&
(nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
- nfsi->cache_validity |= NFS_INO_INVALID_DATA |
- NFS_INO_REVAL_FORCED;
+ nfs_set_cache_invalid(dir,
+ NFS_INO_INVALID_DATA |
+ NFS_INO_REVAL_FORCED);
list_add(&ctx->list, &nfsi->open_files);
spin_unlock(&dir->i_lock);
return ctx;
@@ -1401,6 +1402,13 @@ out_force:
goto out;
}
+static void nfs_mark_dir_for_revalidate(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE);
+ spin_unlock(&inode->i_lock);
+}
+
/*
* We judge how long we want to trust negative
* dentries by looking at the parent inode mtime.
@@ -1435,19 +1443,14 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
__func__, dentry);
return 1;
case 0:
- nfs_mark_for_revalidate(dir);
- if (inode && S_ISDIR(inode->i_mode)) {
- /* Purge readdir caches. */
- nfs_zap_caches(inode);
- /*
- * We can't d_drop the root of a disconnected tree:
- * its d_hash is on the s_anon list and d_drop() would hide
- * it from shrink_dcache_for_unmount(), leading to busy
- * inodes on unmount and further oopses.
- */
- if (IS_ROOT(dentry))
- return 1;
- }
+ /*
+ * We can't d_drop the root of a disconnected tree:
+ * its d_hash is on the s_anon list and d_drop() would hide
+ * it from shrink_dcache_for_unmount(), leading to busy
+ * inodes on unmount and further oopses.
+ */
+ if (inode && IS_ROOT(dentry))
+ return 1;
dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
__func__, dentry);
return 0;
@@ -1525,6 +1528,13 @@ out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
nfs4_label_free(label);
+
+ /*
+ * If the lookup failed despite the dentry change attribute being
+ * a match, then we should revalidate the directory cache.
+ */
+ if (!ret && nfs_verify_change_attribute(dir, dentry->d_time))
+ nfs_mark_dir_for_revalidate(dir);
return nfs_lookup_revalidate_done(dir, dentry, inode, ret);
}
@@ -1567,7 +1577,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
error = nfs_lookup_verify_inode(inode, flags);
if (error) {
if (error == -ESTALE)
- nfs_zap_caches(dir);
+ nfs_mark_dir_for_revalidate(dir);
goto out_bad;
}
nfs_advise_use_readdirplus(dir);
@@ -1691,10 +1701,9 @@ static void nfs_drop_nlink(struct inode *inode)
if (inode->i_nlink > 0)
drop_nlink(inode);
NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE
- | NFS_INO_INVALID_CTIME
- | NFS_INO_INVALID_OTHER
- | NFS_INO_REVAL_FORCED;
+ nfs_set_cache_invalid(
+ inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED);
spin_unlock(&inode->i_lock);
}
@@ -1706,7 +1715,7 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
{
if (S_ISDIR(inode->i_mode))
/* drop any readdir cache as it could easily be old */
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
nfs_complete_unlink(dentry, inode);
@@ -2064,7 +2073,6 @@ out:
dput(parent);
return d;
out_error:
- nfs_mark_for_revalidate(dir);
d = ERR_PTR(error);
goto out;
}
@@ -2473,9 +2481,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
if (error == 0) {
spin_lock(&old_inode->i_lock);
NFS_I(old_inode)->attr_gencount = nfs_inc_attr_generation_counter();
- NFS_I(old_inode)->cache_validity |= NFS_INO_INVALID_CHANGE
- | NFS_INO_INVALID_CTIME
- | NFS_INO_REVAL_FORCED;
+ nfs_set_cache_invalid(old_inode, NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_CTIME |
+ NFS_INO_REVAL_FORCED);
spin_unlock(&old_inode->i_lock);
}
out:
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 749bbea14d99..a7fb076a5f44 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -207,7 +207,7 @@ static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi)
}
#endif
-static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
+void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
{
struct nfs_inode *nfsi = NFS_I(inode);
bool have_delegation = NFS_PROTO(inode)->have_delegation(inode, FMODE_READ);
@@ -229,6 +229,7 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
if (flags & NFS_INO_INVALID_DATA)
nfs_fscache_invalidate(inode);
}
+EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
/*
* Invalidate the local caches
@@ -1067,8 +1068,8 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
spin_lock(&inode->i_lock);
if (list_empty(&nfsi->open_files) &&
(nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
- nfsi->cache_validity |= NFS_INO_INVALID_DATA |
- NFS_INO_REVAL_FORCED;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA |
+ NFS_INO_REVAL_FORCED);
list_add_tail_rcu(&ctx->list, &nfsi->open_files);
spin_unlock(&inode->i_lock);
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 25fb43b69e5a..7b644d6c09e4 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -411,7 +411,8 @@ extern int nfs_write_inode(struct inode *, struct writeback_control *);
extern int nfs_drop_inode(struct inode *);
extern void nfs_clear_inode(struct inode *);
extern void nfs_evict_inode(struct inode *);
-void nfs_zap_acl_cache(struct inode *inode);
+extern void nfs_zap_acl_cache(struct inode *inode);
+extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ca10072644ff..ed1c83738c30 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -36,6 +36,7 @@
#define NFS3_pagepad_sz (1) /* Page padding */
#define NFS3_fhandle_sz (1+16)
#define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */
+#define NFS3_post_op_fh_sz (1+NFS3_fh_sz)
#define NFS3_sattr_sz (15)
#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2))
#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2))
@@ -73,7 +74,7 @@
#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+NFS3_pagepad_sz)
#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+NFS3_pagepad_sz)
#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4)
-#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
+#define NFS3_createres_sz (1+NFS3_post_op_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz))
#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+NFS3_pagepad_sz)
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index f3fd935620fc..094024b0aca1 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -357,13 +357,15 @@ static ssize_t _nfs42_proc_copy(struct file *src,
truncate_pagecache_range(dst_inode, pos_dst,
pos_dst + res->write_res.count);
spin_lock(&dst_inode->i_lock);
- NFS_I(dst_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE |
- NFS_INO_REVAL_FORCED | NFS_INO_INVALID_SIZE |
- NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA);
+ nfs_set_cache_invalid(
+ dst_inode, NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED |
+ NFS_INO_INVALID_SIZE | NFS_INO_INVALID_ATTR |
+ NFS_INO_INVALID_DATA);
spin_unlock(&dst_inode->i_lock);
spin_lock(&src_inode->i_lock);
- NFS_I(src_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE |
- NFS_INO_REVAL_FORCED | NFS_INO_INVALID_ATIME);
+ nfs_set_cache_invalid(src_inode, NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_REVAL_FORCED |
+ NFS_INO_INVALID_ATIME);
spin_unlock(&src_inode->i_lock);
status = res->write_res.count;
out:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 74bc5120013d..c65c4b41e2c1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1169,14 +1169,14 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
static void
nfs4_inc_nlink_locked(struct inode *inode)
{
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
inc_nlink(inode);
}
static void
nfs4_dec_nlink_locked(struct inode *inode)
{
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
drop_nlink(inode);
}
@@ -1187,35 +1187,31 @@ nfs4_update_changeattr_locked(struct inode *inode,
{
struct nfs_inode *nfsi = NFS_I(inode);
- nfsi->cache_validity |= NFS_INO_INVALID_CTIME
- | NFS_INO_INVALID_MTIME
- | cache_validity;
+ cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME;
if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) {
nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
nfsi->attrtimeo_timestamp = jiffies;
} else {
if (S_ISDIR(inode->i_mode)) {
- nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ cache_validity |= NFS_INO_INVALID_DATA;
nfs_force_lookup_revalidate(inode);
} else {
if (!NFS_PROTO(inode)->have_delegation(inode,
FMODE_READ))
- nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
+ cache_validity |= NFS_INO_REVAL_PAGECACHE;
}
if (cinfo->before != inode_peek_iversion_raw(inode))
- nfsi->cache_validity |= NFS_INO_INVALID_ACCESS |
- NFS_INO_INVALID_ACL |
- NFS_INO_INVALID_XATTR;
+ cache_validity |= NFS_INO_INVALID_ACCESS |
+ NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR;
}
inode_set_iversion_raw(inode, cinfo->after);
nfsi->read_cache_jiffies = timestamp;
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
+ nfs_set_cache_invalid(inode, cache_validity);
nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE;
-
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
- nfs_fscache_invalidate(inode);
}
void
@@ -5893,6 +5889,9 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE);
int ret, i;
+ /* You can't remove system.nfs4_acl: */
+ if (buflen == 0)
+ return -EINVAL;
if (!nfs4_server_supports_acls(server))
return -EOPNOTSUPP;
if (npages > ARRAY_SIZE(pages))
@@ -5915,9 +5914,9 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
* so mark the attribute cache invalid.
*/
spin_lock(&inode->i_lock);
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE
- | NFS_INO_INVALID_CTIME
- | NFS_INO_REVAL_FORCED;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_CTIME |
+ NFS_INO_REVAL_FORCED);
spin_unlock(&inode->i_lock);
nfs_access_zap_cache(inode);
nfs_zap_acl_cache(inode);
@@ -5969,7 +5968,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
return ret;
if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
return -ENOENT;
- return 0;
+ return label.len;
}
static int nfs4_get_security_label(struct inode *inode, void *buf,
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index b27ebdccef70..5fa11e1aca4c 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -500,9 +500,9 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
spin_lock(&inode->i_lock);
NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE
- | NFS_INO_INVALID_CTIME
- | NFS_INO_REVAL_FORCED;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_CTIME |
+ NFS_INO_REVAL_FORCED);
spin_unlock(&inode->i_lock);
d_move(dentry, sdentry);
break;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 82bdcb982186..f05a90338a76 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -303,9 +303,9 @@ static void nfs_set_pageerror(struct address_space *mapping)
nfs_zap_mapping(mapping->host, mapping);
/* Force file size revalidation */
spin_lock(&inode->i_lock);
- NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED |
- NFS_INO_REVAL_PAGECACHE |
- NFS_INO_INVALID_SIZE;
+ nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED |
+ NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_INVALID_SIZE);
spin_unlock(&inode->i_lock);
}
@@ -1604,7 +1604,7 @@ static int nfs_writeback_done(struct rpc_task *task,
/* Deal with the suid/sgid bit corner case */
if (nfs_should_remove_suid(inode)) {
spin_lock(&inode->i_lock);
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
spin_unlock(&inode->i_lock);
}
return 0;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1e75417bfe6e..56872e93823d 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -399,7 +399,7 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
{
wi->bio = NULL;
wi->rest_blocks = segbuf->sb_sum.nblocks;
- wi->max_pages = BIO_MAX_PAGES;
+ wi->max_pages = BIO_MAX_VECS;
wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
wi->start = wi->end = 0;
wi->blocknr = segbuf->sb_pseg_start;
diff --git a/fs/pnode.h b/fs/pnode.h
index 26f74e092bd9..988f1aa9b02a 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -12,7 +12,7 @@
#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
#define IS_MNT_SLAVE(m) ((m)->mnt_master)
-#define IS_MNT_NEW(m) (!(m)->mnt_ns)
+#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 45f44425d856..b9e87ebb1060 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -87,7 +87,7 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
int error, i;
struct bio *bio;
- if (page_count <= BIO_MAX_PAGES)
+ if (page_count <= BIO_MAX_VECS)
bio = bio_alloc(GFP_NOIO, page_count);
else
bio = bio_kmalloc(GFP_NOIO, page_count);
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index b6ff4a21abac..0fe76f376dee 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -684,7 +684,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
iov_iter_truncate(from, max);
- nr_pages = iov_iter_npages(from, BIO_MAX_PAGES);
+ nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
if (!nr_pages)
return 0;