diff options
Diffstat (limited to 'fs')
238 files changed, 10995 insertions, 4624 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 082d227fa56b..6261719f6f2a 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -276,7 +276,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, switch (handler->flags) { case ACL_TYPE_ACCESS: if (acl) { - struct iattr iattr; + struct iattr iattr = { 0 }; struct posix_acl *old_acl = acl; retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 89bac3d2f05b..619128b55837 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -61,6 +61,8 @@ enum { Opt_cache_loose, Opt_fscache, Opt_mmap, /* Access options */ Opt_access, Opt_posixacl, + /* Lock timeout option */ + Opt_locktimeout, /* Error token */ Opt_err }; @@ -80,6 +82,7 @@ static const match_table_t tokens = { {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, + {Opt_locktimeout, "locktimeout=%u"}, {Opt_err, NULL} }; @@ -187,6 +190,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = NULL; #endif + v9ses->session_lock_timeout = P9_LOCK_TIMEOUT; if (!opts) return 0; @@ -359,6 +363,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #endif break; + case Opt_locktimeout: + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + if (option < 1) { + p9_debug(P9_DEBUG_ERROR, + "locktimeout must be a greater than zero integer.\n"); + ret = -EINVAL; + continue; + } + v9ses->session_lock_timeout = (long)option * HZ; + break; + default: continue; } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 982e017acadb..129e5243a6bf 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -116,6 +116,7 @@ struct v9fs_session_info { struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct rw_semaphore rename_sem; + long session_lock_timeout; /* retry interval for blocking locks */ }; /* cache_validity flags */ diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index e1cbdfdb7c68..0bcbcc20f769 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -65,7 +65,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) if (retval == 0) return retval; - iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE); + iov_iter_bvec(&to, READ, &bvec, 1, PAGE_SIZE); retval = p9_client_read(fid, page_offset(page), &to, &err); if (err) { @@ -175,7 +175,7 @@ static int v9fs_vfs_writepage_locked(struct page *page) bvec.bv_page = page; bvec.bv_offset = 0; bvec.bv_len = len; - iov_iter_bvec(&from, ITER_BVEC | WRITE, &bvec, 1, len); + iov_iter_bvec(&from, WRITE, &bvec, 1, len); /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index b0405d6aac85..00745147329d 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -76,15 +76,6 @@ static inline int dt_type(struct p9_wstat *mistat) return rettype; } -static void p9stat_init(struct p9_wstat *stbuf) -{ - stbuf->name = NULL; - stbuf->uid = NULL; - stbuf->gid = NULL; - stbuf->muid = NULL; - stbuf->extension = NULL; -} - /** * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir * @filp: opened file structure @@ -114,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) int err = 0; struct p9_fid *fid; int buflen; - int reclen = 0; struct p9_rdir *rdir; struct kvec kvec; @@ -133,7 +123,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) if (rdir->tail == rdir->head) { struct iov_iter to; int n; - iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen); + iov_iter_kvec(&to, READ, &kvec, 1, buflen); n = p9_client_read(file->private_data, ctx->pos, &to, &err); if (err) @@ -145,15 +135,12 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) rdir->tail = n; } while (rdir->head < rdir->tail) { - p9stat_init(&st); err = p9stat_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &st); - if (err) { + if (err <= 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - p9stat_free(&st); return -EIO; } - reclen = st.size+2; over = !dir_emit(ctx, st.name, strlen(st.name), v9fs_qid2ino(&st.qid), dt_type(&st)); @@ -161,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) if (over) return 0; - rdir->head += reclen; - ctx->pos += reclen; + rdir->head += err; + ctx->pos += err; } } } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 5f2e48d41d72..a25efa782fcc 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) uint8_t status = P9_LOCK_ERROR; int res = 0; unsigned char fl_type; + struct v9fs_session_info *v9ses; fid = filp->private_data; BUG_ON(fid == NULL); @@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) if (IS_SETLKW(cmd)) flock.flags = P9_LOCK_FLAGS_BLOCK; + v9ses = v9fs_inode2v9ses(file_inode(filp)); + /* * if its a blocked request and we get P9_LOCK_BLOCKED as the status * for lock request, keep on trying @@ -202,8 +205,17 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) break; - if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0) + if (schedule_timeout_interruptible(v9ses->session_lock_timeout) + != 0) break; + /* + * p9_client_lock_dotl overwrites flock.client_id with the + * server message, free and reuse the client name + */ + if (flock.client_id != fid->clnt->name) { + kfree(flock.client_id); + flock.client_id = fid->clnt->name; + } } /* map 9p status to VFS status */ @@ -216,7 +228,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; default: WARN_ONCE(1, "unknown lock status code: %d\n", status); - /* fallthough */ + /* fall through */ case P9_LOCK_ERROR: case P9_LOCK_GRACE: res = -ENOLCK; @@ -235,6 +247,8 @@ out_unlock: locks_lock_file_wait(filp, fl); fl->fl_type = fl_type; } + if (flock.client_id != fid->clnt->name) + kfree(flock.client_id); out: return res; } @@ -269,7 +283,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) res = p9_client_getlock_dotl(fid, &glock); if (res < 0) - return res; + goto out; /* map 9p lock type to os lock type */ switch (glock.type) { case P9_LOCK_TYPE_RDLCK: @@ -290,7 +304,9 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = glock.start + glock.length - 1; fl->fl_pid = -glock.proc_id; } - kfree(glock.client_id); +out: + if (glock.client_id != fid->clnt->name) + kfree(glock.client_id); return res; } diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 352abc39e891..ac8ff8ca4c11 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -32,7 +32,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, struct iov_iter to; int err; - iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buffer_size); + iov_iter_kvec(&to, READ, &kvec, 1, buffer_size); attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { @@ -107,7 +107,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, struct iov_iter from; int retval, err; - iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len); + iov_iter_kvec(&from, WRITE, &kvec, 1, value_len); p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", name, value_len, flags); diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index ebba3b18e5da..701aaa9b1899 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -27,3 +27,15 @@ config AFS_FSCACHE help Say Y here if you want AFS data to be cached locally on disk through the generic filesystem cache manager + +config AFS_DEBUG_CURSOR + bool "AFS server cursor debugging" + depends on AFS_FS + help + Say Y here to cause the contents of a server cursor to be dumped to + the dmesg log if the server rotation algorithm fails to successfully + contact a server. + + See <file:Documentation/filesystems/afs.txt> for more information. + + If unsure, say N. diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 546874057bd3..0738e2bf5193 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -17,6 +17,7 @@ kafs-y := \ file.o \ flock.o \ fsclient.o \ + fs_probe.o \ inode.o \ main.o \ misc.o \ @@ -29,9 +30,13 @@ kafs-y := \ super.o \ netdevices.o \ vlclient.o \ + vl_list.o \ + vl_probe.o \ + vl_rotate.o \ volume.o \ write.o \ - xattr.o + xattr.o \ + yfsclient.o kafs-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_AFS_FS) := kafs.o diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 55a756c60746..967db336d11a 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -64,19 +64,25 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, /* * Parse a text string consisting of delimited addresses. */ -struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, - char delim, - unsigned short service, - unsigned short port) +struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, + const char *text, size_t len, + char delim, + unsigned short service, + unsigned short port) { + struct afs_vlserver_list *vllist; struct afs_addr_list *alist; const char *p, *end = text + len; + const char *problem; unsigned int nr = 0; + int ret = -ENOMEM; _enter("%*.*s,%c", (int)len, (int)len, text, delim); - if (!len) + if (!len) { + _leave(" = -EDESTADDRREQ [empty]"); return ERR_PTR(-EDESTADDRREQ); + } if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len))) delim = ','; @@ -84,18 +90,24 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, /* Count the addresses */ p = text; do { - if (!*p) - return ERR_PTR(-EINVAL); + if (!*p) { + problem = "nul"; + goto inval; + } if (*p == delim) continue; nr++; if (*p == '[') { p++; - if (p == end) - return ERR_PTR(-EINVAL); + if (p == end) { + problem = "brace1"; + goto inval; + } p = memchr(p, ']', end - p); - if (!p) - return ERR_PTR(-EINVAL); + if (!p) { + problem = "brace2"; + goto inval; + } p++; if (p >= end) break; @@ -109,10 +121,19 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, _debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES); - alist = afs_alloc_addrlist(nr, service, port); - if (!alist) + vllist = afs_alloc_vlserver_list(1); + if (!vllist) return ERR_PTR(-ENOMEM); + vllist->nr_servers = 1; + vllist->servers[0].server = afs_alloc_vlserver("<dummy>", 7, AFS_VL_PORT); + if (!vllist->servers[0].server) + goto error_vl; + + alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT); + if (!alist) + goto error; + /* Extract the addresses */ p = text; do { @@ -135,17 +156,21 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, break; } - if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop)) + if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop)) { family = AF_INET; - else if (in6_pton(p, q - p, (u8 *)x, -1, &stop)) + } else if (in6_pton(p, q - p, (u8 *)x, -1, &stop)) { family = AF_INET6; - else + } else { + problem = "family"; goto bad_address; + } - if (stop != q) + p = q; + if (stop != p) { + problem = "nostop"; goto bad_address; + } - p = q; if (q < end && *q == ']') p++; @@ -154,18 +179,23 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, /* Port number specification "+1234" */ xport = 0; p++; - if (p >= end || !isdigit(*p)) + if (p >= end || !isdigit(*p)) { + problem = "port"; goto bad_address; + } do { xport *= 10; xport += *p - '0'; - if (xport > 65535) + if (xport > 65535) { + problem = "pval"; goto bad_address; + } p++; } while (p < end && isdigit(*p)); } else if (*p == delim) { p++; } else { + problem = "weird"; goto bad_address; } } @@ -177,12 +207,23 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, } while (p < end); + rcu_assign_pointer(vllist->servers[0].server->addresses, alist); _leave(" = [nr %u]", alist->nr_addrs); - return alist; + return vllist; -bad_address: - kfree(alist); +inval: + _leave(" = -EINVAL [%s %zu %*.*s]", + problem, p - text, (int)len, (int)len, text); return ERR_PTR(-EINVAL); +bad_address: + _leave(" = -EINVAL [%s %zu %*.*s]", + problem, p - text, (int)len, (int)len, text); + ret = -EINVAL; +error: + afs_put_addrlist(alist); +error_vl: + afs_put_vlserverlist(net, vllist); + return ERR_PTR(ret); } /* @@ -201,30 +242,34 @@ static int afs_cmp_addr_list(const struct afs_addr_list *a1, /* * Perform a DNS query for VL servers and build a up an address list. */ -struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) +struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) { - struct afs_addr_list *alist; - char *vllist = NULL; + struct afs_vlserver_list *vllist; + char *result = NULL; int ret; _enter("%s", cell->name); - ret = dns_query("afsdb", cell->name, cell->name_len, - "", &vllist, _expiry); - if (ret < 0) + ret = dns_query("afsdb", cell->name, cell->name_len, "srv=1", + &result, _expiry); + if (ret < 0) { + _leave(" = %d [dns]", ret); return ERR_PTR(ret); - - alist = afs_parse_text_addrs(vllist, strlen(vllist), ',', - VL_SERVICE, AFS_VL_PORT); - if (IS_ERR(alist)) { - kfree(vllist); - if (alist != ERR_PTR(-ENOMEM)) - pr_err("Failed to parse DNS data\n"); - return alist; } - kfree(vllist); - return alist; + if (*_expiry == 0) + *_expiry = ktime_get_real_seconds() + 60; + + if (ret > 1 && result[0] == 0) + vllist = afs_extract_vlserver_list(cell, result, ret); + else + vllist = afs_parse_text_addrs(cell->net, result, ret, ',', + VL_SERVICE, AFS_VL_PORT); + kfree(result); + if (IS_ERR(vllist) && vllist != ERR_PTR(-ENOMEM)) + pr_err("Failed to parse DNS data %ld\n", PTR_ERR(vllist)); + + return vllist; } /* @@ -258,6 +303,8 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); srx = &alist->addrs[i]; + srx->srx_family = AF_RXRPC; + srx->transport_type = SOCK_DGRAM; srx->transport_len = sizeof(srx->transport.sin); srx->transport.sin.sin_family = AF_INET; srx->transport.sin.sin_port = htons(port); @@ -296,6 +343,8 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); srx = &alist->addrs[i]; + srx->srx_family = AF_RXRPC; + srx->transport_type = SOCK_DGRAM; srx->transport_len = sizeof(srx->transport.sin6); srx->transport.sin6.sin6_family = AF_INET6; srx->transport.sin6.sin6_port = htons(port); @@ -308,25 +357,33 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) */ bool afs_iterate_addresses(struct afs_addr_cursor *ac) { - _enter("%hu+%hd", ac->start, (short)ac->index); + unsigned long set, failed; + int index; if (!ac->alist) return false; - if (ac->begun) { - ac->index++; - if (ac->index == ac->alist->nr_addrs) - ac->index = 0; + set = ac->alist->responded; + failed = ac->alist->failed; + _enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index); - if (ac->index == ac->start) { - ac->error = -EDESTADDRREQ; - return false; - } - } + ac->nr_iterations++; + + set &= ~(failed | ac->tried); + + if (!set) + return false; - ac->begun = true; + index = READ_ONCE(ac->alist->preferred); + if (test_bit(index, &set)) + goto selected; + + index = __ffs(set); + +selected: + ac->index = index; + set_bit(index, &ac->tried); ac->responded = false; - ac->addr = &ac->alist->addrs[ac->index]; return true; } @@ -339,53 +396,13 @@ int afs_end_cursor(struct afs_addr_cursor *ac) alist = ac->alist; if (alist) { - if (ac->responded && ac->index != ac->start) - WRITE_ONCE(alist->index, ac->index); + if (ac->responded && + ac->index != alist->preferred && + test_bit(ac->alist->preferred, &ac->tried)) + WRITE_ONCE(alist->preferred, ac->index); afs_put_addrlist(alist); + ac->alist = NULL; } - ac->addr = NULL; - ac->alist = NULL; - ac->begun = false; return ac->error; } - -/* - * Set the address cursor for iterating over VL servers. - */ -int afs_set_vl_cursor(struct afs_addr_cursor *ac, struct afs_cell *cell) -{ - struct afs_addr_list *alist; - int ret; - - if (!rcu_access_pointer(cell->vl_addrs)) { - ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET, - TASK_INTERRUPTIBLE); - if (ret < 0) - return ret; - - if (!rcu_access_pointer(cell->vl_addrs) && - ktime_get_real_seconds() < cell->dns_expiry) - return cell->error; - } - - read_lock(&cell->vl_addrs_lock); - alist = rcu_dereference_protected(cell->vl_addrs, - lockdep_is_held(&cell->vl_addrs_lock)); - if (alist->nr_addrs > 0) - afs_get_addrlist(alist); - else - alist = NULL; - read_unlock(&cell->vl_addrs_lock); - - if (!alist) - return -EDESTADDRREQ; - - ac->alist = alist; - ac->addr = NULL; - ac->start = READ_ONCE(alist->index); - ac->index = ac->start; - ac->error = 0; - ac->begun = false; - return 0; -} diff --git a/fs/afs/afs.h b/fs/afs/afs.h index b4ff1f7ae4ab..d12ffb457e47 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -23,9 +23,9 @@ #define AFSPATHMAX 1024 /* Maximum length of a pathname plus NUL */ #define AFSOPAQUEMAX 1024 /* Maximum length of an opaque field */ -typedef unsigned afs_volid_t; -typedef unsigned afs_vnodeid_t; -typedef unsigned long long afs_dataversion_t; +typedef u64 afs_volid_t; +typedef u64 afs_vnodeid_t; +typedef u64 afs_dataversion_t; typedef enum { AFSVL_RWVOL, /* read/write volume */ @@ -52,8 +52,9 @@ typedef enum { */ struct afs_fid { afs_volid_t vid; /* volume ID */ - afs_vnodeid_t vnode; /* file index within volume */ - unsigned unique; /* unique ID number (file index version) */ + afs_vnodeid_t vnode; /* Lower 64-bits of file index within volume */ + u32 vnode_hi; /* Upper 32-bits of file index */ + u32 unique; /* unique ID number (file index version) */ }; /* @@ -67,14 +68,14 @@ typedef enum { } afs_callback_type_t; struct afs_callback { + time64_t expires_at; /* Time at which expires */ unsigned version; /* Callback version */ - unsigned expiry; /* Time at which expires */ afs_callback_type_t type; /* Type of callback */ }; struct afs_callback_break { struct afs_fid fid; /* File identifier */ - struct afs_callback cb; /* Callback details */ + //struct afs_callback cb; /* Callback details */ }; #define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */ @@ -129,19 +130,18 @@ typedef u32 afs_access_t; struct afs_file_status { u64 size; /* file size */ afs_dataversion_t data_version; /* current data version */ - time_t mtime_client; /* last time client changed data */ - time_t mtime_server; /* last time server changed data */ - unsigned abort_code; /* Abort if bulk-fetching this failed */ - - afs_file_type_t type; /* file type */ - unsigned nlink; /* link count */ - u32 author; /* author ID */ - u32 owner; /* owner ID */ - u32 group; /* group ID */ + struct timespec64 mtime_client; /* Last time client changed data */ + struct timespec64 mtime_server; /* Last time server changed data */ + s64 author; /* author ID */ + s64 owner; /* owner ID */ + s64 group; /* group ID */ afs_access_t caller_access; /* access rights for authenticated caller */ afs_access_t anon_access; /* access rights for unauthenticated caller */ umode_t mode; /* UNIX mode */ + afs_file_type_t type; /* file type */ + u32 nlink; /* link count */ s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */ + u32 abort_code; /* Abort if bulk-fetching this failed */ }; /* @@ -158,25 +158,27 @@ struct afs_file_status { * AFS volume synchronisation information */ struct afs_volsync { - time_t creation; /* volume creation time */ + time64_t creation; /* volume creation time */ }; /* * AFS volume status record */ struct afs_volume_status { - u32 vid; /* volume ID */ - u32 parent_id; /* parent volume ID */ + afs_volid_t vid; /* volume ID */ + afs_volid_t parent_id; /* parent volume ID */ u8 online; /* true if volume currently online and available */ u8 in_service; /* true if volume currently in service */ u8 blessed; /* same as in_service */ u8 needs_salvage; /* true if consistency checking required */ u32 type; /* volume type (afs_voltype_t) */ - u32 min_quota; /* minimum space set aside (blocks) */ - u32 max_quota; /* maximum space this volume may occupy (blocks) */ - u32 blocks_in_use; /* space this volume currently occupies (blocks) */ - u32 part_blocks_avail; /* space available in volume's partition */ - u32 part_max_blocks; /* size of volume's partition */ + u64 min_quota; /* minimum space set aside (blocks) */ + u64 max_quota; /* maximum space this volume may occupy (blocks) */ + u64 blocks_in_use; /* space this volume currently occupies (blocks) */ + u64 part_blocks_avail; /* space available in volume's partition */ + u64 part_max_blocks; /* size of volume's partition */ + s64 vol_copy_date; + s64 vol_backup_date; }; #define AFS_BLOCK_SIZE 1024 diff --git a/fs/afs/cache.c b/fs/afs/cache.c index b1c31ec4523a..f6d0a21e8052 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -49,7 +49,7 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, struct afs_vnode *vnode = cookie_netfs_data; struct afs_vnode_cache_aux aux; - _enter("{%x,%x,%llx},%p,%u", + _enter("{%llx,%x,%llx},%p,%u", vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, buffer, buflen); diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 5f261fbf2182..1c7955f5cdaf 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -210,12 +210,10 @@ void afs_init_callback_state(struct afs_server *server) /* * actually break a callback */ -void afs_break_callback(struct afs_vnode *vnode) +void __afs_break_callback(struct afs_vnode *vnode) { _enter(""); - write_seqlock(&vnode->cb_lock); - clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { vnode->cb_break++; @@ -230,7 +228,12 @@ void afs_break_callback(struct afs_vnode *vnode) afs_lock_may_be_available(vnode); spin_unlock(&vnode->lock); } +} +void afs_break_callback(struct afs_vnode *vnode) +{ + write_seqlock(&vnode->cb_lock); + __afs_break_callback(vnode); write_sequnlock(&vnode->cb_lock); } @@ -310,14 +313,10 @@ void afs_break_callbacks(struct afs_server *server, size_t count, /* TODO: Sort the callback break list by volume ID */ for (; count > 0; callbacks++, count--) { - _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }", + _debug("- Fid { vl=%08llx n=%llu u=%u }", callbacks->fid.vid, callbacks->fid.vnode, - callbacks->fid.unique, - callbacks->cb.version, - callbacks->cb.expiry, - callbacks->cb.type - ); + callbacks->fid.unique); afs_break_one_callback(server, &callbacks->fid); } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 6127f0fcd62c..cf445dbd5f2e 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -20,6 +20,8 @@ #include "internal.h" static unsigned __read_mostly afs_cell_gc_delay = 10; +static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; +static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; static void afs_manage_cell(struct work_struct *); @@ -119,7 +121,7 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, */ static struct afs_cell *afs_alloc_cell(struct afs_net *net, const char *name, unsigned int namelen, - const char *vllist) + const char *addresses) { struct afs_cell *cell; int i, ret; @@ -134,7 +136,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, if (namelen == 5 && memcmp(name, "@cell", 5) == 0) return ERR_PTR(-EINVAL); - _enter("%*.*s,%s", namelen, namelen, name, vllist); + _enter("%*.*s,%s", namelen, namelen, name, addresses); cell = kzalloc(sizeof(struct afs_cell), GFP_KERNEL); if (!cell) { @@ -153,23 +155,26 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, (1 << AFS_CELL_FL_NO_LOOKUP_YET)); INIT_LIST_HEAD(&cell->proc_volumes); rwlock_init(&cell->proc_lock); - rwlock_init(&cell->vl_addrs_lock); + rwlock_init(&cell->vl_servers_lock); /* Fill in the VL server list if we were given a list of addresses to * use. */ - if (vllist) { - struct afs_addr_list *alist; - - alist = afs_parse_text_addrs(vllist, strlen(vllist), ':', - VL_SERVICE, AFS_VL_PORT); - if (IS_ERR(alist)) { - ret = PTR_ERR(alist); + if (addresses) { + struct afs_vlserver_list *vllist; + + vllist = afs_parse_text_addrs(net, + addresses, strlen(addresses), ':', + VL_SERVICE, AFS_VL_PORT); + if (IS_ERR(vllist)) { + ret = PTR_ERR(vllist); goto parse_failed; } - rcu_assign_pointer(cell->vl_addrs, alist); + rcu_assign_pointer(cell->vl_servers, vllist); cell->dns_expiry = TIME64_MAX; + } else { + cell->dns_expiry = ktime_get_real_seconds(); } _leave(" = %p", cell); @@ -356,26 +361,40 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) */ static void afs_update_cell(struct afs_cell *cell) { - struct afs_addr_list *alist, *old; - time64_t now, expiry; + struct afs_vlserver_list *vllist, *old; + unsigned int min_ttl = READ_ONCE(afs_cell_min_ttl); + unsigned int max_ttl = READ_ONCE(afs_cell_max_ttl); + time64_t now, expiry = 0; _enter("%s", cell->name); - alist = afs_dns_query(cell, &expiry); - if (IS_ERR(alist)) { - switch (PTR_ERR(alist)) { + vllist = afs_dns_query(cell, &expiry); + + now = ktime_get_real_seconds(); + if (min_ttl > max_ttl) + max_ttl = min_ttl; + if (expiry < now + min_ttl) + expiry = now + min_ttl; + else if (expiry > now + max_ttl) + expiry = now + max_ttl; + + if (IS_ERR(vllist)) { + switch (PTR_ERR(vllist)) { case -ENODATA: - /* The DNS said that the cell does not exist */ + case -EDESTADDRREQ: + /* The DNS said that the cell does not exist or there + * weren't any addresses to be had. + */ set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags); clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags); - cell->dns_expiry = ktime_get_real_seconds() + 61; + cell->dns_expiry = expiry; break; case -EAGAIN: case -ECONNREFUSED: default: set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags); - cell->dns_expiry = ktime_get_real_seconds() + 10; + cell->dns_expiry = now + 10; break; } @@ -387,12 +406,12 @@ static void afs_update_cell(struct afs_cell *cell) /* Exclusion on changing vl_addrs is achieved by a * non-reentrant work item. */ - old = rcu_dereference_protected(cell->vl_addrs, true); - rcu_assign_pointer(cell->vl_addrs, alist); + old = rcu_dereference_protected(cell->vl_servers, true); + rcu_assign_pointer(cell->vl_servers, vllist); cell->dns_expiry = expiry; if (old) - afs_put_addrlist(old); + afs_put_vlserverlist(cell->net, old); } if (test_and_clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags)) @@ -414,7 +433,7 @@ static void afs_cell_destroy(struct rcu_head *rcu) ASSERTCMP(atomic_read(&cell->usage), ==, 0); - afs_put_addrlist(rcu_access_pointer(cell->vl_addrs)); + afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers)); key_put(cell->anonymous_key); kfree(cell); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 9e51d6fe7e8f..8ee5972893ed 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -16,6 +16,7 @@ #include <linux/ip.h> #include "internal.h" #include "afs_cm.h" +#include "protocol_yfs.h" static int afs_deliver_cb_init_call_back_state(struct afs_call *); static int afs_deliver_cb_init_call_back_state3(struct afs_call *); @@ -30,6 +31,8 @@ static void SRXAFSCB_Probe(struct work_struct *); static void SRXAFSCB_ProbeUuid(struct work_struct *); static void SRXAFSCB_TellMeAboutYourself(struct work_struct *); +static int afs_deliver_yfs_cb_callback(struct afs_call *); + #define CM_NAME(name) \ const char afs_SRXCB##name##_name[] __tracepoint_string = \ "CB." #name @@ -101,12 +104,25 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = { }; /* + * YFS CB.CallBack operation type + */ +static CM_NAME(YFS_CallBack); +static const struct afs_call_type afs_SRXYFSCB_CallBack = { + .name = afs_SRXCBYFS_CallBack_name, + .deliver = afs_deliver_yfs_cb_callback, + .destructor = afs_cm_destructor, + .work = SRXAFSCB_CallBack, +}; + +/* * route an incoming cache manager call * - return T if supported, F if not */ bool afs_cm_incoming_call(struct afs_call *call) { - _enter("{CB.OP %u}", call->operation_ID); + _enter("{%u, CB.OP %u}", call->service_id, call->operation_ID); + + call->epoch = rxrpc_kernel_get_epoch(call->net->socket, call->rxcall); switch (call->operation_ID) { case CBCallBack: @@ -127,12 +143,102 @@ bool afs_cm_incoming_call(struct afs_call *call) case CBTellMeAboutYourself: call->type = &afs_SRXCBTellMeAboutYourself; return true; + case YFSCBCallBack: + if (call->service_id != YFS_CM_SERVICE) + return false; + call->type = &afs_SRXYFSCB_CallBack; + return true; default: return false; } } /* + * Record a probe to the cache manager from a server. + */ +static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server) +{ + _enter(""); + + if (test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags) && + !test_bit(AFS_SERVER_FL_PROBING, &server->flags)) { + if (server->cm_epoch == call->epoch) + return 0; + + if (!server->probe.said_rebooted) { + pr_notice("kAFS: FS rebooted %pU\n", &server->uuid); + server->probe.said_rebooted = true; + } + } + + spin_lock(&server->probe_lock); + + if (!test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) { + server->cm_epoch = call->epoch; + server->probe.cm_epoch = call->epoch; + goto out; + } + + if (server->probe.cm_probed && + call->epoch != server->probe.cm_epoch && + !server->probe.said_inconsistent) { + pr_notice("kAFS: FS endpoints inconsistent %pU\n", + &server->uuid); + server->probe.said_inconsistent = true; + } + + if (!server->probe.cm_probed || call->epoch == server->cm_epoch) + server->probe.cm_epoch = server->cm_epoch; + +out: + server->probe.cm_probed = true; + spin_unlock(&server->probe_lock); + return 0; +} + +/* + * Find the server record by peer address and record a probe to the cache + * manager from a server. + */ +static int afs_find_cm_server_by_peer(struct afs_call *call) +{ + struct sockaddr_rxrpc srx; + struct afs_server *server; + + rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); + + server = afs_find_server(call->net, &srx); + if (!server) { + trace_afs_cm_no_server(call, &srx); + return 0; + } + + call->cm_server = server; + return afs_record_cm_probe(call, server); +} + +/* + * Find the server record by server UUID and record a probe to the cache + * manager from a server. + */ +static int afs_find_cm_server_by_uuid(struct afs_call *call, + struct afs_uuid *uuid) +{ + struct afs_server *server; + + rcu_read_lock(); + server = afs_find_server_by_uuid(call->net, call->request); + rcu_read_unlock(); + if (!server) { + trace_afs_cm_no_server_u(call, call->request); + return 0; + } + + call->cm_server = server; + return afs_record_cm_probe(call, server); +} + +/* * Clean up a cache manager call. */ static void afs_cm_destructor(struct afs_call *call) @@ -168,7 +274,6 @@ static void SRXAFSCB_CallBack(struct work_struct *work) static int afs_deliver_cb_callback(struct afs_call *call) { struct afs_callback_break *cb; - struct sockaddr_rxrpc srx; __be32 *bp; int ret, loop; @@ -176,32 +281,32 @@ static int afs_deliver_cb_callback(struct afs_call *call) switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* extract the FID array and its count in two steps */ case 1: _debug("extract FID count"); - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); if (call->count > AFSCBMAX) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_cb_fid_count); call->buffer = kmalloc(array3_size(call->count, 3, 4), GFP_KERNEL); if (!call->buffer) return -ENOMEM; - call->offset = 0; + afs_extract_to_buf(call, call->count * 3 * 4); call->unmarshall++; case 2: _debug("extract FID array"); - ret = afs_extract_data(call, call->buffer, - call->count * 3 * 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -218,59 +323,46 @@ static int afs_deliver_cb_callback(struct afs_call *call) cb->fid.vid = ntohl(*bp++); cb->fid.vnode = ntohl(*bp++); cb->fid.unique = ntohl(*bp++); - cb->cb.type = AFSCM_CB_UNTYPED; } - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* extract the callback array and its count in two steps */ case 3: _debug("extract CB count"); - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->count2 = ntohl(call->tmp); _debug("CB count: %u", call->count2); if (call->count2 != call->count && call->count2 != 0) - return afs_protocol_error(call, -EBADMSG); - call->offset = 0; + return afs_protocol_error(call, -EBADMSG, + afs_eproto_cb_count); + call->_iter = &call->iter; + iov_iter_discard(&call->iter, READ, call->count2 * 3 * 4); call->unmarshall++; case 4: - _debug("extract CB array"); - ret = afs_extract_data(call, call->buffer, - call->count2 * 3 * 4, false); + _debug("extract discard %zu/%u", + iov_iter_count(&call->iter), call->count2 * 3 * 4); + + ret = afs_extract_data(call, false); if (ret < 0) return ret; - _debug("unmarshall CB array"); - cb = call->request; - bp = call->buffer; - for (loop = call->count2; loop > 0; loop--, cb++) { - cb->cb.version = ntohl(*bp++); - cb->cb.expiry = ntohl(*bp++); - cb->cb.type = ntohl(*bp++); - } - - call->offset = 0; call->unmarshall++; case 5: break; } if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) - return -EIO; + return afs_io_error(call, afs_io_error_cm_reply); /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - call->cm_server = afs_find_server(call->net, &srx); - if (!call->cm_server) - trace_afs_cm_no_server(call, &srx); - - return afs_queue_call_work(call); + return afs_find_cm_server_by_peer(call); } /* @@ -294,24 +386,18 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) */ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { - struct sockaddr_rxrpc srx; int ret; _enter(""); - rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - - ret = afs_extract_data(call, NULL, 0, false); + afs_extract_discard(call, 0); + ret = afs_extract_data(call, false); if (ret < 0) return ret; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - call->cm_server = afs_find_server(call->net, &srx); - if (!call->cm_server) - trace_afs_cm_no_server(call, &srx); - - return afs_queue_call_work(call); + return afs_find_cm_server_by_peer(call); } /* @@ -330,16 +416,15 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) switch (call->unmarshall) { case 0: - call->offset = 0; call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL); if (!call->buffer) return -ENOMEM; + afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; case 1: _debug("extract UUID"); - ret = afs_extract_data(call, call->buffer, - 11 * sizeof(__be32), false); + ret = afs_extract_data(call, false); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -362,7 +447,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) for (loop = 0; loop < 6; loop++) r->node[loop] = ntohl(b[loop + 5]); - call->offset = 0; call->unmarshall++; case 2: @@ -370,17 +454,11 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) } if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) - return -EIO; + return afs_io_error(call, afs_io_error_cm_reply); /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - rcu_read_lock(); - call->cm_server = afs_find_server_by_uuid(call->net, call->request); - rcu_read_unlock(); - if (!call->cm_server) - trace_afs_cm_no_server_u(call, call->request); - - return afs_queue_call_work(call); + return afs_find_cm_server_by_uuid(call, call->request); } /* @@ -405,14 +483,14 @@ static int afs_deliver_cb_probe(struct afs_call *call) _enter(""); - ret = afs_extract_data(call, NULL, 0, false); + afs_extract_discard(call, 0); + ret = afs_extract_data(call, false); if (ret < 0) return ret; if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) - return -EIO; - - return afs_queue_call_work(call); + return afs_io_error(call, afs_io_error_cm_reply); + return afs_find_cm_server_by_peer(call); } /* @@ -453,16 +531,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) switch (call->unmarshall) { case 0: - call->offset = 0; call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL); if (!call->buffer) return -ENOMEM; + afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; case 1: _debug("extract UUID"); - ret = afs_extract_data(call, call->buffer, - 11 * sizeof(__be32), false); + ret = afs_extract_data(call, false); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -485,7 +562,6 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) for (loop = 0; loop < 6; loop++) r->node[loop] = ntohl(b[loop + 5]); - call->offset = 0; call->unmarshall++; case 2: @@ -493,9 +569,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) } if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) - return -EIO; - - return afs_queue_call_work(call); + return afs_io_error(call, afs_io_error_cm_reply); + return afs_find_cm_server_by_uuid(call, call->request); } /* @@ -570,12 +645,88 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) _enter(""); - ret = afs_extract_data(call, NULL, 0, false); + afs_extract_discard(call, 0); + ret = afs_extract_data(call, false); if (ret < 0) return ret; if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) - return -EIO; + return afs_io_error(call, afs_io_error_cm_reply); + return afs_find_cm_server_by_peer(call); +} + +/* + * deliver request data to a YFS CB.CallBack call + */ +static int afs_deliver_yfs_cb_callback(struct afs_call *call) +{ + struct afs_callback_break *cb; + struct yfs_xdr_YFSFid *bp; + size_t size; + int ret, loop; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + + /* extract the FID array and its count in two steps */ + case 1: + _debug("extract FID count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("FID count: %u", call->count); + if (call->count > YFSCBMAX) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_cb_fid_count); + + size = array_size(call->count, sizeof(struct yfs_xdr_YFSFid)); + call->buffer = kmalloc(size, GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + afs_extract_to_buf(call, size); + call->unmarshall++; + + case 2: + _debug("extract FID array"); + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + _debug("unmarshall FID array"); + call->request = kcalloc(call->count, + sizeof(struct afs_callback_break), + GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + cb = call->request; + bp = call->buffer; + for (loop = call->count; loop > 0; loop--, cb++) { + cb->fid.vid = xdr_to_u64(bp->volume); + cb->fid.vnode = xdr_to_u64(bp->vnode.lo); + cb->fid.vnode_hi = ntohl(bp->vnode.hi); + cb->fid.unique = ntohl(bp->vnode.unique); + bp++; + } + + afs_extract_to_tmp(call); + call->unmarshall++; + + case 3: + break; + } + + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return afs_io_error(call, afs_io_error_cm_reply); - return afs_queue_call_work(call); + /* We'll need the file server record as that tells us which set of + * vnodes to operate upon. + */ + return afs_find_cm_server_by_peer(call); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 855bf2b79fed..43dea3b00c29 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -138,6 +138,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, ntohs(dbuf->blocks[tmp].hdr.magic)); trace_afs_dir_check_failed(dvnode, off, i_size); kunmap(page); + trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic); goto error; } @@ -190,9 +191,11 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) retry: i_size = i_size_read(&dvnode->vfs_inode); if (i_size < 2048) - return ERR_PTR(-EIO); - if (i_size > 2048 * 1024) + return ERR_PTR(afs_bad(dvnode, afs_file_error_dir_small)); + if (i_size > 2048 * 1024) { + trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); return ERR_PTR(-EFBIG); + } _enter("%llu", i_size); @@ -315,7 +318,8 @@ content_has_grown: /* * deal with one block in an AFS directory */ -static int afs_dir_iterate_block(struct dir_context *ctx, +static int afs_dir_iterate_block(struct afs_vnode *dvnode, + struct dir_context *ctx, union afs_xdr_dir_block *block, unsigned blkoff) { @@ -365,7 +369,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx, " (len %u/%zu)", blkoff / sizeof(union afs_xdr_dir_block), offset, next, tmp, nlen); - return -EIO; + return afs_bad(dvnode, afs_file_error_dir_over_end); } if (!(block->hdr.bitmap[next / 8] & (1 << (next % 8)))) { @@ -373,7 +377,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx, " %u unmarked extension (len %u/%zu)", blkoff / sizeof(union afs_xdr_dir_block), offset, next, tmp, nlen); - return -EIO; + return afs_bad(dvnode, afs_file_error_dir_unmarked_ext); } _debug("ENT[%zu.%u]: ext %u/%zu", @@ -442,7 +446,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, */ page = req->pages[blkoff / PAGE_SIZE]; if (!page) { - ret = -EIO; + ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; } mark_page_accessed(page); @@ -455,7 +459,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, do { dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / sizeof(union afs_xdr_dir_block)]; - ret = afs_dir_iterate_block(ctx, dblock, blkoff); + ret = afs_dir_iterate_block(dvnode, ctx, dblock, blkoff); if (ret != 1) { kunmap(page); goto out; @@ -548,7 +552,7 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, } *fid = cookie.fid; - _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique); + _leave(" = 0 { vn=%llu u=%u }", fid->vnode, fid->unique); return 0; } @@ -826,7 +830,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, struct key *key; int ret; - _enter("{%x:%u},%p{%pd},", + _enter("{%llx:%llu},%p{%pd},", dvnode->fid.vid, dvnode->fid.vnode, dentry, dentry); ASSERTCMP(d_inode(dentry), ==, NULL); @@ -896,7 +900,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); - _enter("{v={%x:%u} n=%pd fl=%lx},", + _enter("{v={%llx:%llu} n=%pd fl=%lx},", vnode->fid.vid, vnode->fid.vnode, dentry, vnode->flags); } else { @@ -965,7 +969,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) /* if the vnode ID has changed, then the dirent points to a * different file */ if (fid.vnode != vnode->fid.vnode) { - _debug("%pd: dirent changed [%u != %u]", + _debug("%pd: dirent changed [%llu != %llu]", dentry, fid.vnode, vnode->fid.vnode); goto not_found; @@ -1085,6 +1089,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, vnode = AFS_FS_I(inode); set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + afs_vnode_commit_status(fc, vnode, 0); d_add(new_dentry, inode); } @@ -1104,7 +1109,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) mode |= S_IFDIR; - _enter("{%x:%u},{%pd},%ho", + _enter("{%llx:%llu},{%pd},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); key = afs_request_key(dvnode->volume->cell); @@ -1169,12 +1174,12 @@ static void afs_dir_remove_subdir(struct dentry *dentry) static int afs_rmdir(struct inode *dir, struct dentry *dentry) { struct afs_fs_cursor fc; - struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL; struct key *key; u64 data_version = dvnode->status.data_version; int ret; - _enter("{%x:%u},{%pd}", + _enter("{%llx:%llu},{%pd}", dvnode->fid.vid, dvnode->fid.vnode, dentry); key = afs_request_key(dvnode->volume->cell); @@ -1183,11 +1188,19 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) goto error; } + /* Try to make sure we have a callback promise on the victim. */ + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); + ret = afs_validate(vnode, key); + if (ret < 0) + goto error_key; + } + ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(dvnode); - afs_fs_remove(&fc, dentry->d_name.name, true, + afs_fs_remove(&fc, vnode, dentry->d_name.name, true, data_version); } @@ -1201,6 +1214,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) } } +error_key: key_put(key); error: return ret; @@ -1231,7 +1245,9 @@ static int afs_dir_remove_link(struct dentry *dentry, struct key *key, if (d_really_is_positive(dentry)) { struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - if (dir_valid) { + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + /* Already done */ + } else if (dir_valid) { drop_nlink(&vnode->vfs_inode); if (vnode->vfs_inode.i_nlink == 0) { set_bit(AFS_VNODE_DELETED, &vnode->flags); @@ -1260,13 +1276,13 @@ static int afs_dir_remove_link(struct dentry *dentry, struct key *key, static int afs_unlink(struct inode *dir, struct dentry *dentry) { struct afs_fs_cursor fc; - struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; + struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL; struct key *key; unsigned long d_version = (unsigned long)dentry->d_fsdata; u64 data_version = dvnode->status.data_version; int ret; - _enter("{%x:%u},{%pd}", + _enter("{%llx:%llu},{%pd}", dvnode->fid.vid, dvnode->fid.vnode, dentry); if (dentry->d_name.len >= AFSNAMEMAX) @@ -1290,7 +1306,18 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(dvnode); - afs_fs_remove(&fc, dentry->d_name.name, false, + + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc.cbi->server->flags) && + !test_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags)) { + yfs_fs_remove_file2(&fc, vnode, dentry->d_name.name, + data_version); + if (fc.ac.error != -ECONNABORTED || + fc.ac.abort_code != RXGEN_OPCODE) + continue; + set_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags); + } + + afs_fs_remove(&fc, vnode, dentry->d_name.name, false, data_version); } @@ -1330,7 +1357,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, mode |= S_IFREG; - _enter("{%x:%u},{%pd},%ho,", + _enter("{%llx:%llu},{%pd},%ho,", dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); ret = -ENAMETOOLONG; @@ -1393,7 +1420,7 @@ static int afs_link(struct dentry *from, struct inode *dir, dvnode = AFS_FS_I(dir); data_version = dvnode->status.data_version; - _enter("{%x:%u},{%x:%u},{%pd}", + _enter("{%llx:%llu},{%llx:%llu},{%pd}", vnode->fid.vid, vnode->fid.vnode, dvnode->fid.vid, dvnode->fid.vnode, dentry); @@ -1464,7 +1491,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, u64 data_version = dvnode->status.data_version; int ret; - _enter("{%x:%u},{%pd},%s", + _enter("{%llx:%llu},{%pd},%s", dvnode->fid.vid, dvnode->fid.vnode, dentry, content); @@ -1540,7 +1567,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, orig_data_version = orig_dvnode->status.data_version; new_data_version = new_dvnode->status.data_version; - _enter("{%x:%u},{%x:%u},{%x:%u},{%pd}", + _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, vnode->fid.vid, vnode->fid.vnode, new_dvnode->fid.vid, new_dvnode->fid.vnode, @@ -1607,7 +1634,7 @@ static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags) { struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host); - _enter("{{%x:%u}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); + _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); set_page_private(page, 0); ClearPagePrivate(page); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index f29c6dade7f6..a9ba81ddf154 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -46,7 +46,7 @@ static int afs_probe_cell_name(struct dentry *dentry) return 0; } - ret = dns_query("afsdb", name, len, "", NULL, NULL); + ret = dns_query("afsdb", name, len, "srv=1", NULL, NULL); if (ret == -ENODATA) ret = -EDESTADDRREQ; return ret; @@ -62,7 +62,7 @@ struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir) struct inode *inode; int ret = -ENOENT; - _enter("%p{%pd}, {%x:%u}", + _enter("%p{%pd}, {%llx:%llu}", dentry, dentry, vnode->fid.vid, vnode->fid.vnode); if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) diff --git a/fs/afs/file.c b/fs/afs/file.c index 7d4f26198573..d6bc3f5d784b 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -121,7 +121,7 @@ int afs_open(struct inode *inode, struct file *file) struct key *key; int ret; - _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode); key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) { @@ -170,7 +170,7 @@ int afs_release(struct inode *inode, struct file *file) struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = file->private_data; - _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode); if ((file->f_mode & FMODE_WRITE)) return vfs_fsync(file, 0); @@ -228,7 +228,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de struct afs_fs_cursor fc; int ret; - _enter("%s{%x:%u.%u},%x,,,", + _enter("%s{%llx:%llu.%u},%x,,,", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, @@ -634,7 +634,7 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); unsigned long priv; - _enter("{{%x:%u}[%lu],%lx},%x", + _enter("{{%llx:%llu}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, gfp_flags); diff --git a/fs/afs/flock.c b/fs/afs/flock.c index dc62d15a964b..0568fd986821 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -29,7 +29,7 @@ static const struct file_lock_operations afs_lock_ops = { */ void afs_lock_may_be_available(struct afs_vnode *vnode) { - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0); } @@ -76,7 +76,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key, struct afs_fs_cursor fc; int ret; - _enter("%s{%x:%u.%u},%x,%u", + _enter("%s{%llx:%llu.%u},%x,%u", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, @@ -107,7 +107,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) struct afs_fs_cursor fc; int ret; - _enter("%s{%x:%u.%u},%x", + _enter("%s{%llx:%llu.%u},%x", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, @@ -138,7 +138,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key) struct afs_fs_cursor fc; int ret; - _enter("%s{%x:%u.%u},%x", + _enter("%s{%llx:%llu.%u},%x", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, @@ -175,7 +175,7 @@ void afs_lock_work(struct work_struct *work) struct key *key; int ret; - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); spin_lock(&vnode->lock); @@ -192,7 +192,7 @@ again: ret = afs_release_lock(vnode, vnode->lock_key); if (ret < 0) printk(KERN_WARNING "AFS:" - " Failed to release lock on {%x:%x} error %d\n", + " Failed to release lock on {%llx:%llx} error %d\n", vnode->fid.vid, vnode->fid.vnode, ret); spin_lock(&vnode->lock); @@ -229,7 +229,7 @@ again: key_put(key); if (ret < 0) - pr_warning("AFS: Failed to extend lock on {%x:%x} error %d\n", + pr_warning("AFS: Failed to extend lock on {%llx:%llx} error %d\n", vnode->fid.vid, vnode->fid.vnode, ret); spin_lock(&vnode->lock); @@ -430,7 +430,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) struct key *key = afs_file_key(file); int ret; - _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); /* only whole-file locks are supported */ if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) @@ -582,7 +582,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); int ret; - _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); /* Flush all pending writes before doing anything with locks. */ vfs_fsync(file, 0); @@ -639,7 +639,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); - _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}", + _enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}", vnode->fid.vid, vnode->fid.vnode, cmd, fl->fl_type, fl->fl_flags, (long long) fl->fl_start, (long long) fl->fl_end); @@ -662,7 +662,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); - _enter("{%x:%u},%d,{t=%x,fl=%x}", + _enter("{%llx:%llu},%d,{t=%x,fl=%x}", vnode->fid.vid, vnode->fid.vnode, cmd, fl->fl_type, fl->fl_flags); diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c new file mode 100644 index 000000000000..d049cb459742 --- /dev/null +++ b/fs/afs/fs_probe.c @@ -0,0 +1,270 @@ +/* AFS fileserver probing + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include "afs_fs.h" +#include "internal.h" +#include "protocol_yfs.h" + +static bool afs_fs_probe_done(struct afs_server *server) +{ + if (!atomic_dec_and_test(&server->probe_outstanding)) + return false; + + wake_up_var(&server->probe_outstanding); + clear_bit_unlock(AFS_SERVER_FL_PROBING, &server->flags); + wake_up_bit(&server->flags, AFS_SERVER_FL_PROBING); + return true; +} + +/* + * Process the result of probing a fileserver. This is called after successful + * or failed delivery of an FS.GetCapabilities operation. + */ +void afs_fileserver_probe_result(struct afs_call *call) +{ + struct afs_addr_list *alist = call->alist; + struct afs_server *server = call->reply[0]; + unsigned int server_index = (long)call->reply[1]; + unsigned int index = call->addr_ix; + unsigned int rtt = UINT_MAX; + bool have_result = false; + u64 _rtt; + int ret = call->error; + + _enter("%pU,%u", &server->uuid, index); + + spin_lock(&server->probe_lock); + + switch (ret) { + case 0: + server->probe.error = 0; + goto responded; + case -ECONNABORTED: + if (!server->probe.responded) { + server->probe.abort_code = call->abort_code; + server->probe.error = ret; + } + goto responded; + case -ENOMEM: + case -ENONET: + server->probe.local_failure = true; + afs_io_error(call, afs_io_error_fs_probe_fail); + goto out; + case -ECONNRESET: /* Responded, but call expired. */ + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + default: + clear_bit(index, &alist->responded); + set_bit(index, &alist->failed); + if (!server->probe.responded && + (server->probe.error == 0 || + server->probe.error == -ETIMEDOUT || + server->probe.error == -ETIME)) + server->probe.error = ret; + afs_io_error(call, afs_io_error_fs_probe_fail); + goto out; + } + +responded: + set_bit(index, &alist->responded); + clear_bit(index, &alist->failed); + + if (call->service_id == YFS_FS_SERVICE) { + server->probe.is_yfs = true; + set_bit(AFS_SERVER_FL_IS_YFS, &server->flags); + alist->addrs[index].srx_service = call->service_id; + } else { + server->probe.not_yfs = true; + if (!server->probe.is_yfs) { + clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags); + alist->addrs[index].srx_service = call->service_id; + } + } + + /* Get the RTT and scale it to fit into a 32-bit value that represents + * over a minute of time so that we can access it with one instruction + * on a 32-bit system. + */ + _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); + _rtt /= 64; + rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; + if (rtt < server->probe.rtt) { + server->probe.rtt = rtt; + alist->preferred = index; + have_result = true; + } + + smp_wmb(); /* Set rtt before responded. */ + server->probe.responded = true; + set_bit(AFS_SERVER_FL_PROBED, &server->flags); +out: + spin_unlock(&server->probe_lock); + + _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", + server_index, index, &alist->addrs[index].transport, + (unsigned int)rtt, ret); + + have_result |= afs_fs_probe_done(server); + if (have_result) { + server->probe.have_result = true; + wake_up_var(&server->probe.have_result); + wake_up_all(&server->probe_wq); + } +} + +/* + * Probe all of a fileserver's addresses to find out the best route and to + * query its capabilities. + */ +static int afs_do_probe_fileserver(struct afs_net *net, + struct afs_server *server, + struct key *key, + unsigned int server_index) +{ + struct afs_addr_cursor ac = { + .index = 0, + }; + int ret; + + _enter("%pU", &server->uuid); + + read_lock(&server->fs_lock); + ac.alist = rcu_dereference_protected(server->addresses, + lockdep_is_held(&server->fs_lock)); + read_unlock(&server->fs_lock); + + atomic_set(&server->probe_outstanding, ac.alist->nr_addrs); + memset(&server->probe, 0, sizeof(server->probe)); + server->probe.rtt = UINT_MAX; + + for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { + ret = afs_fs_get_capabilities(net, server, &ac, key, server_index, + true); + if (ret != -EINPROGRESS) { + afs_fs_probe_done(server); + return ret; + } + } + + return 0; +} + +/* + * Send off probes to all unprobed servers. + */ +int afs_probe_fileservers(struct afs_net *net, struct key *key, + struct afs_server_list *list) +{ + struct afs_server *server; + int i, ret; + + for (i = 0; i < list->nr_servers; i++) { + server = list->servers[i].server; + if (test_bit(AFS_SERVER_FL_PROBED, &server->flags)) + continue; + + if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags)) { + ret = afs_do_probe_fileserver(net, server, key, i); + if (ret) + return ret; + } + } + + return 0; +} + +/* + * Wait for the first as-yet untried fileserver to respond. + */ +int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) +{ + struct wait_queue_entry *waits; + struct afs_server *server; + unsigned int rtt = UINT_MAX; + bool have_responders = false; + int pref = -1, i; + + _enter("%u,%lx", slist->nr_servers, untried); + + /* Only wait for servers that have a probe outstanding. */ + for (i = 0; i < slist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = slist->servers[i].server; + if (!test_bit(AFS_SERVER_FL_PROBING, &server->flags)) + __clear_bit(i, &untried); + if (server->probe.responded) + have_responders = true; + } + } + if (have_responders || !untried) + return 0; + + waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL); + if (!waits) + return -ENOMEM; + + for (i = 0; i < slist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = slist->servers[i].server; + init_waitqueue_entry(&waits[i], current); + add_wait_queue(&server->probe_wq, &waits[i]); + } + } + + for (;;) { + bool still_probing = false; + + set_current_state(TASK_INTERRUPTIBLE); + for (i = 0; i < slist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = slist->servers[i].server; + if (server->probe.responded) + goto stop; + if (test_bit(AFS_SERVER_FL_PROBING, &server->flags)) + still_probing = true; + } + } + + if (!still_probing || unlikely(signal_pending(current))) + goto stop; + schedule(); + } + +stop: + set_current_state(TASK_RUNNING); + + for (i = 0; i < slist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = slist->servers[i].server; + if (server->probe.responded && + server->probe.rtt < rtt) { + pref = i; + rtt = server->probe.rtt; + } + + remove_wait_queue(&server->probe_wq, &waits[i]); + } + } + + kfree(waits); + + if (pref == -1 && signal_pending(current)) + return -ERESTARTSYS; + + if (pref >= 0) + slist->preferred = pref; + return 0; +} diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 50929cb91732..ca08c83168f5 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -17,15 +17,10 @@ #include "internal.h" #include "afs_fs.h" #include "xdr_fs.h" +#include "protocol_yfs.h" static const struct afs_fid afs_zero_fid; -/* - * We need somewhere to discard into in case the server helpfully returns more - * than we asked for in FS.FetchData{,64}. - */ -static u8 afs_discard_buffer[64]; - static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi) { call->cbi = afs_get_cb_interest(cbi); @@ -75,8 +70,7 @@ void afs_update_inode_from_status(struct afs_vnode *vnode, struct timespec64 t; umode_t mode; - t.tv_sec = status->mtime_client; - t.tv_nsec = 0; + t = status->mtime_client; vnode->vfs_inode.i_ctime = t; vnode->vfs_inode.i_mtime = t; vnode->vfs_inode.i_atime = t; @@ -96,7 +90,7 @@ void afs_update_inode_from_status(struct afs_vnode *vnode, if (!(flags & AFS_VNODE_NOT_YET_SET)) { if (expected_version && *expected_version != status->data_version) { - _debug("vnode modified %llx on {%x:%u} [exp %llx]", + _debug("vnode modified %llx on {%llx:%llu} [exp %llx]", (unsigned long long) status->data_version, vnode->fid.vid, vnode->fid.vnode, (unsigned long long) *expected_version); @@ -170,7 +164,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, if (type != status->type && vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { - pr_warning("Vnode %x:%x:%x changed type %u to %u\n", + pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n", vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, @@ -200,8 +194,10 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, EXTRACT_M(mode); EXTRACT_M(group); - status->mtime_client = ntohl(xdr->mtime_client); - status->mtime_server = ntohl(xdr->mtime_server); + status->mtime_client.tv_sec = ntohl(xdr->mtime_client); + status->mtime_client.tv_nsec = 0; + status->mtime_server.tv_sec = ntohl(xdr->mtime_server); + status->mtime_server.tv_nsec = 0; status->lock_count = ntohl(xdr->lock_count); size = (u64)ntohl(xdr->size_lo); @@ -233,7 +229,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, bad: xdr_dump_bad(*_bp); - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); } /* @@ -273,7 +269,7 @@ static void xdr_decode_AFSCallBack(struct afs_call *call, write_seqlock(&vnode->cb_lock); - if (call->cb_break == afs_cb_break_sum(vnode, cbi)) { + if (!afs_cb_is_broken(call->cb_break, vnode, cbi)) { vnode->cb_version = ntohl(*bp++); cb_expiry = ntohl(*bp++); vnode->cb_type = ntohl(*bp++); @@ -293,13 +289,19 @@ static void xdr_decode_AFSCallBack(struct afs_call *call, *_bp = bp; } -static void xdr_decode_AFSCallBack_raw(const __be32 **_bp, +static ktime_t xdr_decode_expiry(struct afs_call *call, u32 expiry) +{ + return ktime_add_ns(call->reply_time, expiry * NSEC_PER_SEC); +} + +static void xdr_decode_AFSCallBack_raw(struct afs_call *call, + const __be32 **_bp, struct afs_callback *cb) { const __be32 *bp = *_bp; cb->version = ntohl(*bp++); - cb->expiry = ntohl(*bp++); + cb->expires_at = xdr_decode_expiry(call, ntohl(*bp++)); cb->type = ntohl(*bp++); *_bp = bp; } @@ -311,14 +313,18 @@ static void xdr_decode_AFSVolSync(const __be32 **_bp, struct afs_volsync *volsync) { const __be32 *bp = *_bp; + u32 creation; - volsync->creation = ntohl(*bp++); + creation = ntohl(*bp++); bp++; /* spare2 */ bp++; /* spare3 */ bp++; /* spare4 */ bp++; /* spare5 */ bp++; /* spare6 */ *_bp = bp; + + if (volsync) + volsync->creation = creation; } /* @@ -379,6 +385,8 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, vs->blocks_in_use = ntohl(*bp++); vs->part_blocks_avail = ntohl(*bp++); vs->part_max_blocks = ntohl(*bp++); + vs->vol_copy_date = 0; + vs->vol_backup_date = 0; *_bp = bp; } @@ -395,16 +403,16 @@ static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call) if (ret < 0) return ret; - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; xdr_decode_AFSCallBack(call, vnode, &bp); - if (call->reply[1]) - xdr_decode_AFSVolSync(&bp, call->reply[1]); + xdr_decode_AFSVolSync(&bp, call->reply[1]); _leave(" = 0 [done]"); return 0; @@ -431,7 +439,10 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy struct afs_net *net = afs_v2net(vnode); __be32 *bp; - _enter(",%x,{%x:%u},,", + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_fetch_file_status(fc, volsync, new_inode); + + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus_vnode, @@ -445,6 +456,7 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy call->reply[0] = vnode; call->reply[1] = volsync; call->expected_version = new_inode ? 1 : vnode->status.data_version; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -468,139 +480,117 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) struct afs_read *req = call->reply[2]; const __be32 *bp; unsigned int size; - void *buffer; int ret; - _enter("{%u,%zu/%u;%llu/%llu}", - call->unmarshall, call->offset, call->count, - req->remain, req->actual_len); + _enter("{%u,%zu/%llu}", + call->unmarshall, iov_iter_count(&call->iter), req->actual_len); switch (call->unmarshall) { case 0: req->actual_len = 0; - call->offset = 0; + req->index = 0; + req->offset = req->pos & (PAGE_SIZE - 1); call->unmarshall++; - if (call->operation_ID != FSFETCHDATA64) { - call->unmarshall++; - goto no_msw; + if (call->operation_ID == FSFETCHDATA64) { + afs_extract_to_tmp64(call); + } else { + call->tmp_u = htonl(0); + afs_extract_to_tmp(call); } - /* extract the upper part of the returned data length of an - * FSFETCHDATA64 op (which should always be 0 using this - * client) */ - case 1: - _debug("extract data length (MSW)"); - ret = afs_extract_data(call, &call->tmp, 4, true); - if (ret < 0) - return ret; - - req->actual_len = ntohl(call->tmp); - req->actual_len <<= 32; - call->offset = 0; - call->unmarshall++; - - no_msw: /* extract the returned data length */ - case 2: + case 1: _debug("extract data length"); - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; - req->actual_len |= ntohl(call->tmp); + req->actual_len = be64_to_cpu(call->tmp64); _debug("DATA length: %llu", req->actual_len); - - req->remain = req->actual_len; - call->offset = req->pos & (PAGE_SIZE - 1); - req->index = 0; - if (req->actual_len == 0) + req->remain = min(req->len, req->actual_len); + if (req->remain == 0) goto no_more_data; + call->unmarshall++; begin_page: ASSERTCMP(req->index, <, req->nr_pages); - if (req->remain > PAGE_SIZE - call->offset) - size = PAGE_SIZE - call->offset; + if (req->remain > PAGE_SIZE - req->offset) + size = PAGE_SIZE - req->offset; else size = req->remain; - call->count = call->offset + size; - ASSERTCMP(call->count, <=, PAGE_SIZE); - req->remain -= size; + call->bvec[0].bv_len = size; + call->bvec[0].bv_offset = req->offset; + call->bvec[0].bv_page = req->pages[req->index]; + iov_iter_bvec(&call->iter, READ, call->bvec, 1, size); + ASSERTCMP(size, <=, PAGE_SIZE); /* extract the returned data */ - case 3: - _debug("extract data %llu/%llu %zu/%u", - req->remain, req->actual_len, call->offset, call->count); + case 2: + _debug("extract data %zu/%llu", + iov_iter_count(&call->iter), req->remain); - buffer = kmap(req->pages[req->index]); - ret = afs_extract_data(call, buffer, call->count, true); - kunmap(req->pages[req->index]); + ret = afs_extract_data(call, true); if (ret < 0) return ret; - if (call->offset == PAGE_SIZE) { + req->remain -= call->bvec[0].bv_len; + req->offset += call->bvec[0].bv_len; + ASSERTCMP(req->offset, <=, PAGE_SIZE); + if (req->offset == PAGE_SIZE) { + req->offset = 0; if (req->page_done) req->page_done(call, req); req->index++; - if (req->remain > 0) { - call->offset = 0; - if (req->index >= req->nr_pages) { - call->unmarshall = 4; - goto begin_discard; - } + if (req->remain > 0) goto begin_page; - } } - goto no_more_data; + + ASSERTCMP(req->remain, ==, 0); + if (req->actual_len <= req->len) + goto no_more_data; /* Discard any excess data the server gave us */ - begin_discard: - case 4: - size = min_t(loff_t, sizeof(afs_discard_buffer), req->remain); - call->count = size; - _debug("extract discard %llu/%llu %zu/%u", - req->remain, req->actual_len, call->offset, call->count); - - call->offset = 0; - ret = afs_extract_data(call, afs_discard_buffer, call->count, true); - req->remain -= call->offset; + iov_iter_discard(&call->iter, READ, req->actual_len - req->len); + call->unmarshall = 3; + case 3: + _debug("extract discard %zu/%llu", + iov_iter_count(&call->iter), req->actual_len - req->len); + + ret = afs_extract_data(call, true); if (ret < 0) return ret; - if (req->remain > 0) - goto begin_discard; no_more_data: - call->offset = 0; - call->unmarshall = 5; + call->unmarshall = 4; + afs_extract_to_buf(call, (21 + 3 + 6) * 4); /* extract the metadata */ - case 5: - ret = afs_extract_data(call, call->buffer, - (21 + 3 + 6) * 4, false); + case 4: + ret = afs_extract_data(call, false); if (ret < 0) return ret; bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, - &vnode->status.data_version, req) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &vnode->status.data_version, req); + if (ret < 0) + return ret; xdr_decode_AFSCallBack(call, vnode, &bp); - if (call->reply[1]) - xdr_decode_AFSVolSync(&bp, call->reply[1]); + xdr_decode_AFSVolSync(&bp, call->reply[1]); - call->offset = 0; call->unmarshall++; - case 6: + case 5: break; } for (; req->index < req->nr_pages; req->index++) { - if (call->count < PAGE_SIZE) + if (req->offset < PAGE_SIZE) zero_user_segment(req->pages[req->index], - call->count, PAGE_SIZE); + req->offset, PAGE_SIZE); if (req->page_done) req->page_done(call, req); - call->count = 0; + req->offset = 0; } _leave(" = 0 [done]"); @@ -653,6 +643,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req) call->reply[1] = NULL; /* volsync */ call->reply[2] = req; call->expected_version = vnode->status.data_version; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -682,6 +673,9 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) struct afs_net *net = afs_v2net(vnode); __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_fetch_data(fc, req); + if (upper_32_bits(req->pos) || upper_32_bits(req->len) || upper_32_bits(req->pos + req->len)) @@ -698,6 +692,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) call->reply[1] = NULL; /* volsync */ call->reply[2] = req; call->expected_version = vnode->status.data_version; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -733,11 +728,14 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 || - afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); - xdr_decode_AFSCallBack_raw(&bp, call->reply[3]); + ret = afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL); + if (ret < 0) + return ret; + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_AFSCallBack_raw(call, &bp, call->reply[3]); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -778,6 +776,15 @@ int afs_fs_create(struct afs_fs_cursor *fc, size_t namesz, reqsz, padsz; __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)){ + if (S_ISDIR(mode)) + return yfs_fs_make_dir(fc, name, mode, current_data_version, + newfid, newstatus, newcb); + else + return yfs_fs_create_file(fc, name, mode, current_data_version, + newfid, newstatus, newcb); + } + _enter(""); namesz = strlen(name); @@ -796,6 +803,7 @@ int afs_fs_create(struct afs_fs_cursor *fc, call->reply[2] = newstatus; call->reply[3] = newcb; call->expected_version = current_data_version + 1; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -839,9 +847,10 @@ static int afs_deliver_fs_remove(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -868,15 +877,18 @@ static const struct afs_call_type afs_RXFSRemoveDir = { /* * remove a file or directory */ -int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir, - u64 current_data_version) +int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode, + const char *name, bool isdir, u64 current_data_version) { - struct afs_vnode *vnode = fc->vnode; + struct afs_vnode *dvnode = fc->vnode; struct afs_call *call; - struct afs_net *net = afs_v2net(vnode); + struct afs_net *net = afs_v2net(dvnode); size_t namesz, reqsz, padsz; __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_remove(fc, vnode, name, isdir, current_data_version); + _enter(""); namesz = strlen(name); @@ -890,15 +902,16 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir, return -ENOMEM; call->key = fc->key; - call->reply[0] = vnode; + call->reply[0] = dvnode; + call->reply[1] = vnode; call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; *bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(dvnode->fid.vid); + *bp++ = htonl(dvnode->fid.vnode); + *bp++ = htonl(dvnode->fid.unique); *bp++ = htonl(namesz); memcpy(bp, name, namesz); bp = (void *) bp + namesz; @@ -908,7 +921,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir, } afs_use_fs_server(call, fc->cbi); - trace_afs_make_fs_call(call, &vnode->fid); + trace_afs_make_fs_call(call, &dvnode->fid); return afs_make_call(&fc->ac, call, GFP_NOFS, false); } @@ -929,10 +942,13 @@ static int afs_deliver_fs_link(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 || - afs_decode_status(call, &bp, &dvnode->status, dvnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL); + if (ret < 0) + return ret; + ret = afs_decode_status(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -961,6 +977,9 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, size_t namesz, reqsz, padsz; __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_link(fc, vnode, name, current_data_version); + _enter(""); namesz = strlen(name); @@ -1016,10 +1035,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) || - afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL); + if (ret < 0) + return ret; + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1052,6 +1074,10 @@ int afs_fs_symlink(struct afs_fs_cursor *fc, size_t namesz, reqsz, padsz, c_namesz, c_padsz; __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_symlink(fc, name, contents, current_data_version, + newfid, newstatus); + _enter(""); namesz = strlen(name); @@ -1122,13 +1148,16 @@ static int afs_deliver_fs_rename(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); - if (new_dvnode != orig_dvnode && - afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode, - &call->expected_version_2, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + if (new_dvnode != orig_dvnode) { + ret = afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode, + &call->expected_version_2, NULL); + if (ret < 0) + return ret; + } /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1161,6 +1190,12 @@ int afs_fs_rename(struct afs_fs_cursor *fc, size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz; __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_rename(fc, orig_name, + new_dvnode, new_name, + current_orig_data_version, + current_new_data_version); + _enter(""); o_namesz = strlen(orig_name); @@ -1231,9 +1266,10 @@ static int afs_deliver_fs_store_data(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ afs_pages_written_back(vnode, call); @@ -1273,7 +1309,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc, struct afs_net *net = afs_v2net(vnode); __be32 *bp; - _enter(",%x,{%x:%u},,", + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); call = afs_alloc_flat_call(net, &afs_RXFSStoreData64, @@ -1330,7 +1366,10 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, loff_t size, pos, i_size; __be32 *bp; - _enter(",%x,{%x:%u},,", + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_store_data(fc, mapping, first, last, offset, to); + + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); size = (loff_t)to - (loff_t)offset; @@ -1407,9 +1446,10 @@ static int afs_deliver_fs_store_status(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (afs_decode_status(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1451,7 +1491,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr) struct afs_net *net = afs_v2net(vnode); __be32 *bp; - _enter(",%x,{%x:%u},,", + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); @@ -1498,7 +1538,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) struct afs_net *net = afs_v2net(vnode); __be32 *bp; - _enter(",%x,{%x:%u},,", + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); @@ -1544,10 +1584,13 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr) struct afs_net *net = afs_v2net(vnode); __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_setattr(fc, attr); + if (attr->ia_valid & ATTR_SIZE) return afs_fs_setattr_size(fc, attr); - _enter(",%x,{%x:%u},,", + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); call = afs_alloc_flat_call(net, &afs_RXFSStoreStatus, @@ -1581,164 +1624,114 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) { const __be32 *bp; char *p; + u32 size; int ret; _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: - call->offset = 0; call->unmarshall++; + afs_extract_to_buf(call, 12 * 4); /* extract the returned status record */ case 1: _debug("extract status"); - ret = afs_extract_data(call, call->buffer, - 12 * 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; bp = call->buffer; xdr_decode_AFSFetchVolumeStatus(&bp, call->reply[1]); - call->offset = 0; call->unmarshall++; + afs_extract_to_tmp(call); /* extract the volume name length */ case 2: - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG); - call->offset = 0; + return afs_protocol_error(call, -EBADMSG, + afs_eproto_volname_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); call->unmarshall++; /* extract the volume name */ case 3: _debug("extract volname"); - if (call->count > 0) { - ret = afs_extract_data(call, call->reply[2], - call->count, true); - if (ret < 0) - return ret; - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; p = call->reply[2]; p[call->count] = 0; _debug("volname '%s'", p); - - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; - /* extract the volume name padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_volname_padding; - } - call->count = 4 - (call->count & 3); - - case 4: - ret = afs_extract_data(call, call->buffer, - call->count, true); - if (ret < 0) - return ret; - - call->offset = 0; - call->unmarshall++; - no_volname_padding: - /* extract the offline message length */ - case 5: - ret = afs_extract_data(call, &call->tmp, 4, true); + case 4: + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG); - call->offset = 0; + return afs_protocol_error(call, -EBADMSG, + afs_eproto_offline_msg_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); call->unmarshall++; /* extract the offline message */ - case 6: + case 5: _debug("extract offline"); - if (call->count > 0) { - ret = afs_extract_data(call, call->reply[2], - call->count, true); - if (ret < 0) - return ret; - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; p = call->reply[2]; p[call->count] = 0; _debug("offline '%s'", p); - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; - /* extract the offline message padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_offline_padding; - } - call->count = 4 - (call->count & 3); - - case 7: - ret = afs_extract_data(call, call->buffer, - call->count, true); - if (ret < 0) - return ret; - - call->offset = 0; - call->unmarshall++; - no_offline_padding: - /* extract the message of the day length */ - case 8: - ret = afs_extract_data(call, &call->tmp, 4, true); + case 6: + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); if (call->count >= AFSNAMEMAX) - return afs_protocol_error(call, -EBADMSG); - call->offset = 0; + return afs_protocol_error(call, -EBADMSG, + afs_eproto_motd_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); call->unmarshall++; /* extract the message of the day */ - case 9: + case 7: _debug("extract motd"); - if (call->count > 0) { - ret = afs_extract_data(call, call->reply[2], - call->count, true); - if (ret < 0) - return ret; - } + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; p = call->reply[2]; p[call->count] = 0; _debug("motd '%s'", p); - call->offset = 0; call->unmarshall++; - /* extract the message of the day padding */ - call->count = (4 - (call->count & 3)) & 3; - - case 10: - ret = afs_extract_data(call, call->buffer, - call->count, false); - if (ret < 0) - return ret; - - call->offset = 0; - call->unmarshall++; - case 11: + case 8: break; } @@ -1778,6 +1771,9 @@ int afs_fs_get_volume_status(struct afs_fs_cursor *fc, __be32 *bp; void *tmpbuf; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_get_volume_status(fc, vs); + _enter(""); tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL); @@ -1867,6 +1863,9 @@ int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type) struct afs_net *net = afs_v2net(vnode); __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_set_lock(fc, type); + _enter(""); call = afs_alloc_flat_call(net, &afs_RXFSSetLock, 5 * 4, 6 * 4); @@ -1899,6 +1898,9 @@ int afs_fs_extend_lock(struct afs_fs_cursor *fc) struct afs_net *net = afs_v2net(vnode); __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_extend_lock(fc); + _enter(""); call = afs_alloc_flat_call(net, &afs_RXFSExtendLock, 4 * 4, 6 * 4); @@ -1930,6 +1932,9 @@ int afs_fs_release_lock(struct afs_fs_cursor *fc) struct afs_net *net = afs_v2net(vnode); __be32 *bp; + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_release_lock(fc); + _enter(""); call = afs_alloc_flat_call(net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4); @@ -2004,19 +2009,16 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) u32 count; int ret; - _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count); + _enter("{%u,%zu}", call->unmarshall, iov_iter_count(&call->iter)); -again: switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* Extract the capabilities word count */ case 1: - ret = afs_extract_data(call, &call->tmp, - 1 * sizeof(__be32), - true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -2024,24 +2026,17 @@ again: call->count = count; call->count2 = count; - call->offset = 0; + iov_iter_discard(&call->iter, READ, count * sizeof(__be32)); call->unmarshall++; /* Extract capabilities words */ case 2: - count = min(call->count, 16U); - ret = afs_extract_data(call, call->buffer, - count * sizeof(__be32), - call->count > 16); + ret = afs_extract_data(call, false); if (ret < 0) return ret; /* TODO: Examine capabilities */ - call->count -= count; - if (call->count > 0) - goto again; - call->offset = 0; call->unmarshall++; break; } @@ -2050,6 +2045,14 @@ again: return 0; } +static void afs_destroy_fs_get_capabilities(struct afs_call *call) +{ + struct afs_server *server = call->reply[0]; + + afs_put_server(call->net, server); + afs_flat_call_destructor(call); +} + /* * FS.GetCapabilities operation type */ @@ -2057,7 +2060,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { .name = "FS.GetCapabilities", .op = afs_FS_GetCapabilities, .deliver = afs_deliver_fs_get_capabilities, - .destructor = afs_flat_call_destructor, + .done = afs_fileserver_probe_result, + .destructor = afs_destroy_fs_get_capabilities, }; /* @@ -2067,7 +2071,9 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { int afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, struct afs_addr_cursor *ac, - struct key *key) + struct key *key, + unsigned int server_index, + bool async) { struct afs_call *call; __be32 *bp; @@ -2079,6 +2085,10 @@ int afs_fs_get_capabilities(struct afs_net *net, return -ENOMEM; call->key = key; + call->reply[0] = afs_get_server(server); + call->reply[1] = (void *)(long)server_index; + call->upgrade = true; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -2086,7 +2096,7 @@ int afs_fs_get_capabilities(struct afs_net *net, /* Can't take a ref on server */ trace_afs_make_fs_call(call, NULL); - return afs_make_call(ac, call, GFP_NOFS, false); + return afs_make_call(ac, call, GFP_NOFS, async); } /* @@ -2097,7 +2107,7 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) struct afs_file_status *status = call->reply[1]; struct afs_callback *callback = call->reply[2]; struct afs_volsync *volsync = call->reply[3]; - struct afs_vnode *vnode = call->reply[0]; + struct afs_fid *fid = call->reply[0]; const __be32 *bp; int ret; @@ -2105,21 +2115,16 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) if (ret < 0) return ret; - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", fid->vid, fid->vnode); /* unmarshall the reply once we've received all of it */ bp = call->buffer; - afs_decode_status(call, &bp, status, vnode, - &call->expected_version, NULL); - callback[call->count].version = ntohl(bp[0]); - callback[call->count].expiry = ntohl(bp[1]); - callback[call->count].type = ntohl(bp[2]); - if (vnode) - xdr_decode_AFSCallBack(call, vnode, &bp); - else - bp += 3; - if (volsync) - xdr_decode_AFSVolSync(&bp, volsync); + ret = afs_decode_status(call, &bp, status, NULL, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_AFSCallBack_raw(call, &bp, callback); + xdr_decode_AFSVolSync(&bp, volsync); _leave(" = 0 [done]"); return 0; @@ -2148,7 +2153,10 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc, struct afs_call *call; __be32 *bp; - _enter(",%x,{%x:%u},,", + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_fetch_status(fc, net, fid, status, callback, volsync); + + _enter(",%x,{%llx:%llu},,", key_serial(fc->key), fid->vid, fid->vnode); call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); @@ -2158,11 +2166,12 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc, } call->key = fc->key; - call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[0] = fid; call->reply[1] = status; call->reply[2] = callback; call->reply[3] = volsync; call->expected_version = 1; /* vnode->status.data_version */ + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -2193,38 +2202,40 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* Extract the file status count and array in two steps */ case 1: _debug("extract status count"); - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; tmp = ntohl(call->tmp); _debug("status count: %u/%u", tmp, call->count2); if (tmp != call->count2) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_ibulkst_count); call->count = 0; call->unmarshall++; more_counts: - call->offset = 0; + afs_extract_to_buf(call, 21 * sizeof(__be32)); case 2: _debug("extract status array %u", call->count); - ret = afs_extract_data(call, call->buffer, 21 * 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; bp = call->buffer; statuses = call->reply[1]; - if (afs_decode_status(call, &bp, &statuses[call->count], - call->count == 0 ? vnode : NULL, - NULL, NULL) < 0) - return afs_protocol_error(call, -EBADMSG); + ret = afs_decode_status(call, &bp, &statuses[call->count], + call->count == 0 ? vnode : NULL, + NULL, NULL); + if (ret < 0) + return ret; call->count++; if (call->count < call->count2) @@ -2232,27 +2243,28 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) call->count = 0; call->unmarshall++; - call->offset = 0; + afs_extract_to_tmp(call); /* Extract the callback count and array in two steps */ case 3: _debug("extract CB count"); - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; tmp = ntohl(call->tmp); _debug("CB count: %u", tmp); if (tmp != call->count2) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_ibulkst_cb_count); call->count = 0; call->unmarshall++; more_cbs: - call->offset = 0; + afs_extract_to_buf(call, 3 * sizeof(__be32)); case 4: _debug("extract CB array"); - ret = afs_extract_data(call, call->buffer, 3 * 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -2260,7 +2272,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) bp = call->buffer; callbacks = call->reply[2]; callbacks[call->count].version = ntohl(bp[0]); - callbacks[call->count].expiry = ntohl(bp[1]); + callbacks[call->count].expires_at = xdr_decode_expiry(call, ntohl(bp[1])); callbacks[call->count].type = ntohl(bp[2]); statuses = call->reply[1]; if (call->count == 0 && vnode && statuses[0].abort_code == 0) @@ -2269,19 +2281,17 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) if (call->count < call->count2) goto more_cbs; - call->offset = 0; + afs_extract_to_buf(call, 6 * sizeof(__be32)); call->unmarshall++; case 5: - ret = afs_extract_data(call, call->buffer, 6 * 4, false); + ret = afs_extract_data(call, false); if (ret < 0) return ret; bp = call->buffer; - if (call->reply[3]) - xdr_decode_AFSVolSync(&bp, call->reply[3]); + xdr_decode_AFSVolSync(&bp, call->reply[3]); - call->offset = 0; call->unmarshall++; case 6: @@ -2317,7 +2327,11 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc, __be32 *bp; int i; - _enter(",%x,{%x:%u},%u", + if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)) + return yfs_fs_inline_bulk_status(fc, net, fids, statuses, callbacks, + nr_fids, volsync); + + _enter(",%x,{%llx:%llu},%u", key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids); call = afs_alloc_flat_call(net, &afs_RXFSInlineBulkStatus, @@ -2334,6 +2348,7 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc, call->reply[2] = callbacks; call->reply[3] = volsync; call->count2 = nr_fids; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 479b7fdda124..4c6d8e1112c2 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -82,7 +82,7 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key) default: printk("kAFS: AFS vnode with undefined type\n"); read_sequnlock_excl(&vnode->cb_lock); - return afs_protocol_error(NULL, -EBADMSG); + return afs_protocol_error(NULL, -EBADMSG, afs_eproto_file_type); } inode->i_blocks = 0; @@ -100,7 +100,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode) struct afs_fs_cursor fc; int ret; - _enter("%s,{%x:%u.%u,S=%lx}", + _enter("%s,{%llx:%llu.%u,S=%lx}", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, vnode->flags); @@ -127,9 +127,9 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode) int afs_iget5_test(struct inode *inode, void *opaque) { struct afs_iget_data *data = opaque; + struct afs_vnode *vnode = AFS_FS_I(inode); - return inode->i_ino == data->fid.vnode && - inode->i_generation == data->fid.unique; + return memcmp(&vnode->fid, &data->fid, sizeof(data->fid)) == 0; } /* @@ -150,11 +150,14 @@ static int afs_iget5_set(struct inode *inode, void *opaque) struct afs_iget_data *data = opaque; struct afs_vnode *vnode = AFS_FS_I(inode); - inode->i_ino = data->fid.vnode; - inode->i_generation = data->fid.unique; vnode->fid = data->fid; vnode->volume = data->volume; + /* YFS supports 96-bit vnode IDs, but Linux only supports + * 64-bit inode numbers. + */ + inode->i_ino = data->fid.vnode; + inode->i_generation = data->fid.unique; return 0; } @@ -193,7 +196,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) return ERR_PTR(-ENOMEM); } - _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }", + _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }", inode, inode->i_ino, data.fid.vid, data.fid.vnode, data.fid.unique); @@ -252,8 +255,8 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) key.vnode_id = vnode->fid.vnode; key.unique = vnode->fid.unique; - key.vnode_id_ext[0] = 0; - key.vnode_id_ext[1] = 0; + key.vnode_id_ext[0] = vnode->fid.vnode >> 32; + key.vnode_id_ext[1] = vnode->fid.vnode_hi; aux.data_version = vnode->status.data_version; vnode->cache = fscache_acquire_cookie(vnode->volume->cache, @@ -277,7 +280,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, struct inode *inode; int ret; - _enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique); + _enter(",{%llx:%llu.%u},,", fid->vid, fid->vnode, fid->unique); as = sb->s_fs_info; data.volume = as->volume; @@ -289,7 +292,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, return ERR_PTR(-ENOMEM); } - _debug("GOT INODE %p { vl=%x vn=%x, u=%x }", + _debug("GOT INODE %p { vl=%llx vn=%llx, u=%x }", inode, fid->vid, fid->vnode, fid->unique); vnode = AFS_FS_I(inode); @@ -314,11 +317,11 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, * didn't give us a callback) */ vnode->cb_version = 0; vnode->cb_type = 0; - vnode->cb_expires_at = 0; + vnode->cb_expires_at = ktime_get(); } else { vnode->cb_version = cb->version; vnode->cb_type = cb->type; - vnode->cb_expires_at = cb->expiry; + vnode->cb_expires_at = cb->expires_at; vnode->cb_interest = afs_get_cb_interest(cbi); set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); } @@ -352,7 +355,7 @@ bad_inode: */ void afs_zap_data(struct afs_vnode *vnode) { - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); #ifdef CONFIG_AFS_FSCACHE fscache_invalidate(vnode->cache); @@ -382,7 +385,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) bool valid = false; int ret; - _enter("{v={%x:%u} fl=%lx},%x", + _enter("{v={%llx:%llu} fl=%lx},%x", vnode->fid.vid, vnode->fid.vnode, vnode->flags, key_serial(key)); @@ -501,7 +504,7 @@ void afs_evict_inode(struct inode *inode) vnode = AFS_FS_I(inode); - _enter("{%x:%u.%d}", + _enter("{%llx:%llu.%d}", vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); @@ -550,7 +553,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) struct key *key; int ret; - _enter("{%x:%u},{n=%pd},%x", + _enter("{%llx:%llu},{n=%pd},%x", vnode->fid.vid, vnode->fid.vnode, dentry, attr->ia_valid); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 72de1f157d20..5da3b09b7518 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -22,6 +22,7 @@ #include <linux/backing-dev.h> #include <linux/uuid.h> #include <linux/mm_types.h> +#include <linux/dns_resolver.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> @@ -75,10 +76,13 @@ struct afs_addr_list { u32 version; /* Version */ unsigned char max_addrs; unsigned char nr_addrs; - unsigned char index; /* Address currently in use */ + unsigned char preferred; /* Preferred address */ unsigned char nr_ipv4; /* Number of IPv4 addresses */ + enum dns_record_source source:8; + enum dns_lookup_status status:8; unsigned long probed; /* Mask of servers that have been probed */ - unsigned long yfs; /* Mask of servers that are YFS */ + unsigned long failed; /* Mask of addrs that failed locally/ICMP */ + unsigned long responded; /* Mask of addrs that responded */ struct sockaddr_rxrpc addrs[]; #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; @@ -88,6 +92,7 @@ struct afs_addr_list { */ struct afs_call { const struct afs_call_type *type; /* type of call */ + struct afs_addr_list *alist; /* Address is alist[addr_ix] */ wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ @@ -98,16 +103,22 @@ struct afs_call { struct afs_cb_interest *cbi; /* Callback interest for server used */ void *request; /* request data (first part) */ struct address_space *mapping; /* Pages being written from */ + struct iov_iter iter; /* Buffer iterator */ + struct iov_iter *_iter; /* Iterator currently in use */ + union { /* Convenience for ->iter */ + struct kvec kvec[1]; + struct bio_vec bvec[1]; + }; void *buffer; /* reply receive buffer */ void *reply[4]; /* Where to put the reply */ pgoff_t first; /* first page in mapping to deal with */ pgoff_t last; /* last page in mapping to deal with */ - size_t offset; /* offset into received data store */ atomic_t usage; enum afs_call_state state; spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ + u32 epoch; unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned first_offset; /* offset into mapping[first] */ @@ -117,19 +128,28 @@ struct afs_call { unsigned count2; /* count used in unmarshalling */ }; unsigned char unmarshall; /* unmarshalling phase */ + unsigned char addr_ix; /* Address in ->alist */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool ret_reply0; /* T if should return reply[0] on success */ bool upgrade; /* T to request service upgrade */ + bool want_reply_time; /* T if want reply_time */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ - __be32 tmp; /* place to extract temporary data */ + union { /* place to extract temporary data */ + struct { + __be32 tmp_u; + __be32 tmp; + } __attribute__((packed)); + __be64 tmp64; + }; afs_dataversion_t expected_version; /* Updated version expected from store */ afs_dataversion_t expected_version_2; /* 2nd updated version expected from store */ + ktime_t reply_time; /* Time of first reply packet */ }; struct afs_call_type { @@ -146,6 +166,9 @@ struct afs_call_type { /* Work function */ void (*work)(struct work_struct *work); + + /* Call done function (gets called immediately on success or failure) */ + void (*done)(struct afs_call *call); }; /* @@ -185,6 +208,7 @@ struct afs_read { refcount_t usage; unsigned int index; /* Which page we're reading into */ unsigned int nr_pages; + unsigned int offset; /* offset into current page */ void (*page_done)(struct afs_call *, struct afs_read *); struct page **pages; struct page *array[]; @@ -343,13 +367,70 @@ struct afs_cell { rwlock_t proc_lock; /* VL server list. */ - rwlock_t vl_addrs_lock; /* Lock on vl_addrs */ - struct afs_addr_list __rcu *vl_addrs; /* List of VL servers */ + rwlock_t vl_servers_lock; /* Lock on vl_servers */ + struct afs_vlserver_list __rcu *vl_servers; + u8 name_len; /* Length of name */ char name[64 + 1]; /* Cell name, case-flattened and NUL-padded */ }; /* + * Volume Location server record. + */ +struct afs_vlserver { + struct rcu_head rcu; + struct afs_addr_list __rcu *addresses; /* List of addresses for this VL server */ + unsigned long flags; +#define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */ +#define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */ +#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */ + rwlock_t lock; /* Lock on addresses */ + atomic_t usage; + + /* Probe state */ + wait_queue_head_t probe_wq; + atomic_t probe_outstanding; + spinlock_t probe_lock; + struct { + unsigned int rtt; /* RTT as ktime/64 */ + u32 abort_code; + short error; + bool have_result; + bool responded:1; + bool is_yfs:1; + bool not_yfs:1; + bool local_failure:1; + } probe; + + u16 port; + u16 name_len; /* Length of name */ + char name[]; /* Server name, case-flattened */ +}; + +/* + * Weighted list of Volume Location servers. + */ +struct afs_vlserver_entry { + u16 priority; /* Preference (as SRV) */ + u16 weight; /* Weight (as SRV) */ + enum dns_record_source source:8; + enum dns_lookup_status status:8; + struct afs_vlserver *server; +}; + +struct afs_vlserver_list { + struct rcu_head rcu; + atomic_t usage; + u8 nr_servers; + u8 index; /* Server currently in use */ + u8 preferred; /* Preferred server */ + enum dns_record_source source:8; + enum dns_lookup_status status:8; + rwlock_t lock; + struct afs_vlserver_entry servers[]; +}; + +/* * Cached VLDB entry. * * This is pointed to by cell->vldb_entries, indexed by name. @@ -403,8 +484,12 @@ struct afs_server { #define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */ #define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ +#define AFS_SERVER_FL_IS_YFS 9 /* Server is YFS not AFS */ +#define AFS_SERVER_FL_NO_RM2 10 /* Fileserver doesn't support YFS.RemoveFile2 */ +#define AFS_SERVER_FL_HAVE_EPOCH 11 /* ->epoch is valid */ atomic_t usage; u32 addr_version; /* Address list version */ + u32 cm_epoch; /* Server RxRPC epoch */ /* file service access */ rwlock_t fs_lock; /* access lock */ @@ -413,6 +498,26 @@ struct afs_server { struct hlist_head cb_volumes; /* List of volume interests on this server */ unsigned cb_s_break; /* Break-everything counter. */ rwlock_t cb_break_lock; /* Volume finding lock */ + + /* Probe state */ + wait_queue_head_t probe_wq; + atomic_t probe_outstanding; + spinlock_t probe_lock; + struct { + unsigned int rtt; /* RTT as ktime/64 */ + u32 abort_code; + u32 cm_epoch; + short error; + bool have_result; + bool responded:1; + bool is_yfs:1; + bool not_yfs:1; + bool local_failure:1; + bool no_epoch:1; + bool cm_probed:1; + bool said_rebooted:1; + bool said_inconsistent:1; + } probe; }; /* @@ -447,8 +552,8 @@ struct afs_server_entry { struct afs_server_list { refcount_t usage; - unsigned short nr_servers; - unsigned short index; /* Server currently in use */ + unsigned char nr_servers; + unsigned char preferred; /* Preferred server */ unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ rwlock_t lock; @@ -550,6 +655,15 @@ struct afs_vnode { afs_callback_type_t cb_type; /* type of callback */ }; +static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) +{ +#ifdef CONFIG_AFS_FSCACHE + return vnode->cache; +#else + return NULL; +#endif +} + /* * cached security record for one user's attempt to access a vnode */ @@ -586,13 +700,31 @@ struct afs_interface { */ struct afs_addr_cursor { struct afs_addr_list *alist; /* Current address list (pins ref) */ - struct sockaddr_rxrpc *addr; + unsigned long tried; /* Tried addresses */ + signed char index; /* Current address */ + bool responded; /* T if the current address responded */ + unsigned short nr_iterations; /* Number of address iterations */ + short error; u32 abort_code; - unsigned short start; /* Starting point in alist->addrs[] */ - unsigned short index; /* Wrapping offset from start to current addr */ +}; + +/* + * Cursor for iterating over a set of volume location servers. + */ +struct afs_vl_cursor { + struct afs_addr_cursor ac; + struct afs_cell *cell; /* The cell we're querying */ + struct afs_vlserver_list *server_list; /* Current server list (pins ref) */ + struct afs_vlserver *server; /* Server on which this resides */ + struct key *key; /* Key for the server */ + unsigned long untried; /* Bitmask of untried servers */ + short index; /* Current server */ short error; - bool begun; /* T if we've begun iteration */ - bool responded; /* T if the current address responded */ + unsigned short flags; +#define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ +#define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ +#define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ + unsigned short nr_iterations; /* Number of server iterations */ }; /* @@ -604,10 +736,11 @@ struct afs_fs_cursor { struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_cb_interest *cbi; /* Server on which this resides (pins ref) */ struct key *key; /* Key for the server */ + unsigned long untried; /* Bitmask of untried servers */ unsigned int cb_break; /* cb_break + cb_s_break before the call */ unsigned int cb_break_2; /* cb_break + cb_s_break (2nd vnode) */ - unsigned char start; /* Initial index in server list */ - unsigned char index; /* Number of servers tried beyond start */ + short index; /* Current server */ + short error; unsigned short flags; #define AFS_FS_CURSOR_STOP 0x0001 /* Set to cease iteration */ #define AFS_FS_CURSOR_VBUSY 0x0002 /* Set if seen VBUSY */ @@ -615,6 +748,7 @@ struct afs_fs_cursor { #define AFS_FS_CURSOR_VNOVOL 0x0008 /* Set if seen VNOVOL */ #define AFS_FS_CURSOR_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ #define AFS_FS_CURSOR_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ + unsigned short nr_iterations; /* Number of server iterations */ }; /* @@ -640,12 +774,12 @@ extern struct afs_addr_list *afs_alloc_addrlist(unsigned int, unsigned short, unsigned short); extern void afs_put_addrlist(struct afs_addr_list *); -extern struct afs_addr_list *afs_parse_text_addrs(const char *, size_t, char, - unsigned short, unsigned short); -extern struct afs_addr_list *afs_dns_query(struct afs_cell *, time64_t *); +extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, + const char *, size_t, char, + unsigned short, unsigned short); +extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); extern bool afs_iterate_addresses(struct afs_addr_cursor *); extern int afs_end_cursor(struct afs_addr_cursor *); -extern int afs_set_vl_cursor(struct afs_addr_cursor *, struct afs_cell *); extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16); extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16); @@ -668,6 +802,7 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def; * callback.c */ extern void afs_init_callback_state(struct afs_server *); +extern void __afs_break_callback(struct afs_vnode *); extern void afs_break_callback(struct afs_vnode *); extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*); @@ -688,10 +823,13 @@ static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break; } -static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode, - struct afs_cb_interest *cbi) +static inline bool afs_cb_is_broken(unsigned int cb_break, + const struct afs_vnode *vnode, + const struct afs_cb_interest *cbi) { - return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break; + return !cbi || cb_break != (vnode->cb_break + + cbi->server->cb_s_break + + vnode->volume->cb_v_break); } /* @@ -781,7 +919,7 @@ extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *); extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *); extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, u64, struct afs_fid *, struct afs_file_status *, struct afs_callback *); -extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool, u64); +extern int afs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, u64); extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64); extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64, struct afs_fid *, struct afs_file_status *); @@ -797,7 +935,7 @@ extern int afs_fs_release_lock(struct afs_fs_cursor *); extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *, - struct afs_addr_cursor *, struct key *); + struct afs_addr_cursor *, struct key *, unsigned int, bool); extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *, struct afs_fid *, struct afs_file_status *, struct afs_callback *, unsigned int, @@ -807,6 +945,13 @@ extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *, struct afs_callback *, struct afs_volsync *); /* + * fs_probe.c + */ +extern void afs_fileserver_probe_result(struct afs_call *); +extern int afs_probe_fileservers(struct afs_net *, struct key *, struct afs_server_list *); +extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long); + +/* * inode.c */ extern int afs_fetch_status(struct afs_vnode *, struct key *, bool); @@ -922,7 +1067,6 @@ extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); -extern int afs_queue_call_work(struct afs_call *); extern long afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t, bool); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, const struct afs_call_type *, @@ -930,12 +1074,39 @@ extern struct afs_call *afs_alloc_flat_call(struct afs_net *, extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); -extern int afs_extract_data(struct afs_call *, void *, size_t, bool); -extern int afs_protocol_error(struct afs_call *, int); +extern int afs_extract_data(struct afs_call *, bool); +extern int afs_protocol_error(struct afs_call *, int, enum afs_eproto_cause); + +static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) +{ + call->kvec[0].iov_base = buf; + call->kvec[0].iov_len = size; + iov_iter_kvec(&call->iter, READ, call->kvec, 1, size); +} + +static inline void afs_extract_to_tmp(struct afs_call *call) +{ + afs_extract_begin(call, &call->tmp, sizeof(call->tmp)); +} + +static inline void afs_extract_to_tmp64(struct afs_call *call) +{ + afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64)); +} + +static inline void afs_extract_discard(struct afs_call *call, size_t size) +{ + iov_iter_discard(&call->iter, READ, size); +} + +static inline void afs_extract_to_buf(struct afs_call *call, size_t size) +{ + afs_extract_begin(call, call->buffer, size); +} static inline int afs_transfer_reply(struct afs_call *call) { - return afs_extract_data(call, call->buffer, call->reply_max, false); + return afs_extract_data(call, false); } static inline bool afs_check_call_state(struct afs_call *call, @@ -1012,7 +1183,6 @@ extern void afs_put_server(struct afs_net *, struct afs_server *); extern void afs_manage_servers(struct work_struct *); extern void afs_servers_timer(struct timer_list *); extern void __net_exit afs_purge_servers(struct afs_net *); -extern bool afs_probe_fileserver(struct afs_fs_cursor *); extern bool afs_check_server_record(struct afs_fs_cursor *, struct afs_server *); /* @@ -1039,14 +1209,51 @@ extern void afs_fs_exit(void); /* * vlclient.c */ -extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *, - struct afs_addr_cursor *, - struct key *, const char *, int); -extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *, struct afs_addr_cursor *, - struct key *, const uuid_t *); -extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *); -extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *, struct afs_addr_cursor *, - struct key *, const uuid_t *); +extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *, + const char *, int); +extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *); +extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *, + struct afs_vlserver *, unsigned int, bool); +extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); + +/* + * vl_probe.c + */ +extern void afs_vlserver_probe_result(struct afs_call *); +extern int afs_send_vl_probes(struct afs_net *, struct key *, struct afs_vlserver_list *); +extern int afs_wait_for_vl_probes(struct afs_vlserver_list *, unsigned long); + +/* + * vl_rotate.c + */ +extern bool afs_begin_vlserver_operation(struct afs_vl_cursor *, + struct afs_cell *, struct key *); +extern bool afs_select_vlserver(struct afs_vl_cursor *); +extern bool afs_select_current_vlserver(struct afs_vl_cursor *); +extern int afs_end_vlserver_operation(struct afs_vl_cursor *); + +/* + * vlserver_list.c + */ +static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver) +{ + atomic_inc(&vlserver->usage); + return vlserver; +} + +static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist) +{ + if (vllist) + atomic_inc(&vllist->usage); + return vllist; +} + +extern struct afs_vlserver *afs_alloc_vlserver(const char *, size_t, unsigned short); +extern void afs_put_vlserver(struct afs_net *, struct afs_vlserver *); +extern struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int); +extern void afs_put_vlserverlist(struct afs_net *, struct afs_vlserver_list *); +extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, + const void *, size_t); /* * volume.c @@ -1089,6 +1296,36 @@ extern int afs_launder_page(struct page *); extern const struct xattr_handler *afs_xattr_handlers[]; extern ssize_t afs_listxattr(struct dentry *, char *, size_t); +/* + * yfsclient.c + */ +extern int yfs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool); +extern int yfs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *); +extern int yfs_fs_create_file(struct afs_fs_cursor *, const char *, umode_t, u64, + struct afs_fid *, struct afs_file_status *, struct afs_callback *); +extern int yfs_fs_make_dir(struct afs_fs_cursor *, const char *, umode_t, u64, + struct afs_fid *, struct afs_file_status *, struct afs_callback *); +extern int yfs_fs_remove_file2(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64); +extern int yfs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, u64); +extern int yfs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64); +extern int yfs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64, + struct afs_fid *, struct afs_file_status *); +extern int yfs_fs_rename(struct afs_fs_cursor *, const char *, + struct afs_vnode *, const char *, u64, u64); +extern int yfs_fs_store_data(struct afs_fs_cursor *, struct address_space *, + pgoff_t, pgoff_t, unsigned, unsigned); +extern int yfs_fs_setattr(struct afs_fs_cursor *, struct iattr *); +extern int yfs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *); +extern int yfs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t); +extern int yfs_fs_extend_lock(struct afs_fs_cursor *); +extern int yfs_fs_release_lock(struct afs_fs_cursor *); +extern int yfs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, struct afs_volsync *); +extern int yfs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, unsigned int, + struct afs_volsync *); /* * Miscellaneous inline functions. @@ -1120,6 +1357,17 @@ static inline void afs_check_for_remote_deletion(struct afs_fs_cursor *fc, } } +static inline int afs_io_error(struct afs_call *call, enum afs_io_error where) +{ + trace_afs_io_error(call->debug_id, -EIO, where); + return -EIO; +} + +static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where) +{ + trace_afs_file_error(vnode, -EIO, where); + return -EIO; +} /*****************************************************************************/ /* diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 99fd13500a97..2e51c6994148 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -130,9 +130,10 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) goto error_no_page; } - ret = -EIO; - if (PageError(page)) + if (PageError(page)) { + ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt); goto error; + } buf = kmap_atomic(page); memcpy(devname, buf, size); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 9101f62707af..be2ee3bbd0a9 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -17,6 +17,11 @@ #include <linux/uaccess.h> #include "internal.h" +struct afs_vl_seq_net_private { + struct seq_net_private seq; /* Must be first */ + struct afs_vlserver_list *vllist; +}; + static inline struct afs_net *afs_seq2net(struct seq_file *m) { return afs_net(seq_file_net(m)); @@ -32,16 +37,24 @@ static inline struct afs_net *afs_seq2net_single(struct seq_file *m) */ static int afs_proc_cells_show(struct seq_file *m, void *v) { - struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); + struct afs_vlserver_list *vllist; + struct afs_cell *cell; if (v == SEQ_START_TOKEN) { /* display header on line 1 */ - seq_puts(m, "USE NAME\n"); + seq_puts(m, "USE TTL SV NAME\n"); return 0; } + cell = list_entry(v, struct afs_cell, proc_link); + vllist = rcu_dereference(cell->vl_servers); + /* display one cell per line on subsequent lines */ - seq_printf(m, "%3u %s\n", atomic_read(&cell->usage), cell->name); + seq_printf(m, "%3u %6lld %2u %s\n", + atomic_read(&cell->usage), + cell->dns_expiry - ktime_get_real_seconds(), + vllist ? vllist->nr_servers : 0, + cell->name); return 0; } @@ -208,7 +221,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) return 0; } - seq_printf(m, "%3d %08x %s\n", + seq_printf(m, "%3d %08llx %s\n", atomic_read(&vol->usage), vol->vid, afs_vol_types[vol->type]); @@ -247,61 +260,102 @@ static const struct seq_operations afs_proc_cell_volumes_ops = { .show = afs_proc_cell_volumes_show, }; +static const char *const dns_record_sources[NR__dns_record_source + 1] = { + [DNS_RECORD_UNAVAILABLE] = "unav", + [DNS_RECORD_FROM_CONFIG] = "cfg", + [DNS_RECORD_FROM_DNS_A] = "A", + [DNS_RECORD_FROM_DNS_AFSDB] = "AFSDB", + [DNS_RECORD_FROM_DNS_SRV] = "SRV", + [DNS_RECORD_FROM_NSS] = "nss", + [NR__dns_record_source] = "[weird]" +}; + +static const char *const dns_lookup_statuses[NR__dns_lookup_status + 1] = { + [DNS_LOOKUP_NOT_DONE] = "no-lookup", + [DNS_LOOKUP_GOOD] = "good", + [DNS_LOOKUP_GOOD_WITH_BAD] = "good/bad", + [DNS_LOOKUP_BAD] = "bad", + [DNS_LOOKUP_GOT_NOT_FOUND] = "not-found", + [DNS_LOOKUP_GOT_LOCAL_FAILURE] = "local-failure", + [DNS_LOOKUP_GOT_TEMP_FAILURE] = "temp-failure", + [DNS_LOOKUP_GOT_NS_FAILURE] = "ns-failure", + [NR__dns_lookup_status] = "[weird]" +}; + /* * Display the list of Volume Location servers we're using for a cell. */ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) { - struct sockaddr_rxrpc *addr = v; + const struct afs_vl_seq_net_private *priv = m->private; + const struct afs_vlserver_list *vllist = priv->vllist; + const struct afs_vlserver_entry *entry; + const struct afs_vlserver *vlserver; + const struct afs_addr_list *alist; + int i; - /* display header on line 1 */ - if (v == (void *)1) { - seq_puts(m, "ADDRESS\n"); + if (v == SEQ_START_TOKEN) { + seq_printf(m, "# source %s, status %s\n", + dns_record_sources[vllist->source], + dns_lookup_statuses[vllist->status]); return 0; } - /* display one cell per line on subsequent lines */ - seq_printf(m, "%pISp\n", &addr->transport); + entry = v; + vlserver = entry->server; + alist = rcu_dereference(vlserver->addresses); + + seq_printf(m, "%s [p=%hu w=%hu s=%s,%s]:\n", + vlserver->name, entry->priority, entry->weight, + dns_record_sources[alist ? alist->source : entry->source], + dns_lookup_statuses[alist ? alist->status : entry->status]); + if (alist) { + for (i = 0; i < alist->nr_addrs; i++) + seq_printf(m, " %c %pISpc\n", + alist->preferred == i ? '>' : '-', + &alist->addrs[i].transport); + } return 0; } static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) __acquires(rcu) { - struct afs_addr_list *alist; + struct afs_vl_seq_net_private *priv = m->private; + struct afs_vlserver_list *vllist; struct afs_cell *cell = PDE_DATA(file_inode(m->file)); loff_t pos = *_pos; rcu_read_lock(); - alist = rcu_dereference(cell->vl_addrs); + vllist = rcu_dereference(cell->vl_servers); + priv->vllist = vllist; - /* allow for the header line */ - if (!pos) - return (void *) 1; - pos--; + if (pos < 0) + *_pos = pos = 0; + if (pos == 0) + return SEQ_START_TOKEN; - if (!alist || pos >= alist->nr_addrs) + if (!vllist || pos - 1 >= vllist->nr_servers) return NULL; - return alist->addrs + pos; + return &vllist->servers[pos - 1]; } static void *afs_proc_cell_vlservers_next(struct seq_file *m, void *v, loff_t *_pos) { - struct afs_addr_list *alist; - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_vl_seq_net_private *priv = m->private; + struct afs_vlserver_list *vllist = priv->vllist; loff_t pos; - alist = rcu_dereference(cell->vl_addrs); - pos = *_pos; - (*_pos)++; - if (!alist || pos >= alist->nr_addrs) + pos++; + *_pos = pos; + if (!vllist || pos - 1 >= vllist->nr_servers) return NULL; - return alist->addrs + pos; + return &vllist->servers[pos - 1]; } static void afs_proc_cell_vlservers_stop(struct seq_file *m, void *v) @@ -337,11 +391,11 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) &server->uuid, atomic_read(&server->usage), &alist->addrs[0].transport, - alist->index == 0 ? "*" : ""); + alist->preferred == 0 ? "*" : ""); for (i = 1; i < alist->nr_addrs; i++) seq_printf(m, " %pISpc%s\n", &alist->addrs[i].transport, - alist->index == i ? "*" : ""); + alist->preferred == i ? "*" : ""); return 0; } @@ -562,7 +616,7 @@ int afs_proc_cell_setup(struct afs_cell *cell) if (!proc_create_net_data("vlservers", 0444, dir, &afs_proc_cell_vlservers_ops, - sizeof(struct seq_net_private), + sizeof(struct afs_vl_seq_net_private), cell) || !proc_create_net_data("volumes", 0444, dir, &afs_proc_cell_volumes_ops, diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h new file mode 100644 index 000000000000..07bc10f076aa --- /dev/null +++ b/fs/afs/protocol_yfs.h @@ -0,0 +1,163 @@ +/* YFS protocol bits + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define YFS_FS_SERVICE 2500 +#define YFS_CM_SERVICE 2501 + +#define YFSCBMAX 1024 + +enum YFS_CM_Operations { + YFSCBProbe = 206, /* probe client */ + YFSCBGetLock = 207, /* get contents of CM lock table */ + YFSCBXStatsVersion = 209, /* get version of extended statistics */ + YFSCBGetXStats = 210, /* get contents of extended statistics data */ + YFSCBInitCallBackState3 = 213, /* initialise callback state, version 3 */ + YFSCBProbeUuid = 214, /* check the client hasn't rebooted */ + YFSCBGetServerPrefs = 215, + YFSCBGetCellServDV = 216, + YFSCBGetLocalCell = 217, + YFSCBGetCacheConfig = 218, + YFSCBGetCellByNum = 65537, + YFSCBTellMeAboutYourself = 65538, /* get client capabilities */ + YFSCBCallBack = 64204, +}; + +enum YFS_FS_Operations { + YFSFETCHACL = 64131, /* YFS Fetch file ACL */ + YFSFETCHSTATUS = 64132, /* YFS Fetch file status */ + YFSSTOREACL = 64134, /* YFS Store file ACL */ + YFSSTORESTATUS = 64135, /* YFS Store file status */ + YFSREMOVEFILE = 64136, /* YFS Remove a file */ + YFSCREATEFILE = 64137, /* YFS Create a file */ + YFSRENAME = 64138, /* YFS Rename or move a file or directory */ + YFSSYMLINK = 64139, /* YFS Create a symbolic link */ + YFSLINK = 64140, /* YFS Create a hard link */ + YFSMAKEDIR = 64141, /* YFS Create a directory */ + YFSREMOVEDIR = 64142, /* YFS Remove a directory */ + YFSGETVOLUMESTATUS = 64149, /* YFS Get volume status information */ + YFSSETVOLUMESTATUS = 64150, /* YFS Set volume status information */ + YFSSETLOCK = 64156, /* YFS Request a file lock */ + YFSEXTENDLOCK = 64157, /* YFS Extend a file lock */ + YFSRELEASELOCK = 64158, /* YFS Release a file lock */ + YFSLOOKUP = 64161, /* YFS lookup file in directory */ + YFSFLUSHCPS = 64165, + YFSFETCHOPAQUEACL = 64168, + YFSWHOAMI = 64170, + YFSREMOVEACL = 64171, + YFSREMOVEFILE2 = 64173, + YFSSTOREOPAQUEACL2 = 64174, + YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */ + YFSFETCHDATA64 = 64537, /* YFS Fetch file data */ + YFSSTOREDATA64 = 64538, /* YFS Store file data */ + YFSUPDATESYMLINK = 64540, +}; + +struct yfs_xdr_u64 { + __be32 msw; + __be32 lsw; +} __packed; + +static inline u64 xdr_to_u64(const struct yfs_xdr_u64 x) +{ + return ((u64)ntohl(x.msw) << 32) | ntohl(x.lsw); +} + +static inline struct yfs_xdr_u64 u64_to_xdr(const u64 x) +{ + return (struct yfs_xdr_u64){ .msw = htonl(x >> 32), .lsw = htonl(x) }; +} + +struct yfs_xdr_vnode { + struct yfs_xdr_u64 lo; + __be32 hi; + __be32 unique; +} __packed; + +struct yfs_xdr_YFSFid { + struct yfs_xdr_u64 volume; + struct yfs_xdr_vnode vnode; +} __packed; + + +struct yfs_xdr_YFSFetchStatus { + __be32 type; + __be32 nlink; + struct yfs_xdr_u64 size; + struct yfs_xdr_u64 data_version; + struct yfs_xdr_u64 author; + struct yfs_xdr_u64 owner; + struct yfs_xdr_u64 group; + __be32 mode; + __be32 caller_access; + __be32 anon_access; + struct yfs_xdr_vnode parent; + __be32 data_access_protocol; + struct yfs_xdr_u64 mtime_client; + struct yfs_xdr_u64 mtime_server; + __be32 lock_count; + __be32 abort_code; +} __packed; + +struct yfs_xdr_YFSCallBack { + __be32 version; + struct yfs_xdr_u64 expiration_time; + __be32 type; +} __packed; + +struct yfs_xdr_YFSStoreStatus { + __be32 mask; + __be32 mode; + struct yfs_xdr_u64 mtime_client; + struct yfs_xdr_u64 owner; + struct yfs_xdr_u64 group; +} __packed; + +struct yfs_xdr_RPCFlags { + __be32 rpc_flags; +} __packed; + +struct yfs_xdr_YFSVolSync { + struct yfs_xdr_u64 vol_creation_date; + struct yfs_xdr_u64 vol_update_date; + struct yfs_xdr_u64 max_quota; + struct yfs_xdr_u64 blocks_in_use; + struct yfs_xdr_u64 blocks_avail; +} __packed; + +enum yfs_volume_type { + yfs_volume_type_ro = 0, + yfs_volume_type_rw = 1, +}; + +#define yfs_FVSOnline 0x1 +#define yfs_FVSInservice 0x2 +#define yfs_FVSBlessed 0x4 +#define yfs_FVSNeedsSalvage 0x8 + +struct yfs_xdr_YFSFetchVolumeStatus { + struct yfs_xdr_u64 vid; + struct yfs_xdr_u64 parent_id; + __be32 flags; + __be32 type; + struct yfs_xdr_u64 max_quota; + struct yfs_xdr_u64 blocks_in_use; + struct yfs_xdr_u64 part_blocks_avail; + struct yfs_xdr_u64 part_max_blocks; + struct yfs_xdr_u64 vol_copy_date; + struct yfs_xdr_u64 vol_backup_date; +} __packed; + +struct yfs_xdr_YFSStoreVolumeStatus { + __be32 mask; + struct yfs_xdr_u64 min_quota; + struct yfs_xdr_u64 max_quota; + struct yfs_xdr_u64 file_quota; +} __packed; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 1faef56b12bd..00504254c1c2 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -19,14 +19,6 @@ #include "afs_fs.h" /* - * Initialise a filesystem server cursor for iterating over FS servers. - */ -static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) -{ - memset(fc, 0, sizeof(*fc)); -} - -/* * Begin an operation on the fileserver. * * Fileserver operations are serialised on the server by vnode, so we serialise @@ -35,13 +27,14 @@ static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode, struct key *key) { - afs_init_fs_cursor(fc, vnode); + memset(fc, 0, sizeof(*fc)); fc->vnode = vnode; fc->key = key; fc->ac.error = SHRT_MAX; + fc->error = -EDESTADDRREQ; if (mutex_lock_interruptible(&vnode->io_lock) < 0) { - fc->ac.error = -EINTR; + fc->error = -EINTR; fc->flags |= AFS_FS_CURSOR_STOP; return false; } @@ -65,12 +58,15 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, fc->server_list = afs_get_serverlist(vnode->volume->servers); read_unlock(&vnode->volume->servers_lock); + fc->untried = (1UL << fc->server_list->nr_servers) - 1; + fc->index = READ_ONCE(fc->server_list->preferred); + cbi = vnode->cb_interest; if (cbi) { /* See if the vnode's preferred record is still available */ for (i = 0; i < fc->server_list->nr_servers; i++) { if (fc->server_list->servers[i].cb_interest == cbi) { - fc->start = i; + fc->index = i; goto found_interest; } } @@ -80,7 +76,7 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, * and have to return an error. */ if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { - fc->ac.error = -ESTALE; + fc->error = -ESTALE; return false; } @@ -94,12 +90,9 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, afs_put_cb_interest(afs_v2net(vnode), cbi); cbi = NULL; - } else { - fc->start = READ_ONCE(fc->server_list->index); } found_interest: - fc->index = fc->start; return true; } @@ -117,7 +110,7 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) default: m = "busy"; break; } - pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m); + pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m); } /* @@ -127,7 +120,7 @@ static bool afs_sleep_and_retry(struct afs_fs_cursor *fc) { msleep_interruptible(1000); if (signal_pending(current)) { - fc->ac.error = -ERESTARTSYS; + fc->error = -ERESTARTSYS; return false; } @@ -143,27 +136,32 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) struct afs_addr_list *alist; struct afs_server *server; struct afs_vnode *vnode = fc->vnode; + u32 rtt, abort_code; + int error = fc->ac.error, i; - _enter("%u/%u,%u/%u,%d,%d", - fc->index, fc->start, - fc->ac.index, fc->ac.start, - fc->ac.error, fc->ac.abort_code); + _enter("%lx[%d],%lx[%d],%d,%d", + fc->untried, fc->index, + fc->ac.tried, fc->ac.index, + error, fc->ac.abort_code); if (fc->flags & AFS_FS_CURSOR_STOP) { _leave(" = f [stopped]"); return false; } + fc->nr_iterations++; + /* Evaluate the result of the previous operation, if there was one. */ - switch (fc->ac.error) { + switch (error) { case SHRT_MAX: goto start; case 0: default: /* Success or local failure. Stop. */ + fc->error = error; fc->flags |= AFS_FS_CURSOR_STOP; - _leave(" = f [okay/local %d]", fc->ac.error); + _leave(" = f [okay/local %d]", error); return false; case -ECONNABORTED: @@ -178,7 +176,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * - May indicate that the fileserver couldn't attach to the vol. */ if (fc->flags & AFS_FS_CURSOR_VNOVOL) { - fc->ac.error = -EREMOTEIO; + fc->error = -EREMOTEIO; goto next_server; } @@ -187,12 +185,12 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) write_unlock(&vnode->volume->servers_lock); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); - if (fc->ac.error < 0) - goto failed; + error = afs_check_volume_status(vnode->volume, fc->key); + if (error < 0) + goto failed_set_error; if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) { - fc->ac.error = -ENOMEDIUM; + fc->error = -ENOMEDIUM; goto failed; } @@ -200,7 +198,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * it's the fileserver having trouble. */ if (vnode->volume->servers == fc->server_list) { - fc->ac.error = -EREMOTEIO; + fc->error = -EREMOTEIO; goto next_server; } @@ -215,7 +213,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) case VONLINE: case VDISKFULL: case VOVERQUOTA: - fc->ac.error = afs_abort_to_error(fc->ac.abort_code); + fc->error = afs_abort_to_error(fc->ac.abort_code); goto next_server; case VOFFLINE: @@ -224,11 +222,11 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); } if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { - fc->ac.error = -EADV; + fc->error = -EADV; goto failed; } if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { - fc->ac.error = -ESTALE; + fc->error = -ESTALE; goto failed; } goto busy; @@ -240,7 +238,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * have a file lock we need to maintain. */ if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { - fc->ac.error = -EBUSY; + fc->error = -EBUSY; goto failed; } if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) { @@ -269,16 +267,16 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * honour, just in case someone sets up a loop. */ if (fc->flags & AFS_FS_CURSOR_VMOVED) { - fc->ac.error = -EREMOTEIO; + fc->error = -EREMOTEIO; goto failed; } fc->flags |= AFS_FS_CURSOR_VMOVED; set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); - if (fc->ac.error < 0) - goto failed; + error = afs_check_volume_status(vnode->volume, fc->key); + if (error < 0) + goto failed_set_error; /* If the server list didn't change, then the VLDB is * out of sync with the fileservers. This is hopefully @@ -290,7 +288,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) * TODO: Retry a few times with sleeps. */ if (vnode->volume->servers == fc->server_list) { - fc->ac.error = -ENOMEDIUM; + fc->error = -ENOMEDIUM; goto failed; } @@ -299,20 +297,25 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) default: clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); - fc->ac.error = afs_abort_to_error(fc->ac.abort_code); + fc->error = afs_abort_to_error(fc->ac.abort_code); goto failed; } + case -ETIMEDOUT: + case -ETIME: + if (fc->error != -EDESTADDRREQ) + goto iterate_address; + /* Fall through */ case -ENETUNREACH: case -EHOSTUNREACH: case -ECONNREFUSED: - case -ETIMEDOUT: - case -ETIME: _debug("no conn"); + fc->error = error; goto iterate_address; case -ECONNRESET: _debug("call reset"); + fc->error = error; goto failed; } @@ -328,15 +331,57 @@ start: /* See if we need to do an update of the volume record. Note that the * volume may have moved or even have been deleted. */ - fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); - if (fc->ac.error < 0) - goto failed; + error = afs_check_volume_status(vnode->volume, fc->key); + if (error < 0) + goto failed_set_error; if (!afs_start_fs_iteration(fc, vnode)) goto failed; -use_server: - _debug("use"); + _debug("__ VOL %llx __", vnode->volume->vid); + error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list); + if (error < 0) + goto failed_set_error; + +pick_server: + _debug("pick [%lx]", fc->untried); + + error = afs_wait_for_fs_probes(fc->server_list, fc->untried); + if (error < 0) + goto failed_set_error; + + /* Pick the untried server with the lowest RTT. If we have outstanding + * callbacks, we stick with the server we're already using if we can. + */ + if (fc->cbi) { + _debug("cbi %u", fc->index); + if (test_bit(fc->index, &fc->untried)) + goto selected_server; + afs_put_cb_interest(afs_v2net(vnode), fc->cbi); + fc->cbi = NULL; + _debug("nocbi"); + } + + fc->index = -1; + rtt = U32_MAX; + for (i = 0; i < fc->server_list->nr_servers; i++) { + struct afs_server *s = fc->server_list->servers[i].server; + + if (!test_bit(i, &fc->untried) || !s->probe.responded) + continue; + if (s->probe.rtt < rtt) { + fc->index = i; + rtt = s->probe.rtt; + } + } + + if (fc->index == -1) + goto no_more_servers; + +selected_server: + _debug("use %d", fc->index); + __clear_bit(fc->index, &fc->untried); + /* We're starting on a different fileserver from the list. We need to * check it, create a callback intercept, find its address list and * probe its capabilities before we use it. @@ -354,10 +399,10 @@ use_server: * break request before we've finished decoding the reply and * installing the vnode. */ - fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list, - fc->index); - if (fc->ac.error < 0) - goto failed; + error = afs_register_server_cb_interest(vnode, fc->server_list, + fc->index); + if (error < 0) + goto failed_set_error; fc->cbi = afs_get_cb_interest(vnode->cb_interest); @@ -369,66 +414,88 @@ use_server: memset(&fc->ac, 0, sizeof(fc->ac)); - /* Probe the current fileserver if we haven't done so yet. */ - if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) { - fc->ac.alist = afs_get_addrlist(alist); - - if (!afs_probe_fileserver(fc)) { - switch (fc->ac.error) { - case -ENOMEM: - case -ERESTARTSYS: - case -EINTR: - goto failed; - default: - goto next_server; - } - } - } - if (!fc->ac.alist) fc->ac.alist = alist; else afs_put_addrlist(alist); - fc->ac.start = READ_ONCE(alist->index); - fc->ac.index = fc->ac.start; + fc->ac.index = -1; iterate_address: ASSERT(fc->ac.alist); - _debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs); /* Iterate over the current server's address list to try and find an * address on which it will respond to us. */ if (!afs_iterate_addresses(&fc->ac)) goto next_server; + _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs); + _leave(" = t"); return true; next_server: _debug("next"); afs_end_cursor(&fc->ac); - afs_put_cb_interest(afs_v2net(vnode), fc->cbi); - fc->cbi = NULL; - fc->index++; - if (fc->index >= fc->server_list->nr_servers) - fc->index = 0; - if (fc->index != fc->start) - goto use_server; + goto pick_server; +no_more_servers: /* That's all the servers poked to no good effect. Try again if some * of them were busy. */ if (fc->flags & AFS_FS_CURSOR_VBUSY) goto restart_from_beginning; - fc->ac.error = -EDESTADDRREQ; - goto failed; + abort_code = 0; + error = -EDESTADDRREQ; + for (i = 0; i < fc->server_list->nr_servers; i++) { + struct afs_server *s = fc->server_list->servers[i].server; + int probe_error = READ_ONCE(s->probe.error); + + switch (probe_error) { + case 0: + continue; + default: + if (error == -ETIMEDOUT || + error == -ETIME) + continue; + case -ETIMEDOUT: + case -ETIME: + if (error == -ENOMEM || + error == -ENONET) + continue; + case -ENOMEM: + case -ENONET: + if (error == -ENETUNREACH) + continue; + case -ENETUNREACH: + if (error == -EHOSTUNREACH) + continue; + case -EHOSTUNREACH: + if (error == -ECONNREFUSED) + continue; + case -ECONNREFUSED: + if (error == -ECONNRESET) + continue; + case -ECONNRESET: /* Responded, but call expired. */ + if (error == -ECONNABORTED) + continue; + case -ECONNABORTED: + abort_code = s->probe.abort_code; + error = probe_error; + continue; + } + } + + if (error == -ECONNABORTED) + error = afs_abort_to_error(abort_code); +failed_set_error: + fc->error = error; failed: fc->flags |= AFS_FS_CURSOR_STOP; afs_end_cursor(&fc->ac); - _leave(" = f [failed %d]", fc->ac.error); + _leave(" = f [failed %d]", fc->error); return false; } @@ -442,13 +509,14 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) struct afs_vnode *vnode = fc->vnode; struct afs_cb_interest *cbi = vnode->cb_interest; struct afs_addr_list *alist; + int error = fc->ac.error; _enter(""); - switch (fc->ac.error) { + switch (error) { case SHRT_MAX: if (!cbi) { - fc->ac.error = -ESTALE; + fc->error = -ESTALE; fc->flags |= AFS_FS_CURSOR_STOP; return false; } @@ -461,25 +529,26 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) afs_get_addrlist(alist); read_unlock(&cbi->server->fs_lock); if (!alist) { - fc->ac.error = -ESTALE; + fc->error = -ESTALE; fc->flags |= AFS_FS_CURSOR_STOP; return false; } memset(&fc->ac, 0, sizeof(fc->ac)); fc->ac.alist = alist; - fc->ac.start = READ_ONCE(alist->index); - fc->ac.index = fc->ac.start; + fc->ac.index = -1; goto iterate_address; case 0: default: /* Success or local failure. Stop. */ + fc->error = error; fc->flags |= AFS_FS_CURSOR_STOP; - _leave(" = f [okay/local %d]", fc->ac.error); + _leave(" = f [okay/local %d]", error); return false; case -ECONNABORTED: + fc->error = afs_abort_to_error(fc->ac.abort_code); fc->flags |= AFS_FS_CURSOR_STOP; _leave(" = f [abort]"); return false; @@ -490,6 +559,7 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) case -ETIMEDOUT: case -ETIME: _debug("no conn"); + fc->error = error; goto iterate_address; } @@ -507,12 +577,65 @@ iterate_address: } /* + * Dump cursor state in the case of the error being EDESTADDRREQ. + */ +static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc) +{ + static int count; + int i; + + if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3) + return; + count++; + + rcu_read_lock(); + + pr_notice("EDESTADDR occurred\n"); + pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n", + fc->cb_break, fc->cb_break_2, fc->flags, fc->error); + pr_notice("FC: ut=%lx ix=%d ni=%u\n", + fc->untried, fc->index, fc->nr_iterations); + + if (fc->server_list) { + const struct afs_server_list *sl = fc->server_list; + pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n", + sl->nr_servers, sl->preferred, sl->vnovol_mask); + for (i = 0; i < sl->nr_servers; i++) { + const struct afs_server *s = sl->servers[i].server; + pr_notice("FC: server fl=%lx av=%u %pU\n", + s->flags, s->addr_version, &s->uuid); + if (s->addresses) { + const struct afs_addr_list *a = + rcu_dereference(s->addresses); + pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n", + a->version, + a->nr_ipv4, a->nr_addrs, a->max_addrs, + a->preferred); + pr_notice("FC: - pr=%lx R=%lx F=%lx\n", + a->probed, a->responded, a->failed); + if (a == fc->ac.alist) + pr_notice("FC: - current\n"); + } + } + } + + pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", + fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error, + fc->ac.responded, fc->ac.nr_iterations); + rcu_read_unlock(); +} + +/* * Tidy up a filesystem cursor and unlock the vnode. */ int afs_end_vnode_operation(struct afs_fs_cursor *fc) { struct afs_net *net = afs_v2net(fc->vnode); - int ret; + + if (fc->error == -EDESTADDRREQ || + fc->error == -ENETUNREACH || + fc->error == -EHOSTUNREACH) + afs_dump_edestaddrreq(fc); mutex_unlock(&fc->vnode->io_lock); @@ -520,9 +643,8 @@ int afs_end_vnode_operation(struct afs_fs_cursor *fc) afs_put_cb_interest(net, fc->cbi); afs_put_serverlist(net, fc->server_list); - ret = fc->ac.error; - if (ret == -ECONNABORTED) - afs_abort_to_error(fc->ac.abort_code); + if (fc->error == -ECONNABORTED) + fc->error = afs_abort_to_error(fc->ac.abort_code); - return fc->ac.error; + return fc->error; } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 77a83790a31f..59970886690f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -16,6 +16,7 @@ #include <net/af_rxrpc.h> #include "internal.h" #include "afs_cm.h" +#include "protocol_yfs.h" struct workqueue_struct *afs_async_calls; @@ -75,6 +76,18 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) goto error_2; + srx.srx_service = YFS_CM_SERVICE; + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + if (ret < 0) + goto error_2; + + /* Ideally, we'd turn on service upgrade here, but we can't because + * OpenAFS is buggy and leaks the userStatus field from packet to + * packet and between FS packets and CB packets - so if we try to do an + * upgrade on an FS packet, OpenAFS will leak that into the CB packet + * it sends back to us. + */ + rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, afs_rx_discard_new_call); @@ -143,6 +156,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, INIT_WORK(&call->async_work, afs_process_async_call); init_waitqueue_head(&call->waitq); spin_lock_init(&call->state_lock); + call->_iter = &call->iter; o = atomic_inc_return(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_alloc, 1, o, @@ -176,6 +190,7 @@ void afs_put_call(struct afs_call *call) afs_put_server(call->net, call->cm_server); afs_put_cb_interest(call->net, call->cbi); + afs_put_addrlist(call->alist); kfree(call->request); trace_afs_call(call, afs_call_trace_free, 0, o, @@ -189,21 +204,22 @@ void afs_put_call(struct afs_call *call) } /* - * Queue the call for actual work. Returns 0 unconditionally for convenience. + * Queue the call for actual work. */ -int afs_queue_call_work(struct afs_call *call) +static void afs_queue_call_work(struct afs_call *call) { - int u = atomic_inc_return(&call->usage); + if (call->type->work) { + int u = atomic_inc_return(&call->usage); - trace_afs_call(call, afs_call_trace_work, u, - atomic_read(&call->net->nr_outstanding_calls), - __builtin_return_address(0)); + trace_afs_call(call, afs_call_trace_work, u, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); - INIT_WORK(&call->work, call->type->work); + INIT_WORK(&call->work, call->type->work); - if (!queue_work(afs_wq, &call->work)) - afs_put_call(call); - return 0; + if (!queue_work(afs_wq, &call->work)) + afs_put_call(call); + } } /* @@ -233,6 +249,7 @@ struct afs_call *afs_alloc_flat_call(struct afs_net *net, goto nomem_free; } + afs_extract_to_buf(call, call->reply_max); call->operation_ID = type->op; init_waitqueue_head(&call->waitq); return call; @@ -286,7 +303,7 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, offset = 0; } - iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, bv, nr, bytes); + iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes); } /* @@ -342,7 +359,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp, bool async) { - struct sockaddr_rxrpc *srx = ac->addr; + struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index]; struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; @@ -359,6 +376,8 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, atomic_read(&call->net->nr_outstanding_calls)); call->async = async; + call->addr_ix = ac->index; + call->alist = afs_get_addrlist(ac->alist); /* Work out the length we're going to transmit. This is awkward for * calls such as FS.StoreData where there's an extra injection of data @@ -390,6 +409,7 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, call->debug_id); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); + call->error = ret; goto error_kill_call; } @@ -401,8 +421,7 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, - call->request_size); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0); @@ -432,7 +451,7 @@ error_do_abort: rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); } else { - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, false, &call->abort_code, &call->service_id); @@ -442,6 +461,8 @@ error_do_abort: call->error = ret; trace_afs_call_done(call); error_kill_call: + if (call->type->done) + call->type->done(call); afs_put_call(call); ac->error = ret; _leave(" = %d", ret); @@ -466,14 +487,12 @@ static void afs_deliver_to_call(struct afs_call *call) state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { - struct iov_iter iter; - - iov_iter_kvec(&iter, READ | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&call->iter, READ, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, - call->rxcall, &iter, false, - &remote_abort, + call->rxcall, &call->iter, + false, &remote_abort, &call->service_id); - trace_afs_recv_data(call, 0, 0, false, ret); + trace_afs_receive_data(call, &call->iter, false, ret); if (ret == -EINPROGRESS || ret == -EAGAIN) return; @@ -485,10 +504,17 @@ static void afs_deliver_to_call(struct afs_call *call) return; } + if (call->want_reply_time && + rxrpc_kernel_get_reply_time(call->net->socket, + call->rxcall, + &call->reply_time)) + call->want_reply_time = false; + ret = call->type->deliver(call); state = READ_ONCE(call->state); switch (ret) { case 0: + afs_queue_call_work(call); if (state == AFS_CALL_CL_PROC_REPLY) { if (call->cbi) set_bit(AFS_SERVER_FL_MAY_HAVE_CB, @@ -500,7 +526,6 @@ static void afs_deliver_to_call(struct afs_call *call) case -EINPROGRESS: case -EAGAIN: goto out; - case -EIO: case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); goto done; @@ -509,6 +534,10 @@ static void afs_deliver_to_call(struct afs_call *call) rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KIV"); goto local_abort; + case -EIO: + pr_err("kAFS: Call %u in bad state %u\n", + call->debug_id, state); + /* Fall through */ case -ENODATA: case -EBADMSG: case -EMSGSIZE: @@ -517,12 +546,14 @@ static void afs_deliver_to_call(struct afs_call *call) if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - abort_code, -EBADMSG, "KUM"); + abort_code, ret, "KUM"); goto local_abort; } } done: + if (call->type->done) + call->type->done(call); if (state == AFS_CALL_COMPLETE && call->incoming) afs_put_call(call); out: @@ -728,6 +759,7 @@ void afs_charge_preallocation(struct work_struct *work) call->async = true; call->state = AFS_CALL_SV_AWAIT_OP_ID; init_waitqueue_head(&call->waitq); + afs_extract_to_tmp(call); } if (rxrpc_kernel_charge_accept(net->socket, @@ -773,18 +805,15 @@ static int afs_deliver_cm_op_id(struct afs_call *call) { int ret; - _enter("{%zu}", call->offset); - - ASSERTCMP(call->offset, <, 4); + _enter("{%zu}", iov_iter_count(call->_iter)); /* the operation ID forms the first four bytes of the request data */ - ret = afs_extract_data(call, &call->tmp, 4, true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; call->operation_ID = ntohl(call->tmp); afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST); - call->offset = 0; /* ask the cache manager to route the call (it'll change the call type * if successful) */ @@ -825,7 +854,7 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -864,7 +893,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -888,30 +917,19 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) /* * Extract a piece of data from the received data socket buffers. */ -int afs_extract_data(struct afs_call *call, void *buf, size_t count, - bool want_more) +int afs_extract_data(struct afs_call *call, bool want_more) { struct afs_net *net = call->net; - struct iov_iter iter; - struct kvec iov; + struct iov_iter *iter = call->_iter; enum afs_call_state state; u32 remote_abort = 0; int ret; - _enter("{%s,%zu},,%zu,%d", - call->type->name, call->offset, count, want_more); - - ASSERTCMP(call->offset, <=, count); - - iov.iov_base = buf + call->offset; - iov.iov_len = count - call->offset; - iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, count - call->offset); + _enter("{%s,%zu},%d", call->type->name, iov_iter_count(iter), want_more); - ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, &iter, + ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter, want_more, &remote_abort, &call->service_id); - call->offset += (count - call->offset) - iov_iter_count(&iter); - trace_afs_recv_data(call, count, call->offset, want_more, ret); if (ret == 0 || ret == -EAGAIN) return ret; @@ -926,7 +944,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, break; case AFS_CALL_COMPLETE: kdebug("prem complete %d", call->error); - return -EIO; + return afs_io_error(call, afs_io_error_extract); default: break; } @@ -940,8 +958,9 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, /* * Log protocol error production. */ -noinline int afs_protocol_error(struct afs_call *call, int error) +noinline int afs_protocol_error(struct afs_call *call, int error, + enum afs_eproto_cause cause) { - trace_afs_protocol_error(call, error, __builtin_return_address(0)); + trace_afs_protocol_error(call, error, cause); return error; } diff --git a/fs/afs/security.c b/fs/afs/security.c index 81dfedb7879f..5f58a9a17e69 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -126,7 +126,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, bool changed = false; int i, j; - _enter("{%x:%u},%x,%x", + _enter("{%llx:%llu},%x,%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key), caller_access); rcu_read_lock(); @@ -147,7 +147,8 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, break; } - if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) { + if (afs_cb_is_broken(cb_break, vnode, + vnode->cb_interest)) { changed = true; break; } @@ -177,7 +178,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, } } - if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) + if (afs_cb_is_broken(cb_break, vnode, vnode->cb_interest)) goto someone_else_changed_it; /* We need a ref on any permits list we want to copy as we'll have to @@ -256,7 +257,7 @@ found: spin_lock(&vnode->lock); zap = rcu_access_pointer(vnode->permit_cache); - if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) && + if (!afs_cb_is_broken(cb_break, vnode, vnode->cb_interest) && zap == permits) rcu_assign_pointer(vnode->permit_cache, replacement); else @@ -289,7 +290,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, bool valid = false; int i, ret; - _enter("{%x:%u},%x", + _enter("{%llx:%llu},%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key)); /* check the permits to see if we've got one yet */ @@ -349,7 +350,7 @@ int afs_permission(struct inode *inode, int mask) if (mask & MAY_NOT_BLOCK) return -ECHILD; - _enter("{{%x:%u},%lx},%x,", + _enter("{{%llx:%llu},%lx},%x,", vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); key = afs_request_key(vnode->volume->cell); diff --git a/fs/afs/server.c b/fs/afs/server.c index 1d329e6981d5..642afa2e9783 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include "afs_fs.h" #include "internal.h" +#include "protocol_yfs.h" static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */ static unsigned afs_server_update_delay = 30; /* Time till VLDB recheck in secs */ @@ -230,6 +231,8 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, rwlock_init(&server->fs_lock); INIT_HLIST_HEAD(&server->cb_volumes); rwlock_init(&server->cb_break_lock); + init_waitqueue_head(&server->probe_wq); + spin_lock_init(&server->probe_lock); afs_inc_servers_outstanding(net); _leave(" = %p", server); @@ -246,41 +249,23 @@ enomem: static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, struct key *key, const uuid_t *uuid) { - struct afs_addr_cursor ac; - struct afs_addr_list *alist; + struct afs_vl_cursor vc; + struct afs_addr_list *alist = NULL; int ret; - ret = afs_set_vl_cursor(&ac, cell); - if (ret < 0) - return ERR_PTR(ret); - - while (afs_iterate_addresses(&ac)) { - if (test_bit(ac.index, &ac.alist->yfs)) - alist = afs_yfsvl_get_endpoints(cell->net, &ac, key, uuid); - else - alist = afs_vl_get_addrs_u(cell->net, &ac, key, uuid); - switch (ac.error) { - case 0: - afs_end_cursor(&ac); - return alist; - case -ECONNABORTED: - ac.error = afs_abort_to_error(ac.abort_code); - goto error; - case -ENOMEM: - case -ENONET: - goto error; - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - break; - default: - ac.error = -EIO; - goto error; + ret = -ERESTARTSYS; + if (afs_begin_vlserver_operation(&vc, cell, key)) { + while (afs_select_vlserver(&vc)) { + if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) + alist = afs_yfsvl_get_endpoints(&vc, uuid); + else + alist = afs_vl_get_addrs_u(&vc, uuid); } + + ret = afs_end_vlserver_operation(&vc); } -error: - return ERR_PTR(afs_end_cursor(&ac)); + return ret < 0 ? ERR_PTR(ret) : alist; } /* @@ -382,9 +367,7 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) struct afs_addr_list *alist = rcu_access_pointer(server->addresses); struct afs_addr_cursor ac = { .alist = alist, - .start = alist->index, - .index = 0, - .addr = &alist->addrs[alist->index], + .index = alist->preferred, .error = 0, }; _enter("%p", server); @@ -392,6 +375,9 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + wait_var_event(&server->probe_outstanding, + atomic_read(&server->probe_outstanding) == 0); + call_rcu(&server->rcu, afs_server_rcu); afs_dec_servers_outstanding(net); } @@ -525,99 +511,6 @@ void afs_purge_servers(struct afs_net *net) } /* - * Probe a fileserver to find its capabilities. - * - * TODO: Try service upgrade. - */ -static bool afs_do_probe_fileserver(struct afs_fs_cursor *fc) -{ - _enter(""); - - fc->ac.addr = NULL; - fc->ac.start = READ_ONCE(fc->ac.alist->index); - fc->ac.index = fc->ac.start; - fc->ac.error = 0; - fc->ac.begun = false; - - while (afs_iterate_addresses(&fc->ac)) { - afs_fs_get_capabilities(afs_v2net(fc->vnode), fc->cbi->server, - &fc->ac, fc->key); - switch (fc->ac.error) { - case 0: - afs_end_cursor(&fc->ac); - set_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags); - return true; - case -ECONNABORTED: - fc->ac.error = afs_abort_to_error(fc->ac.abort_code); - goto error; - case -ENOMEM: - case -ENONET: - goto error; - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - case -ETIMEDOUT: - case -ETIME: - break; - default: - fc->ac.error = -EIO; - goto error; - } - } - -error: - afs_end_cursor(&fc->ac); - return false; -} - -/* - * If we haven't already, try probing the fileserver to get its capabilities. - * We try not to instigate parallel probes, but it's possible that the parallel - * probes will fail due to authentication failure when ours would succeed. - * - * TODO: Try sending an anonymous probe if an authenticated probe fails. - */ -bool afs_probe_fileserver(struct afs_fs_cursor *fc) -{ - bool success; - int ret, retries = 0; - - _enter(""); - -retry: - if (test_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags)) { - _leave(" = t"); - return true; - } - - if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags)) { - success = afs_do_probe_fileserver(fc); - clear_bit_unlock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags); - wake_up_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING); - _leave(" = t"); - return success; - } - - _debug("wait"); - ret = wait_on_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING, - TASK_INTERRUPTIBLE); - if (ret == -ERESTARTSYS) { - fc->ac.error = ret; - _leave(" = f [%d]", ret); - return false; - } - - retries++; - if (retries == 4) { - fc->ac.error = -ESTALE; - _leave(" = f [stale]"); - return false; - } - _debug("retry"); - goto retry; -} - -/* * Get an update for a server's address list. */ static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct afs_server *server) diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 8a5760aa5832..95d0761cdb34 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -118,11 +118,11 @@ bool afs_annotate_server_list(struct afs_server_list *new, return false; changed: - /* Maintain the same current server as before if possible. */ - cur = old->servers[old->index].server; + /* Maintain the same preferred server as before if possible. */ + cur = old->servers[old->preferred].server; for (j = 0; j < new->nr_servers; j++) { if (new->servers[j].server == cur) { - new->index = j; + new->preferred = j; break; } } diff --git a/fs/afs/super.c b/fs/afs/super.c index 4d3e274207fb..dcd07fe99871 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -406,10 +406,11 @@ static int afs_fill_super(struct super_block *sb, inode = afs_iget_pseudo_dir(sb, true); sb->s_flags |= SB_RDONLY; } else { - sprintf(sb->s_id, "%u", as->volume->vid); + sprintf(sb->s_id, "%llu", as->volume->vid); afs_activate_volume(as->volume); fid.vid = as->volume->vid; fid.vnode = 1; + fid.vnode_hi = 0; fid.unique = 1; inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL); } @@ -663,7 +664,7 @@ static void afs_destroy_inode(struct inode *inode) { struct afs_vnode *vnode = AFS_FS_I(inode); - _enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode); + _enter("%p{%llx:%llu}", inode, vnode->fid.vid, vnode->fid.vnode); _debug("DESTROY INODE %p", inode); diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c new file mode 100644 index 000000000000..b4f1a84519b9 --- /dev/null +++ b/fs/afs/vl_list.c @@ -0,0 +1,340 @@ +/* AFS vlserver list management. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include "internal.h" + +struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, + unsigned short port) +{ + struct afs_vlserver *vlserver; + + vlserver = kzalloc(struct_size(vlserver, name, name_len + 1), + GFP_KERNEL); + if (vlserver) { + atomic_set(&vlserver->usage, 1); + rwlock_init(&vlserver->lock); + init_waitqueue_head(&vlserver->probe_wq); + spin_lock_init(&vlserver->probe_lock); + vlserver->name_len = name_len; + vlserver->port = port; + memcpy(vlserver->name, name, name_len); + } + return vlserver; +} + +static void afs_vlserver_rcu(struct rcu_head *rcu) +{ + struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu); + + afs_put_addrlist(rcu_access_pointer(vlserver->addresses)); + kfree_rcu(vlserver, rcu); +} + +void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver) +{ + if (vlserver) { + unsigned int u = atomic_dec_return(&vlserver->usage); + //_debug("VL PUT %p{%u}", vlserver, u); + + if (u == 0) + call_rcu(&vlserver->rcu, afs_vlserver_rcu); + } +} + +struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers) +{ + struct afs_vlserver_list *vllist; + + vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL); + if (vllist) { + atomic_set(&vllist->usage, 1); + rwlock_init(&vllist->lock); + } + + return vllist; +} + +void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist) +{ + if (vllist) { + unsigned int u = atomic_dec_return(&vllist->usage); + + //_debug("VLLS PUT %p{%u}", vllist, u); + if (u == 0) { + int i; + + for (i = 0; i < vllist->nr_servers; i++) { + afs_put_vlserver(net, vllist->servers[i].server); + } + kfree_rcu(vllist, rcu); + } + } +} + +static u16 afs_extract_le16(const u8 **_b) +{ + u16 val; + + val = (u16)*(*_b)++ << 0; + val |= (u16)*(*_b)++ << 8; + return val; +} + +/* + * Build a VL server address list from a DNS queried server list. + */ +static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, + u8 nr_addrs, u16 port) +{ + struct afs_addr_list *alist; + const u8 *b = *_b; + int ret = -EINVAL; + + alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port); + if (!alist) + return ERR_PTR(-ENOMEM); + if (nr_addrs == 0) + return alist; + + for (; nr_addrs > 0 && end - b >= nr_addrs; nr_addrs--) { + struct dns_server_list_v1_address hdr; + __be32 x[4]; + + hdr.address_type = *b++; + + switch (hdr.address_type) { + case DNS_ADDRESS_IS_IPV4: + if (end - b < 4) { + _leave(" = -EINVAL [short inet]"); + goto error; + } + memcpy(x, b, 4); + afs_merge_fs_addr4(alist, x[0], port); + b += 4; + break; + + case DNS_ADDRESS_IS_IPV6: + if (end - b < 16) { + _leave(" = -EINVAL [short inet6]"); + goto error; + } + memcpy(x, b, 16); + afs_merge_fs_addr6(alist, x, port); + b += 16; + break; + + default: + _leave(" = -EADDRNOTAVAIL [unknown af %u]", + hdr.address_type); + ret = -EADDRNOTAVAIL; + goto error; + } + } + + /* Start with IPv6 if available. */ + if (alist->nr_ipv4 < alist->nr_addrs) + alist->preferred = alist->nr_ipv4; + + *_b = b; + return alist; + +error: + *_b = b; + afs_put_addrlist(alist); + return ERR_PTR(ret); +} + +/* + * Build a VL server list from a DNS queried server list. + */ +struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, + const void *buffer, + size_t buffer_size) +{ + const struct dns_server_list_v1_header *hdr = buffer; + struct dns_server_list_v1_server bs; + struct afs_vlserver_list *vllist, *previous; + struct afs_addr_list *addrs; + struct afs_vlserver *server; + const u8 *b = buffer, *end = buffer + buffer_size; + int ret = -ENOMEM, nr_servers, i, j; + + _enter(""); + + /* Check that it's a server list, v1 */ + if (end - b < sizeof(*hdr) || + hdr->hdr.content != DNS_PAYLOAD_IS_SERVER_LIST || + hdr->hdr.version != 1) { + pr_notice("kAFS: Got DNS record [%u,%u] len %zu\n", + hdr->hdr.content, hdr->hdr.version, end - b); + ret = -EDESTADDRREQ; + goto dump; + } + + nr_servers = hdr->nr_servers; + + vllist = afs_alloc_vlserver_list(nr_servers); + if (!vllist) + return ERR_PTR(-ENOMEM); + + vllist->source = (hdr->source < NR__dns_record_source) ? + hdr->source : NR__dns_record_source; + vllist->status = (hdr->status < NR__dns_lookup_status) ? + hdr->status : NR__dns_lookup_status; + + read_lock(&cell->vl_servers_lock); + previous = afs_get_vlserverlist( + rcu_dereference_protected(cell->vl_servers, + lockdep_is_held(&cell->vl_servers_lock))); + read_unlock(&cell->vl_servers_lock); + + b += sizeof(*hdr); + while (end - b >= sizeof(bs)) { + bs.name_len = afs_extract_le16(&b); + bs.priority = afs_extract_le16(&b); + bs.weight = afs_extract_le16(&b); + bs.port = afs_extract_le16(&b); + bs.source = *b++; + bs.status = *b++; + bs.protocol = *b++; + bs.nr_addrs = *b++; + + _debug("extract %u %u %u %u %u %u %*.*s", + bs.name_len, bs.priority, bs.weight, + bs.port, bs.protocol, bs.nr_addrs, + bs.name_len, bs.name_len, b); + + if (end - b < bs.name_len) + break; + + ret = -EPROTONOSUPPORT; + if (bs.protocol == DNS_SERVER_PROTOCOL_UNSPECIFIED) { + bs.protocol = DNS_SERVER_PROTOCOL_UDP; + } else if (bs.protocol != DNS_SERVER_PROTOCOL_UDP) { + _leave(" = [proto %u]", bs.protocol); + goto error; + } + + if (bs.port == 0) + bs.port = AFS_VL_PORT; + if (bs.source > NR__dns_record_source) + bs.source = NR__dns_record_source; + if (bs.status > NR__dns_lookup_status) + bs.status = NR__dns_lookup_status; + + server = NULL; + if (previous) { + /* See if we can update an old server record */ + for (i = 0; i < previous->nr_servers; i++) { + struct afs_vlserver *p = previous->servers[i].server; + + if (p->name_len == bs.name_len && + p->port == bs.port && + strncasecmp(b, p->name, bs.name_len) == 0) { + server = afs_get_vlserver(p); + break; + } + } + } + + if (!server) { + ret = -ENOMEM; + server = afs_alloc_vlserver(b, bs.name_len, bs.port); + if (!server) + goto error; + } + + b += bs.name_len; + + /* Extract the addresses - note that we can't skip this as we + * have to advance the payload pointer. + */ + addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port); + if (IS_ERR(addrs)) { + ret = PTR_ERR(addrs); + goto error_2; + } + + if (vllist->nr_servers >= nr_servers) { + _debug("skip %u >= %u", vllist->nr_servers, nr_servers); + afs_put_addrlist(addrs); + afs_put_vlserver(cell->net, server); + continue; + } + + addrs->source = bs.source; + addrs->status = bs.status; + + if (addrs->nr_addrs == 0) { + afs_put_addrlist(addrs); + if (!rcu_access_pointer(server->addresses)) { + afs_put_vlserver(cell->net, server); + continue; + } + } else { + struct afs_addr_list *old = addrs; + + write_lock(&server->lock); + rcu_swap_protected(server->addresses, old, + lockdep_is_held(&server->lock)); + write_unlock(&server->lock); + afs_put_addrlist(old); + } + + + /* TODO: Might want to check for duplicates */ + + /* Insertion-sort by priority and weight */ + for (j = 0; j < vllist->nr_servers; j++) { + if (bs.priority < vllist->servers[j].priority) + break; /* Lower preferable */ + if (bs.priority == vllist->servers[j].priority && + bs.weight > vllist->servers[j].weight) + break; /* Higher preferable */ + } + + if (j < vllist->nr_servers) { + memmove(vllist->servers + j + 1, + vllist->servers + j, + (vllist->nr_servers - j) * sizeof(struct afs_vlserver_entry)); + } + + clear_bit(AFS_VLSERVER_FL_PROBED, &server->flags); + + vllist->servers[j].priority = bs.priority; + vllist->servers[j].weight = bs.weight; + vllist->servers[j].server = server; + vllist->nr_servers++; + } + + if (b != end) { + _debug("parse error %zd", b - end); + goto error; + } + + afs_put_vlserverlist(cell->net, previous); + _leave(" = ok [%u]", vllist->nr_servers); + return vllist; + +error_2: + afs_put_vlserver(cell->net, server); +error: + afs_put_vlserverlist(cell->net, vllist); + afs_put_vlserverlist(cell->net, previous); +dump: + if (ret != -ENOMEM) { + printk(KERN_DEBUG "DNS: at %zu\n", (const void *)b - buffer); + print_hex_dump_bytes("DNS: ", DUMP_PREFIX_NONE, buffer, buffer_size); + } + return ERR_PTR(ret); +} diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c new file mode 100644 index 000000000000..c0f616bd70cb --- /dev/null +++ b/fs/afs/vl_probe.c @@ -0,0 +1,273 @@ +/* AFS vlserver probing + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include "afs_fs.h" +#include "internal.h" +#include "protocol_yfs.h" + +static bool afs_vl_probe_done(struct afs_vlserver *server) +{ + if (!atomic_dec_and_test(&server->probe_outstanding)) + return false; + + wake_up_var(&server->probe_outstanding); + clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags); + wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING); + return true; +} + +/* + * Process the result of probing a vlserver. This is called after successful + * or failed delivery of an VL.GetCapabilities operation. + */ +void afs_vlserver_probe_result(struct afs_call *call) +{ + struct afs_addr_list *alist = call->alist; + struct afs_vlserver *server = call->reply[0]; + unsigned int server_index = (long)call->reply[1]; + unsigned int index = call->addr_ix; + unsigned int rtt = UINT_MAX; + bool have_result = false; + u64 _rtt; + int ret = call->error; + + _enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code); + + spin_lock(&server->probe_lock); + + switch (ret) { + case 0: + server->probe.error = 0; + goto responded; + case -ECONNABORTED: + if (!server->probe.responded) { + server->probe.abort_code = call->abort_code; + server->probe.error = ret; + } + goto responded; + case -ENOMEM: + case -ENONET: + server->probe.local_failure = true; + afs_io_error(call, afs_io_error_vl_probe_fail); + goto out; + case -ECONNRESET: /* Responded, but call expired. */ + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + default: + clear_bit(index, &alist->responded); + set_bit(index, &alist->failed); + if (!server->probe.responded && + (server->probe.error == 0 || + server->probe.error == -ETIMEDOUT || + server->probe.error == -ETIME)) + server->probe.error = ret; + afs_io_error(call, afs_io_error_vl_probe_fail); + goto out; + } + +responded: + set_bit(index, &alist->responded); + clear_bit(index, &alist->failed); + + if (call->service_id == YFS_VL_SERVICE) { + server->probe.is_yfs = true; + set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); + alist->addrs[index].srx_service = call->service_id; + } else { + server->probe.not_yfs = true; + if (!server->probe.is_yfs) { + clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); + alist->addrs[index].srx_service = call->service_id; + } + } + + /* Get the RTT and scale it to fit into a 32-bit value that represents + * over a minute of time so that we can access it with one instruction + * on a 32-bit system. + */ + _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); + _rtt /= 64; + rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; + if (rtt < server->probe.rtt) { + server->probe.rtt = rtt; + alist->preferred = index; + have_result = true; + } + + smp_wmb(); /* Set rtt before responded. */ + server->probe.responded = true; + set_bit(AFS_VLSERVER_FL_PROBED, &server->flags); +out: + spin_unlock(&server->probe_lock); + + _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", + server_index, index, &alist->addrs[index].transport, + (unsigned int)rtt, ret); + + have_result |= afs_vl_probe_done(server); + if (have_result) { + server->probe.have_result = true; + wake_up_var(&server->probe.have_result); + wake_up_all(&server->probe_wq); + } +} + +/* + * Probe all of a vlserver's addresses to find out the best route and to + * query its capabilities. + */ +static int afs_do_probe_vlserver(struct afs_net *net, + struct afs_vlserver *server, + struct key *key, + unsigned int server_index) +{ + struct afs_addr_cursor ac = { + .index = 0, + }; + int ret; + + _enter("%s", server->name); + + read_lock(&server->lock); + ac.alist = rcu_dereference_protected(server->addresses, + lockdep_is_held(&server->lock)); + read_unlock(&server->lock); + + atomic_set(&server->probe_outstanding, ac.alist->nr_addrs); + memset(&server->probe, 0, sizeof(server->probe)); + server->probe.rtt = UINT_MAX; + + for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { + ret = afs_vl_get_capabilities(net, &ac, key, server, + server_index, true); + if (ret != -EINPROGRESS) { + afs_vl_probe_done(server); + return ret; + } + } + + return 0; +} + +/* + * Send off probes to all unprobed servers. + */ +int afs_send_vl_probes(struct afs_net *net, struct key *key, + struct afs_vlserver_list *vllist) +{ + struct afs_vlserver *server; + int i, ret; + + for (i = 0; i < vllist->nr_servers; i++) { + server = vllist->servers[i].server; + if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags)) + continue; + + if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags)) { + ret = afs_do_probe_vlserver(net, server, key, i); + if (ret) + return ret; + } + } + + return 0; +} + +/* + * Wait for the first as-yet untried server to respond. + */ +int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, + unsigned long untried) +{ + struct wait_queue_entry *waits; + struct afs_vlserver *server; + unsigned int rtt = UINT_MAX; + bool have_responders = false; + int pref = -1, i; + + _enter("%u,%lx", vllist->nr_servers, untried); + + /* Only wait for servers that have a probe outstanding. */ + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) + __clear_bit(i, &untried); + if (server->probe.responded) + have_responders = true; + } + } + if (have_responders || !untried) + return 0; + + waits = kmalloc(array_size(vllist->nr_servers, sizeof(*waits)), GFP_KERNEL); + if (!waits) + return -ENOMEM; + + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + init_waitqueue_entry(&waits[i], current); + add_wait_queue(&server->probe_wq, &waits[i]); + } + } + + for (;;) { + bool still_probing = false; + + set_current_state(TASK_INTERRUPTIBLE); + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + if (server->probe.responded) + goto stop; + if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) + still_probing = true; + } + } + + if (!still_probing || unlikely(signal_pending(current))) + goto stop; + schedule(); + } + +stop: + set_current_state(TASK_RUNNING); + + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + if (server->probe.responded && + server->probe.rtt < rtt) { + pref = i; + rtt = server->probe.rtt; + } + + remove_wait_queue(&server->probe_wq, &waits[i]); + } + } + + kfree(waits); + + if (pref == -1 && signal_pending(current)) + return -ERESTARTSYS; + + if (pref >= 0) + vllist->preferred = pref; + + _leave(" = 0 [%u]", pref); + return 0; +} diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c new file mode 100644 index 000000000000..b64a284b99d2 --- /dev/null +++ b/fs/afs/vl_rotate.c @@ -0,0 +1,355 @@ +/* Handle vlserver selection and rotation. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/signal.h> +#include "internal.h" +#include "afs_vl.h" + +/* + * Begin an operation on a volume location server. + */ +bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell, + struct key *key) +{ + memset(vc, 0, sizeof(*vc)); + vc->cell = cell; + vc->key = key; + vc->error = -EDESTADDRREQ; + vc->ac.error = SHRT_MAX; + + if (signal_pending(current)) { + vc->error = -EINTR; + vc->flags |= AFS_VL_CURSOR_STOP; + return false; + } + + return true; +} + +/* + * Begin iteration through a server list, starting with the last used server if + * possible, or the last recorded good server if not. + */ +static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) +{ + struct afs_cell *cell = vc->cell; + + if (wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET, + TASK_INTERRUPTIBLE)) { + vc->error = -ERESTARTSYS; + return false; + } + + read_lock(&cell->vl_servers_lock); + vc->server_list = afs_get_vlserverlist( + rcu_dereference_protected(cell->vl_servers, + lockdep_is_held(&cell->vl_servers_lock))); + read_unlock(&cell->vl_servers_lock); + if (!vc->server_list || !vc->server_list->nr_servers) + return false; + + vc->untried = (1UL << vc->server_list->nr_servers) - 1; + vc->index = -1; + return true; +} + +/* + * Select the vlserver to use. May be called multiple times to rotate + * through the vlservers. + */ +bool afs_select_vlserver(struct afs_vl_cursor *vc) +{ + struct afs_addr_list *alist; + struct afs_vlserver *vlserver; + u32 rtt; + int error = vc->ac.error, abort_code, i; + + _enter("%lx[%d],%lx[%d],%d,%d", + vc->untried, vc->index, + vc->ac.tried, vc->ac.index, + error, vc->ac.abort_code); + + if (vc->flags & AFS_VL_CURSOR_STOP) { + _leave(" = f [stopped]"); + return false; + } + + vc->nr_iterations++; + + /* Evaluate the result of the previous operation, if there was one. */ + switch (error) { + case SHRT_MAX: + goto start; + + default: + case 0: + /* Success or local failure. Stop. */ + vc->error = error; + vc->flags |= AFS_VL_CURSOR_STOP; + _leave(" = f [okay/local %d]", vc->ac.error); + return false; + + case -ECONNABORTED: + /* The far side rejected the operation on some grounds. This + * might involve the server being busy or the volume having been moved. + */ + switch (vc->ac.abort_code) { + case AFSVL_IO: + case AFSVL_BADVOLOPER: + case AFSVL_NOMEM: + /* The server went weird. */ + vc->error = -EREMOTEIO; + //write_lock(&vc->cell->vl_servers_lock); + //vc->server_list->weird_mask |= 1 << vc->index; + //write_unlock(&vc->cell->vl_servers_lock); + goto next_server; + + default: + vc->error = afs_abort_to_error(vc->ac.abort_code); + goto failed; + } + + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + _debug("no conn %d", error); + vc->error = error; + goto iterate_address; + + case -ECONNRESET: + _debug("call reset"); + vc->error = error; + vc->flags |= AFS_VL_CURSOR_RETRY; + goto next_server; + } + +restart_from_beginning: + _debug("restart"); + afs_end_cursor(&vc->ac); + afs_put_vlserverlist(vc->cell->net, vc->server_list); + vc->server_list = NULL; + if (vc->flags & AFS_VL_CURSOR_RETRIED) + goto failed; + vc->flags |= AFS_VL_CURSOR_RETRIED; +start: + _debug("start"); + + if (!afs_start_vl_iteration(vc)) + goto failed; + + error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list); + if (error < 0) + goto failed_set_error; + +pick_server: + _debug("pick [%lx]", vc->untried); + + error = afs_wait_for_vl_probes(vc->server_list, vc->untried); + if (error < 0) + goto failed_set_error; + + /* Pick the untried server with the lowest RTT. */ + vc->index = vc->server_list->preferred; + if (test_bit(vc->index, &vc->untried)) + goto selected_server; + + vc->index = -1; + rtt = U32_MAX; + for (i = 0; i < vc->server_list->nr_servers; i++) { + struct afs_vlserver *s = vc->server_list->servers[i].server; + + if (!test_bit(i, &vc->untried) || !s->probe.responded) + continue; + if (s->probe.rtt < rtt) { + vc->index = i; + rtt = s->probe.rtt; + } + } + + if (vc->index == -1) + goto no_more_servers; + +selected_server: + _debug("use %d", vc->index); + __clear_bit(vc->index, &vc->untried); + + /* We're starting on a different vlserver from the list. We need to + * check it, find its address list and probe its capabilities before we + * use it. + */ + ASSERTCMP(vc->ac.alist, ==, NULL); + vlserver = vc->server_list->servers[vc->index].server; + vc->server = vlserver; + + _debug("USING VLSERVER: %s", vlserver->name); + + read_lock(&vlserver->lock); + alist = rcu_dereference_protected(vlserver->addresses, + lockdep_is_held(&vlserver->lock)); + afs_get_addrlist(alist); + read_unlock(&vlserver->lock); + + memset(&vc->ac, 0, sizeof(vc->ac)); + + if (!vc->ac.alist) + vc->ac.alist = alist; + else + afs_put_addrlist(alist); + + vc->ac.index = -1; + +iterate_address: + ASSERT(vc->ac.alist); + /* Iterate over the current server's address list to try and find an + * address on which it will respond to us. + */ + if (!afs_iterate_addresses(&vc->ac)) + goto next_server; + + _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs); + + _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport); + return true; + +next_server: + _debug("next"); + afs_end_cursor(&vc->ac); + goto pick_server; + +no_more_servers: + /* That's all the servers poked to no good effect. Try again if some + * of them were busy. + */ + if (vc->flags & AFS_VL_CURSOR_RETRY) + goto restart_from_beginning; + + abort_code = 0; + error = -EDESTADDRREQ; + for (i = 0; i < vc->server_list->nr_servers; i++) { + struct afs_vlserver *s = vc->server_list->servers[i].server; + int probe_error = READ_ONCE(s->probe.error); + + switch (probe_error) { + case 0: + continue; + default: + if (error == -ETIMEDOUT || + error == -ETIME) + continue; + case -ETIMEDOUT: + case -ETIME: + if (error == -ENOMEM || + error == -ENONET) + continue; + case -ENOMEM: + case -ENONET: + if (error == -ENETUNREACH) + continue; + case -ENETUNREACH: + if (error == -EHOSTUNREACH) + continue; + case -EHOSTUNREACH: + if (error == -ECONNREFUSED) + continue; + case -ECONNREFUSED: + if (error == -ECONNRESET) + continue; + case -ECONNRESET: /* Responded, but call expired. */ + if (error == -ECONNABORTED) + continue; + case -ECONNABORTED: + abort_code = s->probe.abort_code; + error = probe_error; + continue; + } + } + + if (error == -ECONNABORTED) + error = afs_abort_to_error(abort_code); + +failed_set_error: + vc->error = error; +failed: + vc->flags |= AFS_VL_CURSOR_STOP; + afs_end_cursor(&vc->ac); + _leave(" = f [failed %d]", vc->error); + return false; +} + +/* + * Dump cursor state in the case of the error being EDESTADDRREQ. + */ +static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) +{ + static int count; + int i; + + if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3) + return; + count++; + + rcu_read_lock(); + pr_notice("EDESTADDR occurred\n"); + pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n", + vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error); + + if (vc->server_list) { + const struct afs_vlserver_list *sl = vc->server_list; + pr_notice("VC: SL nr=%u ix=%u\n", + sl->nr_servers, sl->index); + for (i = 0; i < sl->nr_servers; i++) { + const struct afs_vlserver *s = sl->servers[i].server; + pr_notice("VC: server %s+%hu fl=%lx E=%hd\n", + s->name, s->port, s->flags, s->probe.error); + if (s->addresses) { + const struct afs_addr_list *a = + rcu_dereference(s->addresses); + pr_notice("VC: - nr=%u/%u/%u pf=%u\n", + a->nr_ipv4, a->nr_addrs, a->max_addrs, + a->preferred); + pr_notice("VC: - pr=%lx R=%lx F=%lx\n", + a->probed, a->responded, a->failed); + if (a == vc->ac.alist) + pr_notice("VC: - current\n"); + } + } + } + + pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", + vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error, + vc->ac.responded, vc->ac.nr_iterations); + rcu_read_unlock(); +} + +/* + * Tidy up a volume location server cursor and unlock the vnode. + */ +int afs_end_vlserver_operation(struct afs_vl_cursor *vc) +{ + struct afs_net *net = vc->cell->net; + + if (vc->error == -EDESTADDRREQ || + vc->error == -ENETUNREACH || + vc->error == -EHOSTUNREACH) + afs_vl_dump_edestaddrreq(vc); + + afs_end_cursor(&vc->ac); + afs_put_vlserverlist(net, vc->server_list); + + if (vc->error == -ECONNABORTED) + vc->error = afs_abort_to_error(vc->ac.abort_code); + + return vc->error; +} diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index c3b740813fc7..c3d9e5a5f67e 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -128,14 +128,13 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = { * Dispatch a get volume entry by name or ID operation (uuid variant). If the * volname is a decimal number then it's a volume ID not a volume name. */ -struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net, - struct afs_addr_cursor *ac, - struct key *key, +struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, const char *volname, int volnamesz) { struct afs_vldb_entry *entry; struct afs_call *call; + struct afs_net *net = vc->cell->net; size_t reqsz, padsz; __be32 *bp; @@ -155,7 +154,7 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net, return ERR_PTR(-ENOMEM); } - call->key = key; + call->key = vc->key; call->reply[0] = entry; call->ret_reply0 = true; @@ -168,7 +167,7 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net, memset((void *)bp + volnamesz, 0, padsz); trace_afs_make_vl_call(call); - return (struct afs_vldb_entry *)afs_make_call(ac, call, GFP_KERNEL, false); + return (struct afs_vldb_entry *)afs_make_call(&vc->ac, call, GFP_KERNEL, false); } /* @@ -187,19 +186,18 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) u32 uniquifier, nentries, count; int i, ret; - _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count); + _enter("{%u,%zu/%u}", + call->unmarshall, iov_iter_count(call->_iter), call->count); -again: switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_buf(call, + sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32)); call->unmarshall++; /* Extract the returned uuid, uniquifier, nentries and blkaddrs size */ case 1: - ret = afs_extract_data(call, call->buffer, - sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32), - true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -216,28 +214,28 @@ again: call->reply[0] = alist; call->count = count; call->count2 = nentries; - call->offset = 0; call->unmarshall++; + more_entries: + count = min(call->count, 4U); + afs_extract_to_buf(call, count * sizeof(__be32)); + /* Extract entries */ case 2: - count = min(call->count, 4U); - ret = afs_extract_data(call, call->buffer, - count * sizeof(__be32), - call->count > 4); + ret = afs_extract_data(call, call->count > 4); if (ret < 0) return ret; alist = call->reply[0]; bp = call->buffer; + count = min(call->count, 4U); for (i = 0; i < count; i++) if (alist->nr_addrs < call->count2) afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT); call->count -= count; if (call->count > 0) - goto again; - call->offset = 0; + goto more_entries; call->unmarshall++; break; } @@ -267,14 +265,13 @@ static const struct afs_call_type afs_RXVLGetAddrsU = { * Dispatch an operation to get the addresses for a server, where the server is * nominated by UUID. */ -struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net, - struct afs_addr_cursor *ac, - struct key *key, +struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, const uuid_t *uuid) { struct afs_ListAddrByAttributes__xdr *r; const struct afs_uuid *u = (const struct afs_uuid *)uuid; struct afs_call *call; + struct afs_net *net = vc->cell->net; __be32 *bp; int i; @@ -286,7 +283,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net, if (!call) return ERR_PTR(-ENOMEM); - call->key = key; + call->key = vc->key; call->reply[0] = NULL; call->ret_reply0 = true; @@ -307,7 +304,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net, r->uuid.node[i] = htonl(u->node[i]); trace_afs_make_vl_call(call); - return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false); + return (struct afs_addr_list *)afs_make_call(&vc->ac, call, GFP_KERNEL, false); } /* @@ -318,54 +315,51 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) u32 count; int ret; - _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count); + _enter("{%u,%zu/%u}", + call->unmarshall, iov_iter_count(call->_iter), call->count); -again: switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* Extract the capabilities word count */ case 1: - ret = afs_extract_data(call, &call->tmp, - 1 * sizeof(__be32), - true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; count = ntohl(call->tmp); - call->count = count; call->count2 = count; - call->offset = 0; + call->unmarshall++; + afs_extract_discard(call, count * sizeof(__be32)); /* Extract capabilities words */ case 2: - count = min(call->count, 16U); - ret = afs_extract_data(call, call->buffer, - count * sizeof(__be32), - call->count > 16); + ret = afs_extract_data(call, false); if (ret < 0) return ret; /* TODO: Examine capabilities */ - call->count -= count; - if (call->count > 0) - goto again; - call->offset = 0; call->unmarshall++; break; } - call->reply[0] = (void *)(unsigned long)call->service_id; - _leave(" = 0 [done]"); return 0; } +static void afs_destroy_vl_get_capabilities(struct afs_call *call) +{ + struct afs_vlserver *server = call->reply[0]; + + afs_put_vlserver(call->net, server); + afs_flat_call_destructor(call); +} + /* * VL.GetCapabilities operation type */ @@ -373,11 +367,12 @@ static const struct afs_call_type afs_RXVLGetCapabilities = { .name = "VL.GetCapabilities", .op = afs_VL_GetCapabilities, .deliver = afs_deliver_vl_get_capabilities, - .destructor = afs_flat_call_destructor, + .done = afs_vlserver_probe_result, + .destructor = afs_destroy_vl_get_capabilities, }; /* - * Probe a fileserver for the capabilities that it supports. This can + * Probe a volume server for the capabilities that it supports. This can * return up to 196 words. * * We use this to probe for service upgrade to determine what the server at the @@ -385,7 +380,10 @@ static const struct afs_call_type afs_RXVLGetCapabilities = { */ int afs_vl_get_capabilities(struct afs_net *net, struct afs_addr_cursor *ac, - struct key *key) + struct key *key, + struct afs_vlserver *server, + unsigned int server_index, + bool async) { struct afs_call *call; __be32 *bp; @@ -397,9 +395,10 @@ int afs_vl_get_capabilities(struct afs_net *net, return -ENOMEM; call->key = key; - call->upgrade = true; /* Let's see if this is a YFS server */ - call->reply[0] = (void *)VLGETCAPABILITIES; - call->ret_reply0 = true; + call->reply[0] = afs_get_vlserver(server); + call->reply[1] = (void *)(long)server_index; + call->upgrade = true; + call->want_reply_time = true; /* marshall the parameters */ bp = call->request; @@ -407,7 +406,7 @@ int afs_vl_get_capabilities(struct afs_net *net, /* Can't take a ref on server */ trace_afs_make_vl_call(call); - return afs_make_call(ac, call, GFP_KERNEL, false); + return afs_make_call(ac, call, GFP_KERNEL, async); } /* @@ -426,22 +425,19 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) u32 uniquifier, size; int ret; - _enter("{%u,%zu/%u,%u}", call->unmarshall, call->offset, call->count, call->count2); + _enter("{%u,%zu,%u}", + call->unmarshall, iov_iter_count(call->_iter), call->count2); -again: switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_buf(call, sizeof(uuid_t) + 3 * sizeof(__be32)); call->unmarshall = 1; /* Extract the returned uuid, uniquifier, fsEndpoints count and * either the first fsEndpoint type or the volEndpoints * count if there are no fsEndpoints. */ case 1: - ret = afs_extract_data(call, call->buffer, - sizeof(uuid_t) + - 3 * sizeof(__be32), - true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -451,22 +447,19 @@ again: call->count2 = ntohl(*bp); /* Type or next count */ if (call->count > YFS_MAXENDPOINTS) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_fsendpt_num); alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); if (!alist) return -ENOMEM; alist->version = uniquifier; call->reply[0] = alist; - call->offset = 0; if (call->count == 0) goto extract_volendpoints; - call->unmarshall = 2; - - /* Extract fsEndpoints[] entries */ - case 2: + next_fsendpoint: switch (call->count2) { case YFS_ENDPOINT_IPV4: size = sizeof(__be32) * (1 + 1 + 1); @@ -475,11 +468,17 @@ again: size = sizeof(__be32) * (1 + 4 + 1); break; default: - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_fsendpt_type); } size += sizeof(__be32); - ret = afs_extract_data(call, call->buffer, size, true); + afs_extract_to_buf(call, size); + call->unmarshall = 2; + + /* Extract fsEndpoints[] entries */ + case 2: + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -488,18 +487,21 @@ again: switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_fsendpt4_len); afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_fsendpt6_len); afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); bp += 6; break; default: - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_fsendpt_type); } /* Got either the type of the next entry or the count of @@ -507,10 +509,9 @@ again: */ call->count2 = ntohl(*bp++); - call->offset = 0; call->count--; if (call->count > 0) - goto again; + goto next_fsendpoint; extract_volendpoints: /* Extract the list of volEndpoints. */ @@ -518,8 +519,10 @@ again: if (!call->count) goto end; if (call->count > YFS_MAXENDPOINTS) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_vlendpt_type); + afs_extract_to_buf(call, 1 * sizeof(__be32)); call->unmarshall = 3; /* Extract the type of volEndpoints[0]. Normally we would @@ -527,17 +530,14 @@ again: * data of the current one, but this is the first... */ case 3: - ret = afs_extract_data(call, call->buffer, sizeof(__be32), true); + ret = afs_extract_data(call, true); if (ret < 0) return ret; bp = call->buffer; - call->count2 = ntohl(*bp++); - call->offset = 0; - call->unmarshall = 4; - /* Extract volEndpoints[] entries */ - case 4: + next_volendpoint: + call->count2 = ntohl(*bp++); switch (call->count2) { case YFS_ENDPOINT_IPV4: size = sizeof(__be32) * (1 + 1 + 1); @@ -546,12 +546,18 @@ again: size = sizeof(__be32) * (1 + 4 + 1); break; default: - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_vlendpt_type); } if (call->count > 1) - size += sizeof(__be32); - ret = afs_extract_data(call, call->buffer, size, true); + size += sizeof(__be32); /* Get next type too */ + afs_extract_to_buf(call, size); + call->unmarshall = 4; + + /* Extract volEndpoints[] entries */ + case 4: + ret = afs_extract_data(call, true); if (ret < 0) return ret; @@ -559,34 +565,35 @@ again: switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_vlendpt4_len); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_vlendpt6_len); bp += 6; break; default: - return afs_protocol_error(call, -EBADMSG); + return afs_protocol_error(call, -EBADMSG, + afs_eproto_yvl_vlendpt_type); } /* Got either the type of the next entry or the count of * volEndpoints if no more fsEndpoints. */ - call->offset = 0; call->count--; - if (call->count > 0) { - call->count2 = ntohl(*bp++); - goto again; - } + if (call->count > 0) + goto next_volendpoint; end: + afs_extract_discard(call, 0); call->unmarshall = 5; /* Done */ case 5: - ret = afs_extract_data(call, call->buffer, 0, false); + ret = afs_extract_data(call, false); if (ret < 0) return ret; call->unmarshall = 6; @@ -596,11 +603,6 @@ again: } alist = call->reply[0]; - - /* Start with IPv6 if available. */ - if (alist->nr_ipv4 < alist->nr_addrs) - alist->index = alist->nr_ipv4; - _leave(" = 0 [done]"); return 0; } @@ -619,12 +621,11 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = { * Dispatch an operation to get the addresses for a server, where the server is * nominated by UUID. */ -struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net, - struct afs_addr_cursor *ac, - struct key *key, +struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, const uuid_t *uuid) { struct afs_call *call; + struct afs_net *net = vc->cell->net; __be32 *bp; _enter(""); @@ -635,7 +636,7 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net, if (!call) return ERR_PTR(-ENOMEM); - call->key = key; + call->key = vc->key; call->reply[0] = NULL; call->ret_reply0 = true; @@ -646,5 +647,5 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net, memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */ trace_afs_make_vl_call(call); - return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false); + return (struct afs_addr_list *)afs_make_call(&vc->ac, call, GFP_KERNEL, false); } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 3037bd01f617..00975ed3640f 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -74,55 +74,19 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell, const char *volname, size_t volnamesz) { - struct afs_addr_cursor ac; - struct afs_vldb_entry *vldb; + struct afs_vldb_entry *vldb = ERR_PTR(-EDESTADDRREQ); + struct afs_vl_cursor vc; int ret; - ret = afs_set_vl_cursor(&ac, cell); - if (ret < 0) - return ERR_PTR(ret); - - while (afs_iterate_addresses(&ac)) { - if (!test_bit(ac.index, &ac.alist->probed)) { - ret = afs_vl_get_capabilities(cell->net, &ac, key); - switch (ret) { - case VL_SERVICE: - clear_bit(ac.index, &ac.alist->yfs); - set_bit(ac.index, &ac.alist->probed); - ac.addr->srx_service = ret; - break; - case YFS_VL_SERVICE: - set_bit(ac.index, &ac.alist->yfs); - set_bit(ac.index, &ac.alist->probed); - ac.addr->srx_service = ret; - break; - } - } - - vldb = afs_vl_get_entry_by_name_u(cell->net, &ac, key, - volname, volnamesz); - switch (ac.error) { - case 0: - afs_end_cursor(&ac); - return vldb; - case -ECONNABORTED: - ac.error = afs_abort_to_error(ac.abort_code); - goto error; - case -ENOMEM: - case -ENONET: - goto error; - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - break; - default: - ac.error = -EIO; - goto error; - } + if (!afs_begin_vlserver_operation(&vc, cell, key)) + return ERR_PTR(-ERESTARTSYS); + + while (afs_select_vlserver(&vc)) { + vldb = afs_vl_get_entry_by_name_u(&vc, volname, volnamesz); } -error: - return ERR_PTR(afs_end_cursor(&ac)); + ret = afs_end_vlserver_operation(&vc); + return ret < 0 ? ERR_PTR(ret) : vldb; } /* @@ -270,7 +234,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) /* We look up an ID by passing it as a decimal string in the * operation's name parameter. */ - idsz = sprintf(idbuf, "%u", volume->vid); + idsz = sprintf(idbuf, "%llu", volume->vid); vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz); if (IS_ERR(vldb)) { diff --git a/fs/afs/write.c b/fs/afs/write.c index 19c04caf3c01..72efcfcf9f95 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -33,10 +33,21 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, loff_t pos, unsigned int len, struct page *page) { struct afs_read *req; + size_t p; + void *data; int ret; _enter(",,%llu", (unsigned long long)pos); + if (pos >= vnode->vfs_inode.i_size) { + p = pos & ~PAGE_MASK; + ASSERTCMP(p + len, <=, PAGE_SIZE); + data = kmap(page); + memset(data + p, 0, len); + kunmap(page); + return 0; + } + req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *), GFP_KERNEL); if (!req) @@ -81,7 +92,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; int ret; - _enter("{%x:%u},{%lx},%u,%u", + _enter("{%llx:%llu},{%lx},%u,%u", vnode->fid.vid, vnode->fid.vnode, index, from, to); /* We want to store information about how much of a page is altered in @@ -181,7 +192,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, loff_t i_size, maybe_i_size; int ret; - _enter("{%x:%u},{%lx}", + _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); maybe_i_size = pos + copied; @@ -230,7 +241,7 @@ static void afs_kill_pages(struct address_space *mapping, struct pagevec pv; unsigned count, loop; - _enter("{%x:%u},%lx-%lx", + _enter("{%llx:%llu},%lx-%lx", vnode->fid.vid, vnode->fid.vnode, first, last); pagevec_init(&pv); @@ -272,7 +283,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, struct pagevec pv; unsigned count, loop; - _enter("{%x:%u},%lx-%lx", + _enter("{%llx:%llu},%lx-%lx", vnode->fid.vid, vnode->fid.vnode, first, last); pagevec_init(&pv); @@ -314,7 +325,7 @@ static int afs_store_data(struct address_space *mapping, struct list_head *p; int ret = -ENOKEY, ret2; - _enter("%s{%x:%u.%u},%lx,%lx,%x,%x", + _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, @@ -533,6 +544,7 @@ no_more: case -ENOENT: case -ENOMEDIUM: case -ENXIO: + trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail); afs_kill_pages(mapping, first, last); mapping_set_error(mapping, ret); break; @@ -675,7 +687,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) unsigned count, loop; pgoff_t first = call->first, last = call->last; - _enter("{%x:%u},{%lx-%lx}", + _enter("{%llx:%llu},{%lx-%lx}", vnode->fid.vid, vnode->fid.vnode, first, last); pagevec_init(&pv); @@ -714,7 +726,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) ssize_t result; size_t count = iov_iter_count(from); - _enter("{%x.%u},{%zu},", + _enter("{%llx:%llu},{%zu},", vnode->fid.vid, vnode->fid.vnode, count); if (IS_SWAPFILE(&vnode->vfs_inode)) { @@ -742,7 +754,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); - _enter("{%x:%u},{n=%pD},%d", + _enter("{%llx:%llu},{n=%pD},%d", vnode->fid.vid, vnode->fid.vnode, file, datasync); @@ -760,7 +772,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) struct afs_vnode *vnode = AFS_FS_I(inode); unsigned long priv; - _enter("{{%x:%u}},{%lx}", + _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, vmf->page->index); sb_start_pagefault(inode->i_sb); diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index cfcc674e64a5..a2cdf25573e2 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -72,7 +72,7 @@ static int afs_xattr_get_fid(const struct xattr_handler *handler, char text[8 + 1 + 8 + 1 + 8 + 1]; size_t len; - len = sprintf(text, "%x:%x:%x", + len = sprintf(text, "%llx:%llx:%x", vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); if (size == 0) return len; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c new file mode 100644 index 000000000000..12658c1363ae --- /dev/null +++ b/fs/afs/yfsclient.c @@ -0,0 +1,2184 @@ +/* YFS File Server client stubs + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/circ_buf.h> +#include <linux/iversion.h> +#include "internal.h" +#include "afs_fs.h" +#include "xdr_fs.h" +#include "protocol_yfs.h" + +static const struct afs_fid afs_zero_fid; + +static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi) +{ + call->cbi = afs_get_cb_interest(cbi); +} + +#define xdr_size(x) (sizeof(*x) / sizeof(__be32)) + +static void xdr_decode_YFSFid(const __be32 **_bp, struct afs_fid *fid) +{ + const struct yfs_xdr_YFSFid *x = (const void *)*_bp; + + fid->vid = xdr_to_u64(x->volume); + fid->vnode = xdr_to_u64(x->vnode.lo); + fid->vnode_hi = ntohl(x->vnode.hi); + fid->unique = ntohl(x->vnode.unique); + *_bp += xdr_size(x); +} + +static __be32 *xdr_encode_u32(__be32 *bp, u32 n) +{ + *bp++ = htonl(n); + return bp; +} + +static __be32 *xdr_encode_u64(__be32 *bp, u64 n) +{ + struct yfs_xdr_u64 *x = (void *)bp; + + *x = u64_to_xdr(n); + return bp + xdr_size(x); +} + +static __be32 *xdr_encode_YFSFid(__be32 *bp, struct afs_fid *fid) +{ + struct yfs_xdr_YFSFid *x = (void *)bp; + + x->volume = u64_to_xdr(fid->vid); + x->vnode.lo = u64_to_xdr(fid->vnode); + x->vnode.hi = htonl(fid->vnode_hi); + x->vnode.unique = htonl(fid->unique); + return bp + xdr_size(x); +} + +static size_t xdr_strlen(unsigned int len) +{ + return sizeof(__be32) + round_up(len, sizeof(__be32)); +} + +static __be32 *xdr_encode_string(__be32 *bp, const char *p, unsigned int len) +{ + bp = xdr_encode_u32(bp, len); + bp = memcpy(bp, p, len); + if (len & 3) { + unsigned int pad = 4 - (len & 3); + + memset((u8 *)bp + len, 0, pad); + len += pad; + } + + return bp + len / sizeof(__be32); +} + +static s64 linux_to_yfs_time(const struct timespec64 *t) +{ + /* Convert to 100ns intervals. */ + return (u64)t->tv_sec * 10000000 + t->tv_nsec/100; +} + +static __be32 *xdr_encode_YFSStoreStatus_mode(__be32 *bp, mode_t mode) +{ + struct yfs_xdr_YFSStoreStatus *x = (void *)bp; + + x->mask = htonl(AFS_SET_MODE); + x->mode = htonl(mode & S_IALLUGO); + x->mtime_client = u64_to_xdr(0); + x->owner = u64_to_xdr(0); + x->group = u64_to_xdr(0); + return bp + xdr_size(x); +} + +static __be32 *xdr_encode_YFSStoreStatus_mtime(__be32 *bp, const struct timespec64 *t) +{ + struct yfs_xdr_YFSStoreStatus *x = (void *)bp; + s64 mtime = linux_to_yfs_time(t); + + x->mask = htonl(AFS_SET_MTIME); + x->mode = htonl(0); + x->mtime_client = u64_to_xdr(mtime); + x->owner = u64_to_xdr(0); + x->group = u64_to_xdr(0); + return bp + xdr_size(x); +} + +/* + * Convert a signed 100ns-resolution 64-bit time into a timespec. + */ +static struct timespec64 yfs_time_to_linux(s64 t) +{ + struct timespec64 ts; + u64 abs_t; + + /* + * Unfortunately can not use normal 64 bit division on 32 bit arch, but + * the alternative, do_div, does not work with negative numbers so have + * to special case them + */ + if (t < 0) { + abs_t = -t; + ts.tv_nsec = (time64_t)(do_div(abs_t, 10000000) * 100); + ts.tv_nsec = -ts.tv_nsec; + ts.tv_sec = -abs_t; + } else { + abs_t = t; + ts.tv_nsec = (time64_t)do_div(abs_t, 10000000) * 100; + ts.tv_sec = abs_t; + } + + return ts; +} + +static struct timespec64 xdr_to_time(const struct yfs_xdr_u64 xdr) +{ + s64 t = xdr_to_u64(xdr); + + return yfs_time_to_linux(t); +} + +static void yfs_check_req(struct afs_call *call, __be32 *bp) +{ + size_t len = (void *)bp - call->request; + + if (len > call->request_size) + pr_err("kAFS: %s: Request buffer overflow (%zu>%u)\n", + call->type->name, len, call->request_size); + else if (len < call->request_size) + pr_warning("kAFS: %s: Request buffer underflow (%zu<%u)\n", + call->type->name, len, call->request_size); +} + +/* + * Dump a bad file status record. + */ +static void xdr_dump_bad(const __be32 *bp) +{ + __be32 x[4]; + int i; + + pr_notice("YFS XDR: Bad status record\n"); + for (i = 0; i < 5 * 4 * 4; i += 16) { + memcpy(x, bp, 16); + bp += 4; + pr_notice("%03x: %08x %08x %08x %08x\n", + i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3])); + } + + memcpy(x, bp, 4); + pr_notice("0x50: %08x\n", ntohl(x[0])); +} + +/* + * Decode a YFSFetchStatus block + */ +static int xdr_decode_YFSFetchStatus(struct afs_call *call, + const __be32 **_bp, + struct afs_file_status *status, + struct afs_vnode *vnode, + const afs_dataversion_t *expected_version, + struct afs_read *read_req) +{ + const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp; + u32 type; + u8 flags = 0; + + status->abort_code = ntohl(xdr->abort_code); + if (status->abort_code != 0) { + if (vnode && status->abort_code == VNOVNODE) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + status->nlink = 0; + __afs_break_callback(vnode); + } + return 0; + } + + type = ntohl(xdr->type); + switch (type) { + case AFS_FTYPE_FILE: + case AFS_FTYPE_DIR: + case AFS_FTYPE_SYMLINK: + if (type != status->type && + vnode && + !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { + pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n", + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + status->type, type); + goto bad; + } + status->type = type; + break; + default: + goto bad; + } + +#define EXTRACT_M4(FIELD) \ + do { \ + u32 x = ntohl(xdr->FIELD); \ + if (status->FIELD != x) { \ + flags |= AFS_VNODE_META_CHANGED; \ + status->FIELD = x; \ + } \ + } while (0) + +#define EXTRACT_M8(FIELD) \ + do { \ + u64 x = xdr_to_u64(xdr->FIELD); \ + if (status->FIELD != x) { \ + flags |= AFS_VNODE_META_CHANGED; \ + status->FIELD = x; \ + } \ + } while (0) + +#define EXTRACT_D8(FIELD) \ + do { \ + u64 x = xdr_to_u64(xdr->FIELD); \ + if (status->FIELD != x) { \ + flags |= AFS_VNODE_DATA_CHANGED; \ + status->FIELD = x; \ + } \ + } while (0) + + EXTRACT_M4(nlink); + EXTRACT_D8(size); + EXTRACT_D8(data_version); + EXTRACT_M8(author); + EXTRACT_M8(owner); + EXTRACT_M8(group); + EXTRACT_M4(mode); + EXTRACT_M4(caller_access); /* call ticket dependent */ + EXTRACT_M4(anon_access); + + status->mtime_client = xdr_to_time(xdr->mtime_client); + status->mtime_server = xdr_to_time(xdr->mtime_server); + status->lock_count = ntohl(xdr->lock_count); + + if (read_req) { + read_req->data_version = status->data_version; + read_req->file_size = status->size; + } + + *_bp += xdr_size(xdr); + + if (vnode) { + if (test_bit(AFS_VNODE_UNSET, &vnode->flags)) + flags |= AFS_VNODE_NOT_YET_SET; + afs_update_inode_from_status(vnode, status, expected_version, + flags); + } + + return 0; + +bad: + xdr_dump_bad(*_bp); + return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status); +} + +/* + * Decode the file status. We need to lock the target vnode if we're going to + * update its status so that stat() sees the attributes update atomically. + */ +static int yfs_decode_status(struct afs_call *call, + const __be32 **_bp, + struct afs_file_status *status, + struct afs_vnode *vnode, + const afs_dataversion_t *expected_version, + struct afs_read *read_req) +{ + int ret; + + if (!vnode) + return xdr_decode_YFSFetchStatus(call, _bp, status, vnode, + expected_version, read_req); + + write_seqlock(&vnode->cb_lock); + ret = xdr_decode_YFSFetchStatus(call, _bp, status, vnode, + expected_version, read_req); + write_sequnlock(&vnode->cb_lock); + return ret; +} + +/* + * Decode a YFSCallBack block + */ +static void xdr_decode_YFSCallBack(struct afs_call *call, + struct afs_vnode *vnode, + const __be32 **_bp) +{ + struct yfs_xdr_YFSCallBack *xdr = (void *)*_bp; + struct afs_cb_interest *old, *cbi = call->cbi; + u64 cb_expiry; + + write_seqlock(&vnode->cb_lock); + + if (!afs_cb_is_broken(call->cb_break, vnode, cbi)) { + cb_expiry = xdr_to_u64(xdr->expiration_time); + do_div(cb_expiry, 10 * 1000 * 1000); + vnode->cb_version = ntohl(xdr->version); + vnode->cb_type = ntohl(xdr->type); + vnode->cb_expires_at = cb_expiry + ktime_get_real_seconds(); + old = vnode->cb_interest; + if (old != call->cbi) { + vnode->cb_interest = cbi; + cbi = old; + } + set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + } + + write_sequnlock(&vnode->cb_lock); + call->cbi = cbi; + *_bp += xdr_size(xdr); +} + +static void xdr_decode_YFSCallBack_raw(const __be32 **_bp, + struct afs_callback *cb) +{ + struct yfs_xdr_YFSCallBack *x = (void *)*_bp; + u64 cb_expiry; + + cb_expiry = xdr_to_u64(x->expiration_time); + do_div(cb_expiry, 10 * 1000 * 1000); + cb->version = ntohl(x->version); + cb->type = ntohl(x->type); + cb->expires_at = cb_expiry + ktime_get_real_seconds(); + + *_bp += xdr_size(x); +} + +/* + * Decode a YFSVolSync block + */ +static void xdr_decode_YFSVolSync(const __be32 **_bp, + struct afs_volsync *volsync) +{ + struct yfs_xdr_YFSVolSync *x = (void *)*_bp; + u64 creation; + + if (volsync) { + creation = xdr_to_u64(x->vol_creation_date); + do_div(creation, 10 * 1000 * 1000); + volsync->creation = creation; + } + + *_bp += xdr_size(x); +} + +/* + * Encode the requested attributes into a YFSStoreStatus block + */ +static __be32 *xdr_encode_YFS_StoreStatus(__be32 *bp, struct iattr *attr) +{ + struct yfs_xdr_YFSStoreStatus *x = (void *)bp; + s64 mtime = 0, owner = 0, group = 0; + u32 mask = 0, mode = 0; + + mask = 0; + if (attr->ia_valid & ATTR_MTIME) { + mask |= AFS_SET_MTIME; + mtime = linux_to_yfs_time(&attr->ia_mtime); + } + + if (attr->ia_valid & ATTR_UID) { + mask |= AFS_SET_OWNER; + owner = from_kuid(&init_user_ns, attr->ia_uid); + } + + if (attr->ia_valid & ATTR_GID) { + mask |= AFS_SET_GROUP; + group = from_kgid(&init_user_ns, attr->ia_gid); + } + + if (attr->ia_valid & ATTR_MODE) { + mask |= AFS_SET_MODE; + mode = attr->ia_mode & S_IALLUGO; + } + + x->mask = htonl(mask); + x->mode = htonl(mode); + x->mtime_client = u64_to_xdr(mtime); + x->owner = u64_to_xdr(owner); + x->group = u64_to_xdr(group); + return bp + xdr_size(x); +} + +/* + * Decode a YFSFetchVolumeStatus block. + */ +static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp, + struct afs_volume_status *vs) +{ + const struct yfs_xdr_YFSFetchVolumeStatus *x = (const void *)*_bp; + u32 flags; + + vs->vid = xdr_to_u64(x->vid); + vs->parent_id = xdr_to_u64(x->parent_id); + flags = ntohl(x->flags); + vs->online = flags & yfs_FVSOnline; + vs->in_service = flags & yfs_FVSInservice; + vs->blessed = flags & yfs_FVSBlessed; + vs->needs_salvage = flags & yfs_FVSNeedsSalvage; + vs->type = ntohl(x->type); + vs->min_quota = 0; + vs->max_quota = xdr_to_u64(x->max_quota); + vs->blocks_in_use = xdr_to_u64(x->blocks_in_use); + vs->part_blocks_avail = xdr_to_u64(x->part_blocks_avail); + vs->part_max_blocks = xdr_to_u64(x->part_max_blocks); + vs->vol_copy_date = xdr_to_u64(x->vol_copy_date); + vs->vol_backup_date = xdr_to_u64(x->vol_backup_date); + *_bp += sizeof(*x) / sizeof(__be32); +} + +/* + * deliver reply data to an FS.FetchStatus + */ +static int yfs_deliver_fs_fetch_status_vnode(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSCallBack(call, vnode, &bp); + xdr_decode_YFSVolSync(&bp, call->reply[1]); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.FetchStatus operation type + */ +static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = { + .name = "YFS.FetchStatus(vnode)", + .op = yfs_FS_FetchStatus, + .deliver = yfs_deliver_fs_fetch_status_vnode, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for a file. + */ +int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync, + bool new_inode) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus_vnode, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = volsync; + call->expected_version = new_inode ? 1 : vnode->status.data_version; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSFETCHSTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + yfs_check_req(call, bp); + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an YFS.FetchData64. + */ +static int yfs_deliver_fs_fetch_data64(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + struct afs_read *req = call->reply[2]; + const __be32 *bp; + unsigned int size; + int ret; + + _enter("{%u,%zu/%llu}", + call->unmarshall, iov_iter_count(&call->iter), req->actual_len); + + switch (call->unmarshall) { + case 0: + req->actual_len = 0; + req->index = 0; + req->offset = req->pos & (PAGE_SIZE - 1); + afs_extract_to_tmp64(call); + call->unmarshall++; + + /* extract the returned data length */ + case 1: + _debug("extract data length"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + req->actual_len = be64_to_cpu(call->tmp64); + _debug("DATA length: %llu", req->actual_len); + req->remain = min(req->len, req->actual_len); + if (req->remain == 0) + goto no_more_data; + + call->unmarshall++; + + begin_page: + ASSERTCMP(req->index, <, req->nr_pages); + if (req->remain > PAGE_SIZE - req->offset) + size = PAGE_SIZE - req->offset; + else + size = req->remain; + call->bvec[0].bv_len = size; + call->bvec[0].bv_offset = req->offset; + call->bvec[0].bv_page = req->pages[req->index]; + iov_iter_bvec(&call->iter, READ, call->bvec, 1, size); + ASSERTCMP(size, <=, PAGE_SIZE); + + /* extract the returned data */ + case 2: + _debug("extract data %zu/%llu", + iov_iter_count(&call->iter), req->remain); + + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + req->remain -= call->bvec[0].bv_len; + req->offset += call->bvec[0].bv_len; + ASSERTCMP(req->offset, <=, PAGE_SIZE); + if (req->offset == PAGE_SIZE) { + req->offset = 0; + if (req->page_done) + req->page_done(call, req); + req->index++; + if (req->remain > 0) + goto begin_page; + } + + ASSERTCMP(req->remain, ==, 0); + if (req->actual_len <= req->len) + goto no_more_data; + + /* Discard any excess data the server gave us */ + iov_iter_discard(&call->iter, READ, req->actual_len - req->len); + call->unmarshall = 3; + case 3: + _debug("extract discard %zu/%llu", + iov_iter_count(&call->iter), req->actual_len - req->len); + + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + no_more_data: + call->unmarshall = 4; + afs_extract_to_buf(call, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + + /* extract the metadata */ + case 4: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &vnode->status.data_version, req); + if (ret < 0) + return ret; + xdr_decode_YFSCallBack(call, vnode, &bp); + xdr_decode_YFSVolSync(&bp, call->reply[1]); + + call->unmarshall++; + + case 5: + break; + } + + for (; req->index < req->nr_pages; req->index++) { + if (req->offset < PAGE_SIZE) + zero_user_segment(req->pages[req->index], + req->offset, PAGE_SIZE); + if (req->page_done) + req->page_done(call, req); + req->offset = 0; + } + + _leave(" = 0 [done]"); + return 0; +} + +static void yfs_fetch_data_destructor(struct afs_call *call) +{ + struct afs_read *req = call->reply[2]; + + afs_put_read(req); + afs_flat_call_destructor(call); +} + +/* + * YFS.FetchData64 operation type + */ +static const struct afs_call_type yfs_RXYFSFetchData64 = { + .name = "YFS.FetchData64", + .op = yfs_FS_FetchData64, + .deliver = yfs_deliver_fs_fetch_data64, + .destructor = yfs_fetch_data_destructor, +}; + +/* + * Fetch data from a file. + */ +int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(",%x,{%llx:%llu},%llx,%llx", + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode, + req->pos, req->len); + + call = afs_alloc_flat_call(net, &yfs_RXYFSFetchData64, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_u64) * 2, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = NULL; /* volsync */ + call->reply[2] = req; + call->expected_version = vnode->status.data_version; + call->want_reply_time = true; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSFETCHDATA64); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_u64(bp, req->pos); + bp = xdr_encode_u64(bp, req->len); + yfs_check_req(call, bp); + + refcount_inc(&req->usage); + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data for YFS.CreateFile or YFS.MakeDir. + */ +static int yfs_deliver_fs_create_vnode(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_YFSFid(&bp, call->reply[1]); + ret = yfs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL); + if (ret < 0) + return ret; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSCallBack_raw(&bp, call->reply[3]); + xdr_decode_YFSVolSync(&bp, NULL); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.CreateFile and FS.MakeDir operation type + */ +static const struct afs_call_type afs_RXFSCreateFile = { + .name = "YFS.CreateFile", + .op = yfs_FS_CreateFile, + .deliver = yfs_deliver_fs_create_vnode, + .destructor = afs_flat_call_destructor, +}; + +/* + * Create a file. + */ +int yfs_fs_create_file(struct afs_fs_cursor *fc, + const char *name, + umode_t mode, + u64 current_data_version, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_callback *newcb) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + size_t namesz, reqsz, rplsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + reqsz = (sizeof(__be32) + + sizeof(__be32) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz) + + sizeof(struct yfs_xdr_YFSStoreStatus) + + sizeof(__be32)); + rplsz = (sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + + call = afs_alloc_flat_call(net, &afs_RXFSCreateFile, reqsz, rplsz); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = newfid; + call->reply[2] = newstatus; + call->reply[3] = newcb; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSCREATEFILE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_string(bp, name, namesz); + bp = xdr_encode_YFSStoreStatus_mode(bp, mode); + bp = xdr_encode_u32(bp, 0); /* ViceLockType */ + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +static const struct afs_call_type yfs_RXFSMakeDir = { + .name = "YFS.MakeDir", + .op = yfs_FS_MakeDir, + .deliver = yfs_deliver_fs_create_vnode, + .destructor = afs_flat_call_destructor, +}; + +/* + * Make a directory. + */ +int yfs_fs_make_dir(struct afs_fs_cursor *fc, + const char *name, + umode_t mode, + u64 current_data_version, + struct afs_fid *newfid, + struct afs_file_status *newstatus, + struct afs_callback *newcb) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + size_t namesz, reqsz, rplsz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + reqsz = (sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz) + + sizeof(struct yfs_xdr_YFSStoreStatus)); + rplsz = (sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + + call = afs_alloc_flat_call(net, &yfs_RXFSMakeDir, reqsz, rplsz); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = newfid; + call->reply[2] = newstatus; + call->reply[3] = newcb; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSMAKEDIR); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_string(bp, name, namesz); + bp = xdr_encode_YFSStoreStatus_mode(bp, mode); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.RemoveFile2 operation. + */ +static int yfs_deliver_fs_remove_file2(struct afs_call *call) +{ + struct afs_vnode *dvnode = call->reply[0]; + struct afs_vnode *vnode = call->reply[1]; + struct afs_fid fid; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + + xdr_decode_YFSFid(&bp, &fid); + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL); + if (ret < 0) + return ret; + /* Was deleted if vnode->status.abort_code == VNOVNODE. */ + + xdr_decode_YFSVolSync(&bp, NULL); + return 0; +} + +/* + * YFS.RemoveFile2 operation type. + */ +static const struct afs_call_type yfs_RXYFSRemoveFile2 = { + .name = "YFS.RemoveFile2", + .op = yfs_FS_RemoveFile2, + .deliver = yfs_deliver_fs_remove_file2, + .destructor = afs_flat_call_destructor, +}; + +/* + * Remove a file and retrieve new file status. + */ +int yfs_fs_remove_file2(struct afs_fs_cursor *fc, struct afs_vnode *vnode, + const char *name, u64 current_data_version) +{ + struct afs_vnode *dvnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(dvnode); + size_t namesz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + + call = afs_alloc_flat_call(net, &yfs_RXYFSRemoveFile2, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = dvnode; + call->reply[1] = vnode; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSREMOVEFILE2); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvnode->fid); + bp = xdr_encode_string(bp, name, namesz); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &dvnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.RemoveFile or YFS.RemoveDir operation. + */ +static int yfs_deliver_fs_remove(struct afs_call *call) +{ + struct afs_vnode *dvnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + + xdr_decode_YFSVolSync(&bp, NULL); + return 0; +} + +/* + * FS.RemoveDir and FS.RemoveFile operation types. + */ +static const struct afs_call_type yfs_RXYFSRemoveFile = { + .name = "YFS.RemoveFile", + .op = yfs_FS_RemoveFile, + .deliver = yfs_deliver_fs_remove, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type yfs_RXYFSRemoveDir = { + .name = "YFS.RemoveDir", + .op = yfs_FS_RemoveDir, + .deliver = yfs_deliver_fs_remove, + .destructor = afs_flat_call_destructor, +}; + +/* + * remove a file or directory + */ +int yfs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode, + const char *name, bool isdir, u64 current_data_version) +{ + struct afs_vnode *dvnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(dvnode); + size_t namesz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + call = afs_alloc_flat_call( + net, isdir ? &yfs_RXYFSRemoveDir : &yfs_RXYFSRemoveFile, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = dvnode; + call->reply[1] = vnode; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, isdir ? YFSREMOVEDIR : YFSREMOVEFILE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvnode->fid); + bp = xdr_encode_string(bp, name, namesz); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &dvnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.Link operation. + */ +static int yfs_deliver_fs_link(struct afs_call *call) +{ + struct afs_vnode *dvnode = call->reply[0], *vnode = call->reply[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL); + if (ret < 0) + return ret; + ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSVolSync(&bp, NULL); + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.Link operation type. + */ +static const struct afs_call_type yfs_RXYFSLink = { + .name = "YFS.Link", + .op = yfs_FS_Link, + .deliver = yfs_deliver_fs_link, + .destructor = afs_flat_call_destructor, +}; + +/* + * Make a hard link. + */ +int yfs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, + const char *name, u64 current_data_version) +{ + struct afs_vnode *dvnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + size_t namesz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + call = afs_alloc_flat_call(net, &yfs_RXYFSLink, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz) + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = dvnode; + call->reply[1] = vnode; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSLINK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvnode->fid); + bp = xdr_encode_string(bp, name, namesz); + bp = xdr_encode_YFSFid(bp, &vnode->fid); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.Symlink operation. + */ +static int yfs_deliver_fs_symlink(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_YFSFid(&bp, call->reply[1]); + ret = yfs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL); + if (ret < 0) + return ret; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSVolSync(&bp, NULL); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.Symlink operation type + */ +static const struct afs_call_type yfs_RXYFSSymlink = { + .name = "YFS.Symlink", + .op = yfs_FS_Symlink, + .deliver = yfs_deliver_fs_symlink, + .destructor = afs_flat_call_destructor, +}; + +/* + * Create a symbolic link. + */ +int yfs_fs_symlink(struct afs_fs_cursor *fc, + const char *name, + const char *contents, + u64 current_data_version, + struct afs_fid *newfid, + struct afs_file_status *newstatus) +{ + struct afs_vnode *dvnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(dvnode); + size_t namesz, contents_sz; + __be32 *bp; + + _enter(""); + + namesz = strlen(name); + contents_sz = strlen(contents); + call = afs_alloc_flat_call(net, &yfs_RXYFSSymlink, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(namesz) + + xdr_strlen(contents_sz) + + sizeof(struct yfs_xdr_YFSStoreStatus), + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = dvnode; + call->reply[1] = newfid; + call->reply[2] = newstatus; + call->expected_version = current_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSYMLINK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvnode->fid); + bp = xdr_encode_string(bp, name, namesz); + bp = xdr_encode_string(bp, contents, contents_sz); + bp = xdr_encode_YFSStoreStatus_mode(bp, S_IRWXUGO); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &dvnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.Rename operation. + */ +static int yfs_deliver_fs_rename(struct afs_call *call) +{ + struct afs_vnode *orig_dvnode = call->reply[0]; + struct afs_vnode *new_dvnode = call->reply[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + if (new_dvnode != orig_dvnode) { + ret = yfs_decode_status(call, &bp, &new_dvnode->status, new_dvnode, + &call->expected_version_2, NULL); + if (ret < 0) + return ret; + } + + xdr_decode_YFSVolSync(&bp, NULL); + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.Rename operation type + */ +static const struct afs_call_type yfs_RXYFSRename = { + .name = "FS.Rename", + .op = yfs_FS_Rename, + .deliver = yfs_deliver_fs_rename, + .destructor = afs_flat_call_destructor, +}; + +/* + * Rename a file or directory. + */ +int yfs_fs_rename(struct afs_fs_cursor *fc, + const char *orig_name, + struct afs_vnode *new_dvnode, + const char *new_name, + u64 current_orig_data_version, + u64 current_new_data_version) +{ + struct afs_vnode *orig_dvnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(orig_dvnode); + size_t o_namesz, n_namesz; + __be32 *bp; + + _enter(""); + + o_namesz = strlen(orig_name); + n_namesz = strlen(new_name); + call = afs_alloc_flat_call(net, &yfs_RXYFSRename, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(o_namesz) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(n_namesz), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = orig_dvnode; + call->reply[1] = new_dvnode; + call->expected_version = current_orig_data_version + 1; + call->expected_version_2 = current_new_data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvnode->fid); + bp = xdr_encode_string(bp, orig_name, o_namesz); + bp = xdr_encode_YFSFid(bp, &new_dvnode->fid); + bp = xdr_encode_string(bp, new_name, n_namesz); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &orig_dvnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.StoreData64 operation. + */ +static int yfs_deliver_fs_store_data(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter(""); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSVolSync(&bp, NULL); + + afs_pages_written_back(vnode, call); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.StoreData64 operation type. + */ +static const struct afs_call_type yfs_RXYFSStoreData64 = { + .name = "YFS.StoreData64", + .op = yfs_FS_StoreData64, + .deliver = yfs_deliver_fs_store_data, + .destructor = afs_flat_call_destructor, +}; + +/* + * Store a set of pages to a large file. + */ +int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, + pgoff_t first, pgoff_t last, + unsigned offset, unsigned to) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + loff_t size, pos, i_size; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + + size = (loff_t)to - (loff_t)offset; + if (first != last) + size += (loff_t)(last - first) << PAGE_SHIFT; + pos = (loff_t)first << PAGE_SHIFT; + pos += offset; + + i_size = i_size_read(&vnode->vfs_inode); + if (pos + size > i_size) + i_size = size + pos; + + _debug("size %llx, at %llx, i_size %llx", + (unsigned long long)size, (unsigned long long)pos, + (unsigned long long)i_size); + + call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64, + sizeof(__be32) + + sizeof(__be32) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSStoreStatus) + + sizeof(struct yfs_xdr_u64) * 3, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->mapping = mapping; + call->reply[0] = vnode; + call->first = first; + call->last = last; + call->first_offset = offset; + call->last_to = to; + call->send_pages = true; + call->expected_version = vnode->status.data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSTOREDATA64); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFSStoreStatus_mtime(bp, &vnode->vfs_inode.i_mtime); + bp = xdr_encode_u64(bp, pos); + bp = xdr_encode_u64(bp, size); + bp = xdr_encode_u64(bp, i_size); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * deliver reply data to an FS.StoreStatus + */ +static int yfs_deliver_fs_store_status(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter(""); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSVolSync(&bp, NULL); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.StoreStatus operation type + */ +static const struct afs_call_type yfs_RXYFSStoreStatus = { + .name = "YFS.StoreStatus", + .op = yfs_FS_StoreStatus, + .deliver = yfs_deliver_fs_store_status, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type yfs_RXYFSStoreData64_as_Status = { + .name = "YFS.StoreData64", + .op = yfs_FS_StoreData64, + .deliver = yfs_deliver_fs_store_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Set the attributes on a file, using YFS.StoreData64 rather than + * YFS.StoreStatus so as to alter the file size also. + */ +static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64_as_Status, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSStoreStatus) + + sizeof(struct yfs_xdr_u64) * 3, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + call->expected_version = vnode->status.data_version + 1; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSTOREDATA64); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFS_StoreStatus(bp, attr); + bp = xdr_encode_u64(bp, 0); /* position of start of write */ + bp = xdr_encode_u64(bp, 0); /* size of write */ + bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */ + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Set the attributes on a file, using YFS.StoreData64 if there's a change in + * file size, and YFS.StoreStatus otherwise. + */ +int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + if (attr->ia_valid & ATTR_SIZE) + return yfs_fs_setattr_size(fc, attr); + + _enter(",%x,{%llx:%llu},,", + key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); + + call = afs_alloc_flat_call(net, &yfs_RXYFSStoreStatus, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSStoreStatus), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + call->expected_version = vnode->status.data_version; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSTORESTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_YFS_StoreStatus(bp, attr); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to a YFS.GetVolumeStatus operation. + */ +static int yfs_deliver_fs_get_volume_status(struct afs_call *call) +{ + const __be32 *bp; + char *p; + u32 size; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + call->unmarshall++; + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus)); + + /* extract the returned status record */ + case 1: + _debug("extract status"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSFetchVolumeStatus(&bp, call->reply[1]); + call->unmarshall++; + afs_extract_to_tmp(call); + + /* extract the volume name length */ + case 2: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("volname length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_volname_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); + call->unmarshall++; + + /* extract the volume name */ + case 3: + _debug("extract volname"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + p = call->reply[2]; + p[call->count] = 0; + _debug("volname '%s'", p); + afs_extract_to_tmp(call); + call->unmarshall++; + + /* extract the offline message length */ + case 4: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("offline msg length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_offline_msg_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); + call->unmarshall++; + + /* extract the offline message */ + case 5: + _debug("extract offline"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + p = call->reply[2]; + p[call->count] = 0; + _debug("offline '%s'", p); + + afs_extract_to_tmp(call); + call->unmarshall++; + + /* extract the message of the day length */ + case 6: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("motd length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_motd_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_begin(call, call->reply[2], size); + call->unmarshall++; + + /* extract the message of the day */ + case 7: + _debug("extract motd"); + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + p = call->reply[2]; + p[call->count] = 0; + _debug("motd '%s'", p); + + call->unmarshall++; + + case 8: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * Destroy a YFS.GetVolumeStatus call. + */ +static void yfs_get_volume_status_call_destructor(struct afs_call *call) +{ + kfree(call->reply[2]); + call->reply[2] = NULL; + afs_flat_call_destructor(call); +} + +/* + * YFS.GetVolumeStatus operation type + */ +static const struct afs_call_type yfs_RXYFSGetVolumeStatus = { + .name = "YFS.GetVolumeStatus", + .op = yfs_FS_GetVolumeStatus, + .deliver = yfs_deliver_fs_get_volume_status, + .destructor = yfs_get_volume_status_call_destructor, +}; + +/* + * fetch the status of a volume + */ +int yfs_fs_get_volume_status(struct afs_fs_cursor *fc, + struct afs_volume_status *vs) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + void *tmpbuf; + + _enter(""); + + tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + + call = afs_alloc_flat_call(net, &yfs_RXYFSGetVolumeStatus, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_u64), + sizeof(struct yfs_xdr_YFSFetchVolumeStatus) + + sizeof(__be32)); + if (!call) { + kfree(tmpbuf); + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = vnode; + call->reply[1] = vs; + call->reply[2] = tmpbuf; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSGETVOLUMESTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_u64(bp, vnode->fid.vid); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an YFS.SetLock, YFS.ExtendLock or YFS.ReleaseLock + */ +static int yfs_deliver_fs_xxxx_lock(struct afs_call *call) +{ + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSVolSync(&bp, NULL); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.SetLock operation type + */ +static const struct afs_call_type yfs_RXYFSSetLock = { + .name = "YFS.SetLock", + .op = yfs_FS_SetLock, + .deliver = yfs_deliver_fs_xxxx_lock, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.ExtendLock operation type + */ +static const struct afs_call_type yfs_RXYFSExtendLock = { + .name = "YFS.ExtendLock", + .op = yfs_FS_ExtendLock, + .deliver = yfs_deliver_fs_xxxx_lock, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.ReleaseLock operation type + */ +static const struct afs_call_type yfs_RXYFSReleaseLock = { + .name = "YFS.ReleaseLock", + .op = yfs_FS_ReleaseLock, + .deliver = yfs_deliver_fs_xxxx_lock, + .destructor = afs_flat_call_destructor, +}; + +/* + * Set a lock on a file + */ +int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &yfs_RXYFSSetLock, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(__be32), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSETLOCK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + bp = xdr_encode_u32(bp, type); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * extend a lock on a file + */ +int yfs_fs_extend_lock(struct afs_fs_cursor *fc) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &yfs_RXYFSExtendLock, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSEXTENDLOCK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * release a lock on a file + */ +int yfs_fs_release_lock(struct afs_fs_cursor *fc) +{ + struct afs_vnode *vnode = fc->vnode; + struct afs_call *call; + struct afs_net *net = afs_v2net(vnode); + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &yfs_RXYFSReleaseLock, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return -ENOMEM; + + call->key = fc->key; + call->reply[0] = vnode; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRELEASELOCK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vnode->fid); + yfs_check_req(call, bp); + + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &vnode->fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an FS.FetchStatus with no vnode. + */ +static int yfs_deliver_fs_fetch_status(struct afs_call *call) +{ + struct afs_file_status *status = call->reply[1]; + struct afs_callback *callback = call->reply[2]; + struct afs_volsync *volsync = call->reply[3]; + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + ret = yfs_decode_status(call, &bp, status, vnode, + &call->expected_version, NULL); + if (ret < 0) + return ret; + xdr_decode_YFSCallBack_raw(&bp, callback); + xdr_decode_YFSVolSync(&bp, volsync); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.FetchStatus operation type + */ +static const struct afs_call_type yfs_RXYFSFetchStatus = { + .name = "YFS.FetchStatus", + .op = yfs_FS_FetchStatus, + .deliver = yfs_deliver_fs_fetch_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for a fid without needing a vnode handle. + */ +int yfs_fs_fetch_status(struct afs_fs_cursor *fc, + struct afs_net *net, + struct afs_fid *fid, + struct afs_file_status *status, + struct afs_callback *callback, + struct afs_volsync *volsync) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(fc->key), fid->vid, fid->vnode); + + call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[1] = status; + call->reply[2] = callback; + call->reply[3] = volsync; + call->expected_version = 1; /* vnode->status.data_version */ + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSFETCHSTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, fid); + yfs_check_req(call, bp); + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an YFS.InlineBulkStatus call + */ +static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) +{ + struct afs_file_status *statuses; + struct afs_callback *callbacks; + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + u32 tmp; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + + /* Extract the file status count and array in two steps */ + case 1: + _debug("extract status count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("status count: %u/%u", tmp, call->count2); + if (tmp != call->count2) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_ibulkst_count); + + call->count = 0; + call->unmarshall++; + more_counts: + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus)); + + case 2: + _debug("extract status array %u", call->count); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + bp = call->buffer; + statuses = call->reply[1]; + ret = yfs_decode_status(call, &bp, &statuses[call->count], + call->count == 0 ? vnode : NULL, + NULL, NULL); + if (ret < 0) + return ret; + + call->count++; + if (call->count < call->count2) + goto more_counts; + + call->count = 0; + call->unmarshall++; + afs_extract_to_tmp(call); + + /* Extract the callback count and array in two steps */ + case 3: + _debug("extract CB count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("CB count: %u", tmp); + if (tmp != call->count2) + return afs_protocol_error(call, -EBADMSG, + afs_eproto_ibulkst_cb_count); + call->count = 0; + call->unmarshall++; + more_cbs: + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack)); + + case 4: + _debug("extract CB array"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + _debug("unmarshall CB array"); + bp = call->buffer; + callbacks = call->reply[2]; + xdr_decode_YFSCallBack_raw(&bp, &callbacks[call->count]); + statuses = call->reply[1]; + if (call->count == 0 && vnode && statuses[0].abort_code == 0) { + bp = call->buffer; + xdr_decode_YFSCallBack(call, vnode, &bp); + } + call->count++; + if (call->count < call->count2) + goto more_cbs; + + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync)); + call->unmarshall++; + + case 5: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSVolSync(&bp, call->reply[3]); + + call->unmarshall++; + + case 6: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.InlineBulkStatus operation type + */ +static const struct afs_call_type yfs_RXYFSInlineBulkStatus = { + .name = "YFS.InlineBulkStatus", + .op = yfs_FS_InlineBulkStatus, + .deliver = yfs_deliver_fs_inline_bulk_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for up to 1024 files + */ +int yfs_fs_inline_bulk_status(struct afs_fs_cursor *fc, + struct afs_net *net, + struct afs_fid *fids, + struct afs_file_status *statuses, + struct afs_callback *callbacks, + unsigned int nr_fids, + struct afs_volsync *volsync) +{ + struct afs_call *call; + __be32 *bp; + int i; + + _enter(",%x,{%llx:%llu},%u", + key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids); + + call = afs_alloc_flat_call(net, &yfs_RXYFSInlineBulkStatus, + sizeof(__be32) + + sizeof(__be32) + + sizeof(__be32) + + sizeof(struct yfs_xdr_YFSFid) * nr_fids, + sizeof(struct yfs_xdr_YFSFetchStatus)); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[1] = statuses; + call->reply[2] = callbacks; + call->reply[3] = volsync; + call->count2 = nr_fids; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSINLINEBULKSTATUS); + bp = xdr_encode_u32(bp, 0); /* RPCFlags */ + bp = xdr_encode_u32(bp, nr_fids); + for (i = 0; i < nr_fids; i++) + bp = xdr_encode_YFSFid(bp, &fids[i]); + yfs_check_req(call, bp); + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &fids[0]); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} @@ -2135,12 +2135,12 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, - struct compat_timespec __user *, timeout) + struct old_timespec32 __user *, timeout) { struct timespec64 t; int ret; - if (timeout && compat_get_timespec64(&t, timeout)) + if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); @@ -2160,7 +2160,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, - struct compat_timespec __user *, timeout, + struct old_timespec32 __user *, timeout, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { NULL, }; @@ -2168,7 +2168,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, struct timespec64 t; int ret; - if (timeout && compat_get_timespec64(&t, timeout)) + if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) diff --git a/fs/block_dev.c b/fs/block_dev.c index 38b8ce05cbc7..a80b4f0ee7c4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -349,7 +349,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) dio->size = 0; dio->multi_bio = false; - dio->should_dirty = is_read && (iter->type == ITER_IOVEC); + dio->should_dirty = is_read && iter_is_iovec(iter); blk_start_plug(&plug); for (;;) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8703ce68fe9d..2955a4ea2fa8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -437,10 +437,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->i_pages, pg_index); - rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + page = xa_load(&mapping->i_pages, pg_index); + if (page && !xa_is_value(page)) { misses++; if (misses > 4) break; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2ee43b6a4f09..539901fb5165 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1014,9 +1014,26 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) parent_start = parent->start; + /* + * If we are COWing a node/leaf from the extent, chunk or device trees, + * make sure that we do not finish block group creation of pending block + * groups. We do this to avoid a deadlock. + * COWing can result in allocation of a new chunk, and flushing pending + * block groups (btrfs_create_pending_block_groups()) can be triggered + * when finishing allocation of a new chunk. Creation of a pending block + * group modifies the extent, chunk and device trees, therefore we could + * deadlock with ourselves since we are holding a lock on an extent + * buffer that btrfs_create_pending_block_groups() may try to COW later. + */ + if (root == fs_info->extent_root || + root == fs_info->chunk_root || + root == fs_info->dev_root) + trans->can_flush_pending_bgs = false; + cow = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, &disk_key, level, search_start, empty_size); + trans->can_flush_pending_bgs = true; if (IS_ERR(cow)) return PTR_ERR(cow); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 68ca41dbbef3..80953528572d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3201,9 +3201,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); -int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, - struct file *dst_file, loff_t dst_loff, - u64 olen); /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3233,8 +3230,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); +loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 5149165b49a4..9301b3ad9217 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -164,14 +164,27 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, return NULL; } +static struct btrfs_delayed_ref_head *find_first_ref_head( + struct btrfs_delayed_ref_root *dr) +{ + struct rb_node *n; + struct btrfs_delayed_ref_head *entry; + + n = rb_first_cached(&dr->href_root); + if (!n) + return NULL; + + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); + + return entry; +} + /* - * find an head entry based on bytenr. This returns the delayed ref - * head if it was able to find one, or NULL if nothing was in that spot. - * If return_bigger is given, the next bigger entry is returned if no exact - * match is found. But if no bigger one is found then the first node of the - * ref head tree will be returned. + * Find a head entry based on bytenr. This returns the delayed ref head if it + * was able to find one, or NULL if nothing was in that spot. If return_bigger + * is given, the next bigger entry is returned if no exact match is found. */ -static struct btrfs_delayed_ref_head* find_ref_head( +static struct btrfs_delayed_ref_head *find_ref_head( struct btrfs_delayed_ref_root *dr, u64 bytenr, bool return_bigger) { @@ -195,10 +208,9 @@ static struct btrfs_delayed_ref_head* find_ref_head( if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) - n = rb_first_cached(&dr->href_root); + return NULL; entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - return entry; } return entry; } @@ -355,33 +367,25 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; - u64 start; - bool loop = false; again: - start = delayed_refs->run_delayed_start; - head = find_ref_head(delayed_refs, start, true); - if (!head && !loop) { + head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, + true); + if (!head && delayed_refs->run_delayed_start != 0) { delayed_refs->run_delayed_start = 0; - start = 0; - loop = true; - head = find_ref_head(delayed_refs, start, true); - if (!head) - return NULL; - } else if (!head && loop) { - return NULL; + head = find_first_ref_head(delayed_refs); } + if (!head) + return NULL; while (head->processing) { struct rb_node *node; node = rb_next(&head->href_node); if (!node) { - if (loop) + if (delayed_refs->run_delayed_start == 0) return NULL; delayed_refs->run_delayed_start = 0; - start = 0; - loop = true; goto again; } head = rb_entry(node, struct btrfs_delayed_ref_head, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a4cd0221bc8d..a1febf155747 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2366,6 +2366,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, insert_reserved); else BUG(); + if (ret && insert_reserved) + btrfs_pin_extent(trans->fs_info, node->bytenr, + node->num_bytes, 1); return ret; } @@ -2954,7 +2957,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; - bool can_flush_pending_bgs = trans->can_flush_pending_bgs; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2971,7 +2973,6 @@ again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, count); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -3002,7 +3003,6 @@ again: goto again; } out: - trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -4568,6 +4568,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, goto out; } else { ret = 1; + space_info->max_extent_size = 0; } space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; @@ -4589,11 +4590,9 @@ out: * the block groups that were made dirty during the lifetime of the * transaction. */ - if (trans->can_flush_pending_bgs && - trans->chunk_bytes_reserved >= (u64)SZ_2M) { + if (trans->chunk_bytes_reserved >= (u64)SZ_2M) btrfs_create_pending_block_groups(trans); - btrfs_trans_release_chunk_metadata(trans); - } + return ret; } @@ -6464,6 +6463,7 @@ static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + space_info->max_extent_size = 0; if (delalloc) cache->delalloc_bytes -= num_bytes; @@ -7260,6 +7260,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group = NULL; u64 search_start = 0; u64 max_extent_size = 0; + u64 max_free_space = 0; u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; @@ -7555,8 +7556,8 @@ unclustered_alloc: spin_lock(&ctl->tree_lock); if (ctl->free_space < num_bytes + empty_cluster + empty_size) { - if (ctl->free_space > max_extent_size) - max_extent_size = ctl->free_space; + max_free_space = max(max_free_space, + ctl->free_space); spin_unlock(&ctl->tree_lock); goto loop; } @@ -7723,6 +7724,8 @@ loop: } out: if (ret == -ENOSPC) { + if (!max_extent_size) + max_extent_size = max_free_space; spin_lock(&space_info->lock); space_info->max_extent_size = max_extent_size; spin_unlock(&space_info->lock); @@ -8004,21 +8007,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); - if (!path) { - btrfs_free_and_pin_reserved_extent(fs_info, - extent_key.objectid, - fs_info->nodesize); + if (!path) return -ENOMEM; - } path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, &extent_key, size); if (ret) { btrfs_free_path(path); - btrfs_free_and_pin_reserved_extent(fs_info, - extent_key.objectid, - fs_info->nodesize); return ret; } @@ -10132,9 +10128,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) struct btrfs_block_group_item item; struct btrfs_key key; int ret = 0; - bool can_flush_pending_bgs = trans->can_flush_pending_bgs; - trans->can_flush_pending_bgs = false; + if (!trans->can_flush_pending_bgs) + return; + while (!list_empty(&trans->new_bgs)) { block_group = list_first_entry(&trans->new_bgs, struct btrfs_block_group_cache, @@ -10159,7 +10156,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) next: list_del_init(&block_group->bg_list); } - trans->can_flush_pending_bgs = can_flush_pending_bgs; + btrfs_trans_release_chunk_metadata(trans); } int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6877a74c7469..d228f706ff3e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3784,7 +3784,7 @@ int btree_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { @@ -3909,7 +3909,7 @@ static int extent_write_cache_pages(struct address_space *mapping, pgoff_t done_index; int range_whole = 0; int scanned = 0; - int tag; + xa_mark_t tag; /* * We have to hold onto the inode so that ordered extents can do their @@ -5159,11 +5159,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) clear_page_dirty_for_io(page); xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->i_pages, - page_index(page), - PAGECACHE_TAG_DIRTY); - } + if (!PageDirty(page)) + __xa_clear_mark(&page->mapping->i_pages, + page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15b925142793..a3c22e16509b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2078,6 +2078,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; inode_lock(inode); + + /* + * We take the dio_sem here because the tree log stuff can race with + * lockless dio writes and get an extent map logged for an extent we + * never waited on. We need it this high up for lockdep reasons. + */ + down_write(&BTRFS_I(inode)->dio_sem); + atomic_inc(&root->log_batch); /* @@ -2086,6 +2094,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = btrfs_wait_ordered_range(inode, start, len); if (ret) { + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2109,6 +2118,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * checked called fsync. */ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2127,6 +2137,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2148,6 +2159,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); /* @@ -3286,8 +3298,7 @@ const struct file_operations btrfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif - .clone_file_range = btrfs_clone_file_range, - .dedupe_file_range = btrfs_dedupe_file_range, + .remap_file_range = btrfs_remap_file_range, }; void __cold btrfs_auto_defrag_exit(void) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 67441219d6c9..4ba0aedc878b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1772,6 +1772,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, return -1; } +static inline u64 get_max_extent_size(struct btrfs_free_space *entry) +{ + if (entry->bitmap) + return entry->max_extent_size; + return entry->bytes; +} + /* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, @@ -1793,8 +1800,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, for (node = &entry->offset_index; node; node = rb_next(node)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bytes < *bytes) { - if (entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); continue; } @@ -1812,8 +1819,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, } if (entry->bytes < *bytes + align_off) { - if (entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); continue; } @@ -1825,8 +1832,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, *offset = tmp; *bytes = size; return entry; - } else if (size > *max_extent_size) { - *max_extent_size = size; + } else { + *max_extent_size = + max(get_max_extent_size(entry), + *max_extent_size); } continue; } @@ -2449,6 +2458,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, struct rb_node *n; int count = 0; + spin_lock(&ctl->tree_lock); for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes && !block_group->ro) @@ -2457,6 +2467,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } + spin_unlock(&ctl->tree_lock); btrfs_info(fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); btrfs_info(fs_info, @@ -2685,8 +2696,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); if (err) { - if (search_bytes > *max_extent_size) - *max_extent_size = search_bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); return 0; } @@ -2723,8 +2734,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); while (1) { - if (entry->bytes < bytes && entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + if (entry->bytes < bytes) + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); if (entry->bytes < bytes || (!entry->bitmap && entry->offset < min_start)) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f22f77172c5f..d3df5b52278c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -502,6 +502,7 @@ again: pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { /* just bail out to the uncompressed code */ + nr_pages = 0; goto cont; } @@ -2940,6 +2941,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) bool truncated = false; bool range_locked = false; bool clear_new_delalloc_bytes = false; + bool clear_reserved_extent = true; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && @@ -3043,10 +3045,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); - if (!ret) + if (!ret) { + clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, ordered_extent->start, ordered_extent->disk_len); + } } unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, ordered_extent->len, @@ -3107,8 +3111,13 @@ out: * wrong we need to return the space for this ordered extent * back to the allocator. We only free the extent in the * truncated case if we didn't write out the extent at all. + * + * If we made it past insert_reserved_file_extent before we + * errored out then we don't need to do this as the accounting + * has already been done. */ if ((ret || !logical_len) && + clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) btrfs_free_reserved_extent(fs_info, @@ -5259,11 +5268,13 @@ static void evict_inode_truncate_pages(struct inode *inode) struct extent_state *cached_state = NULL; u64 start; u64 end; + unsigned state_flags; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); start = state->start; end = state->end; + state_flags = state->state; spin_unlock(&io_tree->lock); lock_extent_bits(io_tree, start, end, &cached_state); @@ -5276,7 +5287,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * * Note, end is the bytenr of last byte, so we need + 1 here. */ - if (state->state & EXTENT_DELALLOC) + if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, @@ -5764,16 +5775,10 @@ static int btrfs_dentry_delete(const struct dentry *dentry) static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct inode *inode; - - inode = btrfs_lookup_dentry(dir, dentry); - if (IS_ERR(inode)) { - if (PTR_ERR(inode) == -ENOENT) - inode = NULL; - else - return ERR_CAST(inode); - } + struct inode *inode = btrfs_lookup_dentry(dir, dentry); + if (inode == ERR_PTR(-ENOENT)) + inode = NULL; return d_splice_alias(inode, dentry); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a990a9045139..3ca6943827ef 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3629,26 +3629,6 @@ out_unlock: return ret; } -int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, - struct file *dst_file, loff_t dst_loff, - u64 olen) -{ - struct inode *src = file_inode(src_file); - struct inode *dst = file_inode(dst_file); - u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - - if (WARN_ON_ONCE(bs < PAGE_SIZE)) { - /* - * Btrfs does not support blocksize < page_size. As a - * result, btrfs_cmp_data() won't correctly handle - * this situation without an update. - */ - return -EINVAL; - } - - return btrfs_extent_same(src, src_loff, olen, dst, dst_loff); -} - static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, @@ -4350,10 +4330,34 @@ out_unlock: return ret; } -int btrfs_clone_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, u64 len) +loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, + unsigned int remap_flags) { - return btrfs_clone_files(dst_file, src_file, off, len, destoff); + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) { + struct inode *src = file_inode(src_file); + struct inode *dst = file_inode(dst_file); + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + + if (WARN_ON_ONCE(bs < PAGE_SIZE)) { + /* + * Btrfs does not support blocksize < page_size. As a + * result, btrfs_cmp_data() won't correctly handle + * this situation without an update. + */ + return -EINVAL; + } + + ret = btrfs_extent_same(src, off, len, dst, destoff); + } else { + ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); + } + return ret < 0 ? ret : len; } static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5686290a50e1..d1eeef9ec5da 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2283,15 +2283,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); - /* - * If fs has been frozen, we can not handle delayed iputs, otherwise - * it'll result in deadlock about SB_FREEZE_FS. - */ - if (current != fs_info->transaction_kthread && - current != fs_info->cleaner_kthread && - !test_bit(BTRFS_FS_FROZEN, &fs_info->flags)) - btrfs_run_delayed_iputs(fs_info); - return ret; scrub_continue: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0dba09334a16..e07f3376b7df 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4390,7 +4390,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); - down_write(&inode->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; logged_start = start; @@ -4456,7 +4455,6 @@ process: } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - up_write(&inode->dio_sem); btrfs_release_path(path); if (!ret) @@ -4652,7 +4650,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, ASSERT(len == i_size || (len == fs_info->sectorsize && btrfs_file_extent_compression(leaf, extent) != - BTRFS_COMPRESS_NONE)); + BTRFS_COMPRESS_NONE) || + (len < i_size && i_size < fs_info->sectorsize)); return 0; } diff --git a/fs/buffer.c b/fs/buffer.c index 6f1ae3ac9789..1286c2b95498 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -562,7 +562,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) EXPORT_SYMBOL(mark_buffer_dirty_inode); /* - * Mark the page dirty, and set it dirty in the radix tree, and mark the inode + * Mark the page dirty, and set it dirty in the page cache, and mark the inode * dirty. * * If warn is true, then emit a warning if the page is not uptodate and has @@ -579,8 +579,8 @@ void __set_page_dirty(struct page *page, struct address_space *mapping, if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); + __xa_set_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -1050,7 +1050,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page is tagged dirty in its radix tree. + * the page is tagged dirty in the page cache. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is @@ -1073,9 +1073,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * - * mark_buffer_dirty() will set the dirty bit against the buffer, then set its - * backing page dirty, then tag the page as dirty in its address_space's radix - * tree and then attach the address_space's inode to its superblock's dirty + * mark_buffer_dirty() will set the dirty bit against the buffer, then set + * its backing page dirty, then tag the page as dirty in the page cache + * and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 027408d55aee..5f0103f40079 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -104,6 +104,11 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct timespec64 old_ctime = inode->i_ctime; umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto out; + } + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; @@ -138,11 +143,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) goto out_free; } - if (ceph_snap(inode) != CEPH_NOSNAP) { - ret = -EROFS; - goto out_free; - } - if (new_mode != old_mode) { newattrs.ia_ctime = current_time(inode); newattrs.ia_mode = new_mode; @@ -206,10 +206,9 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); if (!tmp_buf) goto out_err; - pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL); + pagelist = ceph_pagelist_alloc(GFP_KERNEL); if (!pagelist) goto out_err; - ceph_pagelist_init(pagelist); err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); if (err) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9c332a6f6667..8eade7a993c1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -322,7 +322,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got); + ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); if (ret < 0) { dout("start_read %p, error getting cap\n", inode); } else if (!(got & want)) { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index dd7dfdd2ba13..f3496db4bb3e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -519,9 +519,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, - struct ceph_inode_info *ci) + struct ceph_inode_info *ci, + bool set_timeout) { - __cap_set_timeouts(mdsc, ci); dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { @@ -531,6 +531,8 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, goto no_change; list_del_init(&ci->i_cap_delay_list); } + if (set_timeout) + __cap_set_timeouts(mdsc, ci); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); no_change: spin_unlock(&mdsc->cap_delay_lock); @@ -720,7 +722,7 @@ void ceph_add_cap(struct inode *inode, dout(" issued %s, mds wanted %s, actual %s, queueing\n", ceph_cap_string(issued), ceph_cap_string(wanted), ceph_cap_string(actual_wanted)); - __cap_delay_requeue(mdsc, ci); + __cap_delay_requeue(mdsc, ci, true); } if (flags & CEPH_CAP_FLAG_AUTH) { @@ -1647,7 +1649,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; - __cap_delay_requeue(mdsc, ci); + __cap_delay_requeue(mdsc, ci, true); return dirty; } @@ -2065,7 +2067,7 @@ ack: /* Reschedule delayed caps release if we delayed anything */ if (delayed) - __cap_delay_requeue(mdsc, ci); + __cap_delay_requeue(mdsc, ci, false); spin_unlock(&ci->i_ceph_lock); @@ -2125,7 +2127,7 @@ retry: if (delayed) { spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + __cap_delay_requeue(mdsc, ci, true); spin_unlock(&ci->i_ceph_lock); } } else { @@ -2671,17 +2673,18 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got) +int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, + bool nonblock, int *got) { int ret, err = 0; BUG_ON(need & ~CEPH_CAP_FILE_RD); - BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); ret = ceph_pool_perm_check(ci, need); if (ret < 0) return ret; - ret = try_get_cap_refs(ci, need, want, 0, true, got, &err); + ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err); if (ret) { if (err == -EAGAIN) { ret = 0; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 92ab20433682..27cad84dab23 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> +#include <linux/ceph/striper.h> #include <linux/module.h> #include <linux/sched.h> @@ -557,90 +558,26 @@ enum { }; /* - * Read a range of bytes striped over one or more objects. Iterate over - * objects we stripe over. (That's not atomic, but good enough for now.) - * - * If we get a short result from the OSD, check against i_size; we need to - * only return a short read to the caller if we hit EOF. - */ -static int striped_read(struct inode *inode, - u64 pos, u64 len, - struct page **pages, int num_pages, - int page_align, int *checkeof) -{ - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_inode_info *ci = ceph_inode(inode); - u64 this_len; - loff_t i_size; - int page_idx; - int ret, read = 0; - bool hit_stripe, was_short; - - /* - * we may need to do multiple reads. not atomic, unfortunately. - */ -more: - this_len = len; - page_idx = (page_align + read) >> PAGE_SHIFT; - ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), - &ci->i_layout, pos, &this_len, - ci->i_truncate_seq, ci->i_truncate_size, - pages + page_idx, num_pages - page_idx, - ((page_align + read) & ~PAGE_MASK)); - if (ret == -ENOENT) - ret = 0; - hit_stripe = this_len < len; - was_short = ret >= 0 && ret < this_len; - dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read, - ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); - - i_size = i_size_read(inode); - if (ret >= 0) { - if (was_short && (pos + ret < i_size)) { - int zlen = min(this_len - ret, i_size - pos - ret); - int zoff = page_align + read + ret; - dout(" zero gap %llu to %llu\n", - pos + ret, pos + ret + zlen); - ceph_zero_page_vector_range(zoff, zlen, pages); - ret += zlen; - } - - read += ret; - pos += ret; - len -= ret; - - /* hit stripe and need continue*/ - if (len && hit_stripe && pos < i_size) - goto more; - } - - if (read > 0) { - ret = read; - /* did we bounce off eof? */ - if (pos + len > i_size) - *checkeof = CHECK_EOF; - } - - dout("striped_read returns %d\n", ret); - return ret; -} - -/* * Completely synchronous read and write methods. Direct from __user * buffer to osd, or directly to user pages (if O_DIRECT). * - * If the read spans object boundary, just do multiple reads. + * If the read spans object boundary, just do multiple reads. (That's not + * atomic, but good enough for now.) + * + * If we get a short result from the OSD, check against i_size; we need to + * only return a short read to the caller if we hit EOF. */ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, - int *checkeof) + int *retry_op) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct page **pages; - u64 off = iocb->ki_pos; - int num_pages; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_client *osdc = &fsc->client->osdc; ssize_t ret; - size_t len = iov_iter_count(to); + u64 off = iocb->ki_pos; + u64 len = iov_iter_count(to); dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); @@ -653,61 +590,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, * but it will at least behave sensibly when they are * in sequence. */ - ret = filemap_write_and_wait_range(inode->i_mapping, off, - off + len); + ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len); if (ret < 0) return ret; - if (unlikely(to->type & ITER_PIPE)) { + ret = 0; + while ((len = iov_iter_count(to)) > 0) { + struct ceph_osd_request *req; + struct page **pages; + int num_pages; size_t page_off; - ret = iov_iter_get_pages_alloc(to, &pages, len, - &page_off); - if (ret <= 0) - return -ENOMEM; - num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); + u64 i_size; + bool more; + + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, off, &len, 0, 1, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + more = len < iov_iter_count(to); - ret = striped_read(inode, off, ret, pages, num_pages, - page_off, checkeof); - if (ret > 0) { - iov_iter_advance(to, ret); - off += ret; + if (unlikely(iov_iter_is_pipe(to))) { + ret = iov_iter_get_pages_alloc(to, &pages, len, + &page_off); + if (ret <= 0) { + ceph_osdc_put_request(req); + ret = -ENOMEM; + break; + } + num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); + if (ret < len) { + len = ret; + osd_req_op_extent_update(req, 0, len); + more = false; + } } else { - iov_iter_advance(to, 0); + num_pages = calc_pages_for(off, len); + page_off = off & ~PAGE_MASK; + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) { + ceph_osdc_put_request(req); + ret = PTR_ERR(pages); + break; + } } - ceph_put_page_vector(pages, num_pages, false); - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = striped_read(inode, off, len, pages, num_pages, - (off & ~PAGE_MASK), checkeof); - if (ret > 0) { - int l, k = 0; - size_t left = ret; - - while (left) { - size_t page_off = off & ~PAGE_MASK; - size_t copy = min_t(size_t, left, - PAGE_SIZE - page_off); - l = copy_page_to_iter(pages[k++], page_off, - copy, to); - off += l; - left -= l; - if (l < copy) + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, + false, false); + ret = ceph_osdc_start_request(osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(osdc, req); + ceph_osdc_put_request(req); + + i_size = i_size_read(inode); + dout("sync_read %llu~%llu got %zd i_size %llu%s\n", + off, len, ret, i_size, (more ? " MORE" : "")); + + if (ret == -ENOENT) + ret = 0; + if (ret >= 0 && ret < len && (off + ret < i_size)) { + int zlen = min(len - ret, i_size - off - ret); + int zoff = page_off + ret; + dout("sync_read zero gap %llu~%llu\n", + off + ret, off + ret + zlen); + ceph_zero_page_vector_range(zoff, zlen, pages); + ret += zlen; + } + + if (unlikely(iov_iter_is_pipe(to))) { + if (ret > 0) { + iov_iter_advance(to, ret); + off += ret; + } else { + iov_iter_advance(to, 0); + } + ceph_put_page_vector(pages, num_pages, false); + } else { + int idx = 0; + size_t left = ret > 0 ? ret : 0; + while (left > 0) { + size_t len, copied; + page_off = off & ~PAGE_MASK; + len = min_t(size_t, left, PAGE_SIZE - page_off); + copied = copy_page_to_iter(pages[idx++], + page_off, len, to); + off += copied; + left -= copied; + if (copied < len) { + ret = -EFAULT; break; + } } + ceph_release_page_vector(pages, num_pages); } - ceph_release_page_vector(pages, num_pages); + + if (ret <= 0 || off >= i_size || !more) + break; } if (off > iocb->ki_pos) { + if (ret >= 0 && + iov_iter_count(to) > 0 && off >= i_size_read(inode)) + *retry_op = CHECK_EOF; ret = off - iocb->ki_pos; iocb->ki_pos = off; } - dout("sync_read result %zd\n", ret); + dout("sync_read result %zd retry_op %d\n", ret, *retry_op); return ret; } @@ -821,7 +815,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) aio_req->total_len = rc + zlen; } - iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs, + iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, osd_data->num_bvecs, osd_data->bvec_pos.iter.bi_size); iov_iter_advance(&i, rc); @@ -865,7 +859,7 @@ static void ceph_aio_retry_work(struct work_struct *work) } spin_unlock(&ci->i_ceph_lock); - req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, + req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1, false, GFP_NOFS); if (!req) { ret = -ENOMEM; @@ -877,6 +871,11 @@ static void ceph_aio_retry_work(struct work_struct *work) ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); + req->r_ops[0] = orig_req->r_ops[0]; + + req->r_mtime = aio_req->mtime; + req->r_data_offset = req->r_ops[0].extent.offset; + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); if (ret) { ceph_osdc_put_request(req); @@ -884,11 +883,6 @@ static void ceph_aio_retry_work(struct work_struct *work) goto out; } - req->r_ops[0] = orig_req->r_ops[0]; - - req->r_mtime = aio_req->mtime; - req->r_data_offset = req->r_ops[0].extent.offset; - ceph_osdc_put_request(orig_req); req->r_callback = ceph_aio_complete_req; @@ -1044,8 +1038,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, int zlen = min_t(size_t, len - ret, size - pos - ret); - iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages, - len); + iov_iter_bvec(&i, READ, bvecs, num_pages, len); iov_iter_advance(&i, ret); iov_iter_zero(zlen, &i); ret += zlen; @@ -1735,7 +1728,6 @@ static long ceph_fallocate(struct file *file, int mode, struct ceph_file_info *fi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_cap_flush *prealloc_cf; int want, got = 0; int dirty; @@ -1743,10 +1735,7 @@ static long ceph_fallocate(struct file *file, int mode, loff_t endoff = 0; loff_t size; - if ((offset + length) > max(i_size_read(inode), fsc->max_file_size)) - return -EFBIG; - - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (!S_ISREG(inode->i_mode)) @@ -1763,18 +1752,6 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) && - ceph_quota_is_max_bytes_exceeded(inode, offset + length)) { - ret = -EDQUOT; - goto unlock; - } - - if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL) && - !(mode & FALLOC_FL_PUNCH_HOLE)) { - ret = -ENOSPC; - goto unlock; - } - if (ci->i_inline_version != CEPH_INLINE_NONE) { ret = ceph_uninline_data(file, NULL); if (ret < 0) @@ -1782,12 +1759,12 @@ static long ceph_fallocate(struct file *file, int mode, } size = i_size_read(inode); - if (!(mode & FALLOC_FL_KEEP_SIZE)) { - endoff = offset + length; - ret = inode_newsize_ok(inode, endoff); - if (ret) - goto unlock; - } + + /* Are we punching a hole beyond EOF? */ + if (offset >= size) + goto unlock; + if ((offset + length) > size) + length = size - offset; if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; @@ -1798,16 +1775,8 @@ static long ceph_fallocate(struct file *file, int mode, if (ret < 0) goto unlock; - if (mode & FALLOC_FL_PUNCH_HOLE) { - if (offset < size) - ceph_zero_pagecache_range(inode, offset, length); - ret = ceph_zero_objects(inode, offset, length); - } else if (endoff > size) { - truncate_pagecache_range(inode, size, -1); - if (ceph_inode_set_size(inode, endoff)) - ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, NULL); - } + ceph_zero_pagecache_range(inode, offset, length); + ret = ceph_zero_objects(inode, offset, length); if (!ret) { spin_lock(&ci->i_ceph_lock); @@ -1817,9 +1786,6 @@ static long ceph_fallocate(struct file *file, int mode, spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); - if ((endoff > size) && - ceph_quota_is_max_bytes_approaching(inode, endoff)) - ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); } ceph_put_cap_refs(ci, got); @@ -1829,6 +1795,300 @@ unlock: return ret; } +/* + * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for + * src_ci. Two attempts are made to obtain both caps, and an error is return if + * this fails; zero is returned on success. + */ +static int get_rd_wr_caps(struct ceph_inode_info *src_ci, + loff_t src_endoff, int *src_got, + struct ceph_inode_info *dst_ci, + loff_t dst_endoff, int *dst_got) +{ + int ret = 0; + bool retrying = false; + +retry_caps: + ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + dst_endoff, dst_got, NULL); + if (ret < 0) + return ret; + + /* + * Since we're already holding the FILE_WR capability for the dst file, + * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some + * retry dance instead to try to get both capabilities. + */ + ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + false, src_got); + if (ret <= 0) { + /* Start by dropping dst_ci caps and getting src_ci caps */ + ceph_put_cap_refs(dst_ci, *dst_got); + if (retrying) { + if (!ret) + /* ceph_try_get_caps masks EAGAIN */ + ret = -EAGAIN; + return ret; + } + ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, src_endoff, + src_got, NULL); + if (ret < 0) + return ret; + /*... drop src_ci caps too, and retry */ + ceph_put_cap_refs(src_ci, *src_got); + retrying = true; + goto retry_caps; + } + return ret; +} + +static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, + struct ceph_inode_info *dst_ci, int dst_got) +{ + ceph_put_cap_refs(src_ci, src_got); + ceph_put_cap_refs(dst_ci, dst_got); +} + +/* + * This function does several size-related checks, returning an error if: + * - source file is smaller than off+len + * - destination file size is not OK (inode_newsize_ok()) + * - max bytes quotas is exceeded + */ +static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, + loff_t src_off, loff_t dst_off, size_t len) +{ + loff_t size, endoff; + + size = i_size_read(src_inode); + /* + * Don't copy beyond source file EOF. Instead of simply setting length + * to (size - src_off), just drop to VFS default implementation, as the + * local i_size may be stale due to other clients writing to the source + * inode. + */ + if (src_off + len > size) { + dout("Copy beyond EOF (%llu + %zu > %llu)\n", + src_off, len, size); + return -EOPNOTSUPP; + } + size = i_size_read(dst_inode); + + endoff = dst_off + len; + if (inode_newsize_ok(dst_inode, endoff)) + return -EOPNOTSUPP; + + if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) + return -EDQUOT; + + return 0; +} + +static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + struct ceph_inode_info *src_ci = ceph_inode(src_inode); + struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); + struct ceph_cap_flush *prealloc_cf; + struct ceph_object_locator src_oloc, dst_oloc; + struct ceph_object_id src_oid, dst_oid; + loff_t endoff = 0, size; + ssize_t ret = -EIO; + u64 src_objnum, dst_objnum, src_objoff, dst_objoff; + u32 src_objlen, dst_objlen, object_size; + int src_got = 0, dst_got = 0, err, dirty; + bool do_final_copy = false; + + if (src_inode == dst_inode) + return -EINVAL; + if (ceph_snap(dst_inode) != CEPH_NOSNAP) + return -EROFS; + + /* + * Some of the checks below will return -EOPNOTSUPP, which will force a + * fallback to the default VFS copy_file_range implementation. This is + * desirable in several cases (for ex, the 'len' is smaller than the + * size of the objects, or in cases where that would be more + * efficient). + */ + + if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) + return -EOPNOTSUPP; + + if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || + (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) || + (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) + return -EOPNOTSUPP; + + if (len < src_ci->i_layout.object_size) + return -EOPNOTSUPP; /* no remote copy will be done */ + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + /* Start by sync'ing the source file */ + ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); + if (ret < 0) + goto out; + + /* + * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other + * clients may have dirty data in their caches. And OSDs know nothing + * about caps, so they can't safely do the remote object copies. + */ + err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, + dst_ci, (dst_off + len), &dst_got); + if (err < 0) { + dout("get_rd_wr_caps returned %d\n", err); + ret = -EOPNOTSUPP; + goto out; + } + + ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len); + if (ret < 0) + goto out_caps; + + size = i_size_read(dst_inode); + endoff = dst_off + len; + + /* Drop dst file cached pages */ + ret = invalidate_inode_pages2_range(dst_inode->i_mapping, + dst_off >> PAGE_SHIFT, + endoff >> PAGE_SHIFT); + if (ret < 0) { + dout("Failed to invalidate inode pages (%zd)\n", ret); + ret = 0; /* XXX */ + } + src_oloc.pool = src_ci->i_layout.pool_id; + src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); + dst_oloc.pool = dst_ci->i_layout.pool_id; + dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); + + ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, + src_ci->i_layout.object_size, + &src_objnum, &src_objoff, &src_objlen); + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, + dst_ci->i_layout.object_size, + &dst_objnum, &dst_objoff, &dst_objlen); + /* object-level offsets need to the same */ + if (src_objoff != dst_objoff) { + ret = -EOPNOTSUPP; + goto out_caps; + } + + /* + * Do a manual copy if the object offset isn't object aligned. + * 'src_objlen' contains the bytes left until the end of the object, + * starting at the src_off + */ + if (src_objoff) { + /* + * we need to temporarily drop all caps as we'll be calling + * {read,write}_iter, which will get caps again. + */ + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); + ret = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, src_objlen, flags); + if (ret < 0) { + dout("do_splice_direct returned %d\n", err); + goto out; + } + len -= ret; + err = get_rd_wr_caps(src_ci, (src_off + len), + &src_got, dst_ci, + (dst_off + len), &dst_got); + if (err < 0) + goto out; + err = is_file_size_ok(src_inode, dst_inode, + src_off, dst_off, len); + if (err < 0) + goto out_caps; + } + object_size = src_ci->i_layout.object_size; + while (len >= object_size) { + ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, + object_size, &src_objnum, + &src_objoff, &src_objlen); + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, + object_size, &dst_objnum, + &dst_objoff, &dst_objlen); + ceph_oid_init(&src_oid); + ceph_oid_printf(&src_oid, "%llx.%08llx", + src_ci->i_vino.ino, src_objnum); + ceph_oid_init(&dst_oid); + ceph_oid_printf(&dst_oid, "%llx.%08llx", + dst_ci->i_vino.ino, dst_objnum); + /* Do an object remote copy */ + err = ceph_osdc_copy_from( + &ceph_inode_to_client(src_inode)->client->osdc, + src_ci->i_vino.snap, 0, + &src_oid, &src_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, + &dst_oid, &dst_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0); + if (err) { + dout("ceph_osdc_copy_from returned %d\n", err); + if (!ret) + ret = err; + goto out_caps; + } + len -= object_size; + src_off += object_size; + dst_off += object_size; + ret += object_size; + } + + if (len) + /* We still need one final local copy */ + do_final_copy = true; + + file_update_time(dst_file); + if (endoff > size) { + int caps_flags = 0; + + /* Let the MDS know about dst file size change */ + if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff)) + caps_flags |= CHECK_CAPS_NODELAY; + if (ceph_inode_set_size(dst_inode, endoff)) + caps_flags |= CHECK_CAPS_AUTHONLY; + if (caps_flags) + ceph_check_caps(dst_ci, caps_flags, NULL); + } + /* Mark Fw dirty */ + spin_lock(&dst_ci->i_ceph_lock); + dst_ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&dst_ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(dst_inode, dirty); + +out_caps: + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); + + if (do_final_copy) { + err = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, len, flags); + if (err < 0) { + dout("do_splice_direct returned %d\n", err); + goto out; + } + len -= err; + ret += err; + } + +out: + ceph_free_cap_flush(prealloc_cf); + + return ret; +} + const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, @@ -1844,5 +2104,5 @@ const struct file_operations ceph_file_fops = { .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, .fallocate = ceph_fallocate, + .copy_file_range = ceph_copy_file_range, }; - diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ebc7bdaed2d0..79dd5e6ed755 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1132,8 +1132,12 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) if (IS_ERR(realdn)) { pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", PTR_ERR(realdn), dn, in, ceph_vinop(in)); - dput(dn); - dn = realdn; /* note realdn contains the error */ + dn = realdn; + /* + * Caller should release 'dn' in the case of error. + * If 'req->r_dentry' is passed to this function, + * caller should leave 'req->r_dentry' untouched. + */ goto out; } else if (realdn) { dout("dn %p (%d) spliced with %p (%d) " @@ -1196,7 +1200,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) WARN_ON_ONCE(1); } - if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { struct qstr dname; struct dentry *dn, *parent; @@ -1677,7 +1683,6 @@ retry_lookup: if (IS_ERR(realdn)) { err = PTR_ERR(realdn); d_drop(dn); - dn = NULL; goto next_item; } dn = realdn; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bc43c822426a..67a9aeb2f4ec 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2071,7 +2071,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); + msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); goto out_free2; @@ -2136,7 +2136,6 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_pagelist) { struct ceph_pagelist *pagelist = req->r_pagelist; - refcount_inc(&pagelist->refcnt); ceph_msg_data_add_pagelist(msg, pagelist); msg->hdr.data_len = cpu_to_le32(pagelist->length); } else { @@ -3126,12 +3125,11 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, pr_info("mds%d reconnect start\n", mds); - pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) goto fail_nopagelist; - ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); + reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); if (!reply) goto fail_nomsg; @@ -3241,6 +3239,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, mutex_unlock(&mdsc->mutex); up_read(&mdsc->snap_rwsem); + ceph_pagelist_release(pagelist); return; fail: diff --git a/fs/ceph/super.c b/fs/ceph/super.c index eab1359d0553..b5ecd6f50360 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -165,6 +165,8 @@ enum { Opt_noacl, Opt_quotadf, Opt_noquotadf, + Opt_copyfrom, + Opt_nocopyfrom, }; static match_table_t fsopt_tokens = { @@ -203,6 +205,8 @@ static match_table_t fsopt_tokens = { {Opt_noacl, "noacl"}, {Opt_quotadf, "quotadf"}, {Opt_noquotadf, "noquotadf"}, + {Opt_copyfrom, "copyfrom"}, + {Opt_nocopyfrom, "nocopyfrom"}, {-1, NULL} }; @@ -355,6 +359,12 @@ static int parse_fsopt_token(char *c, void *private) case Opt_noquotadf: fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; break; + case Opt_copyfrom: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; + break; + case Opt_nocopyfrom: + fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; + break; #ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: fsopt->sb_flags |= SB_POSIXACL; @@ -553,6 +563,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noacl"); #endif + if (fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) + seq_puts(m, ",nocopyfrom"); + if (fsopt->mds_namespace) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 582e28fd1b7b..c005a5400f2e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -40,6 +40,7 @@ #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ +#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE @@ -1008,7 +1009,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page); extern int ceph_try_get_caps(struct ceph_inode_info *ci, - int need, int want, int *got); + int need, int want, bool nonblock, int *got); /* for counting open files by mode */ extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5cc8b94f8206..316f6ad10644 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -951,11 +951,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, if (size > 0) { /* copy value into pagelist */ - pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) return -ENOMEM; - ceph_pagelist_init(pagelist); err = ceph_pagelist_append(pagelist, value, size); if (err) goto out; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index f1fbea947fef..3e812428ac8d 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -132,7 +132,7 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; - seq_printf(m, "\t\tSpeed: %zu bps\n", iface->speed); + seq_printf(m, "\tSpeed: %zu bps\n", iface->speed); seq_puts(m, "\t\tCapabilities: "); if (iface->rdma_capable) seq_puts(m, "rdma "); @@ -285,7 +285,7 @@ skip_rdma: if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || (ses->serverNOS == NULL)) { - seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d\t", + seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d ", i, ses->serverName, ses->ses_count, ses->capabilities, ses->status); if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) @@ -296,16 +296,18 @@ skip_rdma: seq_printf(m, "\n%d) Name: %s Domain: %s Uses: %d OS:" " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB" - " session status: %d\t", + " session status: %d ", i, ses->serverName, ses->serverDomain, ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->status); } if (server->rdma) seq_printf(m, "RDMA\n\t"); - seq_printf(m, "TCP status: %d\n\tLocal Users To " + seq_printf(m, "TCP status: %d Instance: %d\n\tLocal Users To " "Server: %d SecMode: 0x%x Req On Wire: %d", - server->tcpStatus, server->srv_count, + server->tcpStatus, + server->reconnect_instance, + server->srv_count, server->sec_mode, in_flight(server)); #ifdef CONFIG_CIFS_STATS2 @@ -352,7 +354,7 @@ skip_rdma: seq_printf(m, "\n\tServer interfaces: %zu\n", ses->iface_count); for (j = 0; j < ses->iface_count; j++) { - seq_printf(m, "\t%d)\n", j); + seq_printf(m, "\t%d)", j); cifs_dump_iface(m, &ses->iface_list[j]); } spin_unlock(&ses->iface_lock); @@ -383,6 +385,9 @@ static ssize_t cifs_stats_proc_write(struct file *file, atomic_set(&totBufAllocCount, 0); atomic_set(&totSmBufAllocCount, 0); #endif /* CONFIG_CIFS_STATS2 */ + atomic_set(&tcpSesReconnectCount, 0); + atomic_set(&tconInfoReconnectCount, 0); + spin_lock(&GlobalMid_Lock); GlobalMaxActiveXid = 0; GlobalCurrentXid = 0; diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index f4f3f0853c6e..631dc1bb21c1 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -47,6 +47,29 @@ extern int cifsFYI; */ #ifdef CONFIG_CIFS_DEBUG + +/* + * When adding tracepoints and debug messages we have various choices. + * Some considerations: + * + * Use cifs_dbg(VFS, ...) for things we always want logged, and the user to see + * cifs_info(...) slightly less important, admin can filter via loglevel > 6 + * cifs_dbg(FYI, ...) minor debugging messages, off by default + * trace_smb3_* ftrace functions are preferred for complex debug messages + * intended for developers or experienced admins, off by default + */ + +/* Information level messages, minor events */ +#define cifs_info_func(ratefunc, fmt, ...) \ +do { \ + pr_info_ ## ratefunc("CIFS: " fmt, ##__VA_ARGS__); \ +} while (0) + +#define cifs_info(fmt, ...) \ +do { \ + cifs_info_func(ratelimited, fmt, ##__VA_ARGS__); \ +} while (0) + /* information message: e.g., configuration, major event */ #define cifs_dbg_func(ratefunc, type, fmt, ...) \ do { \ @@ -81,6 +104,11 @@ do { \ if (0) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) + +#define cifs_info(fmt, ...) \ +do { \ + pr_info("CIFS: "fmt, ##__VA_ARGS__); \ +} while (0) #endif #endif /* _H_CIFS_DEBUG */ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 6b61df117fd4..b97c74efd04a 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -304,12 +304,17 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) */ mnt = ERR_PTR(-ENOMEM); + cifs_sb = CIFS_SB(mntpt->d_sb); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) { + mnt = ERR_PTR(-EREMOTE); + goto cdda_exit; + } + /* always use tree name prefix */ full_path = build_path_from_dentry_optional_prefix(mntpt, true); if (full_path == NULL) goto cdda_exit; - cifs_sb = CIFS_SB(mntpt->d_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { mnt = ERR_CAST(tlink); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 9731d0d891e7..63d7530f2e1d 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -51,6 +51,7 @@ */ #define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */ #define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */ +#define CIFS_MOUNT_NO_DFS 0x8000000 /* disable DFS resolving */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 57ff0756e30c..d8bce2f862de 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -43,8 +43,19 @@ struct smb_snapshot_array { /* snapshots[]; */ } __packed; +struct smb_query_info { + __u32 info_type; + __u32 file_info_class; + __u32 additional_information; + __u32 flags; + __u32 input_buffer_length; + __u32 output_buffer_length; + /* char buffer[]; */ +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) #define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) +#define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 7065426b3280..b7ac09e38159 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -81,6 +81,14 @@ module_param(cifs_max_pending, uint, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for " "CIFS/SMB1 dialect (N/A for SMB3) " "Default: 32767 Range: 2 to 32767."); +#ifdef CONFIG_CIFS_STATS2 +unsigned int slow_rsp_threshold = 1; +module_param(slow_rsp_threshold, uint, 0644); +MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait " + "before logging that a response is delayed. " + "Default: 1 (if set to 0 disables msg)."); +#endif /* STATS2 */ + module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); @@ -492,6 +500,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",unix"); else seq_puts(s, ",nounix"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) + seq_puts(s, ",nodfs"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) seq_puts(s, ",posixpaths"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) @@ -707,7 +717,14 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, struct cifs_mnt_data mnt_data; struct dentry *root; - cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags); + /* + * Prints in Kernel / CIFS log the attempted mount operation + * If CIFS_DEBUG && cifs_FYI + */ + if (cifsFYI) + cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags); + else + cifs_info("Attempting to mount %s\n", dev_name); volume_info = cifs_get_volume_info((char *)data, dev_name, is_smb3); if (IS_ERR(volume_info)) @@ -975,8 +992,9 @@ const struct inode_operations cifs_symlink_inode_ops = { .listxattr = cifs_listxattr, }; -static int cifs_clone_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, u64 len) +static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, + unsigned int remap_flags) { struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); @@ -986,6 +1004,9 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off, unsigned int xid; int rc; + if (remap_flags & ~REMAP_FILE_ADVISORY) + return -EINVAL; + cifs_dbg(FYI, "clone range\n"); xid = get_xid(); @@ -1025,7 +1046,7 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off, unlock_two_nondirectories(src_inode, target_inode); out: free_xid(xid); - return rc; + return rc < 0 ? rc : len; } ssize_t cifs_file_copychunk_range(unsigned int xid, @@ -1134,7 +1155,7 @@ const struct file_operations cifs_file_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1153,7 +1174,7 @@ const struct file_operations cifs_file_strict_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1172,7 +1193,7 @@ const struct file_operations cifs_file_direct_ops = { .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -1191,7 +1212,7 @@ const struct file_operations cifs_file_nobrl_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1209,7 +1230,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1227,7 +1248,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -1239,7 +1260,7 @@ const struct file_operations cifs_dir_ops = { .read = generic_read_dir, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .llseek = generic_file_llseek, .fsync = cifs_dir_fsync, }; @@ -1418,6 +1439,11 @@ init_cifs(void) #ifdef CONFIG_CIFS_STATS2 atomic_set(&totBufAllocCount, 0); atomic_set(&totSmBufAllocCount, 0); + if (slow_rsp_threshold < 1) + cifs_dbg(FYI, "slow_response_threshold msgs disabled\n"); + else if (slow_rsp_threshold > 32767) + cifs_dbg(VFS, + "slow response threshold set higher than recommended (0 to 32767)\n"); #endif /* CONFIG_CIFS_STATS2 */ atomic_set(&midCount, 0); @@ -1538,11 +1564,11 @@ exit_cifs(void) cifs_proc_clean(); } -MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); +MODULE_AUTHOR("Steve French"); MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */ MODULE_DESCRIPTION - ("VFS to access servers complying with the SNIA CIFS Specification " - "e.g. Samba and Windows"); + ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and " + "also older servers complying with the SNIA CIFS Specification)"); MODULE_VERSION(CIFS_VERSION); MODULE_SOFTDEP("pre: arc4"); MODULE_SOFTDEP("pre: des"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index f047e87871a1..24e265a51874 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -148,5 +148,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.13" +#define CIFS_VERSION "2.14" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9dcaed031843..ed1e0fcb69e3 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -33,6 +33,7 @@ #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ +#define SMB_PATH_MAX 260 #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -465,6 +466,11 @@ struct smb_version_operations { enum securityEnum (*select_sectype)(struct TCP_Server_Info *, enum securityEnum); int (*next_header)(char *); + /* ioctl passthrough for query_info */ + int (*ioctl_query_info)(const unsigned int xid, + struct cifs_tcon *tcon, + __le16 *path, int is_dir, + unsigned long p); }; struct smb_version_values { @@ -654,6 +660,7 @@ struct TCP_Server_Info { /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* for signing, protected by srv_mutex */ + __u32 reconnect_instance; /* incremented on each reconnect */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ @@ -798,6 +805,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb) * a single wsize request with a single call. */ #define CIFS_DEFAULT_IOSIZE (1024 * 1024) +#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) /* * Windows only supports a max of 60kb reads and 65535 byte writes. Default to @@ -924,6 +932,8 @@ struct cifs_tcon { struct list_head tcon_list; int tc_count; struct list_head rlist; /* reconnect list */ + atomic_t num_local_opens; /* num of all opens including disconnected */ + atomic_t num_remote_opens; /* num of all network opens on server */ struct list_head openFileList; spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ @@ -1072,7 +1082,8 @@ struct cifsLockInfo { __u64 offset; __u64 length; __u32 pid; - __u32 type; + __u16 type; + __u16 flags; }; /* @@ -1715,6 +1726,7 @@ GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */ #ifdef CONFIG_CIFS_STATS2 GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */ GLOBAL_EXTERN atomic_t totSmBufAllocCount; +extern unsigned int slow_rsp_threshold; /* number of secs before logging */ #endif GLOBAL_EXTERN atomic_t smBufAllocCount; GLOBAL_EXTERN atomic_t midCount; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 20adda4de83b..fa361bc00602 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -219,7 +219,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon); extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, - __u64 length, __u8 type, + __u64 length, __u8 type, __u16 flags, struct cifsLockInfo **conf_lock, int rw_check); extern void cifs_add_pending_open(struct cifs_fid *fid, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5657b79dbc99..f82fd342bca5 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1607,6 +1607,7 @@ cifs_readv_callback(struct mid_q_entry *mid) struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, + .rq_offset = rdata->page_offset, .rq_npages = rdata->nr_pages, .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; @@ -2210,6 +2211,7 @@ cifs_async_writev(struct cifs_writedata *wdata, rqst.rq_iov = iov; rqst.rq_nvec = 2; rqst.rq_pages = wdata->pages; + rqst.rq_offset = wdata->page_offset; rqst.rq_npages = wdata->nr_pages; rqst.rq_pagesz = wdata->pagesz; rqst.rq_tailsz = wdata->tailsz; @@ -5027,6 +5029,13 @@ oldQFSInfoRetry: le16_to_cpu(response_data->BytesPerSector) * le32_to_cpu(response_data-> SectorsPerAllocationUnit); + /* + * much prefer larger but if server doesn't report + * a valid size than 4K is a reasonable minimum + */ + if (FSData->f_bsize < 512) + FSData->f_bsize = 4096; + FSData->f_blocks = le32_to_cpu(response_data->TotalAllocationUnits); FSData->f_bfree = FSData->f_bavail = @@ -5107,6 +5116,13 @@ QFSInfoRetry: le32_to_cpu(response_data->BytesPerSector) * le32_to_cpu(response_data-> SectorsPerAllocationUnit); + /* + * much prefer larger but if server doesn't report + * a valid size than 4K is a reasonable minimum + */ + if (FSData->f_bsize < 512) + FSData->f_bsize = 4096; + FSData->f_blocks = le64_to_cpu(response_data->TotalAllocationUnits); FSData->f_bfree = FSData->f_bavail = @@ -5470,6 +5486,13 @@ QFSPosixRetry: data_offset); FSData->f_bsize = le32_to_cpu(response_data->BlockSize); + /* + * much prefer larger but if server doesn't report + * a valid size than 4K is a reasonable minimum + */ + if (FSData->f_bsize < 512) + FSData->f_bsize = 4096; + FSData->f_blocks = le64_to_cpu(response_data->TotalBlocks); FSData->f_bfree = diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 52d71b64c0c6..6f24f129a751 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -250,6 +250,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_ignore, "dev" }, { Opt_ignore, "mand" }, { Opt_ignore, "nomand" }, + { Opt_ignore, "relatime" }, { Opt_ignore, "_netdev" }, { Opt_err, NULL } @@ -347,7 +348,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->maxBuf = 0; server->max_read = 0; - cifs_dbg(FYI, "Reconnecting tcp session\n"); + cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); trace_smb3_reconnect(server->CurrentMid, server->hostname); /* before reconnecting the tcp session, mark the smb session (uid) @@ -588,7 +589,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, { struct msghdr smb_msg; struct kvec iov = {.iov_base = buf, .iov_len = to_read}; - iov_iter_kvec(&smb_msg.msg_iter, READ | ITER_KVEC, &iov, 1, to_read); + iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -600,7 +601,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, struct msghdr smb_msg; struct bio_vec bv = { .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; - iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read); + iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -2396,6 +2397,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); tcp_ses->session_estab = false; tcp_ses->sequence_number = 0; + tcp_ses->reconnect_instance = 0; tcp_ses->lstrp = jiffies; spin_lock_init(&tcp_ses->req_lock); INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); @@ -3085,10 +3087,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) if (rc) goto out_fail; - if (volume_info->nodfs) { - tcon->Flags &= ~SMB_SHARE_IS_IN_DFS; - cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags); - } tcon->use_persistent = false; /* check if SMB2 or later, CIFS does not support persistent handles */ if (volume_info->persistent) { @@ -3663,6 +3661,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->actimeo = pvolume_info->actimeo; cifs_sb->local_nls = pvolume_info->local_nls; + if (pvolume_info->nodfs) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_DFS; if (pvolume_info->noperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; if (pvolume_info->setuids) @@ -3819,6 +3819,9 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, struct dfs_info3_param *referrals = NULL; char *full_path = NULL, *ref_path = NULL, *mdata = NULL; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) + return -EREMOTE; + full_path = build_unc_path_to_root(volume_info, cifs_sb); if (IS_ERR(full_path)) return PTR_ERR(full_path); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8d41ca7bfcf1..e262a05a98bf 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -334,6 +334,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); + atomic_inc(&tcon->num_local_opens); /* if readable file instance put first in list*/ if (file->f_mode & FMODE_READ) @@ -395,6 +396,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) /* remove it from the lists */ list_del(&cifs_file->flist); list_del(&cifs_file->tlist); + atomic_dec(&tcon->num_local_opens); if (list_empty(&cifsi->openFileList)) { cifs_dbg(FYI, "closing last open instance for inode %p\n", @@ -864,7 +866,7 @@ int cifs_closedir(struct inode *inode, struct file *file) } static struct cifsLockInfo * -cifs_lock_init(__u64 offset, __u64 length, __u8 type) +cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 flags) { struct cifsLockInfo *lock = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); @@ -874,6 +876,7 @@ cifs_lock_init(__u64 offset, __u64 length, __u8 type) lock->length = length; lock->type = type; lock->pid = current->tgid; + lock->flags = flags; INIT_LIST_HEAD(&lock->blist); init_waitqueue_head(&lock->block_q); return lock; @@ -896,7 +899,8 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock) /* @rw_check : 0 - no op, 1 - read, 2 - write */ static bool cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, - __u64 length, __u8 type, struct cifsFileInfo *cfile, + __u64 length, __u8 type, __u16 flags, + struct cifsFileInfo *cfile, struct cifsLockInfo **conf_lock, int rw_check) { struct cifsLockInfo *li; @@ -918,6 +922,10 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, ((server->ops->compare_fids(cfile, cur_cfile) && current->tgid == li->pid) || type == li->type)) continue; + if (rw_check == CIFS_LOCK_OP && + (flags & FL_OFDLCK) && (li->flags & FL_OFDLCK) && + server->ops->compare_fids(cfile, cur_cfile)) + continue; if (conf_lock) *conf_lock = li; return true; @@ -927,8 +935,8 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, - __u8 type, struct cifsLockInfo **conf_lock, - int rw_check) + __u8 type, __u16 flags, + struct cifsLockInfo **conf_lock, int rw_check) { bool rc = false; struct cifs_fid_locks *cur; @@ -936,7 +944,8 @@ cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, list_for_each_entry(cur, &cinode->llist, llist) { rc = cifs_find_fid_lock_conflict(cur, offset, length, type, - cfile, conf_lock, rw_check); + flags, cfile, conf_lock, + rw_check); if (rc) break; } @@ -964,7 +973,8 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, down_read(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, offset, length, type, - &conf_lock, CIFS_LOCK_OP); + flock->fl_flags, &conf_lock, + CIFS_LOCK_OP); if (exist) { flock->fl_start = conf_lock->offset; flock->fl_end = conf_lock->offset + conf_lock->length - 1; @@ -1011,7 +1021,8 @@ try_again: down_write(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, - lock->type, &conf_lock, CIFS_LOCK_OP); + lock->type, lock->flags, &conf_lock, + CIFS_LOCK_OP); if (!exist && cinode->can_cache_brlcks) { list_add_tail(&lock->llist, &cfile->llist->locks); up_write(&cinode->lock_sem); @@ -1321,7 +1332,7 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock, cifs_dbg(FYI, "Lease on file - not implemented yet\n"); if (flock->fl_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | - FL_ACCESS | FL_LEASE | FL_CLOSE))) + FL_ACCESS | FL_LEASE | FL_CLOSE | FL_OFDLCK))) cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags); *type = server->vals->large_lock_type; @@ -1584,7 +1595,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, if (lock) { struct cifsLockInfo *lock; - lock = cifs_lock_init(flock->fl_start, length, type); + lock = cifs_lock_init(flock->fl_start, length, type, + flock->fl_flags); if (!lock) return -ENOMEM; @@ -1653,7 +1665,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag, tcon->ses->server); - cifs_sb = CIFS_FILE_SB(file); netfid = cfile->fid.netfid; cinode = CIFS_I(file_inode(file)); @@ -2098,6 +2109,7 @@ static int cifs_writepages(struct address_space *mapping, pgoff_t end, index; struct cifs_writedata *wdata; int rc = 0; + unsigned int xid; /* * If wsize is smaller than the page cache size, default to writing @@ -2106,6 +2118,7 @@ static int cifs_writepages(struct address_space *mapping, if (cifs_sb->wsize < PAGE_SIZE) return generic_writepages(mapping, wbc); + xid = get_xid(); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -2199,6 +2212,7 @@ retry: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; + free_xid(xid); return rc; } @@ -2817,8 +2831,8 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) goto out; if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), - server->vals->exclusive_lock_type, NULL, - CIFS_WRITE_OP)) + server->vals->exclusive_lock_type, 0, + NULL, CIFS_WRITE_OP)) rc = __generic_file_write_iter(iocb, from); else rc = -EACCES; @@ -2990,7 +3004,7 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) size_t copy = min_t(size_t, remaining, PAGE_SIZE); size_t written; - if (unlikely(iter->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(iter))) { void *addr = kmap_atomic(page); written = copy_to_iter(addr, copy, iter); @@ -3302,7 +3316,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) if (!is_sync_kiocb(iocb)) ctx->iocb = iocb; - if (to->type == ITER_IOVEC) + if (iter_is_iovec(to)) ctx->should_dirty = true; rc = setup_aio_ctx_iter(ctx, to, READ); @@ -3388,7 +3402,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) down_read(&cinode->lock_sem); if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to), tcon->ses->server->vals->shared_lock_type, - NULL, CIFS_READ_OP)) + 0, NULL, CIFS_READ_OP)) rc = generic_file_read_iter(iocb, to); up_read(&cinode->lock_sem); return rc; @@ -3743,7 +3757,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct TCP_Server_Info *server; pid_t pid; + unsigned int xid; + xid = get_xid(); /* * Reads as many pages as possible from fscache. Returns -ENOBUFS * immediately if the cookie is negative @@ -3753,8 +3769,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, */ rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, &num_pages); - if (rc == 0) + if (rc == 0) { + free_xid(xid); return rc; + } if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -3798,6 +3816,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, */ if (unlikely(rsize < PAGE_SIZE)) { add_credits_and_wake_if(server, credits, 0); + free_xid(xid); return 0; } @@ -3862,6 +3881,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * allocator. */ cifs_fscache_readpages_cancel(mapping->host, page_list); + free_xid(xid); return rc; } @@ -3889,8 +3909,12 @@ static int cifs_readpage_worker(struct file *file, struct page *page, else cifs_dbg(FYI, "Bytes read %d\n", rc); - file_inode(file)->i_atime = - current_time(file_inode(file)); + /* we do not want atime to be less than mtime, it broke some apps */ + file_inode(file)->i_atime = current_time(file_inode(file)); + if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime))) + file_inode(file)->i_atime = file_inode(file)->i_mtime; + else + file_inode(file)->i_atime = current_time(file_inode(file)); if (PAGE_SIZE > rc) memset(read_data + rc, 0, PAGE_SIZE - rc); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 6e8765f44508..1023d78673fb 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -162,7 +162,11 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); - inode->i_atime = fattr->cf_atime; + /* we do not want atime to be less than mtime, it broke some apps */ + if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime)) + inode->i_atime = fattr->cf_mtime; + else + inode->i_atime = fattr->cf_atime; inode->i_mtime = fattr->cf_mtime; inode->i_ctime = fattr->cf_ctime; inode->i_rdev = fattr->cf_rdev; @@ -777,38 +781,53 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } else if (rc == -EREMOTE) { cifs_create_dfs_fattr(&fattr, sb); rc = 0; - } else if (rc == -EACCES && backup_cred(cifs_sb)) { - srchinf = kzalloc(sizeof(struct cifs_search_info), - GFP_KERNEL); - if (srchinf == NULL) { - rc = -ENOMEM; - goto cgii_exit; - } + } else if ((rc == -EACCES) && backup_cred(cifs_sb) && + (strcmp(server->vals->version_string, SMB1_VERSION_STRING) + == 0)) { + /* + * For SMB2 and later the backup intent flag is already + * sent if needed on open and there is no path based + * FindFirst operation to use to retry with + */ - srchinf->endOfSearch = false; + srchinf = kzalloc(sizeof(struct cifs_search_info), + GFP_KERNEL); + if (srchinf == NULL) { + rc = -ENOMEM; + goto cgii_exit; + } + + srchinf->endOfSearch = false; + if (tcon->unix_ext) + srchinf->info_level = SMB_FIND_FILE_UNIX; + else if ((tcon->ses->capabilities & + tcon->ses->server->vals->cap_nt_find) == 0) + srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD; + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; + else /* no srvino useful for fallback to some netapp */ + srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO; - srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | - CIFS_SEARCH_CLOSE_AT_END | - CIFS_SEARCH_BACKUP_SEARCH; + srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | + CIFS_SEARCH_CLOSE_AT_END | + CIFS_SEARCH_BACKUP_SEARCH; - rc = CIFSFindFirst(xid, tcon, full_path, - cifs_sb, NULL, srchflgs, srchinf, false); - if (!rc) { - data = - (FILE_ALL_INFO *)srchinf->srch_entries_start; + rc = CIFSFindFirst(xid, tcon, full_path, + cifs_sb, NULL, srchflgs, srchinf, false); + if (!rc) { + data = (FILE_ALL_INFO *)srchinf->srch_entries_start; - cifs_dir_info_to_fattr(&fattr, - (FILE_DIRECTORY_INFO *)data, cifs_sb); - fattr.cf_uniqueid = le64_to_cpu( - ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); - validinum = true; + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)data, cifs_sb); + fattr.cf_uniqueid = le64_to_cpu( + ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); + validinum = true; - cifs_buf_release(srchinf->ntwrk_buf_start); - } - kfree(srchinf); - if (rc) - goto cgii_exit; + cifs_buf_release(srchinf->ntwrk_buf_start); + } + kfree(srchinf); + if (rc) + goto cgii_exit; } else goto cgii_exit; diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 54f32f9143a9..76ddd98b6298 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -32,8 +32,51 @@ #include "cifs_debug.h" #include "cifsfs.h" #include "cifs_ioctl.h" +#include "smb2proto.h" #include <linux/btrfs.h> +static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, + unsigned long p) +{ + struct inode *inode = file_inode(filep); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct dentry *dentry = filep->f_path.dentry; + unsigned char *path; + __le16 *utf16_path = NULL, root_path; + int rc = 0; + + path = build_path_from_dentry(dentry); + if (path == NULL) + return -ENOMEM; + + cifs_dbg(FYI, "%s %s\n", __func__, path); + + if (!path[0]) { + root_path = 0; + utf16_path = &root_path; + } else { + utf16_path = cifs_convert_path_to_utf16(path + 1, cifs_sb); + if (!utf16_path) { + rc = -ENOMEM; + goto ici_exit; + } + } + + if (tcon->ses->server->ops->ioctl_query_info) + rc = tcon->ses->server->ops->ioctl_query_info( + xid, tcon, utf16_path, + filep->private_data ? 0 : 1, p); + else + rc = -EOPNOTSUPP; + + ici_exit: + if (utf16_path != &root_path) + kfree(utf16_path); + kfree(path); + return rc; +} + static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, unsigned long srcfd) { @@ -123,7 +166,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) struct inode *inode = file_inode(filep); int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; - struct cifs_sb_info *cifs_sb; struct cifsFileInfo *pSMBFile = filep->private_data; struct cifs_tcon *tcon; __u64 ExtAttrBits = 0; @@ -131,7 +173,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); - cifs_sb = CIFS_SB(inode->i_sb); cifs_dbg(FYI, "cifs ioctl 0x%x\n", command); switch (command) { case FS_IOC_GETFLAGS: @@ -196,6 +237,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) case CIFS_IOC_COPYCHUNK_FILE: rc = cifs_ioctl_copychunk(xid, filep, arg); break; + case CIFS_QUERY_INFO: + rc = cifs_ioctl_query_info(xid, filep, arg); + break; case CIFS_IOC_SET_INTEGRITY: if (pSMBFile == NULL) break; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 6926685e513c..8a41f4eba726 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -123,6 +123,8 @@ tconInfoAlloc(void) ret_buf->crfid.fid = kzalloc(sizeof(struct cifs_fid), GFP_KERNEL); spin_lock_init(&ret_buf->stat_lock); + atomic_set(&ret_buf->num_local_opens, 0); + atomic_set(&ret_buf->num_remote_opens, 0); } return ret_buf; } @@ -786,7 +788,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) struct page **pages = NULL; struct bio_vec *bv = NULL; - if (iter->type & ITER_KVEC) { + if (iov_iter_is_kvec(iter)) { memcpy(&ctx->iter, iter, sizeof(struct iov_iter)); ctx->len = count; iov_iter_advance(iter, count); @@ -857,7 +859,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) ctx->bv = bv; ctx->len = saved_len - count; ctx->npages = npages; - iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len); + iov_iter_bvec(&ctx->iter, rw, ctx->bv, npages, ctx->len); return 0; } diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 0ffa18094335..dd10f0ce4cd5 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -33,7 +33,7 @@ /* * Identifiers for functions that use the open, operation, close pattern - * in smb2inode.c:smb2_open_op_close() + * in smb2inode.c:smb2_compound_op() */ #define SMB2_OP_SET_DELETE 1 #define SMB2_OP_SET_INFO 2 diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 1eef1791d0c4..9e7ef7ec2d70 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -38,54 +38,83 @@ #include "smb2proto.h" static int -smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 desired_access, __u32 create_disposition, - __u32 create_options, void *data, int command) +smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + __u32 desired_access, __u32 create_disposition, + __u32 create_options, void *ptr, int command) { - int rc, tmprc = 0; + int rc; __le16 *utf16_path = NULL; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; - bool use_cached_root_handle = false; - - if ((strcmp(full_path, "") == 0) && (create_options == 0) && - (desired_access == FILE_READ_ATTRIBUTES) && - (create_disposition == FILE_OPEN) && - (tcon->nohandlecache == false)) { - rc = open_shroot(xid, tcon, &fid); - if (rc == 0) - use_cached_root_handle = true; - } + struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = ses->server; + int num_rqst = 0; + struct smb_rqst rqst[3]; + int resp_buftype[3]; + struct kvec rsp_iov[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qi_iov[1]; + struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec close_iov[1]; + struct smb2_query_info_rsp *qi_rsp = NULL; + int flags = 0; + __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0}; + unsigned int size[2]; + void *data[2]; + struct smb2_file_rename_info rename_info; + struct smb2_file_link_info link_info; + int len; - if (use_cached_root_handle == false) { - utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); - if (!utf16_path) - return -ENOMEM; - - oparms.tcon = tcon; - oparms.desired_access = desired_access; - oparms.disposition = create_disposition; - oparms.create_options = create_options; - oparms.fid = &fid; - oparms.reconnect = false; - - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, - NULL); - if (rc) { - kfree(utf16_path); - return rc; - } - } + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + /* Open */ + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = desired_access; + oparms.disposition = create_disposition; + oparms.create_options = create_options; + if (backup_cred(cifs_sb)) + oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; + oparms.fid = &fid; + oparms.reconnect = false; + memset(&open_iov, 0, sizeof(open_iov)); + rqst[num_rqst].rq_iov = open_iov; + rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE; + rc = SMB2_open_init(tcon, &rqst[num_rqst], &oplock, &oparms, + utf16_path); + kfree(utf16_path); + if (rc) + goto finished; + + smb2_set_next_command(server, &rqst[num_rqst++]); + + /* Operation */ switch (command) { - case SMB2_OP_DELETE: - break; case SMB2_OP_QUERY_INFO: - tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, - (struct smb2_file_all_info *)data); + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[num_rqst].rq_iov = qi_iov; + rqst[num_rqst].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + break; + case SMB2_OP_DELETE: break; case SMB2_OP_MKDIR: /* @@ -94,39 +123,156 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, */ break; case SMB2_OP_RMDIR: - tmprc = SMB2_rmdir(xid, tcon, fid.persistent_fid, - fid.volatile_fid); - break; - case SMB2_OP_RENAME: - tmprc = SMB2_rename(xid, tcon, fid.persistent_fid, - fid.volatile_fid, (__le16 *)data); - break; - case SMB2_OP_HARDLINK: - tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid, - fid.volatile_fid, (__le16 *)data); + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_nvec = 1; + + size[0] = 8; + data[0] = &delete_pending[0]; + + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_SET_EOF: - tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid, - fid.volatile_fid, current->tgid, - (__le64 *)data, false); + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_nvec = 1; + + size[0] = 8; /* sizeof __le64 */ + data[0] = ptr; + + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_SET_INFO: - tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, - (FILE_BASIC_INFO *)data); + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_nvec = 1; + + + size[0] = sizeof(FILE_BASIC_INFO); + data[0] = ptr; + + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_BASIC_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + break; + case SMB2_OP_RENAME: + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_nvec = 2; + + len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX)); + + rename_info.ReplaceIfExists = 1; + rename_info.RootDirectory = 0; + rename_info.FileNameLength = cpu_to_le32(len); + + size[0] = sizeof(struct smb2_file_rename_info); + data[0] = &rename_info; + + size[1] = len + 2 /* null */; + data[1] = (__le16 *)ptr; + + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_RENAME_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + break; + case SMB2_OP_HARDLINK: + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_nvec = 2; + + len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX)); + + link_info.ReplaceIfExists = 0; + link_info.RootDirectory = 0; + link_info.FileNameLength = cpu_to_le32(len); + + size[0] = sizeof(struct smb2_file_link_info); + data[0] = &link_info; + + size[1] = len + 2 /* null */; + data[1] = (__le16 *)ptr; + + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_LINK_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); break; default: cifs_dbg(VFS, "Invalid command\n"); - break; + rc = -EINVAL; } + if (rc) + goto finished; - if (use_cached_root_handle) - close_shroot(&tcon->crfid); - else - rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); - if (tmprc) - rc = tmprc; - kfree(utf16_path); + /* Close */ + memset(&close_iov, 0, sizeof(close_iov)); + rqst[num_rqst].rq_iov = close_iov; + rqst[num_rqst].rq_nvec = 1; + rc = SMB2_close_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID); + smb2_set_related(&rqst[num_rqst++]); + if (rc) + goto finished; + + rc = compound_send_recv(xid, ses, flags, num_rqst, rqst, + resp_buftype, rsp_iov); + + finished: + SMB2_open_free(&rqst[0]); + switch (command) { + case SMB2_OP_QUERY_INFO: + if (rc == 0) { + qi_rsp = (struct smb2_query_info_rsp *) + rsp_iov[1].iov_base; + rc = smb2_validate_and_copy_iov( + le16_to_cpu(qi_rsp->OutputBufferOffset), + le32_to_cpu(qi_rsp->OutputBufferLength), + &rsp_iov[1], sizeof(struct smb2_file_all_info), + ptr); + } + if (rqst[1].rq_iov) + SMB2_query_info_free(&rqst[1]); + if (rqst[2].rq_iov) + SMB2_close_free(&rqst[2]); + break; + case SMB2_OP_DELETE: + case SMB2_OP_MKDIR: + if (rqst[1].rq_iov) + SMB2_close_free(&rqst[1]); + break; + case SMB2_OP_HARDLINK: + case SMB2_OP_RENAME: + case SMB2_OP_RMDIR: + case SMB2_OP_SET_EOF: + case SMB2_OP_SET_INFO: + if (rqst[1].rq_iov) + SMB2_set_info_free(&rqst[1]); + if (rqst[2].rq_iov) + SMB2_close_free(&rqst[2]); + break; + } + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); return rc; } @@ -147,6 +293,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, { int rc; struct smb2_file_all_info *smb2_data; + __u32 create_options = 0; *adjust_tz = false; *symlink = false; @@ -155,17 +302,21 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; - rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, 0, - smb2_data, SMB2_OP_QUERY_INFO); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, + smb2_data, SMB2_OP_QUERY_INFO); if (rc == -EOPNOTSUPP) { *symlink = true; + create_options |= OPEN_REPARSE_POINT; + /* Failed on a symbolic link - query a reparse point info */ - rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - OPEN_REPARSE_POINT, smb2_data, - SMB2_OP_QUERY_INFO); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, smb2_data, + SMB2_OP_QUERY_INFO); } if (rc) goto out; @@ -180,9 +331,9 @@ int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { - return smb2_open_op_close(xid, tcon, cifs_sb, name, - FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR); + return smb2_compound_op(xid, tcon, cifs_sb, name, + FILE_WRITE_ATTRIBUTES, FILE_CREATE, + CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR); } void @@ -199,9 +350,9 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, cifs_i = CIFS_I(inode); dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; data.Attributes = cpu_to_le32(dosattrs); - tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name, - FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO); + tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, + FILE_WRITE_ATTRIBUTES, FILE_CREATE, + CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -210,18 +361,18 @@ int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { - return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE, - NULL, SMB2_OP_RMDIR); + return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, + CREATE_NOT_FILE, + NULL, SMB2_OP_RMDIR); } int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { - return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - NULL, SMB2_OP_DELETE); + return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, + CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, + NULL, SMB2_OP_DELETE); } static int @@ -238,8 +389,8 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, goto smb2_rename_path; } - rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, 0, smb2_to_name, command); + rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, + FILE_OPEN, 0, smb2_to_name, command); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -269,9 +420,10 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, bool set_alloc) { __le64 eof = cpu_to_le64(size); - return smb2_open_op_close(xid, tcon, cifs_sb, full_path, - FILE_WRITE_DATA, FILE_OPEN, 0, &eof, - SMB2_OP_SET_EOF); + + return smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_WRITE_DATA, FILE_OPEN, 0, &eof, + SMB2_OP_SET_EOF); } int @@ -291,9 +443,9 @@ smb2_set_file_info(struct inode *inode, const char *full_path, if (IS_ERR(tlink)) return PTR_ERR(tlink); - rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, - FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, - SMB2_OP_SET_INFO); + rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path, + FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, + SMB2_OP_SET_INFO); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 20a2d304c603..d47b7f5dfa6c 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -288,7 +288,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_FLT_BUFFER_TOO_SMALL, -ENOBUFS, "STATUS_FLT_BUFFER_TOO_SMALL"}, {STATUS_FVE_PARTIAL_METADATA, -EIO, "STATUS_FVE_PARTIAL_METADATA"}, {STATUS_UNSUCCESSFUL, -EIO, "STATUS_UNSUCCESSFUL"}, - {STATUS_NOT_IMPLEMENTED, -ENOSYS, "STATUS_NOT_IMPLEMENTED"}, + {STATUS_NOT_IMPLEMENTED, -EOPNOTSUPP, "STATUS_NOT_IMPLEMENTED"}, {STATUS_INVALID_INFO_CLASS, -EIO, "STATUS_INVALID_INFO_CLASS"}, {STATUS_INFO_LENGTH_MISMATCH, -EIO, "STATUS_INFO_LENGTH_MISMATCH"}, {STATUS_ACCESS_VIOLATION, -EACCES, "STATUS_ACCESS_VIOLATION"}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 89985a0a6819..23c0a21a66f8 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -74,6 +74,12 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, int *val, rc = 0; spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); + + /* eg found case where write overlapping reconnect messed up credits */ + if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0)) + trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, + server->hostname, *val); + *val += add; if (*val > 65000) { *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ @@ -104,7 +110,12 @@ smb2_set_credits(struct TCP_Server_Info *server, const int val) { spin_lock(&server->req_lock); server->credits = val; + if (val == 1) + server->reconnect_instance++; spin_unlock(&server->req_lock); + /* don't log while holding the lock */ + if (val == 1) + cifs_dbg(FYI, "set credits to 1 due to smb2 reconnect\n"); } static int * @@ -270,6 +281,31 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) } static unsigned int +smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int wsize; + + /* start with specified wsize, or default */ + wsize = volume_info->wsize ? volume_info->wsize : SMB3_DEFAULT_IOSIZE; + wsize = min_t(unsigned int, wsize, server->max_write); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->rdma) { + if (server->sign) + wsize = min_t(unsigned int, + wsize, server->smbd_conn->max_fragmented_send_size); + else + wsize = min_t(unsigned int, + wsize, server->smbd_conn->max_readwrite_size); + } +#endif + if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); + + return wsize; +} + +static unsigned int smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) { struct TCP_Server_Info *server = tcon->ses->server; @@ -295,6 +331,31 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) return rsize; } +static unsigned int +smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int rsize; + + /* start with specified rsize, or default */ + rsize = volume_info->rsize ? volume_info->rsize : SMB3_DEFAULT_IOSIZE; + rsize = min_t(unsigned int, rsize, server->max_read); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->rdma) { + if (server->sign) + rsize = min_t(unsigned int, + rsize, server->smbd_conn->max_fragmented_recv_size); + else + rsize = min_t(unsigned int, + rsize, server->smbd_conn->max_readwrite_size); + } +#endif + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); + + return rsize; +} static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, @@ -962,6 +1023,9 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) seq_printf(m, "\nBytes read: %llu Bytes written: %llu", (long long)(tcon->bytes_read), (long long)(tcon->bytes_written)); + seq_printf(m, "\nOpen files: %d total (local), %d open on server", + atomic_read(&tcon->num_local_opens), + atomic_read(&tcon->num_remote_opens)); seq_printf(m, "\nTreeConnects: %d total %d failed", atomic_read(&sent[SMB2_TREE_CONNECT_HE]), atomic_read(&failed[SMB2_TREE_CONNECT_HE])); @@ -1057,6 +1121,131 @@ req_res_key_exit: return rc; } +static int +smb2_ioctl_query_info(const unsigned int xid, + struct cifs_tcon *tcon, + __le16 *path, int is_dir, + unsigned long p) +{ + struct cifs_ses *ses = tcon->ses; + char __user *arg = (char __user *)p; + struct smb_query_info qi; + struct smb_query_info __user *pqi; + int rc = 0; + int flags = 0; + struct smb2_query_info_rsp *rsp = NULL; + void *buffer = NULL; + struct smb_rqst rqst[3]; + int resp_buftype[3]; + struct kvec rsp_iov[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct cifs_open_parms oparms; + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_fid fid; + struct kvec qi_iov[1]; + struct kvec close_iov[1]; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) + return -EFAULT; + + if (qi.output_buffer_length > 1024) + return -EINVAL; + + if (!ses || !(ses->server)) + return -EIO; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + buffer = kmalloc(qi.output_buffer_length, GFP_KERNEL); + if (buffer == NULL) + return -ENOMEM; + + if (copy_from_user(buffer, arg + sizeof(struct smb_query_info), + qi.output_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } + + /* Open */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + + memset(&oparms, 0, sizeof(oparms)); + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES | READ_CONTROL; + oparms.disposition = FILE_OPEN; + if (is_dir) + oparms.create_options = CREATE_NOT_FILE; + else + oparms.create_options = CREATE_NOT_DIR; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, path); + if (rc) + goto iqinf_exit; + smb2_set_next_command(ses->server, &rqst[0]); + + /* Query */ + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[1].rq_iov = qi_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, + qi.file_info_class, qi.info_type, + qi.additional_information, + qi.input_buffer_length, + qi.output_buffer_length, buffer); + if (rc) + goto iqinf_exit; + smb2_set_next_command(ses->server, &rqst[1]); + smb2_set_related(&rqst[1]); + + /* Close */ + memset(&close_iov, 0, sizeof(close_iov)); + rqst[2].rq_iov = close_iov; + rqst[2].rq_nvec = 1; + + rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID); + if (rc) + goto iqinf_exit; + smb2_set_related(&rqst[2]); + + rc = compound_send_recv(xid, ses, flags, 3, rqst, + resp_buftype, rsp_iov); + if (rc) + goto iqinf_exit; + pqi = (struct smb_query_info __user *)arg; + rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length) + qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength); + if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto iqinf_exit; + } + if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } + + iqinf_exit: + kfree(buffer); + SMB2_open_free(&rqst[0]); + SMB2_query_info_free(&rqst[1]); + SMB2_close_free(&rqst[2]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + return rc; +} + static ssize_t smb2_copychunk_range(const unsigned int xid, struct cifsFileInfo *srcfile, @@ -1301,7 +1490,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, } return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof, false); + cfile->fid.volatile_fid, cfile->pid, &eof); } static int @@ -1556,7 +1745,7 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, CIFS_CACHE_READ(cinode) ? 1 : 0); } -static void +void smb2_set_related(struct smb_rqst *rqst) { struct smb2_sync_hdr *shdr; @@ -1567,7 +1756,7 @@ smb2_set_related(struct smb_rqst *rqst) char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0}; -static void +void smb2_set_next_command(struct TCP_Server_Info *server, struct smb_rqst *rqst) { struct smb2_sync_hdr *shdr; @@ -1610,7 +1799,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, flags |= CIFS_TRANSFORM_REQ; memset(rqst, 0, sizeof(rqst)); - memset(resp_buftype, 0, sizeof(resp_buftype)); + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); memset(&open_iov, 0, sizeof(open_iov)); @@ -1636,7 +1825,8 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, FS_FULL_SIZE_INFORMATION, SMB2_O_INFO_FILESYSTEM, 0, - sizeof(struct smb2_fs_full_size_info)); + sizeof(struct smb2_fs_full_size_info), 0, + NULL); if (rc) goto qfs_exit; smb2_set_next_command(server, &rqst[1]); @@ -2962,13 +3152,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - iov_iter_bvec(&iter, WRITE | ITER_BVEC, bvec, npages, data_len); + iov_iter_bvec(&iter, WRITE, bvec, npages, data_len); } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ WARN_ONCE(npages > 0, "read data can be either in buf or in pages"); iov.iov_base = buf + data_offset; iov.iov_len = data_len; - iov_iter_kvec(&iter, WRITE | ITER_KVEC, &iov, 1, data_len); + iov_iter_kvec(&iter, WRITE, &iov, 1, data_len); } else { /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); @@ -3303,6 +3493,7 @@ struct smb_version_operations smb20_operations = { .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ .next_header = smb2_next_header, + .ioctl_query_info = smb2_ioctl_query_info, }; struct smb_version_operations smb21_operations = { @@ -3398,6 +3589,7 @@ struct smb_version_operations smb21_operations = { .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ .next_header = smb2_next_header, + .ioctl_query_info = smb2_ioctl_query_info, }; struct smb_version_operations smb30_operations = { @@ -3425,8 +3617,8 @@ struct smb_version_operations smb30_operations = { .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, - .negotiate_wsize = smb2_negotiate_wsize, - .negotiate_rsize = smb2_negotiate_rsize, + .negotiate_wsize = smb3_negotiate_wsize, + .negotiate_rsize = smb3_negotiate_rsize, .sess_setup = SMB2_sess_setup, .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, @@ -3502,6 +3694,7 @@ struct smb_version_operations smb30_operations = { .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ .next_header = smb2_next_header, + .ioctl_query_info = smb2_ioctl_query_info, }; struct smb_version_operations smb311_operations = { @@ -3529,8 +3722,8 @@ struct smb_version_operations smb311_operations = { .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, - .negotiate_wsize = smb2_negotiate_wsize, - .negotiate_rsize = smb2_negotiate_rsize, + .negotiate_wsize = smb3_negotiate_wsize, + .negotiate_rsize = smb3_negotiate_rsize, .sess_setup = SMB2_sess_setup, .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, @@ -3607,6 +3800,7 @@ struct smb_version_operations smb311_operations = { .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ .next_header = smb2_next_header, + .ioctl_query_info = smb2_ioctl_query_info, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f54d07bda067..7d7b016fe8bb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1478,7 +1478,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */ tcon->tid = 0; - + atomic_set(&tcon->num_remote_opens, 0); rc = smb2_plain_req_init(SMB2_TREE_CONNECT, tcon, (void **) &req, &total_len); if (rc) { @@ -2243,10 +2243,12 @@ SMB2_open_free(struct smb_rqst *rqst) { int i; - cifs_small_buf_release(rqst->rq_iov[0].iov_base); - for (i = 1; i < rqst->rq_nvec; i++) - if (rqst->rq_iov[i].iov_base != smb2_padding) - kfree(rqst->rq_iov[i].iov_base); + if (rqst && rqst->rq_iov) { + cifs_small_buf_release(rqst->rq_iov[0].iov_base); + for (i = 1; i < rqst->rq_nvec; i++) + if (rqst->rq_iov[i].iov_base != smb2_padding) + kfree(rqst->rq_iov[i].iov_base); + } } int @@ -2261,7 +2263,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, struct cifs_ses *ses = tcon->ses; struct kvec iov[SMB2_CREATE_IOV_SIZE]; struct kvec rsp_iov = {NULL, 0}; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; @@ -2303,6 +2305,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, ses->Suid, oparms->create_options, oparms->desired_access); + atomic_inc(&tcon->num_remote_opens); oparms->fid->persistent_fid = rsp->PersistentFileId; oparms->fid->volatile_fid = rsp->VolatileFileId; @@ -2474,13 +2477,13 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - *out_data = kmalloc(*plen, GFP_KERNEL); + *out_data = kmemdup((char *)rsp + le32_to_cpu(rsp->OutputOffset), + *plen, GFP_KERNEL); if (*out_data == NULL) { rc = -ENOMEM; goto ioctl_exit; } - memcpy(*out_data, (char *)rsp + le32_to_cpu(rsp->OutputOffset), *plen); ioctl_exit: free_rsp_buf(resp_buftype, rsp); return rc; @@ -2535,7 +2538,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, void SMB2_close_free(struct smb_rqst *rqst) { - cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ } int @@ -2547,7 +2551,7 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_ses *ses = tcon->ses; struct kvec iov[1]; struct kvec rsp_iov; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; int rc = 0; cifs_dbg(FYI, "Close\n"); @@ -2577,6 +2581,8 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, goto close_exit; } + atomic_dec(&tcon->num_remote_opens); + /* BB FIXME - decode close response, update inode for caching */ close_exit: @@ -2627,10 +2633,10 @@ smb2_validate_iov(unsigned int offset, unsigned int buffer_length, * If SMB buffer fields are valid, copy into temporary buffer to hold result. * Caller must free buffer. */ -static int -validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, - struct kvec *iov, unsigned int minbufsize, - char *data) +int +smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, + struct kvec *iov, unsigned int minbufsize, + char *data) { char *begin_of_buf = offset + (char *)iov->iov_base; int rc; @@ -2651,7 +2657,7 @@ int SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u8 info_class, u8 info_type, u32 additional_info, - size_t output_len) + size_t output_len, size_t input_len, void *input) { struct smb2_query_info_req *req; struct kvec *iov = rqst->rq_iov; @@ -2669,23 +2675,25 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, req->VolatileFileId = volatile_fid; req->AdditionalInformation = cpu_to_le32(additional_info); - /* - * We do not use the input buffer (do not send extra byte) - */ - req->InputBufferOffset = 0; - req->OutputBufferLength = cpu_to_le32(output_len); + if (input_len) { + req->InputBufferLength = cpu_to_le32(input_len); + /* total_len for smb query request never close to le16 max */ + req->InputBufferOffset = cpu_to_le16(total_len - 1); + memcpy(req->Buffer, input, input_len); + } iov[0].iov_base = (char *)req; /* 1 for Buffer */ - iov[0].iov_len = total_len - 1; + iov[0].iov_len = total_len - 1 + input_len; return 0; } void SMB2_query_info_free(struct smb_rqst *rqst) { - cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ } static int @@ -2699,7 +2707,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, struct kvec iov[1]; struct kvec rsp_iov; int rc = 0; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; struct cifs_ses *ses = tcon->ses; int flags = 0; @@ -2718,7 +2726,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_info_init(tcon, &rqst, persistent_fid, volatile_fid, info_class, info_type, additional_info, - output_len); + output_len, 0, NULL); if (rc) goto qinf_exit; @@ -2746,9 +2754,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, } } - rc = validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), - &rsp_iov, min_len, *data); + rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), + &rsp_iov, min_len, *data); qinf_exit: SMB2_query_info_free(&rqst); @@ -3754,45 +3762,22 @@ qdir_exit: return rc; } -static int -send_set_info(const unsigned int xid, struct cifs_tcon *tcon, +int +SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class, - u8 info_type, u32 additional_info, unsigned int num, + u8 info_type, u32 additional_info, void **data, unsigned int *size) { - struct smb_rqst rqst; struct smb2_set_info_req *req; - struct smb2_set_info_rsp *rsp = NULL; - struct kvec *iov; - struct kvec rsp_iov; - int rc = 0; - int resp_buftype; - unsigned int i; - struct cifs_ses *ses = tcon->ses; - int flags = 0; - unsigned int total_len; - - if (!ses || !(ses->server)) - return -EIO; - - if (!num) - return -EINVAL; - - iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL); - if (!iov) - return -ENOMEM; + struct kvec *iov = rqst->rq_iov; + unsigned int i, total_len; + int rc; rc = smb2_plain_req_init(SMB2_SET_INFO, tcon, (void **) &req, &total_len); - if (rc) { - kfree(iov); + if (rc) return rc; - } - - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; req->sync_hdr.ProcessId = cpu_to_le32(pid); - req->InfoType = info_type; req->FileInfoClass = info_class; req->PersistentFileId = persistent_fid; @@ -3810,19 +3795,66 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, /* 1 for Buffer */ iov[0].iov_len = total_len - 1; - for (i = 1; i < num; i++) { + for (i = 1; i < rqst->rq_nvec; i++) { le32_add_cpu(&req->BufferLength, size[i]); iov[i].iov_base = (char *)data[i]; iov[i].iov_len = size[i]; } + return 0; +} + +void +SMB2_set_info_free(struct smb_rqst *rqst) +{ + if (rqst && rqst->rq_iov) + cifs_buf_release(rqst->rq_iov[0].iov_base); /* request */ +} + +static int +send_set_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class, + u8 info_type, u32 additional_info, unsigned int num, + void **data, unsigned int *size) +{ + struct smb_rqst rqst; + struct smb2_set_info_rsp *rsp = NULL; + struct kvec *iov; + struct kvec rsp_iov; + int rc = 0; + int resp_buftype; + struct cifs_ses *ses = tcon->ses; + int flags = 0; + + if (!ses || !(ses->server)) + return -EIO; + + if (!num) + return -EINVAL; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL); + if (!iov) + return -ENOMEM; + memset(&rqst, 0, sizeof(struct smb_rqst)); rqst.rq_iov = iov; rqst.rq_nvec = num; + rc = SMB2_set_info_init(tcon, &rqst, persistent_fid, volatile_fid, pid, + info_class, info_type, additional_info, + data, size); + if (rc) { + kfree(iov); + return rc; + } + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_buf_release(req); + SMB2_set_info_free(&rqst); rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base; if (rc != 0) { @@ -3837,88 +3869,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, } int -SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, __le16 *target_file) -{ - struct smb2_file_rename_info info; - void **data; - unsigned int size[2]; - int rc; - int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX)); - - data = kmalloc_array(2, sizeof(void *), GFP_KERNEL); - if (!data) - return -ENOMEM; - - info.ReplaceIfExists = 1; /* 1 = replace existing target with new */ - /* 0 = fail if target already exists */ - info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */ - info.FileNameLength = cpu_to_le32(len); - - data[0] = &info; - size[0] = sizeof(struct smb2_file_rename_info); - - data[1] = target_file; - size[1] = len + 2 /* null */; - - rc = send_set_info(xid, tcon, persistent_fid, volatile_fid, - current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE, - 0, 2, data, size); - kfree(data); - return rc; -} - -int -SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid) -{ - __u8 delete_pending = 1; - void *data; - unsigned int size; - - data = &delete_pending; - size = 1; /* sizeof __u8 */ - - return send_set_info(xid, tcon, persistent_fid, volatile_fid, - current->tgid, FILE_DISPOSITION_INFORMATION, SMB2_O_INFO_FILE, - 0, 1, &data, &size); -} - -int -SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, __le16 *target_file) -{ - struct smb2_file_link_info info; - void **data; - unsigned int size[2]; - int rc; - int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX)); - - data = kmalloc_array(2, sizeof(void *), GFP_KERNEL); - if (!data) - return -ENOMEM; - - info.ReplaceIfExists = 0; /* 1 = replace existing link with new */ - /* 0 = fail if link already exists */ - info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */ - info.FileNameLength = cpu_to_le32(len); - - data[0] = &info; - size[0] = sizeof(struct smb2_file_link_info); - - data[1] = target_file; - size[1] = len + 2 /* null */; - - rc = send_set_info(xid, tcon, persistent_fid, volatile_fid, - current->tgid, FILE_LINK_INFORMATION, SMB2_O_INFO_FILE, - 0, 2, data, size); - kfree(data); - return rc; -} - -int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc) + u64 volatile_fid, u32 pid, __le64 *eof) { struct smb2_file_eof_info info; void *data; @@ -3929,28 +3881,12 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, data = &info; size = sizeof(struct smb2_file_eof_info); - if (is_falloc) - return send_set_info(xid, tcon, persistent_fid, volatile_fid, - pid, FILE_ALLOCATION_INFORMATION, SMB2_O_INFO_FILE, - 0, 1, &data, &size); - else - return send_set_info(xid, tcon, persistent_fid, volatile_fid, + return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, 0, 1, &data, &size); } int -SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf) -{ - unsigned int size; - size = sizeof(FILE_BASIC_INFO); - return send_set_info(xid, tcon, persistent_fid, volatile_fid, - current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE, - 0, 1, (void **)&buf, &size); -} - -int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct cifs_ntsd *pnntsd, int pacllen, int aclflag) @@ -4350,6 +4286,8 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, struct kvec iov[1]; struct kvec rsp_iov; int resp_buf_type; + __u64 *please_key_high; + __u64 *please_key_low; cifs_dbg(FYI, "SMB2_lease_break\n"); rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req, @@ -4379,10 +4317,16 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); + please_key_low = (__u64 *)req->LeaseKey; + please_key_high = (__u64 *)(req->LeaseKey+8); if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); + trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid, + ses->Suid, *please_key_low, *please_key_high, rc); cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc); - } + } else + trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid, + ses->Suid, *please_key_low, *please_key_high); return rc; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 8fb7887f2b3d..f753f424d7f1 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -613,6 +613,8 @@ struct smb2_tree_disconnect_rsp { #define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83 #define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C +/* Flag (SMB3 open response) values */ +#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 /* * Maximum number of iovs we need for an open/create request. @@ -650,7 +652,7 @@ struct smb2_create_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 89 */ __u8 OplockLevel; - __u8 Reserved; + __u8 Flag; /* 0x01 if reparse point */ __le32 CreateAction; __le64 CreationTime; __le64 LastAccessTime; @@ -1174,6 +1176,15 @@ struct smb2_query_info_rsp { __u8 Buffer[1]; } __packed; +/* + * Maximum number of iovs we need for a set-info request. + * The largest one is rename/hardlink + * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info + * [1] : path + * [2] : compound padding + */ +#define SMB2_SET_INFO_IOV_SIZE 3 + struct smb2_set_info_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 33 */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index b4076577eeb7..9f4e9ed9ce53 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -116,6 +116,9 @@ extern void smb2_reconnect_server(struct work_struct *work); extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server); extern unsigned long smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst); +extern void smb2_set_next_command(struct TCP_Server_Info *server, + struct smb_rqst *rqst); +extern void smb2_set_related(struct smb_rqst *rqst); /* * SMB2 Worker functions - most of protocol specific implementation details @@ -160,7 +163,8 @@ extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u8 info_class, u8 info_type, - u32 additional_info, size_t output_len); + u32 additional_info, size_t output_len, + size_t input_len, void *input); extern void SMB2_query_info_free(struct smb_rqst *rqst); extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, @@ -179,20 +183,14 @@ extern int SMB2_echo(struct TCP_Server_Info *server); extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int index, struct cifs_search_info *srch_inf); -extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, - __le16 *target_file); -extern int SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid); -extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, - __le16 *target_file); extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 pid, - __le64 *eof, bool is_fallocate); -extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, - FILE_BASIC_INFO *buf); + __le64 *eof); +extern int SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 pid, + u8 info_class, u8 info_type, u32 additional_info, + void **data, unsigned int *size); +extern void SMB2_set_info_free(struct smb_rqst *rqst); extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct cifs_ntsd *pnntsd, int pacllen, int aclflag); @@ -232,6 +230,10 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, extern int smb3_encryption_required(const struct cifs_tcon *tcon); extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size); +extern int smb2_validate_and_copy_iov(unsigned int offset, + unsigned int buffer_length, + struct kvec *iov, + unsigned int minbufsize, char *data); extern void smb2_copy_fs_info_to_kstatfs( struct smb2_fs_full_size_info *pfs_inf, struct kstatfs *kst); diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5fdb9a509a97..e94a8d1d08a3 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -2054,14 +2054,22 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) info->smbd_recv_pending++; - switch (msg->msg_iter.type) { - case READ | ITER_KVEC: + if (iov_iter_rw(&msg->msg_iter) == WRITE) { + /* It's a bug in upper layer to get there */ + cifs_dbg(VFS, "CIFS: invalid msg iter dir %u\n", + iov_iter_rw(&msg->msg_iter)); + rc = -EINVAL; + goto out; + } + + switch (iov_iter_type(&msg->msg_iter)) { + case ITER_KVEC: buf = msg->msg_iter.kvec->iov_base; to_read = msg->msg_iter.kvec->iov_len; rc = smbd_recv_buf(info, buf, to_read); break; - case READ | ITER_BVEC: + case ITER_BVEC: page = msg->msg_iter.bvec->bv_page; page_offset = msg->msg_iter.bvec->bv_offset; to_read = msg->msg_iter.bvec->bv_len; @@ -2071,10 +2079,11 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) default: /* It's a bug in upper layer to get there */ cifs_dbg(VFS, "CIFS: invalid msg type %d\n", - msg->msg_iter.type); + iov_iter_type(&msg->msg_iter)); rc = -EINVAL; } +out: info->smbd_recv_pending--; wake_up(&info->wait_smbd_recv_pending); @@ -2295,8 +2304,12 @@ static void smbd_mr_recovery_work(struct work_struct *work) int rc; list_for_each_entry(smbdirect_mr, &info->mr_list, list) { - if (smbdirect_mr->state == MR_INVALIDATED || - smbdirect_mr->state == MR_ERROR) { + if (smbdirect_mr->state == MR_INVALIDATED) + ib_dma_unmap_sg( + info->id->device, smbdirect_mr->sgl, + smbdirect_mr->sgl_count, + smbdirect_mr->dir); + else if (smbdirect_mr->state == MR_ERROR) { /* recover this MR entry */ rc = ib_dereg_mr(smbdirect_mr->mr); @@ -2320,25 +2333,21 @@ static void smbd_mr_recovery_work(struct work_struct *work) smbd_disconnect_rdma_connection(info); continue; } + } else + /* This MR is being used, don't recover it */ + continue; - if (smbdirect_mr->state == MR_INVALIDATED) - ib_dma_unmap_sg( - info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, - smbdirect_mr->dir); - - smbdirect_mr->state = MR_READY; + smbdirect_mr->state = MR_READY; - /* smbdirect_mr->state is updated by this function - * and is read and updated by I/O issuing CPUs trying - * to get a MR, the call to atomic_inc_return - * implicates a memory barrier and guarantees this - * value is updated before waking up any calls to - * get_mr() from the I/O issuing CPUs - */ - if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up_interruptible(&info->wait_mr); - } + /* smbdirect_mr->state is updated by this function + * and is read and updated by I/O issuing CPUs trying + * to get a MR, the call to atomic_inc_return + * implicates a memory barrier and guarantees this + * value is updated before waking up any calls to + * get_mr() from the I/O issuing CPUs + */ + if (atomic_inc_return(&info->mr_ready_count) == 1) + wake_up_interruptible(&info->wait_mr); } } diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index d4aed5217a56..cce8414fe7ec 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -460,6 +460,85 @@ DEFINE_EVENT(smb3_open_done_class, smb3_##name, \ DEFINE_SMB3_OPEN_DONE_EVENT(open_done); DEFINE_SMB3_OPEN_DONE_EVENT(posix_mkdir_done); + +DECLARE_EVENT_CLASS(smb3_lease_done_class, + TP_PROTO(__u32 lease_state, + __u32 tid, + __u64 sesid, + __u64 lease_key_low, + __u64 lease_key_high), + TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high), + TP_STRUCT__entry( + __field(__u32, lease_state) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, lease_key_low) + __field(__u64, lease_key_high) + ), + TP_fast_assign( + __entry->lease_state = lease_state; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->lease_key_low = lease_key_low; + __entry->lease_key_high = lease_key_high; + ), + TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x", + __entry->sesid, __entry->tid, __entry->lease_key_high, + __entry->lease_key_low, __entry->lease_state) +) + +#define DEFINE_SMB3_LEASE_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \ + TP_PROTO(__u32 lease_state, \ + __u32 tid, \ + __u64 sesid, \ + __u64 lease_key_low, \ + __u64 lease_key_high), \ + TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high)) + +DEFINE_SMB3_LEASE_DONE_EVENT(lease_done); + +DECLARE_EVENT_CLASS(smb3_lease_err_class, + TP_PROTO(__u32 lease_state, + __u32 tid, + __u64 sesid, + __u64 lease_key_low, + __u64 lease_key_high, + int rc), + TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc), + TP_STRUCT__entry( + __field(__u32, lease_state) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, lease_key_low) + __field(__u64, lease_key_high) + __field(int, rc) + ), + TP_fast_assign( + __entry->lease_state = lease_state; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->lease_key_low = lease_key_low; + __entry->lease_key_high = lease_key_high; + __entry->rc = rc; + ), + TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x rc=%d", + __entry->sesid, __entry->tid, __entry->lease_key_high, + __entry->lease_key_low, __entry->lease_state, __entry->rc) +) + +#define DEFINE_SMB3_LEASE_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \ + TP_PROTO(__u32 lease_state, \ + __u32 tid, \ + __u64 sesid, \ + __u64 lease_key_low, \ + __u64 lease_key_high, \ + int rc), \ + TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc)) + +DEFINE_SMB3_LEASE_ERR_EVENT(lease_err); + DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, char *hostname), @@ -486,6 +565,36 @@ DEFINE_EVENT(smb3_reconnect_class, smb3_##name, \ DEFINE_SMB3_RECONNECT_EVENT(reconnect); DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect); +DECLARE_EVENT_CLASS(smb3_credit_class, + TP_PROTO(__u64 currmid, + char *hostname, + int credits), + TP_ARGS(currmid, hostname, credits), + TP_STRUCT__entry( + __field(__u64, currmid) + __field(char *, hostname) + __field(int, credits) + ), + TP_fast_assign( + __entry->currmid = currmid; + __entry->hostname = hostname; + __entry->credits = credits; + ), + TP_printk("server=%s current_mid=0x%llx credits=%d", + __entry->hostname, + __entry->currmid, + __entry->credits) +) + +#define DEFINE_SMB3_CREDIT_EVENT(name) \ +DEFINE_EVENT(smb3_credit_class, smb3_##name, \ + TP_PROTO(__u64 currmid, \ + char *hostname, \ + int credits), \ + TP_ARGS(currmid, hostname, credits)) + +DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); + #endif /* _CIFS_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index b48f43963da6..83ff0c25710d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -113,9 +113,18 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) cifs_small_buf_release(midEntry->resp_buf); #ifdef CONFIG_CIFS_STATS2 now = jiffies; - /* commands taking longer than one second are indications that - something is wrong, unless it is quite a slow link or server */ - if (time_after(now, midEntry->when_alloc + HZ) && + /* + * commands taking longer than one second (default) can be indications + * that something is wrong, unless it is quite a slow link or a very + * busy server. Note that this calc is unlikely or impossible to wrap + * as long as slow_rsp_threshold is not set way above recommended max + * value (32767 ie 9 hours) and is generally harmless even if wrong + * since only affects debug counters - so leaving the calc as simple + * comparison rather than doing multiple conversions and overflow + * checks + */ + if ((slow_rsp_threshold != 0) && + time_after(now, midEntry->when_alloc + (slow_rsp_threshold * HZ)) && (midEntry->command != command)) { /* smb2slowcmd[NUMBER_OF_SMB2_COMMANDS] counts by command */ if ((le16_to_cpu(midEntry->command) < NUMBER_OF_SMB2_COMMANDS) && @@ -128,7 +137,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) if (cifsFYI & CIFS_TIMER) { pr_debug(" CIFS slow rsp: cmd %d mid %llu", midEntry->command, midEntry->mid); - pr_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n", + cifs_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n", now - midEntry->when_alloc, now - midEntry->when_sent, now - midEntry->when_received); @@ -307,8 +316,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, .iov_base = &rfc1002_marker, .iov_len = 4 }; - iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, &hiov, - 1, 4); + iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) goto uncork; @@ -329,8 +337,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, size += iov[i].iov_len; } - iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, - iov, n_vec, size); + iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) @@ -346,7 +353,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rqst_page_get_length(&rqst[j], i, &bvec.bv_len, &bvec.bv_offset); - iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC, + iov_iter_bvec(&smb_msg.msg_iter, WRITE, &bvec, 1, bvec.bv_len); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) @@ -786,7 +793,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, int i, j, rc = 0; int timeout, optype; struct mid_q_entry *midQ[MAX_COMPOUND]; - unsigned int credits = 1; + unsigned int credits = 0; char *buf; timeout = flags & CIFS_TIMEOUT_MASK; @@ -851,21 +858,24 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, mutex_unlock(&ses->server->srv_mutex); - for (i = 0; i < num_rqst; i++) { - if (rc < 0) - goto out; + if (rc < 0) + goto out; - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) - smb311_update_preauth_hash(ses, rqst[i].rq_iov, - rqst[i].rq_nvec); + /* + * Compounding is never used during session establish. + */ + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) + smb311_update_preauth_hash(ses, rqst[0].rq_iov, + rqst[0].rq_nvec); - if (timeout == CIFS_ASYNC_OP) - goto out; + if (timeout == CIFS_ASYNC_OP) + goto out; + for (i = 0; i < num_rqst; i++) { rc = wait_for_response(ses->server, midQ[i]); if (rc != 0) { - cifs_dbg(FYI, "Cancelling wait for mid %llu\n", - midQ[i]->mid); + cifs_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n", + midQ[i]->mid, le16_to_cpu(midQ[i]->command)); send_cancel(ses->server, &rqst[i], midQ[i]); spin_lock(&GlobalMid_Lock); if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { @@ -877,10 +887,21 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } spin_unlock(&GlobalMid_Lock); } + } + + for (i = 0; i < num_rqst; i++) + if (midQ[i]->resp_buf) + credits += ses->server->ops->get_credits(midQ[i]); + if (!credits) + credits = 1; + + for (i = 0; i < num_rqst; i++) { + if (rc < 0) + goto out; rc = cifs_sync_mid_result(midQ[i], ses->server); if (rc != 0) { - add_credits(ses->server, 1, optype); + add_credits(ses->server, credits, optype); return rc; } @@ -901,23 +922,26 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, else resp_buf_type[i] = CIFS_SMALL_BUFFER; - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { - struct kvec iov = { - .iov_base = resp_iov[i].iov_base, - .iov_len = resp_iov[i].iov_len - }; - smb311_update_preauth_hash(ses, &iov, 1); - } - - credits = ses->server->ops->get_credits(midQ[i]); - rc = ses->server->ops->check_receive(midQ[i], ses->server, flags & CIFS_LOG_ERROR); /* mark it so buf will not be freed by cifs_delete_mid */ if ((flags & CIFS_NO_RESP) == 0) midQ[i]->resp_buf = NULL; + + } + + /* + * Compounding is never used during session establish. + */ + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { + struct kvec iov = { + .iov_base = resp_iov[0].iov_base, + .iov_len = resp_iov[0].iov_len + }; + smb311_update_preauth_hash(ses, &iov, 1); } + out: /* * This will dequeue all mids. After this it is important that the diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 504b3c3539dc..15f6e96b3bd9 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -52,7 +52,7 @@ #define elf_prpsinfo compat_elf_prpsinfo #undef ns_to_timeval -#define ns_to_timeval ns_to_compat_timeval +#define ns_to_timeval ns_to_old_timeval32 /* * To use this file, asm/elf.h must define compat_elf_check_arch. diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 0c445a03e682..6e30949d9f77 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -22,37 +22,21 @@ #include <linux/smp.h> #include <linux/ioctl.h> #include <linux/if.h> -#include <linux/if_bridge.h> #include <linux/raid/md_u.h> -#include <linux/kd.h> -#include <linux/route.h> -#include <linux/in6.h> -#include <linux/ipv6_route.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/vt.h> #include <linux/falloc.h> -#include <linux/fs.h> #include <linux/file.h> -#include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/if_pppox.h> #include <linux/mtio.h> #include <linux/tty.h> #include <linux/vt_kern.h> -#include <linux/fb.h> -#include <linux/videodev2.h> -#include <linux/netdevice.h> #include <linux/raw.h> #include <linux/blkdev.h> -#include <linux/elevator.h> #include <linux/rtc.h> #include <linux/pci.h> #include <linux/serial.h> -#include <linux/if_tun.h> #include <linux/ctype.h> #include <linux/syscalls.h> -#include <linux/atalk.h> #include <linux/gfp.h> #include <linux/cec.h> @@ -74,44 +58,17 @@ #endif #include <linux/uaccess.h> -#include <linux/ethtool.h> -#include <linux/mii.h> -#include <linux/if_bonding.h> #include <linux/watchdog.h> #include <linux/soundcard.h> -#include <linux/lp.h> -#include <linux/ppdev.h> - -#include <linux/atm.h> -#include <linux/atmarp.h> -#include <linux/atmclip.h> -#include <linux/atmdev.h> -#include <linux/atmioc.h> -#include <linux/atmlec.h> -#include <linux/atmmpc.h> -#include <linux/atmsvc.h> -#include <linux/atm_tcp.h> -#include <linux/sonet.h> -#include <linux/atm_suni.h> - -#include <linux/usb.h> -#include <linux/usbdevice_fs.h> -#include <linux/nbd.h> -#include <linux/random.h> -#include <linux/filter.h> #include <linux/hiddev.h> -#define __DVB_CORE__ -#include <linux/dvb/audio.h> -#include <linux/dvb/dmx.h> -#include <linux/dvb/frontend.h> -#include <linux/dvb/video.h> #include <linux/sort.h> #ifdef CONFIG_SPARC +#include <linux/fb.h> #include <asm/fbio.h> #endif @@ -133,71 +90,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return vfs_ioctl(file, cmd, arg); } -struct compat_video_event { - int32_t type; - compat_time_t timestamp; - union { - video_size_t size; - unsigned int frame_rate; - } u; -}; - -static int do_video_get_event(struct file *file, - unsigned int cmd, struct compat_video_event __user *up) -{ - struct video_event __user *kevent = - compat_alloc_user_space(sizeof(*kevent)); - int err; - - if (kevent == NULL) - return -EFAULT; - - err = do_ioctl(file, cmd, (unsigned long)kevent); - if (!err) { - err = convert_in_user(&kevent->type, &up->type); - err |= convert_in_user(&kevent->timestamp, &up->timestamp); - err |= convert_in_user(&kevent->u.size.w, &up->u.size.w); - err |= convert_in_user(&kevent->u.size.h, &up->u.size.h); - err |= convert_in_user(&kevent->u.size.aspect_ratio, - &up->u.size.aspect_ratio); - if (err) - err = -EFAULT; - } - - return err; -} - -struct compat_video_still_picture { - compat_uptr_t iFrame; - int32_t size; -}; - -static int do_video_stillpicture(struct file *file, - unsigned int cmd, struct compat_video_still_picture __user *up) -{ - struct video_still_picture __user *up_native; - compat_uptr_t fp; - int32_t size; - int err; - - err = get_user(fp, &up->iFrame); - err |= get_user(size, &up->size); - if (err) - return -EFAULT; - - up_native = - compat_alloc_user_space(sizeof(struct video_still_picture)); - - err = put_user(compat_ptr(fp), &up_native->iFrame); - err |= put_user(size, &up_native->size); - if (err) - return -EFAULT; - - err = do_ioctl(file, cmd, (unsigned long) up_native); - - return err; -} - #ifdef CONFIG_BLOCK typedef struct sg_io_hdr32 { compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ @@ -544,22 +436,6 @@ static int mt_ioctl_trans(struct file *file, #define HCIUARTSETFLAGS _IOW('U', 203, int) #define HCIUARTGETFLAGS _IOR('U', 204, int) -#define BNEPCONNADD _IOW('B', 200, int) -#define BNEPCONNDEL _IOW('B', 201, int) -#define BNEPGETCONNLIST _IOR('B', 210, int) -#define BNEPGETCONNINFO _IOR('B', 211, int) -#define BNEPGETSUPPFEAT _IOR('B', 212, int) - -#define CMTPCONNADD _IOW('C', 200, int) -#define CMTPCONNDEL _IOW('C', 201, int) -#define CMTPGETCONNLIST _IOR('C', 210, int) -#define CMTPGETCONNINFO _IOR('C', 211, int) - -#define HIDPCONNADD _IOW('H', 200, int) -#define HIDPCONNDEL _IOW('H', 201, int) -#define HIDPGETCONNLIST _IOR('H', 210, int) -#define HIDPGETCONNINFO _IOR('H', 211, int) - #define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) #define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) @@ -974,19 +850,6 @@ COMPATIBLE_IOCTL(RFCOMMRELEASEDEV) COMPATIBLE_IOCTL(RFCOMMGETDEVLIST) COMPATIBLE_IOCTL(RFCOMMGETDEVINFO) COMPATIBLE_IOCTL(RFCOMMSTEALDLC) -COMPATIBLE_IOCTL(BNEPCONNADD) -COMPATIBLE_IOCTL(BNEPCONNDEL) -COMPATIBLE_IOCTL(BNEPGETCONNLIST) -COMPATIBLE_IOCTL(BNEPGETCONNINFO) -COMPATIBLE_IOCTL(BNEPGETSUPPFEAT) -COMPATIBLE_IOCTL(CMTPCONNADD) -COMPATIBLE_IOCTL(CMTPCONNDEL) -COMPATIBLE_IOCTL(CMTPGETCONNLIST) -COMPATIBLE_IOCTL(CMTPGETCONNINFO) -COMPATIBLE_IOCTL(HIDPCONNADD) -COMPATIBLE_IOCTL(HIDPCONNDEL) -COMPATIBLE_IOCTL(HIDPGETCONNLIST) -COMPATIBLE_IOCTL(HIDPGETCONNINFO) /* CAPI */ COMPATIBLE_IOCTL(CAPI_REGISTER) COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER) @@ -1025,61 +888,6 @@ COMPATIBLE_IOCTL(HIDIOCGFLAG) COMPATIBLE_IOCTL(HIDIOCSFLAG) COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX) COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO) -/* dvb */ -COMPATIBLE_IOCTL(AUDIO_STOP) -COMPATIBLE_IOCTL(AUDIO_PLAY) -COMPATIBLE_IOCTL(AUDIO_PAUSE) -COMPATIBLE_IOCTL(AUDIO_CONTINUE) -COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE) -COMPATIBLE_IOCTL(AUDIO_SET_MUTE) -COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC) -COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE) -COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT) -COMPATIBLE_IOCTL(AUDIO_GET_STATUS) -COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES) -COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER) -COMPATIBLE_IOCTL(AUDIO_SET_ID) -COMPATIBLE_IOCTL(AUDIO_SET_MIXER) -COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE) -COMPATIBLE_IOCTL(DMX_START) -COMPATIBLE_IOCTL(DMX_STOP) -COMPATIBLE_IOCTL(DMX_SET_FILTER) -COMPATIBLE_IOCTL(DMX_SET_PES_FILTER) -COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE) -COMPATIBLE_IOCTL(DMX_GET_PES_PIDS) -COMPATIBLE_IOCTL(DMX_GET_STC) -COMPATIBLE_IOCTL(DMX_REQBUFS) -COMPATIBLE_IOCTL(DMX_QUERYBUF) -COMPATIBLE_IOCTL(DMX_EXPBUF) -COMPATIBLE_IOCTL(DMX_QBUF) -COMPATIBLE_IOCTL(DMX_DQBUF) -COMPATIBLE_IOCTL(VIDEO_STOP) -COMPATIBLE_IOCTL(VIDEO_PLAY) -COMPATIBLE_IOCTL(VIDEO_FREEZE) -COMPATIBLE_IOCTL(VIDEO_CONTINUE) -COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE) -COMPATIBLE_IOCTL(VIDEO_SET_BLANK) -COMPATIBLE_IOCTL(VIDEO_GET_STATUS) -COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT) -COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD) -COMPATIBLE_IOCTL(VIDEO_SLOWMOTION) -COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES) -COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER) -COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE) -COMPATIBLE_IOCTL(VIDEO_SET_FORMAT) -COMPATIBLE_IOCTL(VIDEO_GET_SIZE) -/* cec */ -COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS) -COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS) -COMPATIBLE_IOCTL(CEC_ADAP_S_LOG_ADDRS) -COMPATIBLE_IOCTL(CEC_ADAP_G_PHYS_ADDR) -COMPATIBLE_IOCTL(CEC_ADAP_S_PHYS_ADDR) -COMPATIBLE_IOCTL(CEC_G_MODE) -COMPATIBLE_IOCTL(CEC_S_MODE) -COMPATIBLE_IOCTL(CEC_TRANSMIT) -COMPATIBLE_IOCTL(CEC_RECEIVE) -COMPATIBLE_IOCTL(CEC_DQEVENT) - /* joystick */ COMPATIBLE_IOCTL(JSIOCGVERSION) COMPATIBLE_IOCTL(JSIOCGAXES) @@ -1147,12 +955,6 @@ static long do_ioctl_trans(unsigned int cmd, case RTC_EPOCH_READ32: case RTC_EPOCH_SET32: return rtc_ioctl(file, cmd, argp); - - /* dvb */ - case VIDEO_GET_EVENT: - return do_video_get_event(file, cmd, argp); - case VIDEO_STILLPICTURE: - return do_video_stillpicture(file, cmd, argp); } /* diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index f408994fc632..9352487bd0fc 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -202,7 +202,8 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, continue; blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT; blk_offset += offset; - if (blk_offset + len > BUFFER_SIZE) + if (blk_offset > BUFFER_SIZE || + blk_offset + len > BUFFER_SIZE) continue; return read_buffers[i] + blk_offset; } @@ -418,9 +419,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) int i; vma->vm_flags |= VM_MIXEDMAP; for (i = 0; i < pages && !ret; i++) { + vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); - ret = vm_insert_mixed(vma, vma->vm_start + off, pfn); + vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn); + if (vmf & VM_FAULT_ERROR) + ret = vm_fault_to_errno(vmf, 0); } } @@ -869,8 +873,8 @@ static int cramfs_readpage(struct file *file, struct page *page) if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) { /* See comments on earlier code. */ u32 prev_start = block_start; - block_start = prev_start & ~CRAMFS_BLK_FLAGS; - block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; + block_start = prev_start & ~CRAMFS_BLK_FLAGS; + block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) { block_start += PAGE_SIZE; } else { diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 39c20ef26db4..79debfc9cef9 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -83,10 +83,6 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode, filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) return true; - if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS && - filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS) - return true; - return false; } diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index e997ca51192f..7874c9bb2fc5 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -174,16 +174,6 @@ static struct fscrypt_mode { .cipher_str = "cts(cbc(aes))", .keysize = 16, }, - [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { - .friendly_name = "Speck128/256-XTS", - .cipher_str = "xts(speck128)", - .keysize = 64, - }, - [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { - .friendly_name = "Speck128/256-CTS-CBC", - .cipher_str = "cts(cbc(speck128))", - .keysize = 32, - }, }; static struct fscrypt_mode * @@ -38,6 +38,17 @@ #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> +static inline unsigned int pe_order(enum page_entry_size pe_size) +{ + if (pe_size == PE_SIZE_PTE) + return PAGE_SHIFT - PAGE_SHIFT; + if (pe_size == PE_SIZE_PMD) + return PMD_SHIFT - PAGE_SHIFT; + if (pe_size == PE_SIZE_PUD) + return PUD_SHIFT - PAGE_SHIFT; + return ~0; +} + /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -46,6 +57,9 @@ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) +/* The order of a PMD entry */ +#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) + static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -59,63 +73,74 @@ static int __init init_dax_wait_table(void) fs_initcall(init_dax_wait_table); /* - * We use lowest available bit in exceptional entry for locking, one bit for - * the entry size (PMD) and two more to tell us if the entry is a zero page or - * an empty entry that is just used for locking. In total four special bits. + * DAX pagecache entries use XArray value entries so they can't be mistaken + * for pages. We use one bit for locking, one bit for the entry size (PMD) + * and two more to tell us if the entry is a zero page or an empty entry that + * is just used for locking. In total four special bits. * * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem * block allocation. */ -#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) -#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) -#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) -#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) +#define DAX_SHIFT (4) +#define DAX_LOCKED (1UL << 0) +#define DAX_PMD (1UL << 1) +#define DAX_ZERO_PAGE (1UL << 2) +#define DAX_EMPTY (1UL << 3) -static unsigned long dax_radix_pfn(void *entry) +static unsigned long dax_to_pfn(void *entry) { - return (unsigned long)entry >> RADIX_DAX_SHIFT; + return xa_to_value(entry) >> DAX_SHIFT; } -static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) +static void *dax_make_entry(pfn_t pfn, unsigned long flags) { - return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | - (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); + return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); } -static unsigned int dax_radix_order(void *entry) +static void *dax_make_page_entry(struct page *page) { - if ((unsigned long)entry & RADIX_DAX_PMD) - return PMD_SHIFT - PAGE_SHIFT; + pfn_t pfn = page_to_pfn_t(page); + return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0); +} + +static bool dax_is_locked(void *entry) +{ + return xa_to_value(entry) & DAX_LOCKED; +} + +static unsigned int dax_entry_order(void *entry) +{ + if (xa_to_value(entry) & DAX_PMD) + return PMD_ORDER; return 0; } static int dax_is_pmd_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_PMD; + return xa_to_value(entry) & DAX_PMD; } static int dax_is_pte_entry(void *entry) { - return !((unsigned long)entry & RADIX_DAX_PMD); + return !(xa_to_value(entry) & DAX_PMD); } static int dax_is_zero_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; + return xa_to_value(entry) & DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_EMPTY; + return xa_to_value(entry) & DAX_EMPTY; } /* - * DAX radix tree locking + * DAX page cache entry locking */ struct exceptional_entry_key { - struct address_space *mapping; + struct xarray *xa; pgoff_t entry_start; }; @@ -124,10 +149,11 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; -static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, - pgoff_t index, void *entry, struct exceptional_entry_key *key) +static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, + void *entry, struct exceptional_entry_key *key) { unsigned long hash; + unsigned long index = xas->xa_index; /* * If 'entry' is a PMD, align the 'index' that we use for the wait @@ -136,22 +162,21 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, */ if (dax_is_pmd_entry(entry)) index &= ~PG_PMD_COLOUR; - - key->mapping = mapping; + key->xa = xas->xa; key->entry_start = index; - hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); + hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); return wait_table + hash; } -static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, - int sync, void *keyp) +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; struct wait_exceptional_entry_queue *ewait = container_of(wait, struct wait_exceptional_entry_queue, wait); - if (key->mapping != ewait->key.mapping || + if (key->xa != ewait->key.xa || key->entry_start != ewait->key.entry_start) return 0; return autoremove_wake_function(wait, mode, sync, NULL); @@ -162,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ -static void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, void *entry, bool wake_all) +static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) { struct exceptional_entry_key key; wait_queue_head_t *wq; - wq = dax_entry_waitqueue(mapping, index, entry, &key); + wq = dax_entry_waitqueue(xas, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens @@ -181,55 +205,16 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, } /* - * Check whether the given slot is locked. Must be called with the i_pages - * lock held. - */ -static inline int slot_locked(struct address_space *mapping, void **slot) -{ - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - return entry & RADIX_DAX_ENTRY_LOCK; -} - -/* - * Mark the given slot as locked. Must be called with the i_pages lock held. - */ -static inline void *lock_slot(struct address_space *mapping, void **slot) -{ - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - - entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); - return (void *)entry; -} - -/* - * Mark the given slot as unlocked. Must be called with the i_pages lock held. - */ -static inline void *unlock_slot(struct address_space *mapping, void **slot) -{ - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - - entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); - return (void *)entry; -} - -/* - * Lookup entry in radix tree, wait for it to become unlocked if it is - * exceptional entry and return it. The caller must call - * put_unlocked_mapping_entry() when he decided not to lock the entry or - * put_locked_mapping_entry() when he locked the entry and now wants to - * unlock it. + * Look up entry in page cache, wait for it to become unlocked if it + * is a DAX entry and return it. The caller must subsequently call + * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() + * if it did. * * Must be called with the i_pages lock held. */ -static void *__get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp, bool (*wait_fn)(void)) +static void *get_unlocked_entry(struct xa_state *xas) { - void *entry, **slot; + void *entry; struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; @@ -237,80 +222,54 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - bool revalidate; - - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, - &slot); - if (!entry || - WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || - !slot_locked(mapping, slot)) { - if (slotp) - *slotp = slot; + entry = xas_load(xas); + if (!entry || xa_is_internal(entry) || + WARN_ON_ONCE(!xa_is_value(entry)) || + !dax_is_locked(entry)) return entry; - } - wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); + wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - xa_unlock_irq(&mapping->i_pages); - revalidate = wait_fn(); + xas_unlock_irq(xas); + xas_reset(xas); + schedule(); finish_wait(wq, &ewait.wait); - xa_lock_irq(&mapping->i_pages); - if (revalidate) - return ERR_PTR(-EAGAIN); + xas_lock_irq(xas); } } -static bool entry_wait(void) -{ - schedule(); - /* - * Never return an ERR_PTR() from - * __get_unlocked_mapping_entry(), just keep looping. - */ - return false; -} - -static void *get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp) +static void put_unlocked_entry(struct xa_state *xas, void *entry) { - return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait); -} - -static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - void *entry, **slot; - - xa_lock_irq(&mapping->i_pages); - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || - !slot_locked(mapping, slot))) { - xa_unlock_irq(&mapping->i_pages); - return; - } - unlock_slot(mapping, slot); - xa_unlock_irq(&mapping->i_pages); - dax_wake_mapping_entry_waiter(mapping, index, entry, false); + /* If we were the only waiter woken, wake the next one */ + if (entry) + dax_wake_entry(xas, entry, false); } -static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index) +/* + * We used the xa_state to get the entry, but then we locked the entry and + * dropped the xa_lock, so we know the xa_state is stale and must be reset + * before use. + */ +static void dax_unlock_entry(struct xa_state *xas, void *entry) { - unlock_mapping_entry(mapping, index); + void *old; + + xas_reset(xas); + xas_lock_irq(xas); + old = xas_store(xas, entry); + xas_unlock_irq(xas); + BUG_ON(!dax_is_locked(old)); + dax_wake_entry(xas, entry, false); } /* - * Called when we are done with radix tree entry we looked up via - * get_unlocked_mapping_entry() and which we didn't lock in the end. + * Return: The entry stored at this location before it was locked. */ -static void put_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) +static void *dax_lock_entry(struct xa_state *xas, void *entry) { - if (!entry) - return; - - /* We have to wake up next waiter for the radix tree entry lock */ - dax_wake_mapping_entry_waiter(mapping, index, entry, false); + unsigned long v = xa_to_value(entry); + return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); } static unsigned long dax_entry_size(void *entry) @@ -325,9 +284,9 @@ static unsigned long dax_entry_size(void *entry) return PAGE_SIZE; } -static unsigned long dax_radix_end_pfn(void *entry) +static unsigned long dax_end_pfn(void *entry) { - return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; + return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; } /* @@ -335,8 +294,8 @@ static unsigned long dax_radix_end_pfn(void *entry) * 'empty' and 'zero' entries. */ #define for_each_mapped_pfn(entry, pfn) \ - for (pfn = dax_radix_pfn(entry); \ - pfn < dax_radix_end_pfn(entry); pfn++) + for (pfn = dax_to_pfn(entry); \ + pfn < dax_end_pfn(entry); pfn++) /* * TODO: for reflink+dax we need a way to associate a single page with @@ -393,33 +352,16 @@ static struct page *dax_busy_page(void *entry) return NULL; } -static bool entry_wait_revalidate(void) -{ - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - - /* - * Tell __get_unlocked_mapping_entry() to take a break, we need - * to revalidate page->mapping after dropping locks - */ - return true; -} - bool dax_lock_mapping_entry(struct page *page) { - pgoff_t index; - struct inode *inode; - bool did_lock = false; - void *entry = NULL, **slot; - struct address_space *mapping; + XA_STATE(xas, NULL, 0); + void *entry; - rcu_read_lock(); for (;;) { - mapping = READ_ONCE(page->mapping); + struct address_space *mapping = READ_ONCE(page->mapping); if (!dax_mapping(mapping)) - break; + return false; /* * In the device-dax case there's no need to lock, a @@ -428,98 +370,94 @@ bool dax_lock_mapping_entry(struct page *page) * otherwise we would not have a valid pfn_to_page() * translation. */ - inode = mapping->host; - if (S_ISCHR(inode->i_mode)) { - did_lock = true; - break; - } + if (S_ISCHR(mapping->host->i_mode)) + return true; - xa_lock_irq(&mapping->i_pages); + xas.xa = &mapping->i_pages; + xas_lock_irq(&xas); if (mapping != page->mapping) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); continue; } - index = page->index; - - entry = __get_unlocked_mapping_entry(mapping, index, &slot, - entry_wait_revalidate); - if (!entry) { - xa_unlock_irq(&mapping->i_pages); - break; - } else if (IS_ERR(entry)) { - xa_unlock_irq(&mapping->i_pages); - WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN); - continue; + xas_set(&xas, page->index); + entry = xas_load(&xas); + if (dax_is_locked(entry)) { + entry = get_unlocked_entry(&xas); + /* Did the page move while we slept? */ + if (dax_to_pfn(entry) != page_to_pfn(page)) { + xas_unlock_irq(&xas); + continue; + } } - lock_slot(mapping, slot); - did_lock = true; - xa_unlock_irq(&mapping->i_pages); - break; + dax_lock_entry(&xas, entry); + xas_unlock_irq(&xas); + return true; } - rcu_read_unlock(); - - return did_lock; } void dax_unlock_mapping_entry(struct page *page) { struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; + XA_STATE(xas, &mapping->i_pages, page->index); - if (S_ISCHR(inode->i_mode)) + if (S_ISCHR(mapping->host->i_mode)) return; - unlock_mapping_entry(mapping, page->index); + dax_unlock_entry(&xas, dax_make_page_entry(page)); } /* - * Find radix tree entry at given index. If it points to an exceptional entry, - * return it with the radix tree entry locked. If the radix tree doesn't - * contain given index, create an empty exceptional entry for the index and - * return with it locked. + * Find page cache entry at given index. If it is a DAX entry, return it + * with the entry locked. If the page cache doesn't contain an entry at + * that index, add a locked empty entry. * - * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will - * either return that locked entry or will return an error. This error will - * happen if there are any 4k entries within the 2MiB range that we are - * requesting. + * When requesting an entry with size DAX_PMD, grab_mapping_entry() will + * either return that locked entry or will return VM_FAULT_FALLBACK. + * This will happen if there are any PTE entries within the PMD range + * that we are requesting. * - * We always favor 4k entries over 2MiB entries. There isn't a flow where we - * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB - * insertion will fail if it finds any 4k entries already in the tree, and a - * 4k insertion will cause an existing 2MiB entry to be unmapped and - * downgraded to 4k entries. This happens for both 2MiB huge zero pages as - * well as 2MiB empty entries. + * We always favor PTE entries over PMD entries. There isn't a flow where we + * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD + * insertion will fail if it finds any PTE entries already in the tree, and a + * PTE insertion will cause an existing PMD entry to be unmapped and + * downgraded to PTE entries. This happens for both PMD zero pages as + * well as PMD empty entries. * - * The exception to this downgrade path is for 2MiB DAX PMD entries that have - * real storage backing them. We will leave these real 2MiB DAX entries in - * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. + * The exception to this downgrade path is for PMD entries that have + * real storage backing them. We will leave these real PMD entries in + * the tree, and PTE writes will simply dirty the entire PMD entry. * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. + * + * On error, this function does not return an ERR_PTR. Instead it returns + * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values + * overlap with xarray value entries. */ -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, - unsigned long size_flag) +static void *grab_mapping_entry(struct xa_state *xas, + struct address_space *mapping, unsigned long size_flag) { - bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ - void *entry, **slot; - -restart: - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, &slot); + unsigned long index = xas->xa_index; + bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ + void *entry; - if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { - entry = ERR_PTR(-EIO); - goto out_unlock; - } +retry: + xas_lock_irq(xas); + entry = get_unlocked_entry(xas); + if (xa_is_internal(entry)) + goto fallback; if (entry) { - if (size_flag & RADIX_DAX_PMD) { + if (WARN_ON_ONCE(!xa_is_value(entry))) { + xas_set_err(xas, EIO); + goto out_unlock; + } + + if (size_flag & DAX_PMD) { if (dax_is_pte_entry(entry)) { - put_unlocked_mapping_entry(mapping, index, - entry); - entry = ERR_PTR(-EEXIST); - goto out_unlock; + put_unlocked_entry(xas, entry); + goto fallback; } } else { /* trying to grab a PTE entry */ if (dax_is_pmd_entry(entry) && @@ -530,87 +468,57 @@ restart: } } - /* No entry for given index? Make sure radix tree is big enough. */ - if (!entry || pmd_downgrade) { - int err; - - if (pmd_downgrade) { - /* - * Make sure 'entry' remains valid while we drop - * the i_pages lock. - */ - entry = lock_slot(mapping, slot); - } + if (pmd_downgrade) { + /* + * Make sure 'entry' remains valid while we drop + * the i_pages lock. + */ + dax_lock_entry(xas, entry); - xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be * unmapped. */ - if (pmd_downgrade && dax_is_zero_entry(entry)) - unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, - PG_PMD_NR, false); - - err = radix_tree_preload( - mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); - if (err) { - if (pmd_downgrade) - put_locked_mapping_entry(mapping, index); - return ERR_PTR(err); - } - xa_lock_irq(&mapping->i_pages); - - if (!entry) { - /* - * We needed to drop the i_pages lock while calling - * radix_tree_preload() and we didn't have an entry to - * lock. See if another thread inserted an entry at - * our index during this time. - */ - entry = __radix_tree_lookup(&mapping->i_pages, index, - NULL, &slot); - if (entry) { - radix_tree_preload_end(); - xa_unlock_irq(&mapping->i_pages); - goto restart; - } + if (dax_is_zero_entry(entry)) { + xas_unlock_irq(xas); + unmap_mapping_pages(mapping, + xas->xa_index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); + xas_reset(xas); + xas_lock_irq(xas); } - if (pmd_downgrade) { - dax_disassociate_entry(entry, mapping, false); - radix_tree_delete(&mapping->i_pages, index); - mapping->nrexceptional--; - dax_wake_mapping_entry_waiter(mapping, index, entry, - true); - } + dax_disassociate_entry(entry, mapping, false); + xas_store(xas, NULL); /* undo the PMD join */ + dax_wake_entry(xas, entry, true); + mapping->nrexceptional--; + entry = NULL; + xas_set(xas, index); + } - entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); - - err = __radix_tree_insert(&mapping->i_pages, index, - dax_radix_order(entry), entry); - radix_tree_preload_end(); - if (err) { - xa_unlock_irq(&mapping->i_pages); - /* - * Our insertion of a DAX entry failed, most likely - * because we were inserting a PMD entry and it - * collided with a PTE sized entry at a different - * index in the PMD range. We haven't inserted - * anything into the radix tree and have no waiters to - * wake. - */ - return ERR_PTR(err); - } - /* Good, we have inserted empty locked entry into the tree. */ + if (entry) { + dax_lock_entry(xas, entry); + } else { + entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY); + dax_lock_entry(xas, entry); + if (xas_error(xas)) + goto out_unlock; mapping->nrexceptional++; - xa_unlock_irq(&mapping->i_pages); - return entry; } - entry = lock_slot(mapping, slot); - out_unlock: - xa_unlock_irq(&mapping->i_pages); + +out_unlock: + xas_unlock_irq(xas); + if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) + goto retry; + if (xas->xa_node == XA_ERROR(-ENOMEM)) + return xa_mk_internal(VM_FAULT_OOM); + if (xas_error(xas)) + return xa_mk_internal(VM_FAULT_SIGBUS); return entry; +fallback: + xas_unlock_irq(xas); + return xa_mk_internal(VM_FAULT_FALLBACK); } /** @@ -630,11 +538,10 @@ restart: */ struct page *dax_layout_busy_page(struct address_space *mapping) { - pgoff_t indices[PAGEVEC_SIZE]; + XA_STATE(xas, &mapping->i_pages, 0); + void *entry; + unsigned int scanned = 0; struct page *page = NULL; - struct pagevec pvec; - pgoff_t index, end; - unsigned i; /* * In the 'limited' case get_user_pages() for dax is disabled. @@ -645,13 +552,9 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (!dax_mapping(mapping) || !mapping_mapped(mapping)) return NULL; - pagevec_init(&pvec); - index = 0; - end = -1; - /* * If we race get_user_pages_fast() here either we'll see the - * elevated page count in the pagevec_lookup and wait, or + * elevated page count in the iteration and wait, or * get_user_pages_fast() will see that the page it took a reference * against is no longer mapped in the page tables and bail to the * get_user_pages() slow path. The slow path is protected by @@ -663,94 +566,68 @@ struct page *dax_layout_busy_page(struct address_space *mapping) */ unmap_mapping_range(mapping, 0, 0, 1); - while (index < end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - indices)) { - pgoff_t nr_pages = 1; - - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *pvec_ent = pvec.pages[i]; - void *entry; - - index = indices[i]; - if (index >= end) - break; - - if (WARN_ON_ONCE( - !radix_tree_exceptional_entry(pvec_ent))) - continue; - - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (entry) { - page = dax_busy_page(entry); - /* - * Account for multi-order entries at - * the end of the pagevec. - */ - if (i + 1 >= pagevec_count(&pvec)) - nr_pages = 1UL << dax_radix_order(entry); - } - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(&mapping->i_pages); - if (page) - break; - } - - /* - * We don't expect normal struct page entries to exist in our - * tree, but we keep these pagevec calls so that this code is - * consistent with the common pattern for handling pagevecs - * throughout the kernel. - */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - index += nr_pages; - + xas_lock_irq(&xas); + xas_for_each(&xas, entry, ULONG_MAX) { + if (WARN_ON_ONCE(!xa_is_value(entry))) + continue; + if (unlikely(dax_is_locked(entry))) + entry = get_unlocked_entry(&xas); + if (entry) + page = dax_busy_page(entry); + put_unlocked_entry(&xas, entry); if (page) break; + if (++scanned % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } + xas_unlock_irq(&xas); return page; } EXPORT_SYMBOL_GPL(dax_layout_busy_page); -static int __dax_invalidate_mapping_entry(struct address_space *mapping, +static int __dax_invalidate_entry(struct address_space *mapping, pgoff_t index, bool trunc) { + XA_STATE(xas, &mapping->i_pages, index); int ret = 0; void *entry; - struct radix_tree_root *pages = &mapping->i_pages; - xa_lock_irq(pages); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) + xas_lock_irq(&xas); + entry = get_unlocked_entry(&xas); + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto out; if (!trunc && - (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) + (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || + xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) goto out; dax_disassociate_entry(entry, mapping, trunc); - radix_tree_delete(pages, index); + xas_store(&xas, NULL); mapping->nrexceptional--; ret = 1; out: - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(pages); + put_unlocked_entry(&xas, entry); + xas_unlock_irq(&xas); return ret; } + /* - * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree - * entry to get unlocked before deleting it. + * Delete DAX entry at @index from @mapping. Wait for it + * to be unlocked before deleting it. */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { - int ret = __dax_invalidate_mapping_entry(mapping, index, true); + int ret = __dax_invalidate_entry(mapping, index, true); /* * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the - * radix tree (usually fs-private i_mmap_sem for writing). Since the - * caller has seen exceptional entry for this index, we better find it + * page cache (usually fs-private i_mmap_sem for writing). Since the + * caller has seen a DAX entry for this index, we better find it * at that index as well... */ WARN_ON_ONCE(!ret); @@ -758,12 +635,12 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) } /* - * Invalidate exceptional DAX entry if it is clean. + * Invalidate DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index) { - return __dax_invalidate_mapping_entry(mapping, index, false); + return __dax_invalidate_entry(mapping, index, false); } static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, @@ -799,30 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ -static void *dax_insert_mapping_entry(struct address_space *mapping, - struct vm_fault *vmf, - void *entry, pfn_t pfn_t, - unsigned long flags, bool dirty) +static void *dax_insert_entry(struct xa_state *xas, + struct address_space *mapping, struct vm_fault *vmf, + void *entry, pfn_t pfn, unsigned long flags, bool dirty) { - struct radix_tree_root *pages = &mapping->i_pages; - unsigned long pfn = pfn_t_to_pfn(pfn_t); - pgoff_t index = vmf->pgoff; - void *new_entry; + void *new_entry = dax_make_entry(pfn, flags); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { + if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { + unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, - PG_PMD_NR, false); + PG_PMD_NR, false); else /* pte entry */ - unmap_mapping_pages(mapping, vmf->pgoff, 1, false); + unmap_mapping_pages(mapping, index, 1, false); } - xa_lock_irq(pages); - new_entry = dax_radix_locked_entry(pfn, flags); + xas_reset(xas); + xas_lock_irq(xas); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); @@ -830,33 +704,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* - * Only swap our new entry into the radix tree if the current + * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or - * PMD entry is already in the tree, we leave it alone. This + * PMD entry is already in the cache, we leave it alone. This * means that if we are trying to insert a PTE and the * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary. */ - struct radix_tree_node *node; - void **slot; - void *ret; - - ret = __radix_tree_lookup(pages, index, &node, &slot); - WARN_ON_ONCE(ret != entry); - __radix_tree_replace(pages, node, slot, - new_entry, NULL); + void *old = dax_lock_entry(xas, new_entry); + WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | + DAX_LOCKED)); entry = new_entry; + } else { + xas_load(xas); /* Walk the xa_state */ } if (dirty) - radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); + xas_set_mark(xas, PAGECACHE_TAG_DIRTY); - xa_unlock_irq(pages); + xas_unlock_irq(xas); return entry; } -static inline unsigned long -pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) +static inline +unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) { unsigned long address; @@ -866,8 +737,8 @@ pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) } /* Walk all mappings of a given index of a file and writeprotect them */ -static void dax_mapping_entry_mkclean(struct address_space *mapping, - pgoff_t index, unsigned long pfn) +static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, + unsigned long pfn) { struct vm_area_struct *vma; pte_t pte, *ptep = NULL; @@ -937,11 +808,9 @@ unlock_pte: i_mmap_unlock_read(mapping); } -static int dax_writeback_one(struct dax_device *dax_dev, - struct address_space *mapping, pgoff_t index, void *entry) +static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, + struct address_space *mapping, void *entry) { - struct radix_tree_root *pages = &mapping->i_pages; - void *entry2, **slot; unsigned long pfn; long ret = 0; size_t size; @@ -950,32 +819,38 @@ static int dax_writeback_one(struct dax_device *dax_dev, * A page got tagged dirty in DAX mapping? Something is seriously * wrong. */ - if (WARN_ON(!radix_tree_exceptional_entry(entry))) + if (WARN_ON(!xa_is_value(entry))) return -EIO; - xa_lock_irq(pages); - entry2 = get_unlocked_mapping_entry(mapping, index, &slot); - /* Entry got punched out / reallocated? */ - if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) - goto put_unlocked; - /* - * Entry got reallocated elsewhere? No need to writeback. We have to - * compare pfns as we must not bail out due to difference in lockbit - * or entry type. - */ - if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) - goto put_unlocked; - if (WARN_ON_ONCE(dax_is_empty_entry(entry) || - dax_is_zero_entry(entry))) { - ret = -EIO; - goto put_unlocked; + if (unlikely(dax_is_locked(entry))) { + void *old_entry = entry; + + entry = get_unlocked_entry(xas); + + /* Entry got punched out / reallocated? */ + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) + goto put_unlocked; + /* + * Entry got reallocated elsewhere? No need to writeback. + * We have to compare pfns as we must not bail out due to + * difference in lockbit or entry type. + */ + if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) + goto put_unlocked; + if (WARN_ON_ONCE(dax_is_empty_entry(entry) || + dax_is_zero_entry(entry))) { + ret = -EIO; + goto put_unlocked; + } + + /* Another fsync thread may have already done this entry */ + if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) + goto put_unlocked; } - /* Another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) - goto put_unlocked; /* Lock the entry to serialize with page faults */ - entry = lock_slot(mapping, slot); + dax_lock_entry(xas, entry); + /* * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we @@ -983,8 +858,8 @@ static int dax_writeback_one(struct dax_device *dax_dev, * at the entry only under the i_pages lock and once they do that * they will see the entry locked and wait for it to unlock. */ - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); - xa_unlock_irq(pages); + xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irq(xas); /* * Even if dax_writeback_mapping_range() was given a wbc->range_start @@ -993,10 +868,10 @@ static int dax_writeback_one(struct dax_device *dax_dev, * This allows us to flush for PMD_SIZE and not have to worry about * partial PMD writebacks. */ - pfn = dax_radix_pfn(entry); - size = PAGE_SIZE << dax_radix_order(entry); + pfn = dax_to_pfn(entry); + size = PAGE_SIZE << dax_entry_order(entry); - dax_mapping_entry_mkclean(mapping, index, pfn); + dax_entry_mkclean(mapping, xas->xa_index, pfn); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); /* * After we have flushed the cache, we can clear the dirty tag. There @@ -1004,16 +879,18 @@ static int dax_writeback_one(struct dax_device *dax_dev, * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ - xa_lock_irq(pages); - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); - xa_unlock_irq(pages); - trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); - put_locked_mapping_entry(mapping, index); + xas_reset(xas); + xas_lock_irq(xas); + xas_store(xas, entry); + xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); + dax_wake_entry(xas, entry, false); + + trace_dax_writeback_one(mapping->host, xas->xa_index, + size >> PAGE_SHIFT); return ret; put_unlocked: - put_unlocked_mapping_entry(mapping, index, entry2); - xa_unlock_irq(pages); + put_unlocked_entry(xas, entry); return ret; } @@ -1025,13 +902,13 @@ static int dax_writeback_one(struct dax_device *dax_dev, int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { + XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); struct inode *inode = mapping->host; - pgoff_t start_index, end_index; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; struct dax_device *dax_dev; - struct pagevec pvec; - bool done = false; - int i, ret = 0; + void *entry; + int ret = 0; + unsigned int scanned = 0; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; @@ -1043,41 +920,29 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (!dax_dev) return -EIO; - start_index = wbc->range_start >> PAGE_SHIFT; - end_index = wbc->range_end >> PAGE_SHIFT; - - trace_dax_writeback_range(inode, start_index, end_index); + trace_dax_writeback_range(inode, xas.xa_index, end_index); - tag_pages_for_writeback(mapping, start_index, end_index); + tag_pages_for_writeback(mapping, xas.xa_index, end_index); - pagevec_init(&pvec); - while (!done) { - pvec.nr = find_get_entries_tag(mapping, start_index, - PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, - pvec.pages, indices); - - if (pvec.nr == 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { + ret = dax_writeback_one(&xas, dax_dev, mapping, entry); + if (ret < 0) { + mapping_set_error(mapping, ret); break; - - for (i = 0; i < pvec.nr; i++) { - if (indices[i] > end_index) { - done = true; - break; - } - - ret = dax_writeback_one(dax_dev, mapping, indices[i], - pvec.pages[i]); - if (ret < 0) { - mapping_set_error(mapping, ret); - goto out; - } } - start_index = indices[pvec.nr - 1] + 1; + if (++scanned % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } -out: + xas_unlock_irq(&xas); put_dax(dax_dev); - trace_dax_writeback_range_done(inode, start_index, end_index); - return (ret < 0 ? ret : 0); + trace_dax_writeback_range_done(inode, xas.xa_index, end_index); + return ret; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); @@ -1125,16 +990,18 @@ out: * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ -static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, - struct vm_fault *vmf) +static vm_fault_t dax_load_hole(struct xa_state *xas, + struct address_space *mapping, void **entry, + struct vm_fault *vmf) { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; - dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, - false); + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, + DAX_ZERO_PAGE, false); + ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); trace_dax_load_hole(inode, vmf, ret); return ret; @@ -1342,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; + XA_STATE(xas, &mapping->i_pages, vmf->pgoff); struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; @@ -1368,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (write && !vmf->cow_page) flags |= IOMAP_WRITE; - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); - if (IS_ERR(entry)) { - ret = dax_fault_return(PTR_ERR(entry)); + entry = grab_mapping_entry(&xas, mapping, 0); + if (xa_is_internal(entry)) { + ret = xa_to_internal(entry); goto out; } @@ -1443,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto error_finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, 0, write && !sync); /* @@ -1471,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!write) { - ret = dax_load_hole(mapping, entry, vmf); + ret = dax_load_hole(&xas, mapping, &entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1498,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff); + dax_unlock_entry(&xas, entry); out: trace_dax_pte_fault_done(inode, vmf, ret); return ret | major; } #ifdef CONFIG_FS_DAX_PMD -static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, - void *entry) +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, + struct iomap *iomap, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; struct inode *inode = mapping->host; struct page *zero_page; - void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; pfn_t pfn; @@ -1523,8 +1390,8 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; pfn = page_to_pfn_t(zero_page); - ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, + DAX_PMD | DAX_ZERO_PAGE, false); ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1536,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); - trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); return VM_FAULT_NOPAGE; fallback: - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); return VM_FAULT_FALLBACK; } @@ -1549,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; @@ -1556,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; - pgoff_t max_pgoff, pgoff; + pgoff_t max_pgoff; void *entry; loff_t pos; int error; @@ -1567,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ - pgoff = linear_page_index(vma, pmd_addr); max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); @@ -1576,7 +1443,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * Make sure that the faulting address's PMD offset (color) matches * the PMD offset from the start of the file. This is necessary so * that a PMD range in the page table overlaps exactly with a PMD - * range in the radix tree. + * range in the page cache. */ if ((vmf->pgoff & PG_PMD_COLOUR) != ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) @@ -1592,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - if (pgoff >= max_pgoff) { + if (xas.xa_index >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } /* If the PMD would extend beyond the file size */ - if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) + if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) goto fallback; /* - * grab_mapping_entry() will make sure we get a 2MiB empty entry, a - * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page - * is already in the tree, for instance), it will return -EEXIST and - * we just fall back to 4k entries. + * grab_mapping_entry() will make sure we get an empty PMD entry, + * a zero PMD entry or a DAX PMD. If it can't (because a PTE + * entry is already in the array, for instance), it will return + * VM_FAULT_FALLBACK. */ - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); - if (IS_ERR(entry)) + entry = grab_mapping_entry(&xas, mapping, DAX_PMD); + if (xa_is_internal(entry)) { + result = xa_to_internal(entry); goto fallback; + } /* * It is possible, particularly with mixed reads & writes to private @@ -1628,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. */ - pos = (loff_t)pgoff << PAGE_SHIFT; + pos = (loff_t)xas.xa_index << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) goto unlock_entry; @@ -1644,8 +1513,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_PMD, write && !sync); + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, + DAX_PMD, write && !sync); /* * If we are doing synchronous page fault and inode needs fsync, @@ -1669,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; - result = dax_pmd_load_hole(vmf, &iomap, entry); + result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); break; default: WARN_ON_ONCE(1); @@ -1692,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, pgoff); + dax_unlock_entry(&xas, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); @@ -1737,54 +1606,49 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, } EXPORT_SYMBOL_GPL(dax_iomap_fault); -/** +/* * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables * @vmf: The description of the fault - * @pe_size: Size of entry to be inserted * @pfn: PFN to insert + * @order: Order of entry to insert. * - * This function inserts writeable PTE or PMD entry into page tables for mmaped - * DAX file. It takes care of marking corresponding radix tree entry as dirty - * as well. + * This function inserts a writeable PTE or PMD entry into the page tables + * for an mmaped DAX file. It also marks the page cache entry as dirty. */ -static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, - enum page_entry_size pe_size, - pfn_t pfn) +static vm_fault_t +dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; - void *entry, **slot; - pgoff_t index = vmf->pgoff; + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); + void *entry; vm_fault_t ret; - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, &slot); + xas_lock_irq(&xas); + entry = get_unlocked_entry(&xas); /* Did we race with someone splitting entry or so? */ if (!entry || - (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || - (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(&mapping->i_pages); + (order == 0 && !dax_is_pte_entry(entry)) || + (order == PMD_ORDER && (xa_is_internal(entry) || + !dax_is_pmd_entry(entry)))) { + put_unlocked_entry(&xas, entry); + xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } - radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); - entry = lock_slot(mapping, slot); - xa_unlock_irq(&mapping->i_pages); - switch (pe_size) { - case PE_SIZE_PTE: + xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); + dax_lock_entry(&xas, entry); + xas_unlock_irq(&xas); + if (order == 0) ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); - break; #ifdef CONFIG_FS_DAX_PMD - case PE_SIZE_PMD: + else if (order == PMD_ORDER) ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, true); - break; #endif - default: + else ret = VM_FAULT_FALLBACK; - } - put_locked_mapping_entry(mapping, index); + dax_unlock_entry(&xas, entry); trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); return ret; } @@ -1804,17 +1668,12 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; - size_t len = 0; + unsigned int order = pe_order(pe_size); + size_t len = PAGE_SIZE << order; - if (pe_size == PE_SIZE_PTE) - len = PAGE_SIZE; - else if (pe_size == PE_SIZE_PMD) - len = PMD_SIZE; - else - WARN_ON_ONCE(1); err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); if (err) return VM_FAULT_SIGBUS; - return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); + return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); diff --git a/fs/dcache.c b/fs/dcache.c index 2e7e8d85e9b4..2593153471cf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -26,7 +26,7 @@ #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> @@ -257,24 +257,10 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -static void __d_free_external_name(struct rcu_head *head) -{ - struct external_name *name = container_of(head, struct external_name, - u.head); - - mod_node_page_state(page_pgdat(virt_to_page(name)), - NR_INDIRECTLY_RECLAIMABLE_BYTES, - -ksize(name)); - - kfree(name); -} - static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - - __d_free_external_name(&external_name(dentry)->u.head); - + kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } @@ -306,7 +292,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) struct external_name *p; p = container_of(name->name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) - call_rcu(&p->u.head, __d_free_external_name); + kfree_rcu(p, u.head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -1606,7 +1592,6 @@ EXPORT_SYMBOL(d_invalidate); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { - struct external_name *ext = NULL; struct dentry *dentry; char *dname; int err; @@ -1627,14 +1612,15 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - - ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); - if (!ext) { + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT | + __GFP_RECLAIMABLE); + if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&ext->u.count, 1); - dname = ext->name; + atomic_set(&p->u.count, 1); + dname = p->name; } else { dname = dentry->d_iname; } @@ -1673,12 +1659,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } } - if (unlikely(ext)) { - pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); - mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, - ksize(ext)); - } - this_cpu_inc(nr_dentry); return dentry; @@ -2707,7 +2687,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - call_rcu(&old_name->u.head, __d_free_external_name); + kfree_rcu(old_name, u.head); } /* diff --git a/fs/direct-io.c b/fs/direct-io.c index 093fb54cd316..722d17c88edb 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1313,7 +1313,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, spin_lock_init(&dio->bio_lock); dio->refcount = 1; - dio->should_dirty = (iter->type == ITER_IOVEC); + dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ; sdio.iter = iter; sdio.final_block_in_request = end >> blkbits; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a5e4a221435c..76976d6e50f9 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -674,7 +674,7 @@ static int receive_from_sock(struct connection *con) nvec = 2; } len = iov[0].iov_len + iov[1].iov_len; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nvec, len); + iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len); r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL); if (ret <= 0) diff --git a/fs/exec.c b/fs/exec.c index 1ebf6e5a521d..fc281b738a98 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -908,14 +908,14 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, goto out; i_size = i_size_read(file_inode(file)); - if (max_size > 0 && i_size > max_size) { - ret = -EFBIG; - goto out; - } if (i_size <= 0) { ret = -EINVAL; goto out; } + if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) { + ret = -EFBIG; + goto out; + } if (id != READING_FIRMWARE_PREALLOC_BUFFER) *buf = vmalloc(i_size); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 41cf2fbee50d..906839a4da8f 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -101,6 +101,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts) token = match_token(p, tokens, args); switch (token) { case Opt_name: + kfree(opts->dev_name); opts->dev_name = match_strdup(&args[0]); if (unlikely(!opts->dev_name)) { EXOFS_ERR("Error allocating dev_name"); @@ -117,7 +118,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts) EXOFS_MIN_PID); return -EINVAL; } - s_pid = 1; + s_pid = true; break; case Opt_to: if (match_int(&args[0], &option)) @@ -866,8 +867,10 @@ static struct dentry *exofs_mount(struct file_system_type *type, int ret; ret = parse_options(data, &opts); - if (ret) + if (ret) { + kfree(opts.dev_name); return ERR_PTR(ret); + } if (!opts.dev_name) opts.dev_name = dev_name; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 224c04abb2e5..cf4c77f8dd08 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -256,11 +256,15 @@ ext2_init_acl(struct inode *inode, struct inode *dir) if (default_acl) { error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } return error; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 00e759f05161..e770cd100a6a 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -390,11 +390,7 @@ struct ext2_inode { #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ -#ifdef CONFIG_FS_DAX #define EXT2_MOUNT_DAX 0x100000 /* Direct Access */ -#else -#define EXT2_MOUNT_DAX 0 -#endif #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 73bd58fa13de..cb91baa4275d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -309,20 +309,17 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); -#if defined(CONFIG_QUOTA) if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA) seq_puts(seq, ",usrquota"); if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA) seq_puts(seq, ",grpquota"); -#endif -#ifdef CONFIG_FS_DAX if (sbi->s_mount_opt & EXT2_MOUNT_XIP) seq_puts(seq, ",xip"); + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) seq_puts(seq, ",dax"); -#endif if (!test_opt(sb, RESERVATION)) seq_puts(seq, ",noreservation"); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 12f90d48ba61..3f89d0ab08fc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -45,15 +45,6 @@ #include <linux/compiler.h> -/* Until this gets included into linux/compiler-gcc.h */ -#ifndef __nonstring -#if defined(GCC_VERSION) && (GCC_VERSION >= 80000) -#define __nonstring __attribute__((nonstring)) -#else -#define __nonstring -#endif -#endif - /* * The fourth extended filesystem constants/structures */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 2addcb8730e1..014f6a698cb7 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1216,7 +1216,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) - return (struct inode *) bitmap_bh; + return ERR_CAST(bitmap_bh); /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c3d9a42c561e..05f01fbd9c7f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2643,7 +2643,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; - int tag; + xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 67a38532032a..17adcb16a9c8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1556,7 +1556,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (IS_ERR(bh)) - return (struct dentry *) bh; + return ERR_CAST(bh); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); @@ -1600,7 +1600,7 @@ struct dentry *ext4_get_parent(struct dentry *child) bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL); if (IS_ERR(bh)) - return (struct dentry *) bh; + return ERR_CAST(bh); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 106f116466bf..b293cb3e27a2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2071,7 +2071,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; int nwritten = 0; pagevec_init(&pvec); @@ -2787,13 +2787,13 @@ const struct address_space_operations f2fs_dblock_aops = { #endif }; -void f2fs_clear_radix_tree_dirty_tag(struct page *page) +void f2fs_clear_page_cache_dirty_tag(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2ef84b4590ea..bacc667950b6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -726,7 +726,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, page->index, page->index + 1)) { - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 56204a8f8a12..1e031971a466 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3108,7 +3108,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); -void f2fs_clear_radix_tree_dirty_tag(struct page *page); +void f2fs_clear_page_cache_dirty_tag(struct page *page); /* * gc.c diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index cb31a719b048..7b0cff7e6051 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -243,7 +243,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2b34206486d8..d338740d0fda 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -101,7 +101,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { if (PageDirty(page)) { - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); } @@ -1306,9 +1306,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (f2fs_check_nid_range(sbi, nid)) return; - rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); - rcu_read_unlock(); + apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid); if (apage) return; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 7f5f3699fc6c..c8366cb8eccd 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -369,7 +369,9 @@ static int fat_parse_short(struct super_block *sb, } memcpy(work, de->name, sizeof(work)); - /* see namei.c, msdos_format_name */ + /* For an explanation of the special treatment of 0x05 in + * filenames, see msdos_format_name in namei_msdos.c + */ if (work[0] == 0x05) work[0] = 0xE5; @@ -1071,7 +1073,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) } } - dir->i_mtime = dir->i_atime = current_time(dir); + fat_truncate_time(dir, NULL, S_ATIME|S_MTIME); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 9d7d2d5da28b..4e1b2f6df5e6 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -416,6 +416,10 @@ extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs); extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); +extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, + int flags); +extern int fat_update_time(struct inode *inode, struct timespec64 *now, + int flags); extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); int fat_cache_init(void); diff --git a/fs/fat/file.c b/fs/fat/file.c index 4f3d72fb1e60..13935ee99e1e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -227,7 +227,7 @@ static int fat_cont_expand(struct inode *inode, loff_t size) if (err) goto out; - inode->i_ctime = inode->i_mtime = current_time(inode); + fat_truncate_time(inode, NULL, S_CTIME|S_MTIME); mark_inode_dirty(inode); if (IS_SYNC(inode)) { int err2; @@ -330,7 +330,7 @@ static int fat_free(struct inode *inode, int skip) MSDOS_I(inode)->i_logstart = 0; } MSDOS_I(inode)->i_attrs |= ATTR_ARCH; - inode->i_ctime = inode->i_mtime = current_time(inode); + fat_truncate_time(inode, NULL, S_CTIME|S_MTIME); if (wait) { err = fat_sync_inode(inode); if (err) { @@ -542,6 +542,18 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) up_write(&MSDOS_I(inode)->truncate_lock); } + /* + * setattr_copy can't truncate these appropriately, so we'll + * copy them ourselves + */ + if (attr->ia_valid & ATTR_ATIME) + fat_truncate_time(inode, &attr->ia_atime, S_ATIME); + if (attr->ia_valid & ATTR_CTIME) + fat_truncate_time(inode, &attr->ia_ctime, S_CTIME); + if (attr->ia_valid & ATTR_MTIME) + fat_truncate_time(inode, &attr->ia_mtime, S_MTIME); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME); + setattr_copy(inode, attr); mark_inode_dirty(inode); out: @@ -552,4 +564,5 @@ EXPORT_SYMBOL_GPL(fat_setattr); const struct inode_operations fat_file_inode_operations = { .setattr = fat_setattr, .getattr = fat_getattr, + .update_time = fat_update_time, }; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index d6b81e31f9f5..c0b5b5c3373b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -244,7 +244,7 @@ static int fat_write_end(struct file *file, struct address_space *mapping, if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { - inode->i_mtime = inode->i_ctime = current_time(inode); + fat_truncate_time(inode, NULL, S_CTIME|S_MTIME); MSDOS_I(inode)->i_attrs |= ATTR_ARCH; mark_inode_dirty(inode); } @@ -564,7 +564,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) de->cdate, de->ctime_cs); fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); } else - inode->i_ctime = inode->i_atime = inode->i_mtime; + fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME); return 0; } @@ -1626,6 +1626,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; + /* + * fat timestamps are complex and truncated by fat itself, so + * we set 1 here to be fast + */ + sb->s_time_gran = 1; mutex_init(&sbi->nfs_build_inode_lock); ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 573836dcaefc..fce0a76f3f1e 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -7,6 +7,7 @@ */ #include "fat.h" +#include <linux/iversion.h> /* * fat_fs_error reports a file system problem that might indicate fa data @@ -185,6 +186,13 @@ static long days_in_year[] = { 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, }; +static inline int fat_tz_offset(struct msdos_sb_info *sbi) +{ + return (sbi->options.tz_set ? + -sbi->options.time_offset : + sys_tz.tz_minuteswest) * SECS_PER_MIN; +} + /* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs) @@ -210,10 +218,7 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, + days_in_year[month] + day + DAYS_DELTA) * SECS_PER_DAY; - if (!sbi->options.tz_set) - second += sys_tz.tz_minuteswest * SECS_PER_MIN; - else - second -= sbi->options.time_offset * SECS_PER_MIN; + second += fat_tz_offset(sbi); if (time_cs) { ts->tv_sec = second + (time_cs / 100); @@ -229,9 +234,7 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs) { struct tm tm; - time64_to_tm(ts->tv_sec, - (sbi->options.tz_set ? sbi->options.time_offset : - -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm); + time64_to_tm(ts->tv_sec, -fat_tz_offset(sbi), &tm); /* FAT can only support year between 1980 to 2107 */ if (tm.tm_year < 1980 - 1900) { @@ -263,6 +266,80 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, } EXPORT_SYMBOL_GPL(fat_time_unix2fat); +static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts) +{ + return (struct timespec64){ ts.tv_sec & ~1ULL, 0 }; +} +/* + * truncate the various times with appropriate granularity: + * root inode: + * all times always 0 + * all other inodes: + * mtime - 2 seconds + * ctime + * msdos - 2 seconds + * vfat - 10 milliseconds + * atime - 24 hours (00:00:00 in local timezone) + */ +int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct timespec64 ts; + + if (inode->i_ino == MSDOS_ROOT_INO) + return 0; + + if (now == NULL) { + now = &ts; + ts = current_time(inode); + } + + if (flags & S_ATIME) { + /* to localtime */ + time64_t seconds = now->tv_sec - fat_tz_offset(sbi); + s32 remainder; + + div_s64_rem(seconds, SECS_PER_DAY, &remainder); + /* to day boundary, and back to unix time */ + seconds = seconds + fat_tz_offset(sbi) - remainder; + + inode->i_atime = (struct timespec64){ seconds, 0 }; + } + if (flags & S_CTIME) { + if (sbi->options.isvfat) + inode->i_ctime = timespec64_trunc(*now, 10000000); + else + inode->i_ctime = fat_timespec64_trunc_2secs(*now); + } + if (flags & S_MTIME) + inode->i_mtime = fat_timespec64_trunc_2secs(*now); + + return 0; +} +EXPORT_SYMBOL_GPL(fat_truncate_time); + +int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) +{ + int iflags = I_DIRTY_TIME; + bool dirty = false; + + if (inode->i_ino == MSDOS_ROOT_INO) + return 0; + + fat_truncate_time(inode, now, flags); + if (flags & S_VERSION) + dirty = inode_maybe_inc_iversion(inode, false); + if ((flags & (S_ATIME | S_CTIME | S_MTIME)) && + !(inode->i_sb->s_flags & SB_LAZYTIME)) + dirty = true; + + if (dirty) + iflags |= I_DIRTY_SYNC; + __mark_inode_dirty(inode, iflags); + return 0; +} +EXPORT_SYMBOL_GPL(fat_update_time); + int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) { int i, err = 0; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index efb8c40c9d27..f2cd365a4e86 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -250,7 +250,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, if (err) return err; - dir->i_ctime = dir->i_mtime = *ts; + fat_truncate_time(dir, ts, S_CTIME|S_MTIME); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -294,7 +294,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, err = PTR_ERR(inode); goto out; } - inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -327,7 +327,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); clear_nlink(inode); - inode->i_ctime = current_time(inode); + fat_truncate_time(inode, NULL, S_CTIME); fat_detach(inode); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -380,7 +380,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; } set_nlink(inode, 2); - inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -413,7 +413,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; clear_nlink(inode); - inode->i_ctime = current_time(inode); + fat_truncate_time(inode, NULL, S_CTIME); fat_detach(inode); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -478,7 +478,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, mark_inode_dirty(old_inode); inode_inc_iversion(old_dir); - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + fat_truncate_time(old_dir, NULL, S_CTIME|S_MTIME); if (IS_DIRSYNC(old_dir)) (void)fat_sync_inode(old_dir); else @@ -538,7 +538,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, if (err) goto error_dotdot; inode_inc_iversion(old_dir); - old_dir->i_ctime = old_dir->i_mtime = ts; + fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME); if (IS_DIRSYNC(old_dir)) (void)fat_sync_inode(old_dir); else @@ -548,7 +548,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, drop_nlink(new_inode); if (is_dir) drop_nlink(new_inode); - new_inode->i_ctime = ts; + fat_truncate_time(new_inode, &ts, S_CTIME); } out: brelse(sinfo.bh); @@ -637,6 +637,7 @@ static const struct inode_operations msdos_dir_inode_operations = { .rename = msdos_rename, .setattr = fat_setattr, .getattr = fat_getattr, + .update_time = fat_update_time, }; static void setup(struct super_block *sb) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 82cd1e69cbdf..996c8c25e9c6 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname, goto cleanup; /* update timestamp */ - dir->i_ctime = dir->i_mtime = dir->i_atime = *ts; + fat_truncate_time(dir, ts, S_CTIME|S_MTIME); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -779,7 +779,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto out; } inode_inc_iversion(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -810,7 +810,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = current_time(inode); + fat_truncate_time(inode, NULL, S_ATIME|S_MTIME); fat_detach(inode); vfat_d_version_set(dentry, inode_query_iversion(dir)); out: @@ -836,7 +836,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; clear_nlink(inode); - inode->i_mtime = inode->i_atime = current_time(inode); + fat_truncate_time(inode, NULL, S_ATIME|S_MTIME); fat_detach(inode); vfat_d_version_set(dentry, inode_query_iversion(dir)); out: @@ -876,7 +876,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } inode_inc_iversion(inode); set_nlink(inode, 2); - inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -969,7 +969,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto error_dotdot; inode_inc_iversion(old_dir); - old_dir->i_ctime = old_dir->i_mtime = ts; + fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME); if (IS_DIRSYNC(old_dir)) (void)fat_sync_inode(old_dir); else @@ -979,7 +979,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, drop_nlink(new_inode); if (is_dir) drop_nlink(new_inode); - new_inode->i_ctime = ts; + fat_truncate_time(new_inode, &ts, S_CTIME); } out: brelse(sinfo.bh); @@ -1032,6 +1032,7 @@ static const struct inode_operations vfat_dir_inode_operations = { .rename = vfat_rename, .setattr = fat_setattr, .getattr = fat_getattr, + .update_time = fat_update_time, }; static void setup(struct super_block *sb) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 471d863958bc..b40168fcc94a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -339,9 +339,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; - struct radix_tree_iter iter; + XA_STATE(xas, &mapping->i_pages, 0); + struct page *page; bool switched = false; - void **slot; /* * By the time control reaches here, RCU grace period has passed @@ -375,25 +375,18 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to * pages actually under writeback. */ - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, - PAGECACHE_TAG_DIRTY) { - struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (likely(page) && PageDirty(page)) { + xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) { + if (PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, - PAGECACHE_TAG_WRITEBACK) { - struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (likely(page)) { - WARN_ON_ONCE(!PageWriteback(page)); - dec_wb_stat(old_wb, WB_WRITEBACK); - inc_wb_stat(new_wb, WB_WRITEBACK); - } + xas_set(&xas, 0); + xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { + WARN_ON_ONCE(!PageWriteback(page)); + dec_wb_stat(old_wb, WB_WRITEBACK); + inc_wb_stat(new_wb, WB_WRITEBACK); } wb_get(new_wb); diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 60da84a86dab..f7b807bc1027 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o -fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o +fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 0b694655d988..989df5accaee 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -107,7 +107,7 @@ static ssize_t fuse_conn_max_background_read(struct file *file, if (!fc) return 0; - val = fc->max_background; + val = READ_ONCE(fc->max_background); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); @@ -125,7 +125,12 @@ static ssize_t fuse_conn_max_background_write(struct file *file, if (ret > 0) { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { + spin_lock(&fc->bg_lock); fc->max_background = val; + fc->blocked = fc->num_background >= fc->max_background; + if (!fc->blocked) + wake_up(&fc->blocked_waitq); + spin_unlock(&fc->bg_lock); fuse_conn_put(fc); } } @@ -144,7 +149,7 @@ static ssize_t fuse_conn_congestion_threshold_read(struct file *file, if (!fc) return 0; - val = fc->congestion_threshold; + val = READ_ONCE(fc->congestion_threshold); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); @@ -155,18 +160,31 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, size_t count, loff_t *ppos) { unsigned uninitialized_var(val); + struct fuse_conn *fc; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, max_user_congthresh); - if (ret > 0) { - struct fuse_conn *fc = fuse_ctl_file_conn_get(file); - if (fc) { - fc->congestion_threshold = val; - fuse_conn_put(fc); + if (ret <= 0) + goto out; + fc = fuse_ctl_file_conn_get(file); + if (!fc) + goto out; + + spin_lock(&fc->bg_lock); + fc->congestion_threshold = val; + if (fc->sb) { + if (fc->num_background < fc->congestion_threshold) { + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + } else { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } } - + spin_unlock(&fc->bg_lock); + fuse_conn_put(fc); +out: return ret; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 11ea2c4a38ab..ae813e609932 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -25,6 +25,10 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); +/* Ordinary requests have even IDs, while interrupts IDs are odd */ +#define FUSE_INT_REQ_BIT (1ULL << 0) +#define FUSE_REQ_ID_STEP (1ULL << 1) + static struct kmem_cache *fuse_req_cachep; static struct fuse_dev *fuse_get_dev(struct file *file) @@ -40,9 +44,6 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, struct fuse_page_desc *page_descs, unsigned npages) { - memset(req, 0, sizeof(*req)); - memset(pages, 0, sizeof(*pages) * npages); - memset(page_descs, 0, sizeof(*page_descs) * npages); INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); @@ -53,30 +54,36 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, __set_bit(FR_PENDING, &req->flags); } +static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) pages + npages * sizeof(struct page *); + + return pages; +} + static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); + struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) { - struct page **pages; - struct fuse_page_desc *page_descs; - - if (npages <= FUSE_REQ_INLINE_PAGES) { + struct page **pages = NULL; + struct fuse_page_desc *page_descs = NULL; + + WARN_ON(npages > FUSE_MAX_MAX_PAGES); + if (npages > FUSE_REQ_INLINE_PAGES) { + pages = fuse_req_pages_alloc(npages, flags, + &page_descs); + if (!pages) { + kmem_cache_free(fuse_req_cachep, req); + return NULL; + } + } else if (npages) { pages = req->inline_pages; page_descs = req->inline_page_descs; - } else { - pages = kmalloc_array(npages, sizeof(struct page *), - flags); - page_descs = - kmalloc_array(npages, - sizeof(struct fuse_page_desc), - flags); - } - - if (!pages || !page_descs) { - kfree(pages); - kfree(page_descs); - kmem_cache_free(fuse_req_cachep, req); - return NULL; } fuse_request_init(req, pages, page_descs, npages); @@ -95,12 +102,41 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages) return __fuse_request_alloc(npages, GFP_NOFS); } -void fuse_request_free(struct fuse_req *req) +static void fuse_req_pages_free(struct fuse_req *req) { - if (req->pages != req->inline_pages) { + if (req->pages != req->inline_pages) kfree(req->pages); - kfree(req->page_descs); - } +} + +bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, + gfp_t flags) +{ + struct page **pages; + struct fuse_page_desc *page_descs; + unsigned int npages = min_t(unsigned int, + max_t(unsigned int, req->max_pages * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), + fc->max_pages); + WARN_ON(npages <= req->max_pages); + + pages = fuse_req_pages_alloc(npages, flags, &page_descs); + if (!pages) + return false; + + memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages); + memcpy(page_descs, req->page_descs, + sizeof(struct fuse_page_desc) * req->max_pages); + fuse_req_pages_free(req); + req->pages = pages; + req->page_descs = page_descs; + req->max_pages = npages; + + return true; +} + +void fuse_request_free(struct fuse_req *req) +{ + fuse_req_pages_free(req); kmem_cache_free(fuse_req_cachep, req); } @@ -235,8 +271,10 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) struct file *file = req->stolen_file; struct fuse_file *ff = file->private_data; + WARN_ON(req->max_pages); spin_lock(&fc->lock); - fuse_request_init(req, req->pages, req->page_descs, req->max_pages); + memset(req, 0, sizeof(*req)); + fuse_request_init(req, NULL, NULL, 0); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); @@ -287,10 +325,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) * We get here in the unlikely case that a background * request was allocated but not sent */ - spin_lock(&fc->lock); + spin_lock(&fc->bg_lock); if (!fc->blocked) wake_up(&fc->blocked_waitq); - spin_unlock(&fc->lock); + spin_unlock(&fc->bg_lock); } if (test_bit(FR_WAITING, &req->flags)) { @@ -319,7 +357,13 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) static u64 fuse_get_unique(struct fuse_iqueue *fiq) { - return ++fiq->reqctr; + fiq->reqctr += FUSE_REQ_ID_STEP; + return fiq->reqctr; +} + +static unsigned int fuse_req_hash(u64 unique) +{ + return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) @@ -353,12 +397,13 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, static void flush_bg_queue(struct fuse_conn *fc) { + struct fuse_iqueue *fiq = &fc->iq; + while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; - struct fuse_iqueue *fiq = &fc->iq; - req = list_entry(fc->bg_queue.next, struct fuse_req, list); + req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; spin_lock(&fiq->waitq.lock); @@ -389,14 +434,21 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); if (test_bit(FR_BACKGROUND, &req->flags)) { - spin_lock(&fc->lock); + spin_lock(&fc->bg_lock); clear_bit(FR_BACKGROUND, &req->flags); - if (fc->num_background == fc->max_background) + if (fc->num_background == fc->max_background) { fc->blocked = 0; - - /* Wake up next waiter, if any */ - if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) wake_up(&fc->blocked_waitq); + } else if (!fc->blocked) { + /* + * Wake up next waiter, if any. It's okay to use + * waitqueue_active(), as we've already synced up + * fc->blocked with waiters with the wake_up() call + * above. + */ + if (waitqueue_active(&fc->blocked_waitq)) + wake_up(&fc->blocked_waitq); + } if (fc->num_background == fc->congestion_threshold && fc->sb) { clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); @@ -405,7 +457,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) fc->num_background--; fc->active_background--; flush_bg_queue(fc); - spin_unlock(&fc->lock); + spin_unlock(&fc->bg_lock); } wake_up(&req->waitq); if (req->end) @@ -573,40 +625,38 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) return ret; } -/* - * Called under fc->lock - * - * fc->connected must have been checked previously - */ -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req) +bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(!test_bit(FR_BACKGROUND, &req->flags)); + bool queued = false; + + WARN_ON(!test_bit(FR_BACKGROUND, &req->flags)); if (!test_bit(FR_WAITING, &req->flags)) { __set_bit(FR_WAITING, &req->flags); atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); - fc->num_background++; - if (fc->num_background == fc->max_background) - fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fc->sb) { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + spin_lock(&fc->bg_lock); + if (likely(fc->connected)) { + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + if (fc->num_background == fc->congestion_threshold && fc->sb) { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + } + list_add_tail(&req->list, &fc->bg_queue); + flush_bg_queue(fc); + queued = true; } - list_add_tail(&req->list, &fc->bg_queue); - flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + + return queued; } void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(!req->end); - spin_lock(&fc->lock); - if (fc->connected) { - fuse_request_send_background_locked(fc, req); - spin_unlock(&fc->lock); - } else { - spin_unlock(&fc->lock); + WARN_ON(!req->end); + if (!fuse_request_queue_background(fc, req)) { req->out.h.error = -ENOTCONN; req->end(fc, req); fuse_put_request(fc, req); @@ -1084,12 +1134,11 @@ __releases(fiq->waitq.lock) int err; list_del_init(&req->intr_entry); - req->intr_unique = fuse_get_unique(fiq); memset(&ih, 0, sizeof(ih)); memset(&arg, 0, sizeof(arg)); ih.len = reqsize; ih.opcode = FUSE_INTERRUPT; - ih.unique = req->intr_unique; + ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); arg.unique = req->in.h.unique; spin_unlock(&fiq->waitq.lock); @@ -1238,6 +1287,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_req *req; struct fuse_in *in; unsigned reqsize; + unsigned int hash; restart: spin_lock(&fiq->waitq.lock); @@ -1310,13 +1360,16 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, err = reqsize; goto out_end; } - list_move_tail(&req->list, &fpq->processing); - spin_unlock(&fpq->lock); + hash = fuse_req_hash(req->in.h.unique); + list_move_tail(&req->list, &fpq->processing[hash]); + __fuse_get_request(req); set_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) queue_interrupt(fiq, req); + fuse_put_request(fc, req); return reqsize; @@ -1663,7 +1716,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - int num_pages; + unsigned int num_pages; offset = outarg->offset & ~PAGE_MASK; file_size = i_size_read(inode); @@ -1675,7 +1728,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, num = file_size - outarg->offset; num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); + num_pages = min(num_pages, fc->max_pages); req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) @@ -1792,10 +1845,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, /* Look up request on processing list by unique ID */ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) { + unsigned int hash = fuse_req_hash(unique); struct fuse_req *req; - list_for_each_entry(req, &fpq->processing, list) { - if (req->in.h.unique == unique || req->intr_unique == unique) + list_for_each_entry(req, &fpq->processing[hash], list) { + if (req->in.h.unique == unique) return req; } return NULL; @@ -1869,22 +1923,26 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, if (!fpq->connected) goto err_unlock_pq; - req = request_find(fpq, oh.unique); + req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); if (!req) goto err_unlock_pq; - /* Is it an interrupt reply? */ - if (req->intr_unique == oh.unique) { + /* Is it an interrupt reply ID? */ + if (oh.unique & FUSE_INT_REQ_BIT) { + __fuse_get_request(req); spin_unlock(&fpq->lock); err = -EINVAL; - if (nbytes != sizeof(struct fuse_out_header)) + if (nbytes != sizeof(struct fuse_out_header)) { + fuse_put_request(fc, req); goto err_finish; + } if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) queue_interrupt(&fc->iq, req); + fuse_put_request(fc, req); fuse_copy_finish(cs); return nbytes; @@ -2102,9 +2160,13 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) struct fuse_dev *fud; struct fuse_req *req, *next; LIST_HEAD(to_end); + unsigned int i; + /* Background queuing checks fc->connected under bg_lock */ + spin_lock(&fc->bg_lock); fc->connected = 0; - fc->blocked = 0; + spin_unlock(&fc->bg_lock); + fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { @@ -2123,11 +2185,16 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) } spin_unlock(&req->waitq.lock); } - list_splice_tail_init(&fpq->processing, &to_end); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_tail_init(&fpq->processing[i], + &to_end); spin_unlock(&fpq->lock); } + spin_lock(&fc->bg_lock); + fc->blocked = 0; fc->max_background = UINT_MAX; flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); spin_lock(&fiq->waitq.lock); fiq->connected = 0; @@ -2163,10 +2230,12 @@ int fuse_dev_release(struct inode *inode, struct file *file) struct fuse_conn *fc = fud->fc; struct fuse_pqueue *fpq = &fud->pq; LIST_HEAD(to_end); + unsigned int i; spin_lock(&fpq->lock); WARN_ON(!list_empty(&fpq->io)); - list_splice_init(&fpq->processing, &to_end); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); end_requests(fc, &to_end); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0979609d6eba..47395b0c3b35 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,24 +14,9 @@ #include <linux/namei.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/iversion.h> #include <linux/posix_acl.h> -static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) -{ - struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_inode *fi = get_fuse_inode(dir); - - if (!fc->do_readdirplus) - return false; - if (!fc->readdirplus_auto) - return true; - if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) - return true; - if (ctx->pos == 0) - return true; - return false; -} - static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -80,8 +65,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec) * Set dentry and possibly attribute timeouts from the lookup/mk* * replies */ -static void fuse_change_entry_timeout(struct dentry *entry, - struct fuse_entry_out *o) +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) { fuse_dentry_settime(entry, time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); @@ -92,18 +76,29 @@ static u64 attr_timeout(struct fuse_attr_out *o) return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } -static u64 entry_attr_timeout(struct fuse_entry_out *o) +u64 entry_attr_timeout(struct fuse_entry_out *o) { return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } +static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) +{ + set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask); +} + /* * Mark the attributes as stale, so that at the next call to * ->getattr() they will be fetched from userspace */ void fuse_invalidate_attr(struct inode *inode) { - get_fuse_inode(inode)->i_time = 0; + fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS); +} + +static void fuse_dir_changed(struct inode *dir) +{ + fuse_invalidate_attr(dir); + inode_maybe_inc_iversion(dir, false); } /** @@ -113,7 +108,7 @@ void fuse_invalidate_attr(struct inode *inode) void fuse_invalidate_atime(struct inode *inode) { if (!IS_RDONLY(inode)) - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, STATX_ATIME); } /* @@ -262,11 +257,6 @@ invalid: goto out; } -static int invalid_nodeid(u64 nodeid) -{ - return !nodeid || nodeid == FUSE_ROOT_ID; -} - static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); @@ -469,7 +459,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, kfree(forget); d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); err = finish_open(file, entry, generic_file_open); if (err) { fuse_sync_release(ff, flags); @@ -583,7 +573,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, } else { fuse_change_entry_timeout(entry, &outarg); } - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); return 0; out_put_forget_req: @@ -693,7 +683,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) drop_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); fuse_update_ctime(inode); } else if (err == -EINTR) @@ -715,7 +705,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(d_inode(entry)); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); @@ -754,9 +744,9 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, fuse_update_ctime(d_inode(newent)); } - fuse_invalidate_attr(olddir); + fuse_dir_changed(olddir); if (olddir != newdir) - fuse_invalidate_attr(newdir); + fuse_dir_changed(newdir); /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { @@ -932,7 +922,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, } static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat, unsigned int flags) + struct kstat *stat, u32 request_mask, + unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; @@ -942,6 +933,8 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; + else if (request_mask & READ_ONCE(fi->inval_mask)) + sync = true; else sync = time_before64(fi->i_time, get_jiffies_64()); @@ -959,7 +952,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int fuse_update_attributes(struct inode *inode, struct file *file) { - return fuse_update_get_attr(inode, file, NULL, 0); + /* Do *not* need to get atime for internal purposes */ + return fuse_update_get_attr(inode, file, NULL, + STATX_BASIC_STATS & ~STATX_ATIME, 0); } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, @@ -989,7 +984,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!entry) goto unlock; - fuse_invalidate_attr(parent); + fuse_dir_changed(parent); fuse_invalidate_entry(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { @@ -1165,271 +1160,78 @@ static int fuse_permission(struct inode *inode, int mask) return err; } -static int parse_dirfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx) -{ - while (nbytes >= FUSE_NAME_OFFSET) { - struct fuse_dirent *dirent = (struct fuse_dirent *) buf; - size_t reclen = FUSE_DIRENT_SIZE(dirent); - if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) - return -EIO; - if (reclen > nbytes) - break; - if (memchr(dirent->name, '/', dirent->namelen) != NULL) - return -EIO; - - if (!dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type)) - break; - - buf += reclen; - nbytes -= reclen; - ctx->pos = dirent->off; - } - - return 0; -} - -static int fuse_direntplus_link(struct file *file, - struct fuse_direntplus *direntplus, - u64 attr_version) -{ - struct fuse_entry_out *o = &direntplus->entry_out; - struct fuse_dirent *dirent = &direntplus->dirent; - struct dentry *parent = file->f_path.dentry; - struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); - struct dentry *dentry; - struct dentry *alias; - struct inode *dir = d_inode(parent); - struct fuse_conn *fc; - struct inode *inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - - if (!o->nodeid) { - /* - * Unlike in the case of fuse_lookup, zero nodeid does not mean - * ENOENT. Instead, it only means the userspace filesystem did - * not want to return attributes/handle for this entry. - * - * So do nothing. - */ - return 0; - } - - if (name.name[0] == '.') { - /* - * We could potentially refresh the attributes of the directory - * and its parent? - */ - if (name.len == 1) - return 0; - if (name.name[1] == '.' && name.len == 2) - return 0; - } - - if (invalid_nodeid(o->nodeid)) - return -EIO; - if (!fuse_valid_type(o->attr.mode)) - return -EIO; - - fc = get_fuse_conn(dir); - - name.hash = full_name_hash(parent, name.name, name.len); - dentry = d_lookup(parent, &name); - if (!dentry) { -retry: - dentry = d_alloc_parallel(parent, &name, &wq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - } - if (!d_in_lookup(dentry)) { - struct fuse_inode *fi; - inode = d_inode(dentry); - if (!inode || - get_node_id(inode) != o->nodeid || - ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { - d_invalidate(dentry); - dput(dentry); - goto retry; - } - if (is_bad_inode(inode)) { - dput(dentry); - return -EIO; - } - - fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->nlookup++; - spin_unlock(&fc->lock); - - forget_all_cached_acls(inode); - fuse_change_attributes(inode, &o->attr, - entry_attr_timeout(o), - attr_version); - /* - * The other branch comes via fuse_iget() - * which bumps nlookup inside - */ - } else { - inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, - &o->attr, entry_attr_timeout(o), - attr_version); - if (!inode) - inode = ERR_PTR(-ENOMEM); - - alias = d_splice_alias(inode, dentry); - d_lookup_done(dentry); - if (alias) { - dput(dentry); - dentry = alias; - } - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - } - if (fc->readdirplus_auto) - set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); - fuse_change_entry_timeout(dentry, o); - - dput(dentry); - return 0; -} - -static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx, u64 attr_version) +static int fuse_readlink_page(struct inode *inode, struct page *page) { - struct fuse_direntplus *direntplus; - struct fuse_dirent *dirent; - size_t reclen; - int over = 0; - int ret; - - while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { - direntplus = (struct fuse_direntplus *) buf; - dirent = &direntplus->dirent; - reclen = FUSE_DIRENTPLUS_SIZE(direntplus); - - if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) - return -EIO; - if (reclen > nbytes) - break; - if (memchr(dirent->name, '/', dirent->namelen) != NULL) - return -EIO; - - if (!over) { - /* We fill entries into dstbuf only as much as - it can hold. But we still continue iterating - over remaining entries to link them. If not, - we need to send a FORGET for each of those - which we did not link. - */ - over = !dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type); - if (!over) - ctx->pos = dirent->off; - } - - buf += reclen; - nbytes -= reclen; - - ret = fuse_direntplus_link(file, direntplus, attr_version); - if (ret) - fuse_force_forget(file, direntplus->entry_out.nodeid); - } - - return 0; -} - -static int fuse_readdir(struct file *file, struct dir_context *ctx) -{ - int plus, err; - size_t nbytes; - struct page *page; - struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; - u64 attr_version = 0; - bool locked; - - if (is_bad_inode(inode)) - return -EIO; + int err; req = fuse_get_req(fc, 1); if (IS_ERR(req)) return PTR_ERR(req); - page = alloc_page(GFP_KERNEL); - if (!page) { - fuse_put_request(fc, req); - return -ENOMEM; - } - - plus = fuse_use_readdirplus(inode, ctx); + req->out.page_zeroing = 1; req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE; - if (plus) { - attr_version = fuse_get_attr_version(fc); - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); - } else { - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); - } - locked = fuse_lock_inode(inode); + req->page_descs[0].length = PAGE_SIZE - 1; + req->in.h.opcode = FUSE_READLINK; + req->in.h.nodeid = get_node_id(inode); + req->out.argvar = 1; + req->out.numargs = 1; + req->out.args[0].size = PAGE_SIZE - 1; fuse_request_send(fc, req); - fuse_unlock_inode(inode, locked); - nbytes = req->out.args[0].size; err = req->out.h.error; - fuse_put_request(fc, req); + if (!err) { - if (plus) { - err = parse_dirplusfile(page_address(page), nbytes, - file, ctx, - attr_version); - } else { - err = parse_dirfile(page_address(page), nbytes, file, - ctx); - } + char *link = page_address(page); + size_t len = req->out.args[0].size; + + BUG_ON(len >= PAGE_SIZE); + link[len] = '\0'; } - __free_page(page); + fuse_put_request(fc, req); fuse_invalidate_atime(inode); + return err; } -static const char *fuse_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) +static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - char *link; - ssize_t ret; + struct page *page; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out_err; + if (fc->cache_symlinks) + return page_get_link(dentry, inode, callback); + + err = -ECHILD; if (!dentry) - return ERR_PTR(-ECHILD); + goto out_err; - link = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!link) - return ERR_PTR(-ENOMEM); + page = alloc_page(GFP_KERNEL); + err = -ENOMEM; + if (!page) + goto out_err; - args.in.h.opcode = FUSE_READLINK; - args.in.h.nodeid = get_node_id(inode); - args.out.argvar = 1; - args.out.numargs = 1; - args.out.args[0].size = PAGE_SIZE - 1; - args.out.args[0].value = link; - ret = fuse_simple_request(fc, &args); - if (ret < 0) { - kfree(link); - link = ERR_PTR(ret); - } else { - link[ret] = '\0'; - set_delayed_call(done, kfree_link, link); + err = fuse_readlink_page(inode, page); + if (err) { + __free_page(page); + goto out_err; } - fuse_invalidate_atime(inode); - return link; + + set_delayed_call(callback, page_put_link, page); + + return page_address(page); + +out_err: + return ERR_PTR(err); } static int fuse_dir_open(struct inode *inode, struct file *file) @@ -1662,8 +1464,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, file = NULL; } - if (attr->ia_valid & ATTR_SIZE) + if (attr->ia_valid & ATTR_SIZE) { + if (WARN_ON(!S_ISREG(inode->i_mode))) + return -EIO; is_truncate = true; + } if (is_truncate) { fuse_set_nowrite(inode); @@ -1811,7 +1616,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_get_attr(inode, NULL, stat, flags); + return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); } static const struct inode_operations fuse_dir_inode_operations = { @@ -1867,11 +1672,37 @@ void fuse_init_common(struct inode *inode) void fuse_init_dir(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + inode->i_op = &fuse_dir_inode_operations; inode->i_fop = &fuse_dir_operations; + + spin_lock_init(&fi->rdc.lock); + fi->rdc.cached = false; + fi->rdc.size = 0; + fi->rdc.pos = 0; + fi->rdc.version = 0; } +static int fuse_symlink_readpage(struct file *null, struct page *page) +{ + int err = fuse_readlink_page(page->mapping->host, page); + + if (!err) + SetPageUptodate(page); + + unlock_page(page); + + return err; +} + +static const struct address_space_operations fuse_symlink_aops = { + .readpage = fuse_symlink_readpage, +}; + void fuse_init_symlink(struct inode *inode) { inode->i_op = &fuse_symlink_inode_operations; + inode->i_data.a_ops = &fuse_symlink_aops; + inode_nohighmem(inode); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 32d0b883e74f..cc2121b37bf5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -59,6 +59,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) } INIT_LIST_HEAD(&ff->write_entry); + mutex_init(&ff->readdir.lock); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -73,6 +74,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) void fuse_file_free(struct fuse_file *ff) { fuse_request_free(ff->reserved_req); + mutex_destroy(&ff->readdir.lock); kfree(ff); } @@ -848,11 +850,11 @@ static int fuse_readpages_fill(void *_data, struct page *page) fuse_wait_on_page_writeback(inode, page->index); if (req->num_pages && - (req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (req->num_pages == fc->max_pages || (req->num_pages + 1) * PAGE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { - int nr_alloc = min_t(unsigned, data->nr_pages, - FUSE_MAX_PAGES_PER_REQ); + unsigned int nr_alloc = min_t(unsigned int, data->nr_pages, + fc->max_pages); fuse_send_readpages(req, data->file); if (fc->async_read) req = fuse_get_req_for_background(fc, nr_alloc); @@ -887,7 +889,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; - int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ); + unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages); err = -EIO; if (is_bad_inode(inode)) @@ -1102,12 +1104,13 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, return count > 0 ? count : err; } -static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, + unsigned int max_pages) { - return min_t(unsigned, + return min_t(unsigned int, ((pos + len - 1) >> PAGE_SHIFT) - (pos >> PAGE_SHIFT) + 1, - FUSE_MAX_PAGES_PER_REQ); + max_pages); } static ssize_t fuse_perform_write(struct kiocb *iocb, @@ -1129,7 +1132,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, do { struct fuse_req *req; ssize_t count; - unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); + unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), + fc->max_pages); req = fuse_get_req(fc, nr_pages); if (IS_ERR(req)) { @@ -1271,7 +1275,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, ssize_t ret = 0; /* Special case for kernel I/O: can copy directly into the buffer */ - if (ii->type & ITER_KVEC) { + if (iov_iter_is_kvec(ii)) { unsigned long user_addr = fuse_get_user_addr(ii); size_t frag_size = fuse_get_frag_size(ii, *nbytesp); @@ -1319,11 +1323,6 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, return ret < 0 ? ret : 0; } -static inline int fuse_iter_npages(const struct iov_iter *ii_p) -{ - return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ); -} - ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { @@ -1343,9 +1342,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int err = 0; if (io->async) - req = fuse_get_req_for_background(fc, fuse_iter_npages(iter)); + req = fuse_get_req_for_background(fc, iov_iter_npages(iter, + fc->max_pages)); else - req = fuse_get_req(fc, fuse_iter_npages(iter)); + req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1390,9 +1390,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, fuse_put_request(fc, req); if (io->async) req = fuse_get_req_for_background(fc, - fuse_iter_npages(iter)); + iov_iter_npages(iter, fc->max_pages)); else - req = fuse_get_req(fc, fuse_iter_npages(iter)); + req = fuse_get_req(fc, iov_iter_npages(iter, + fc->max_pages)); if (IS_ERR(req)) break; } @@ -1418,7 +1419,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, res = fuse_direct_io(io, iter, ppos, 0); - fuse_invalidate_attr(inode); + fuse_invalidate_atime(inode); return res; } @@ -1487,6 +1488,7 @@ __acquires(fc->lock) struct fuse_inode *fi = get_fuse_inode(req->inode); struct fuse_write_in *inarg = &req->misc.write.in; __u64 data_size = req->num_pages * PAGE_SIZE; + bool queued; if (!fc->connected) goto out_free; @@ -1502,7 +1504,8 @@ __acquires(fc->lock) req->in.args[1].size = inarg->size; fi->writectr++; - fuse_request_send_background_locked(fc, req); + queued = fuse_request_queue_background(fc, req); + WARN_ON(!queued); return; out_free: @@ -1819,12 +1822,18 @@ static int fuse_writepages_fill(struct page *page, is_writeback = fuse_page_is_writeback(inode, page->index); if (req && req->num_pages && - (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (is_writeback || req->num_pages == fc->max_pages || (req->num_pages + 1) * PAGE_SIZE > fc->max_write || data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_writepages_send(data); data->req = NULL; + } else if (req && req->num_pages == req->max_pages) { + if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) { + fuse_writepages_send(data); + req = data->req = NULL; + } } + err = -ENOMEM; tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) @@ -1847,7 +1856,7 @@ static int fuse_writepages_fill(struct page *page, struct fuse_inode *fi = get_fuse_inode(inode); err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ); + req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); if (!req) { __free_page(tmp_page); goto out_unlock; @@ -1904,6 +1913,7 @@ static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_wb_data data; int err; @@ -1916,7 +1926,7 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + data.orig_pages = kcalloc(fc->max_pages, sizeof(struct page *), GFP_NOFS); if (!data.orig_pages) @@ -2387,10 +2397,11 @@ static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, } /* Make sure iov_length() won't overflow */ -static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, + size_t count) { size_t n; - u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + u32 max = fc->max_pages << PAGE_SHIFT; for (n = 0; n < count; n++, iov++) { if (iov->iov_len > (size_t) max) @@ -2514,7 +2525,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL); + pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!pages || !iov_page) goto out; @@ -2553,7 +2564,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, /* make sure there are enough buffer pages and init request with them */ err = -ENOMEM; - if (max_pages > FUSE_MAX_PAGES_PER_REQ) + if (max_pages > fc->max_pages) goto out; while (num_pages < max_pages) { pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); @@ -2640,11 +2651,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iov = iov_page; out_iov = in_iov + in_iovs; - err = fuse_verify_ioctl_iov(in_iov, in_iovs); + err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs); if (err) goto out; - err = fuse_verify_ioctl_iov(out_iov, out_iovs); + err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs); if (err) goto out; @@ -2835,9 +2846,9 @@ static void fuse_do_truncate(struct file *file) fuse_do_setattr(file_dentry(file), &attr, file); } -static inline loff_t fuse_round_up(loff_t off) +static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) { - return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); + return round_up(off, fc->max_pages << PAGE_SHIFT); } static ssize_t @@ -2866,7 +2877,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { if (offset >= i_size) return 0; - iov_iter_truncate(iter, fuse_round_up(i_size - offset)); + iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); count = iov_iter_count(iter); } @@ -3011,6 +3022,82 @@ out: return err; } +static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + struct fuse_file *ff_in = file_in->private_data; + struct fuse_file *ff_out = file_out->private_data; + struct inode *inode_out = file_inode(file_out); + struct fuse_inode *fi_out = get_fuse_inode(inode_out); + struct fuse_conn *fc = ff_in->fc; + FUSE_ARGS(args); + struct fuse_copy_file_range_in inarg = { + .fh_in = ff_in->fh, + .off_in = pos_in, + .nodeid_out = ff_out->nodeid, + .fh_out = ff_out->fh, + .off_out = pos_out, + .len = len, + .flags = flags + }; + struct fuse_write_out outarg; + ssize_t err; + /* mark unstable when write-back is not used, and file_out gets + * extended */ + bool is_unstable = (!fc->writeback_cache) && + ((pos_out + len) > inode_out->i_size); + + if (fc->no_copy_file_range) + return -EOPNOTSUPP; + + inode_lock(inode_out); + + if (fc->writeback_cache) { + err = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + len); + if (err) + goto out; + + fuse_sync_writes(inode_out); + } + + if (is_unstable) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); + + args.in.h.opcode = FUSE_COPY_FILE_RANGE; + args.in.h.nodeid = ff_in->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } + if (err) + goto out; + + if (fc->writeback_cache) { + fuse_write_update_size(inode_out, pos_out + outarg.size); + file_update_time(file_out); + } + + fuse_invalidate_attr(inode_out); + + err = outarg.size; +out: + if (is_unstable) + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); + + inode_unlock(inode_out); + + return err; +} + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read_iter = fuse_file_read_iter, @@ -3027,6 +3114,7 @@ static const struct file_operations fuse_file_operations = { .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, .fallocate = fuse_file_fallocate, + .copy_file_range = fuse_copy_file_range, }; static const struct file_operations fuse_direct_io_file_operations = { @@ -3062,6 +3150,14 @@ static const struct address_space_operations fuse_file_aops = { void fuse_init_file_inode(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + + INIT_LIST_HEAD(&fi->write_files); + INIT_LIST_HEAD(&fi->queued_writes); + fi->writectr = 0; + init_waitqueue_head(&fi->page_waitq); + INIT_LIST_HEAD(&fi->writepages); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f78e9614bb5f..e9f712e81c7d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -28,8 +28,11 @@ #include <linux/refcount.h> #include <linux/user_namespace.h> -/** Max number of pages that can be used in a single read request */ -#define FUSE_MAX_PAGES_PER_REQ 32 +/** Default max number of pages that can be used in a single read request */ +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 + +/** Maximum of max_pages received in init_out */ +#define FUSE_MAX_MAX_PAGES 256 /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -77,6 +80,9 @@ struct fuse_inode { /** Time in jiffies until the file attributes are valid */ u64 i_time; + /* Which attributes are invalid */ + u32 inval_mask; + /** The sticky bit in inode->i_mode may have been removed, so preserve the original mode */ umode_t orig_i_mode; @@ -87,21 +93,51 @@ struct fuse_inode { /** Version of last attribute change */ u64 attr_version; - /** Files usable in writepage. Protected by fc->lock */ - struct list_head write_files; + union { + /* Write related fields (regular file only) */ + struct { + /* Files usable in writepage. Protected by fc->lock */ + struct list_head write_files; + + /* Writepages pending on truncate or fsync */ + struct list_head queued_writes; - /** Writepages pending on truncate or fsync */ - struct list_head queued_writes; + /* Number of sent writes, a negative bias + * (FUSE_NOWRITE) means more writes are blocked */ + int writectr; + + /* Waitq for writepage completion */ + wait_queue_head_t page_waitq; + + /* List of writepage requestst (pending or sent) */ + struct list_head writepages; + }; + + /* readdir cache (directory only) */ + struct { + /* true if fully cached */ + bool cached; - /** Number of sent writes, a negative bias (FUSE_NOWRITE) - * means more writes are blocked */ - int writectr; + /* size of cache */ + loff_t size; - /** Waitq for writepage completion */ - wait_queue_head_t page_waitq; + /* position at end of cache (position of next entry) */ + loff_t pos; - /** List of writepage requestst (pending or sent) */ - struct list_head writepages; + /* version of the cache */ + u64 version; + + /* modification time of directory when cache was + * started */ + struct timespec64 mtime; + + /* iversion of directory when cache was started */ + u64 iversion; + + /* protects above fields */ + spinlock_t lock; + } rdc; + }; /** Miscellaneous bits describing inode state */ unsigned long state; @@ -148,6 +184,25 @@ struct fuse_file { /** Entry on inode's write_files list */ struct list_head write_entry; + /* Readdir related */ + struct { + /* + * Protects below fields against (crazy) parallel readdir on + * same open file. Uncontended in the normal case. + */ + struct mutex lock; + + /* Dir stream position */ + loff_t pos; + + /* Offset in cache */ + loff_t cache_off; + + /* Version of cache we are reading */ + u64 version; + + } readdir; + /** RB node to be linked on fuse_conn->polled_files */ struct rb_node polled_node; @@ -311,9 +366,6 @@ struct fuse_req { /** refcount */ refcount_t count; - /** Unique ID for the interrupt request */ - u64 intr_unique; - /* Request flags, updated with test/set/clear_bit() */ unsigned long flags; @@ -411,6 +463,9 @@ struct fuse_iqueue { struct fasync_struct *fasync; }; +#define FUSE_PQ_HASH_BITS 8 +#define FUSE_PQ_HASH_SIZE (1 << FUSE_PQ_HASH_BITS) + struct fuse_pqueue { /** Connection established */ unsigned connected; @@ -418,8 +473,8 @@ struct fuse_pqueue { /** Lock protecting accessess to members of this structure */ spinlock_t lock; - /** The list of requests being processed */ - struct list_head processing; + /** Hash table of requests being processed */ + struct list_head *processing; /** The list of requests under I/O */ struct list_head io; @@ -476,6 +531,9 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; + /** Maxmum number of pages that can be used in a single request */ + unsigned int max_pages; + /** Input queue */ struct fuse_iqueue iq; @@ -500,6 +558,10 @@ struct fuse_conn { /** The list of background requests set aside for later queuing */ struct list_head bg_queue; + /** Protects: max_background, congestion_threshold, num_background, + * active_background, bg_queue, blocked */ + spinlock_t bg_lock; + /** Flag indicating that INIT reply has been received. Allocating * any fuse request will be suspended until the flag is set */ int initialized; @@ -551,6 +613,9 @@ struct fuse_conn { /** handle fs handles killing suid/sgid/cap on write/chown/trunc */ unsigned handle_killpriv:1; + /** cache READLINK responses in page cache */ + unsigned cache_symlinks:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -637,6 +702,9 @@ struct fuse_conn { /** Allow other than the mounter user to access the filesystem ? */ unsigned allow_other:1; + /** Does the filesystem support copy_file_range? */ + unsigned no_copy_file_range:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -697,6 +765,11 @@ static inline u64 get_node_id(struct inode *inode) return get_fuse_inode(inode)->nodeid; } +static inline int invalid_nodeid(u64 nodeid) +{ + return !nodeid || nodeid == FUSE_ROOT_ID; +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -812,6 +885,10 @@ struct fuse_req *fuse_request_alloc(unsigned npages); struct fuse_req *fuse_request_alloc_nofs(unsigned npages); +bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, + gfp_t flags); + + /** * Free a request */ @@ -856,9 +933,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); * Send a request in the background */ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); - -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req); +bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); @@ -873,6 +948,9 @@ void fuse_invalidate_entry_cache(struct dentry *entry); void fuse_invalidate_atime(struct inode *inode); +u64 entry_attr_timeout(struct fuse_entry_out *o); +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); + /** * Acquire reference to fuse_conn */ @@ -992,4 +1070,8 @@ struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type); int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); + +/* readdir.c */ +int fuse_readdir(struct file *file, struct dir_context *ctx); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index db9e60b7eb69..0b94b23b02d4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -90,16 +90,12 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi = get_fuse_inode(inode); fi->i_time = 0; + fi->inval_mask = 0; fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; - fi->writectr = 0; fi->orig_ino = 0; fi->state = 0; - INIT_LIST_HEAD(&fi->write_files); - INIT_LIST_HEAD(&fi->queued_writes); - INIT_LIST_HEAD(&fi->writepages); - init_waitqueue_head(&fi->page_waitq); mutex_init(&fi->mutex); fi->forget = fuse_alloc_forget(); if (!fi->forget) { @@ -119,8 +115,10 @@ static void fuse_i_callback(struct rcu_head *head) static void fuse_destroy_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(!list_empty(&fi->write_files)); - BUG_ON(!list_empty(&fi->queued_writes)); + if (S_ISREG(inode->i_mode)) { + WARN_ON(!list_empty(&fi->write_files)); + WARN_ON(!list_empty(&fi->queued_writes)); + } mutex_destroy(&fi->mutex); kfree(fi->forget); call_rcu(&inode->i_rcu, fuse_i_callback); @@ -167,6 +165,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->attr_version = ++fc->attr_version; fi->i_time = attr_valid; + WRITE_ONCE(fi->inval_mask, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -594,9 +593,11 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq) static void fuse_pqueue_init(struct fuse_pqueue *fpq) { - memset(fpq, 0, sizeof(struct fuse_pqueue)); + unsigned int i; + spin_lock_init(&fpq->lock); - INIT_LIST_HEAD(&fpq->processing); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + INIT_LIST_HEAD(&fpq->processing[i]); INIT_LIST_HEAD(&fpq->io); fpq->connected = 1; } @@ -605,6 +606,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); + spin_lock_init(&fc->bg_lock); init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); @@ -852,6 +854,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); + spin_lock(&fc->bg_lock); if (arg->max_background) { fc->max_background = arg->max_background; @@ -865,6 +868,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) fc->congestion_threshold > max_user_congthresh) fc->congestion_threshold = max_user_congthresh; } + spin_unlock(&fc->bg_lock); } static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) @@ -924,8 +928,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->posix_acl = 1; fc->sb->s_xattr = fuse_acl_xattr_handlers; } + if (arg->flags & FUSE_CACHE_SYMLINKS) + fc->cache_symlinks = 1; if (arg->flags & FUSE_ABORT_ERROR) fc->abort_err = 1; + if (arg->flags & FUSE_MAX_PAGES) { + fc->max_pages = + min_t(unsigned int, FUSE_MAX_MAX_PAGES, + max_t(unsigned int, arg->max_pages, 1)); + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -957,7 +968,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1022,17 +1033,26 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) { struct fuse_dev *fud; + struct list_head *pq; fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); - if (fud) { - fud->fc = fuse_conn_get(fc); - fuse_pqueue_init(&fud->pq); + if (!fud) + return NULL; - spin_lock(&fc->lock); - list_add_tail(&fud->entry, &fc->devices); - spin_unlock(&fc->lock); + pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); + if (!pq) { + kfree(fud); + return NULL; } + fud->pq.processing = pq; + fud->fc = fuse_conn_get(fc); + fuse_pqueue_init(&fud->pq); + + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); + return fud; } EXPORT_SYMBOL_GPL(fuse_dev_alloc); @@ -1141,6 +1161,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); + fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; /* Used by get_root_inode() */ sb->s_fs_info = fc; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c new file mode 100644 index 000000000000..ab18b78f4755 --- /dev/null +++ b/fs/fuse/readdir.c @@ -0,0 +1,569 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2018 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + + +#include "fuse_i.h" +#include <linux/iversion.h> +#include <linux/posix_acl.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> + +static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_inode *fi = get_fuse_inode(dir); + + if (!fc->do_readdirplus) + return false; + if (!fc->readdirplus_auto) + return true; + if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) + return true; + if (ctx->pos == 0) + return true; + return false; +} + +static void fuse_add_dirent_to_cache(struct file *file, + struct fuse_dirent *dirent, loff_t pos) +{ + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); + size_t reclen = FUSE_DIRENT_SIZE(dirent); + pgoff_t index; + struct page *page; + loff_t size; + u64 version; + unsigned int offset; + void *addr; + + spin_lock(&fi->rdc.lock); + /* + * Is cache already completed? Or this entry does not go at the end of + * cache? + */ + if (fi->rdc.cached || pos != fi->rdc.pos) { + spin_unlock(&fi->rdc.lock); + return; + } + version = fi->rdc.version; + size = fi->rdc.size; + offset = size & ~PAGE_MASK; + index = size >> PAGE_SHIFT; + /* Dirent doesn't fit in current page? Jump to next page. */ + if (offset + reclen > PAGE_SIZE) { + index++; + offset = 0; + } + spin_unlock(&fi->rdc.lock); + + if (offset) { + page = find_lock_page(file->f_mapping, index); + } else { + page = find_or_create_page(file->f_mapping, index, + mapping_gfp_mask(file->f_mapping)); + } + if (!page) + return; + + spin_lock(&fi->rdc.lock); + /* Raced with another readdir */ + if (fi->rdc.version != version || fi->rdc.size != size || + WARN_ON(fi->rdc.pos != pos)) + goto unlock; + + addr = kmap_atomic(page); + if (!offset) + clear_page(addr); + memcpy(addr + offset, dirent, reclen); + kunmap_atomic(addr); + fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; + fi->rdc.pos = dirent->off; +unlock: + spin_unlock(&fi->rdc.lock); + unlock_page(page); + put_page(page); +} + +static void fuse_readdir_cache_end(struct file *file, loff_t pos) +{ + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); + loff_t end; + + spin_lock(&fi->rdc.lock); + /* does cache end position match current position? */ + if (fi->rdc.pos != pos) { + spin_unlock(&fi->rdc.lock); + return; + } + + fi->rdc.cached = true; + end = ALIGN(fi->rdc.size, PAGE_SIZE); + spin_unlock(&fi->rdc.lock); + + /* truncate unused tail of cache */ + truncate_inode_pages(file->f_mapping, end); +} + +static bool fuse_emit(struct file *file, struct dir_context *ctx, + struct fuse_dirent *dirent) +{ + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_CACHE_DIR) + fuse_add_dirent_to_cache(file, dirent, ctx->pos); + + return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, + dirent->type); +} + +static int parse_dirfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx) +{ + while (nbytes >= FUSE_NAME_OFFSET) { + struct fuse_dirent *dirent = (struct fuse_dirent *) buf; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!fuse_emit(file, ctx, dirent)) + break; + + buf += reclen; + nbytes -= reclen; + ctx->pos = dirent->off; + } + + return 0; +} + +static int fuse_direntplus_link(struct file *file, + struct fuse_direntplus *direntplus, + u64 attr_version) +{ + struct fuse_entry_out *o = &direntplus->entry_out; + struct fuse_dirent *dirent = &direntplus->dirent; + struct dentry *parent = file->f_path.dentry; + struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = d_inode(parent); + struct fuse_conn *fc; + struct inode *inode; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + + if (!o->nodeid) { + /* + * Unlike in the case of fuse_lookup, zero nodeid does not mean + * ENOENT. Instead, it only means the userspace filesystem did + * not want to return attributes/handle for this entry. + * + * So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* + * We could potentially refresh the attributes of the directory + * and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + + if (invalid_nodeid(o->nodeid)) + return -EIO; + if (!fuse_valid_type(o->attr.mode)) + return -EIO; + + fc = get_fuse_conn(dir); + + name.hash = full_name_hash(parent, name.name, name.len); + dentry = d_lookup(parent, &name); + if (!dentry) { +retry: + dentry = d_alloc_parallel(parent, &name, &wq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (!d_in_lookup(dentry)) { + struct fuse_inode *fi; + inode = d_inode(dentry); + if (!inode || + get_node_id(inode) != o->nodeid || + ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { + d_invalidate(dentry); + dput(dentry); + goto retry; + } + if (is_bad_inode(inode)) { + dput(dentry); + return -EIO; + } + + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + + forget_all_cached_acls(inode); + fuse_change_attributes(inode, &o->attr, + entry_attr_timeout(o), + attr_version); + /* + * The other branch comes via fuse_iget() + * which bumps nlookup inside + */ + } else { + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), + attr_version); + if (!inode) + inode = ERR_PTR(-ENOMEM); + + alias = d_splice_alias(inode, dentry); + d_lookup_done(dentry); + if (alias) { + dput(dentry); + dentry = alias; + } + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (fc->readdirplus_auto) + set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + fuse_change_entry_timeout(dentry, o); + + dput(dentry); + return 0; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx, u64 attr_version) +{ + struct fuse_direntplus *direntplus; + struct fuse_dirent *dirent; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { + direntplus = (struct fuse_direntplus *) buf; + dirent = &direntplus->dirent; + reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = !fuse_emit(file, ctx, dirent); + if (!over) + ctx->pos = dirent->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntplus_link(file, direntplus, attr_version); + if (ret) + fuse_force_forget(file, direntplus->entry_out.nodeid); + } + + return 0; +} + +static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) +{ + int plus, err; + size_t nbytes; + struct page *page; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + u64 attr_version = 0; + bool locked; + + req = fuse_get_req(fc, 1); + if (IS_ERR(req)) + return PTR_ERR(req); + + page = alloc_page(GFP_KERNEL); + if (!page) { + fuse_put_request(fc, req); + return -ENOMEM; + } + + plus = fuse_use_readdirplus(inode, ctx); + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + req->page_descs[0].length = PAGE_SIZE; + if (plus) { + attr_version = fuse_get_attr_version(fc); + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, + FUSE_READDIRPLUS); + } else { + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, + FUSE_READDIR); + } + locked = fuse_lock_inode(inode); + fuse_request_send(fc, req); + fuse_unlock_inode(inode, locked); + nbytes = req->out.args[0].size; + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + if (!nbytes) { + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_CACHE_DIR) + fuse_readdir_cache_end(file, ctx->pos); + } else if (plus) { + err = parse_dirplusfile(page_address(page), nbytes, + file, ctx, attr_version); + } else { + err = parse_dirfile(page_address(page), nbytes, file, + ctx); + } + } + + __free_page(page); + fuse_invalidate_atime(inode); + return err; +} + +enum fuse_parse_result { + FOUND_ERR = -1, + FOUND_NONE = 0, + FOUND_SOME, + FOUND_ALL, +}; + +static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, + void *addr, unsigned int size, + struct dir_context *ctx) +{ + unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK; + enum fuse_parse_result res = FOUND_NONE; + + WARN_ON(offset >= size); + + for (;;) { + struct fuse_dirent *dirent = addr + offset; + unsigned int nbytes = size - offset; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + + if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) + break; + + if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) + return FOUND_ERR; + if (WARN_ON(reclen > nbytes)) + return FOUND_ERR; + if (WARN_ON(memchr(dirent->name, '/', dirent->namelen) != NULL)) + return FOUND_ERR; + + if (ff->readdir.pos == ctx->pos) { + res = FOUND_SOME; + if (!dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type)) + return FOUND_ALL; + ctx->pos = dirent->off; + } + ff->readdir.pos = dirent->off; + ff->readdir.cache_off += reclen; + + offset += reclen; + } + + return res; +} + +static void fuse_rdc_reset(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + fi->rdc.cached = false; + fi->rdc.version++; + fi->rdc.size = 0; + fi->rdc.pos = 0; +} + +#define UNCACHED 1 + +static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + enum fuse_parse_result res; + pgoff_t index; + unsigned int size; + struct page *page; + void *addr; + + /* Seeked? If so, reset the cache stream */ + if (ff->readdir.pos != ctx->pos) { + ff->readdir.pos = 0; + ff->readdir.cache_off = 0; + } + + /* + * We're just about to start reading into the cache or reading the + * cache; both cases require an up-to-date mtime value. + */ + if (!ctx->pos && fc->auto_inval_data) { + int err = fuse_update_attributes(inode, file); + + if (err) + return err; + } + +retry: + spin_lock(&fi->rdc.lock); +retry_locked: + if (!fi->rdc.cached) { + /* Starting cache? Set cache mtime. */ + if (!ctx->pos && !fi->rdc.size) { + fi->rdc.mtime = inode->i_mtime; + fi->rdc.iversion = inode_query_iversion(inode); + } + spin_unlock(&fi->rdc.lock); + return UNCACHED; + } + /* + * When at the beginning of the directory (i.e. just after opendir(3) or + * rewinddir(3)), then need to check whether directory contents have + * changed, and reset the cache if so. + */ + if (!ctx->pos) { + if (inode_peek_iversion(inode) != fi->rdc.iversion || + !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) { + fuse_rdc_reset(inode); + goto retry_locked; + } + } + + /* + * If cache version changed since the last getdents() call, then reset + * the cache stream. + */ + if (ff->readdir.version != fi->rdc.version) { + ff->readdir.pos = 0; + ff->readdir.cache_off = 0; + } + /* + * If at the beginning of the cache, than reset version to + * current. + */ + if (ff->readdir.pos == 0) + ff->readdir.version = fi->rdc.version; + + WARN_ON(fi->rdc.size < ff->readdir.cache_off); + + index = ff->readdir.cache_off >> PAGE_SHIFT; + + if (index == (fi->rdc.size >> PAGE_SHIFT)) + size = fi->rdc.size & ~PAGE_MASK; + else + size = PAGE_SIZE; + spin_unlock(&fi->rdc.lock); + + /* EOF? */ + if ((ff->readdir.cache_off & ~PAGE_MASK) == size) + return 0; + + page = find_get_page_flags(file->f_mapping, index, + FGP_ACCESSED | FGP_LOCK); + spin_lock(&fi->rdc.lock); + if (!page) { + /* + * Uh-oh: page gone missing, cache is useless + */ + if (fi->rdc.version == ff->readdir.version) + fuse_rdc_reset(inode); + goto retry_locked; + } + + /* Make sure it's still the same version after getting the page. */ + if (ff->readdir.version != fi->rdc.version) { + spin_unlock(&fi->rdc.lock); + unlock_page(page); + put_page(page); + goto retry; + } + spin_unlock(&fi->rdc.lock); + + /* + * Contents of the page are now protected against changing by holding + * the page lock. + */ + addr = kmap(page); + res = fuse_parse_cache(ff, addr, size, ctx); + kunmap(page); + unlock_page(page); + put_page(page); + + if (res == FOUND_ERR) + return -EIO; + + if (res == FOUND_ALL) + return 0; + + if (size == PAGE_SIZE) { + /* We hit end of page: skip to next page. */ + ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE); + goto retry; + } + + /* + * End of cache reached. If found position, then we are done, otherwise + * need to fall back to uncached, since the position we were looking for + * wasn't in the cache. + */ + return res == FOUND_SOME ? 0 : UNCACHED; +} + +int fuse_readdir(struct file *file, struct dir_context *ctx) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + int err; + + if (is_bad_inode(inode)) + return -EIO; + + mutex_lock(&ff->readdir.lock); + + err = UNCACHED; + if (ff->open_flags & FOPEN_CACHE_DIR) + err = fuse_readdir_cached(file, ctx); + if (err == UNCACHED) + err = fuse_readdir_uncached(file, ctx); + + mutex_unlock(&ff->readdir.lock); + + return err; +} diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 31e8270d0b26..8afbb35559b9 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -366,7 +366,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 9a8772465a90..896396554bcc 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -425,6 +425,10 @@ skip: if (new_node) { __be32 cnid; + if (!new_node->parent) { + hfs_btree_inc_height(tree); + new_node->parent = tree->root; + } fd->bnode = hfs_bnode_find(tree, new_node->parent); /* create index key and entry */ hfs_bnode_read_key(new_node, fd->search_key, 14); @@ -441,6 +445,7 @@ skip: /* restore search_key */ hfs_bnode_read_key(node, fd->search_key, 14); } + new_node = NULL; } if (!rec && node->parent) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 374b5688e29e..98b96ffb95ed 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -220,25 +220,17 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) return node; } -struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +/* Make sure @tree has enough space for the @rsvd_nodes */ +int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes) { - struct hfs_bnode *node, *next_node; - struct page **pagep; - u32 nidx, idx; - unsigned off; - u16 off16; - u16 len; - u8 *data, byte, m; - int i; - - while (!tree->free_nodes) { - struct inode *inode = tree->inode; - u32 count; - int res; + struct inode *inode = tree->inode; + u32 count; + int res; + while (tree->free_nodes < rsvd_nodes) { res = hfs_extend_file(inode); if (res) - return ERR_PTR(res); + return res; HFS_I(inode)->phys_size = inode->i_size = (loff_t)HFS_I(inode)->alloc_blocks * HFS_SB(tree->sb)->alloc_blksz; @@ -246,9 +238,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) tree->sb->s_blocksize_bits; inode_set_bytes(inode, inode->i_size); count = inode->i_size >> tree->node_size_shift; - tree->free_nodes = count - tree->node_count; + tree->free_nodes += count - tree->node_count; tree->node_count = count; } + return 0; +} + +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *next_node; + struct page **pagep; + u32 nidx, idx; + unsigned off; + u16 off16; + u16 len; + u8 *data, byte, m; + int i, res; + + res = hfs_bmap_reserve(tree, 1); + if (res) + return ERR_PTR(res); nidx = 0; node = hfs_bnode_find(tree, nidx); diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index c8b252dbb26c..dcc2aab1b2c4 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -82,6 +82,7 @@ struct hfs_find_data { extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp); extern void hfs_btree_close(struct hfs_btree *); extern void hfs_btree_write(struct hfs_btree *); +extern int hfs_bmap_reserve(struct hfs_btree *, int); extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *); extern void hfs_bmap_free(struct hfs_bnode *node); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 8a66405b0f8b..d365bf0b8c77 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -97,6 +97,14 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i if (err) return err; + /* + * Fail early and avoid ENOSPC during the btree operations. We may + * have to split the root node at most once. + */ + err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth); + if (err) + goto err2; + hfs_cat_build_key(sb, fd.search_key, cnid, NULL); entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ? HFS_CDR_THD : HFS_CDR_FTH, @@ -295,6 +303,14 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, return err; dst_fd = src_fd; + /* + * Fail early and avoid ENOSPC during the btree operations. We may + * have to split the root node at most once. + */ + err = hfs_bmap_reserve(src_fd.tree, 2 * src_fd.tree->depth); + if (err) + goto out; + /* find the old dir entry and read the data */ hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); err = hfs_brec_find(&src_fd); diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 5d0182654580..263d5028d9d1 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -117,6 +117,10 @@ static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) { if (res != -ENOENT) return res; + /* Fail early and avoid ENOSPC during the btree operation */ + res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1); + if (res) + return res; hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec)); HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); } else { @@ -300,7 +304,7 @@ int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type) return 0; blocks = 0; - for (i = 0; i < 3; extent++, i++) + for (i = 0; i < 3; i++) blocks += be16_to_cpu(extent[i].count); res = hfs_free_extents(sb, extent, blocks, blocks); @@ -341,7 +345,9 @@ int hfs_get_block(struct inode *inode, sector_t block, ablock = (u32)block / HFS_SB(sb)->fs_div; if (block >= HFS_I(inode)->fs_blocks) { - if (block > HFS_I(inode)->fs_blocks || !create) + if (!create) + return 0; + if (block > HFS_I(inode)->fs_blocks) return -EIO; if (ablock >= HFS_I(inode)->alloc_blocks) { res = hfs_extend_file(inode); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a2dfa1b2a89c..da243c84e93b 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -642,6 +642,8 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) truncate_setsize(inode, attr->ia_size); hfs_file_truncate(inode); + inode->i_atime = inode->i_mtime = inode->i_ctime = + current_time(inode); } setattr_copy(inode, attr); diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index 2bab6b3cdba4..e6d554476db4 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -217,6 +217,11 @@ int hfsplus_create_attr(struct inode *inode, if (err) goto failed_init_create_attr; + /* Fail early and avoid ENOSPC during the btree operation */ + err = hfs_bmap_reserve(fd.tree, fd.tree->depth + 1); + if (err) + goto failed_create_attr; + if (name) { err = hfsplus_attr_build_key(sb, fd.search_key, inode->i_ino, name); @@ -313,6 +318,11 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) if (err) return err; + /* Fail early and avoid ENOSPC during the btree operation */ + err = hfs_bmap_reserve(fd.tree, fd.tree->depth); + if (err) + goto out; + if (name) { err = hfsplus_attr_build_key(sb, fd.search_key, inode->i_ino, name); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index ed8eacb34452..1918544a7871 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -429,6 +429,10 @@ skip: if (new_node) { __be32 cnid; + if (!new_node->parent) { + hfs_btree_inc_height(tree); + new_node->parent = tree->root; + } fd->bnode = hfs_bnode_find(tree, new_node->parent); /* create index key and entry */ hfs_bnode_read_key(new_node, fd->search_key, 14); @@ -445,6 +449,7 @@ skip: /* restore search_key */ hfs_bnode_read_key(node, fd->search_key, 14); } + new_node = NULL; } if (!rec && node->parent) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index de14b2b6881b..236efe51eca6 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -342,26 +342,21 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) return node; } -struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +/* Make sure @tree has enough space for the @rsvd_nodes */ +int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes) { - struct hfs_bnode *node, *next_node; - struct page **pagep; - u32 nidx, idx; - unsigned off; - u16 off16; - u16 len; - u8 *data, byte, m; - int i; + struct inode *inode = tree->inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + u32 count; + int res; - while (!tree->free_nodes) { - struct inode *inode = tree->inode; - struct hfsplus_inode_info *hip = HFSPLUS_I(inode); - u32 count; - int res; + if (rsvd_nodes <= 0) + return 0; + while (tree->free_nodes < rsvd_nodes) { res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree)); if (res) - return ERR_PTR(res); + return res; hip->phys_size = inode->i_size = (loff_t)hip->alloc_blocks << HFSPLUS_SB(tree->sb)->alloc_blksz_shift; @@ -369,9 +364,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift; inode_set_bytes(inode, inode->i_size); count = inode->i_size >> tree->node_size_shift; - tree->free_nodes = count - tree->node_count; + tree->free_nodes += count - tree->node_count; tree->node_count = count; } + return 0; +} + +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *next_node; + struct page **pagep; + u32 nidx, idx; + unsigned off; + u16 off16; + u16 len; + u8 *data, byte, m; + int i, res; + + res = hfs_bmap_reserve(tree, 1); + if (res) + return ERR_PTR(res); nidx = 0; node = hfs_bnode_find(tree, nidx); diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index a196369ba779..35472cba750e 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -265,6 +265,14 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, if (err) return err; + /* + * Fail early and avoid ENOSPC during the btree operations. We may + * have to split the root node at most once. + */ + err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth); + if (err) + goto err2; + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? @@ -333,6 +341,14 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) if (err) return err; + /* + * Fail early and avoid ENOSPC during the btree operations. We may + * have to split the root node at most once. + */ + err = hfs_bmap_reserve(fd.tree, 2 * (int)fd.tree->depth - 2); + if (err) + goto out; + if (!str) { int len; @@ -433,6 +449,14 @@ int hfsplus_rename_cat(u32 cnid, return err; dst_fd = src_fd; + /* + * Fail early and avoid ENOSPC during the btree operations. We may + * have to split the root node at most twice. + */ + err = hfs_bmap_reserve(src_fd.tree, 4 * (int)src_fd.tree->depth - 1); + if (err) + goto out; + /* find the old dir entry and read the data */ err = hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 8e0f59767694..a930ddd15681 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -100,6 +100,10 @@ static int __hfsplus_ext_write_extent(struct inode *inode, if (hip->extent_state & HFSPLUS_EXT_NEW) { if (res != -ENOENT) return res; + /* Fail early and avoid ENOSPC during the btree operation */ + res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1); + if (res) + return res; hfs_brec_insert(fd, hip->cached_extents, sizeof(hfsplus_extent_rec)); hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); @@ -233,7 +237,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, ablock = iblock >> sbi->fs_shift; if (iblock >= hip->fs_blocks) { - if (iblock > hip->fs_blocks || !create) + if (!create) + return 0; + if (iblock > hip->fs_blocks) return -EIO; if (ablock >= hip->alloc_blocks) { res = hfsplus_file_extend(inode, false); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 8e039435958a..dd7ad9f13e3a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -311,6 +311,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) #define hfs_btree_open hfsplus_btree_open #define hfs_btree_close hfsplus_btree_close #define hfs_btree_write hfsplus_btree_write +#define hfs_bmap_reserve hfsplus_bmap_reserve #define hfs_bmap_alloc hfsplus_bmap_alloc #define hfs_bmap_free hfsplus_bmap_free #define hfs_bnode_read hfsplus_bnode_read @@ -395,6 +396,7 @@ u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors, struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id); void hfs_btree_close(struct hfs_btree *tree); int hfs_btree_write(struct hfs_btree *tree); +int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes); struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree); void hfs_bmap_free(struct hfs_bnode *node); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 8e9427a42b81..d7ab9d8c4b67 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -261,6 +261,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) } truncate_setsize(inode, attr->ia_size); hfsplus_file_truncate(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); } setattr_copy(inode, attr); diff --git a/fs/inode.c b/fs/inode.c index 42f6d25f32a5..9e198f00b64c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -10,7 +10,7 @@ #include <linux/swap.h> #include <linux/security.h> #include <linux/cdev.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/posix_acl.h> @@ -349,7 +349,7 @@ EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { - INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT); + xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); diff --git a/fs/ioctl.c b/fs/ioctl.c index 2005529af560..d64f622cac8b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -223,6 +223,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { struct fd src_file = fdget(srcfd); + loff_t cloned; int ret; if (!src_file.file) @@ -230,7 +231,14 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ret = -EXDEV; if (src_file.file->f_path.mnt != dst_file->f_path.mnt) goto fdput; - ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); + cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff, + olen, 0); + if (cloned < 0) + ret = cloned; + else if (olen && cloned != olen) + ret = -EINVAL; + else + ret = 0; fdput: fdput(src_file); return ret; @@ -669,6 +677,9 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, return ioctl_fiemap(filp, arg); case FIGETBSZ: + /* anon_bdev filesystems may not have a block size */ + if (!inode->i_sb->s_blocksize) + return -EINVAL; return put_user(inode->i_sb->s_blocksize, argp); case FICLONE: diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..64ce240217a1 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -30,7 +30,6 @@ #include <linux/task_io_accounting_ops.h> #include <linux/dax.h> #include <linux/sched/signal.h> -#include <linux/swap.h> #include "internal.h" @@ -1057,7 +1056,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, return length; } -int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -1795,7 +1794,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (pos >= dio->i_size) goto out_free_dio; - if (iter->type == ITER_IOVEC) + if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) dio->flags |= IOMAP_DIO_DIRTY; } else { flags |= IOMAP_WRITE; diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 947ce22f5b3c..f0fe641893a5 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -46,7 +46,7 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod return i; } -/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */ +/* Acorn extensions written by Matthew Wilcox <willy@infradead.org> 1998 */ int get_acorn_filename(struct iso_directory_record *de, char *retname, struct inode *inode) { diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index ff2716f9322e..fdf527b6d79c 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -236,6 +236,9 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; + /* sysfs dentries and inodes don't require IO to create */ + sb->s_shrink.seeks = 0; + /* get root inode, initialize and unlock it */ mutex_lock(&kernfs_mutex); inode = kernfs_get_inode(sb, info->root->kn); diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 305b220af45d..162f43b80c84 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -72,6 +72,9 @@ static int kernfs_get_target_path(struct kernfs_node *parent, if (base == kn) break; + if ((s - path) + 3 >= PATH_MAX) + return -ENAMETOOLONG; + strcpy(s, "../"); s += 3; base = base->parent; @@ -88,7 +91,7 @@ static int kernfs_get_target_path(struct kernfs_node *parent, if (len < 2) return -EINVAL; len--; - if ((s - path) + len > PATH_MAX) + if ((s - path) + len >= PATH_MAX) return -ENAMETOOLONG; /* reverse fillup of target string from target to base */ diff --git a/fs/lockd/host.c b/fs/lockd/host.c index d35cd6be0675..93fb7cf0b92b 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -341,7 +341,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, }; struct lockd_net *ln = net_generic(net, lockd_net_id); - dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + dprintk("lockd: %s(host='%.*s', vers=%u, proto=%s)\n", __func__, (int)hostname_len, hostname, rqstp->rq_vers, (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); diff --git a/fs/namespace.c b/fs/namespace.c index d86830c86ce8..98d27da43304 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -23,7 +23,7 @@ #include <linux/uaccess.h> #include <linux/proc_ns.h> #include <linux/magic.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/task_work.h> #include <linux/sched/task.h> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 06cb0c1d9aee..d3781cd983f6 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -896,7 +896,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (end != inode->i_mapping->nrpages) { rcu_read_lock(); - end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); + end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX); rcu_read_unlock(); } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index f033f3a69a3b..07b839560576 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -93,7 +93,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t flags) return nfs4_do_check_delegation(inode, flags, false); } -static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid) { struct inode *inode = state->inode; struct file_lock *fl; @@ -108,7 +108,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ spin_lock(&flctx->flc_lock); restart: list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file) != ctx) + if (nfs_file_open_context(fl->fl_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); @@ -136,8 +136,8 @@ static int nfs_delegation_claim_opens(struct inode *inode, int err; again: - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -147,15 +147,16 @@ again: continue; if (!nfs4_stateid_match(&state->stateid, stateid)) continue; - get_nfs_open_context(ctx); - spin_unlock(&inode->i_lock); + if (!get_nfs_open_context(ctx)) + continue; + rcu_read_unlock(); sp = state->owner; /* Block nfs4_proc_unlck */ mutex_lock(&sp->so_delegreturn_mutex); seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid, type); if (!err) - err = nfs_delegation_claim_locks(ctx, state, stateid); + err = nfs_delegation_claim_locks(state, stateid); if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) err = -EAGAIN; mutex_unlock(&sp->so_delegreturn_mutex); @@ -164,7 +165,7 @@ again: return err; goto again; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); return 0; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8bfaa658b2c1..71b2e390becf 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1072,6 +1072,100 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); } +static int +nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, + struct inode *inode, int error) +{ + switch (error) { + case 1: + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", + __func__, dentry); + return 1; + case 0: + nfs_mark_for_revalidate(dir); + if (inode && S_ISDIR(inode->i_mode)) { + /* Purge readdir caches. */ + nfs_zap_caches(inode); + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (IS_ROOT(dentry)) + return 1; + } + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", + __func__, dentry); + return 0; + } + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", + __func__, dentry, error); + return error; +} + +static int +nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + int ret = 1; + if (nfs_neg_need_reval(dir, dentry, flags)) { + if (flags & LOOKUP_RCU) + return -ECHILD; + ret = 0; + } + return nfs_lookup_revalidate_done(dir, dentry, NULL, ret); +} + +static int +nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + return nfs_lookup_revalidate_done(dir, dentry, inode, 1); +} + +static int +nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct nfs_fh *fhandle; + struct nfs_fattr *fattr; + struct nfs4_label *label; + int ret; + + ret = -ENOMEM; + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (fhandle == NULL || fattr == NULL || IS_ERR(label)) + goto out; + + ret = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); + if (ret < 0) { + if (ret == -ESTALE || ret == -ENOENT) + ret = 0; + goto out; + } + ret = 0; + if (nfs_compare_fh(NFS_FH(inode), fhandle)) + goto out; + if (nfs_refresh_inode(inode, fattr) < 0) + goto out; + + nfs_setsecurity(inode, fattr, label); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + + /* set a readdirplus hint that we had a cache miss */ + nfs_force_use_readdirplus(dir); + ret = 1; +out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + nfs4_label_free(label); + return nfs_lookup_revalidate_done(dir, dentry, inode, ret); +} + /* * This is called every time the dcache has a lookup hit, * and we should check whether we can really trust that @@ -1083,58 +1177,36 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, * If the parent directory is seen to have changed, we throw out the * cached dentry and do a new lookup. */ -static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int +nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, + unsigned int flags) { - struct inode *dir; struct inode *inode; - struct dentry *parent; - struct nfs_fh *fhandle = NULL; - struct nfs_fattr *fattr = NULL; - struct nfs4_label *label = NULL; int error; - if (flags & LOOKUP_RCU) { - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - } nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = d_inode(dentry); - if (!inode) { - if (nfs_neg_need_reval(dir, dentry, flags)) { - if (flags & LOOKUP_RCU) - return -ECHILD; - goto out_bad; - } - goto out_valid; - } + if (!inode) + return nfs_lookup_revalidate_negative(dir, dentry, flags); if (is_bad_inode(inode)) { - if (flags & LOOKUP_RCU) - return -ECHILD; dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", __func__, dentry); goto out_bad; } if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) - goto out_set_verifier; + return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* Force a full look up iff the parent directory has changed */ if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) && nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { error = nfs_lookup_verify_inode(inode, flags); if (error) { - if (flags & LOOKUP_RCU) - return -ECHILD; if (error == -ESTALE) - goto out_zap_parent; - goto out_error; + nfs_zap_caches(dir); + goto out_bad; } nfs_advise_use_readdirplus(dir); goto out_valid; @@ -1146,81 +1218,45 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (NFS_STALE(inode)) goto out_bad; - error = -ENOMEM; - fhandle = nfs_alloc_fhandle(); - fattr = nfs_alloc_fattr(); - if (fhandle == NULL || fattr == NULL) - goto out_error; - - label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); - if (IS_ERR(label)) - goto out_error; - trace_nfs_lookup_revalidate_enter(dir, dentry, flags); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); + error = nfs_lookup_revalidate_dentry(dir, dentry, inode); trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); - if (error == -ESTALE || error == -ENOENT) - goto out_bad; - if (error) - goto out_error; - if (nfs_compare_fh(NFS_FH(inode), fhandle)) - goto out_bad; - if ((error = nfs_refresh_inode(inode, fattr)) != 0) - goto out_bad; - - nfs_setsecurity(inode, fattr, label); - - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); + return error; +out_valid: + return nfs_lookup_revalidate_done(dir, dentry, inode, 1); +out_bad: + if (flags & LOOKUP_RCU) + return -ECHILD; + return nfs_lookup_revalidate_done(dir, dentry, inode, 0); +} - /* set a readdirplus hint that we had a cache miss */ - nfs_force_use_readdirplus(dir); +static int +__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, + int (*reval)(struct inode *, struct dentry *, unsigned int)) +{ + struct dentry *parent; + struct inode *dir; + int ret; -out_set_verifier: - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - out_valid: if (flags & LOOKUP_RCU) { + parent = READ_ONCE(dentry->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + ret = reval(dir, dentry, flags); if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; - } else + } else { + parent = dget_parent(dentry); + ret = reval(d_inode(parent), dentry, flags); dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", - __func__, dentry); - return 1; -out_zap_parent: - nfs_zap_caches(dir); - out_bad: - WARN_ON(flags & LOOKUP_RCU); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); - nfs_mark_for_revalidate(dir); - if (inode && S_ISDIR(inode->i_mode)) { - /* Purge readdir caches. */ - nfs_zap_caches(inode); - /* - * We can't d_drop the root of a disconnected tree: - * its d_hash is on the s_anon list and d_drop() would hide - * it from shrink_dcache_for_unmount(), leading to busy - * inodes on unmount and further oopses. - */ - if (IS_ROOT(dentry)) - goto out_valid; } - dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", - __func__, dentry); - return 0; -out_error: - WARN_ON(flags & LOOKUP_RCU); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); - dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", - __func__, dentry, error); - return error; + return ret; +} + +static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +{ + return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate); } /* @@ -1579,62 +1615,55 @@ no_open: } EXPORT_SYMBOL_GPL(nfs_atomic_open); -static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int +nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, + unsigned int flags) { struct inode *inode; - int ret = 0; if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) - goto no_open; + goto full_reval; if (d_mountpoint(dentry)) - goto no_open; - if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) - goto no_open; + goto full_reval; inode = d_inode(dentry); /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ - if (inode == NULL) { - struct dentry *parent; - struct inode *dir; - - if (flags & LOOKUP_RCU) { - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - } - if (!nfs_neg_need_reval(dir, dentry, flags)) - ret = 1; - else if (flags & LOOKUP_RCU) - ret = -ECHILD; - if (!(flags & LOOKUP_RCU)) - dput(parent); - else if (parent != READ_ONCE(dentry->d_parent)) - return -ECHILD; - goto out; - } + if (inode == NULL) + goto full_reval; + + if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) + return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) - goto no_open; + goto full_reval; + /* We cannot do exclusive creation on a positive dentry */ - if (flags & LOOKUP_EXCL) - goto no_open; + if (flags & (LOOKUP_EXCL | LOOKUP_REVAL)) + goto reval_dentry; + + /* Check if the directory changed */ + if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) + goto reval_dentry; /* Let f_op->open() actually open (and revalidate) the file */ - ret = 1; + return 1; +reval_dentry: + if (flags & LOOKUP_RCU) + return -ECHILD; + return nfs_lookup_revalidate_dentry(dir, dentry, inode);; -out: - return ret; +full_reval: + return nfs_do_lookup_revalidate(dir, dentry, flags); +} -no_open: - return nfs_lookup_revalidate(dentry, flags); +static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) +{ + return __nfs_lookup_revalidate(dentry, flags, + nfs4_do_lookup_revalidate); } #endif /* CONFIG_NFSV4 */ diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 060c658eab66..a7d3df85736d 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -65,6 +65,7 @@ struct nfs_dns_ent { struct sockaddr_storage addr; size_t addrlen; + struct rcu_head rcu_head; }; @@ -101,15 +102,23 @@ static void nfs_dns_ent_init(struct cache_head *cnew, } } -static void nfs_dns_ent_put(struct kref *ref) +static void nfs_dns_ent_free_rcu(struct rcu_head *head) { struct nfs_dns_ent *item; - item = container_of(ref, struct nfs_dns_ent, h.ref); + item = container_of(head, struct nfs_dns_ent, rcu_head); kfree(item->hostname); kfree(item); } +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + call_rcu(&item->rcu_head, nfs_dns_ent_free_rcu); +} + static struct cache_head *nfs_dns_ent_alloc(void) { struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); @@ -195,7 +204,7 @@ static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, { struct cache_head *ch; - ch = sunrpc_cache_lookup(cd, + ch = sunrpc_cache_lookup_rcu(cd, &key->h, nfs_dns_hash(key)); if (!ch) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d175724ff566..61f46facb39c 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .max_layoutget_response = 4096, /* 1 page or so... */ .alloc_layout_hdr = filelayout_alloc_layout_hdr, .free_layout_hdr = filelayout_free_layout_hdr, .alloc_lseg = filelayout_alloc_lseg, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index cae43333ef16..86bcba40ca61 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -2356,6 +2356,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .name = "LAYOUT_FLEX_FILES", .owner = THIS_MODULE, .flags = PNFS_LAYOUTGET_ON_OPEN, + .max_layoutget_response = 4096, /* 1 page or so... */ .set_layoutdriver = ff_layout_set_layoutdriver, .alloc_layout_hdr = ff_layout_alloc_layout_hdr, .free_layout_hdr = ff_layout_free_layout_hdr, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 59aa04976331..74d8d5352438 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -453,7 +453,7 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); struct rpc_cred *cred; - if (mirror) { + if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) { cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); if (!cred) cred = get_rpccred(mdscred); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b65aee481d13..5b1eee4952b7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -857,15 +857,14 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) { - struct nfs_lock_context *head = &ctx->lock_context; - struct nfs_lock_context *pos = head; + struct nfs_lock_context *pos; - do { + list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) { if (pos->lockowner != current->files) continue; - refcount_inc(&pos->count); - return pos; - } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); + if (refcount_inc_not_zero(&pos->count)) + return pos; + } return NULL; } @@ -874,10 +873,10 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) struct nfs_lock_context *res, *new = NULL; struct inode *inode = d_inode(ctx->dentry); - spin_lock(&inode->i_lock); + rcu_read_lock(); res = __nfs_find_lock_context(ctx); + rcu_read_unlock(); if (res == NULL) { - spin_unlock(&inode->i_lock); new = kmalloc(sizeof(*new), GFP_KERNEL); if (new == NULL) return ERR_PTR(-ENOMEM); @@ -885,14 +884,14 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); if (res == NULL) { - list_add_tail(&new->list, &ctx->lock_context.list); + list_add_tail_rcu(&new->list, &ctx->lock_context.list); new->open_context = ctx; res = new; new = NULL; } + spin_unlock(&inode->i_lock); + kfree(new); } - spin_unlock(&inode->i_lock); - kfree(new); return res; } EXPORT_SYMBOL_GPL(nfs_get_lock_context); @@ -904,9 +903,9 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx) if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; - list_del(&l_ctx->list); + list_del_rcu(&l_ctx->list); spin_unlock(&inode->i_lock); - kfree(l_ctx); + kfree_rcu(l_ctx, rcu_head); } EXPORT_SYMBOL_GPL(nfs_put_lock_context); @@ -978,9 +977,9 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { - if (ctx != NULL) - refcount_inc(&ctx->lock_context.count); - return ctx; + if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count)) + return ctx; + return NULL; } EXPORT_SYMBOL_GPL(get_nfs_open_context); @@ -989,13 +988,13 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) struct inode *inode = d_inode(ctx->dentry); struct super_block *sb = ctx->dentry->d_sb; + if (!refcount_dec_and_test(&ctx->lock_context.count)) + return; if (!list_empty(&ctx->list)) { - if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) - return; - list_del(&ctx->list); + spin_lock(&inode->i_lock); + list_del_rcu(&ctx->list); spin_unlock(&inode->i_lock); - } else if (!refcount_dec_and_test(&ctx->lock_context.count)) - return; + } if (inode != NULL) NFS_PROTO(inode)->close_context(ctx, is_sync); if (ctx->cred != NULL) @@ -1003,7 +1002,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) dput(ctx->dentry); nfs_sb_deactive(sb); kfree(ctx->mdsthreshold); - kfree(ctx); + kfree_rcu(ctx, rcu_head); } void put_nfs_open_context(struct nfs_open_context *ctx) @@ -1027,10 +1026,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - if (ctx->mode & FMODE_WRITE) - list_add(&ctx->list, &nfsi->open_files); - else - list_add_tail(&ctx->list, &nfsi->open_files); + list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); @@ -1051,16 +1047,17 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *pos, *ctx = NULL; - spin_lock(&inode->i_lock); - list_for_each_entry(pos, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, &nfsi->open_files, list) { if (cred != NULL && pos->cred != cred) continue; if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) continue; ctx = get_nfs_open_context(pos); - break; + if (ctx) + break; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ctx; } @@ -1078,9 +1075,6 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx->error < 0) invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; - spin_lock(&inode->i_lock); - list_move_tail(&ctx->list, &NFS_I(inode)->open_files); - spin_unlock(&inode->i_lock); put_nfs_open_context_sync(ctx); } } @@ -1329,19 +1323,11 @@ static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; - assert_spin_locked(&inode->i_lock); - if (!S_ISREG(inode->i_mode)) return false; if (list_empty(&nfsi->open_files)) return false; - /* Note: This relies on nfsi->open_files being ordered with writers - * being placed at the head of the list. - * See nfs_inode_attach_open_context() - */ - return (list_first_entry(&nfsi->open_files, - struct nfs_open_context, - list)->mode & FMODE_WRITE) == FMODE_WRITE; + return inode_is_open_for_write(inode); } static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index ec8a9efa268f..71bc16225b98 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -786,6 +786,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct inode *inode = hdr->inode; + struct nfs_server *server = NFS_SERVER(inode); if (hdr->pgio_done_cb != NULL) return hdr->pgio_done_cb(task, hdr); @@ -793,6 +794,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; + if (task->tk_status >= 0 && !server->read_hdrsize) + cmpxchg(&server->read_hdrsize, 0, hdr->res.replen); + nfs_invalidate_atime(inode); nfs_refresh_inode(inode, &hdr->fattr); return 0; @@ -802,6 +806,7 @@ static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; + hdr->args.replen = NFS_SERVER(hdr->inode)->read_hdrsize; } static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 64e4fa33d89f..78df4eb60f85 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -983,10 +983,11 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, const void *data) { const struct nfs_pgio_args *args = data; + unsigned int replen = args->replen ? args->replen : NFS3_readres_sz; encode_read3args(xdr, args); prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, NFS3_readres_sz); + args->count, replen); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -1364,10 +1365,12 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, encode_nfs_fh3(xdr, args->fh); encode_uint32(xdr, args->mask); - if (args->mask & (NFS_ACL | NFS_DFACL)) + if (args->mask & (NFS_ACL | NFS_DFACL)) { prepare_reply_buffer(req, args->pages, 0, NFSACL_MAXPAGES << PAGE_SHIFT, ACL3_getaclres_sz); + req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; + } } static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, @@ -1673,9 +1676,11 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { struct nfs_pgio_res *result = data; + unsigned int pos; enum nfs_stat status; int error; + pos = xdr_stream_pos(xdr); error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; @@ -1685,6 +1690,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, result->op_status = status; if (status != NFS3_OK) goto out_status; + result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2); error = decode_read3resok(xdr, result); out: return error; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3a6904173214..8d59c9655ec4 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -188,9 +188,10 @@ struct nfs4_state { unsigned int n_wronly; /* Number of write-only references */ unsigned int n_rdwr; /* Number of read/write references */ fmode_t state; /* State on the server (R,W, or RW) */ - atomic_t count; + refcount_t count; wait_queue_head_t waitq; + struct rcu_head rcu_head; }; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 146e30862234..8f53455c4765 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -950,10 +950,10 @@ EXPORT_SYMBOL_GPL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. - * Set the mount rsize and wsize with negotiated fore channel - * attributes which will be bound checked in nfs_server_set_fsinfo. + * Limit the mount rsize, wsize and dtsize using negotiated fore + * channel attributes. */ -static void nfs4_session_set_rwsize(struct nfs_server *server) +static void nfs4_session_limit_rwsize(struct nfs_server *server) { #ifdef CONFIG_NFS_V4_1 struct nfs4_session *sess; @@ -966,9 +966,11 @@ static void nfs4_session_set_rwsize(struct nfs_server *server) server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; - if (!server->rsize || server->rsize > server_resp_sz) + if (server->dtsize > server_resp_sz) + server->dtsize = server_resp_sz; + if (server->rsize > server_resp_sz) server->rsize = server_resp_sz; - if (!server->wsize || server->wsize > server_rqst_sz) + if (server->wsize > server_rqst_sz) server->wsize = server_rqst_sz; #endif /* CONFIG_NFS_V4_1 */ } @@ -1015,12 +1017,12 @@ static int nfs4_server_common_setup(struct nfs_server *server, (unsigned long long) server->fsid.minor); nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); - nfs4_session_set_rwsize(server); - error = nfs_probe_fsinfo(server, mntfh, fattr); if (error < 0) goto out; + nfs4_session_limit_rwsize(server); + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4288a6ecaf75..46d691ba04bc 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -180,8 +180,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t return nfs42_proc_allocate(filep, offset, len); } -static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, - struct file *dst_file, loff_t dst_off, u64 count) +static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, loff_t count, + unsigned int remap_flags) { struct inode *dst_inode = file_inode(dst_file); struct nfs_server *server = NFS_SERVER(dst_inode); @@ -190,6 +191,9 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, bool same_inode = false; int ret; + if (remap_flags & ~REMAP_FILE_ADVISORY) + return -EINVAL; + /* check alignment w.r.t. clone_blksize */ ret = -EINVAL; if (bs) { @@ -240,7 +244,7 @@ out_unlock: inode_unlock(src_inode); } out: - return ret; + return ret < 0 ? ret : count; } #endif /* CONFIG_NFS_V4_2 */ @@ -262,7 +266,7 @@ const struct file_operations nfs4_file_operations = { .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, .fallocate = nfs42_fallocate, - .clone_file_range = nfs42_clone_file_range, + .remap_file_range = nfs42_remap_file_range, #else .llseek = nfs_file_llseek, #endif diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8220a168282e..db84b4adbc49 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1349,12 +1349,20 @@ static bool nfs4_mode_match_open_stateid(struct nfs4_state *state, return false; } -static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) +static int can_open_cached(struct nfs4_state *state, fmode_t mode, + int open_mode, enum open_claim_type4 claim) { int ret = 0; if (open_mode & (O_EXCL|O_TRUNC)) goto out; + switch (claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + goto out; + default: + break; + } switch (mode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 @@ -1747,7 +1755,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) for (;;) { spin_lock(&state->owner->so_lock); - if (can_open_cached(state, fmode, open_mode)) { + if (can_open_cached(state, fmode, open_mode, claim)) { update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); goto out_return_state; @@ -1777,7 +1785,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) out: return ERR_PTR(ret); out_return_state: - atomic_inc(&state->count); + refcount_inc(&state->count); return state; } @@ -1849,7 +1857,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) update: update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); - atomic_inc(&state->count); + refcount_inc(&state->count); return state; } @@ -1887,7 +1895,7 @@ nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data) return ERR_CAST(inode); if (data->state != NULL && data->state->inode == inode) { state = data->state; - atomic_inc(&state->count); + refcount_inc(&state->count); } else state = nfs4_get_open_state(inode, data->owner); iput(inode); @@ -1933,23 +1941,41 @@ nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) return ret; } -static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) +static struct nfs_open_context * +nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode) { struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_open_context *ctx; - spin_lock(&state->inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { if (ctx->state != state) continue; - get_nfs_open_context(ctx); - spin_unlock(&state->inode->i_lock); + if ((ctx->mode & mode) != mode) + continue; + if (!get_nfs_open_context(ctx)) + continue; + rcu_read_unlock(); return ctx; } - spin_unlock(&state->inode->i_lock); + rcu_read_unlock(); return ERR_PTR(-ENOENT); } +static struct nfs_open_context * +nfs4_state_find_open_context(struct nfs4_state *state) +{ + struct nfs_open_context *ctx; + + ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE); + if (!IS_ERR(ctx)) + return ctx; + ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE); + if (!IS_ERR(ctx)) + return ctx; + return nfs4_state_find_open_context_mode(state, FMODE_READ); +} + static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state, enum open_claim_type4 claim) { @@ -1960,7 +1986,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; - atomic_inc(&state->count); + refcount_inc(&state->count); return opendata; } @@ -2276,7 +2302,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) if (data->state != NULL) { struct nfs_delegation *delegation; - if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags)) + if (can_open_cached(data->state, data->o_arg.fmode, + data->o_arg.open_flags, claim)) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 40a08cd483f0..62ae0fd345ad 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -655,7 +655,7 @@ nfs4_alloc_open_state(void) state = kzalloc(sizeof(*state), GFP_NOFS); if (!state) return NULL; - atomic_set(&state->count, 1); + refcount_set(&state->count, 1); INIT_LIST_HEAD(&state->lock_states); spin_lock_init(&state->state_lock); seqlock_init(&state->seqlock); @@ -684,12 +684,12 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_state *state; - list_for_each_entry(state, &nfsi->open_states, inode_states) { + list_for_each_entry_rcu(state, &nfsi->open_states, inode_states) { if (state->owner != owner) continue; if (!nfs4_valid_open_stateid(state)) continue; - if (atomic_inc_not_zero(&state->count)) + if (refcount_inc_not_zero(&state->count)) return state; } return NULL; @@ -698,7 +698,7 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) static void nfs4_free_open_state(struct nfs4_state *state) { - kfree(state); + kfree_rcu(state, rcu_head); } struct nfs4_state * @@ -707,9 +707,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) struct nfs4_state *state, *new; struct nfs_inode *nfsi = NFS_I(inode); - spin_lock(&inode->i_lock); + rcu_read_lock(); state = __nfs4_find_state_byowner(inode, owner); - spin_unlock(&inode->i_lock); + rcu_read_unlock(); if (state) goto out; new = nfs4_alloc_open_state(); @@ -720,7 +720,7 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) state = new; state->owner = owner; atomic_inc(&owner->so_count); - list_add(&state->inode_states, &nfsi->open_states); + list_add_rcu(&state->inode_states, &nfsi->open_states); ihold(inode); state->inode = inode; spin_unlock(&inode->i_lock); @@ -743,10 +743,10 @@ void nfs4_put_open_state(struct nfs4_state *state) struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; - if (!atomic_dec_and_lock(&state->count, &owner->so_lock)) + if (!refcount_dec_and_lock(&state->count, &owner->so_lock)) return; spin_lock(&inode->i_lock); - list_del(&state->inode_states); + list_del_rcu(&state->inode_states); list_del(&state->open_states); spin_unlock(&inode->i_lock); spin_unlock(&owner->so_lock); @@ -1437,8 +1437,8 @@ void nfs_inode_find_state_and_recover(struct inode *inode, struct nfs4_state *state; bool found = false; - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -1456,7 +1456,7 @@ void nfs_inode_find_state_and_recover(struct inode *inode, nfs4_state_mark_reclaim_nograce(clp, state)) found = true; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); nfs_inode_find_delegation_state_and_recover(inode, stateid); if (found) @@ -1469,13 +1469,13 @@ static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { if (ctx->state != state) continue; set_bit(NFS_CONTEXT_BAD, &ctx->flags); } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); } static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) @@ -1549,10 +1549,62 @@ out: return status; } +#ifdef CONFIG_NFS_V4_2 +static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + struct nfs4_copy_state *copy; + + if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) + return; + + spin_lock(&sp->so_server->nfs_client->cl_lock); + list_for_each_entry(copy, &sp->so_server->ss_copies, copies) { + if (nfs4_stateid_match_other(&state->stateid, ©->parent_state->stateid)) + continue; + copy->flags = 1; + complete(©->completion); + break; + } + spin_unlock(&sp->so_server->nfs_client->cl_lock); +} +#else /* !CONFIG_NFS_V4_2 */ +static inline void nfs42_complete_copies(struct nfs4_state_owner *sp, + struct nfs4_state *state) +{ +} +#endif /* CONFIG_NFS_V4_2 */ + +static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state, + const struct nfs4_state_recovery_ops *ops) +{ + struct nfs4_lock_state *lock; + int status; + + status = ops->recover_open(sp, state); + if (status < 0) + return status; + + status = nfs4_reclaim_locks(state, ops); + if (status < 0) + return status; + + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { + spin_lock(&state->state_lock); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) + pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__); + } + spin_unlock(&state->state_lock); + } + + nfs42_complete_copies(sp, state); + clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + return status; +} + static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) { struct nfs4_state *state; - struct nfs4_lock_state *lock; int status = 0; /* Note: we rely on the sp->so_states list being ordered @@ -1573,79 +1625,45 @@ restart: continue; if (state->state == 0) continue; - atomic_inc(&state->count); + refcount_inc(&state->count); spin_unlock(&sp->so_lock); - status = ops->recover_open(sp, state); - if (status >= 0) { - status = nfs4_reclaim_locks(state, ops); - if (status >= 0) { - if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { - spin_lock(&state->state_lock); - list_for_each_entry(lock, &state->lock_states, ls_locks) { - if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) - pr_warn_ratelimited("NFS: " - "%s: Lock reclaim " - "failed!\n", __func__); - } - spin_unlock(&state->state_lock); - } - clear_bit(NFS_STATE_RECLAIM_NOGRACE, - &state->flags); -#ifdef CONFIG_NFS_V4_2 - if (test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) { - struct nfs4_copy_state *copy; - - spin_lock(&sp->so_server->nfs_client->cl_lock); - list_for_each_entry(copy, &sp->so_server->ss_copies, copies) { - if (memcmp(&state->stateid.other, ©->parent_state->stateid.other, NFS4_STATEID_SIZE)) - continue; - copy->flags = 1; - complete(©->completion); - printk("AGLO: server rebooted waking up the copy\n"); - break; - } - spin_unlock(&sp->so_server->nfs_client->cl_lock); - } -#endif /* CONFIG_NFS_V4_2 */ - nfs4_put_open_state(state); - spin_lock(&sp->so_lock); - goto restart; - } - } + status = __nfs4_reclaim_open_state(sp, state, ops); + switch (status) { - default: - printk(KERN_ERR "NFS: %s: unhandled error %d\n", - __func__, status); - /* Fall through */ - case -ENOENT: - case -ENOMEM: - case -EACCES: - case -EROFS: - case -EIO: - case -ESTALE: - /* Open state on this file cannot be recovered */ - nfs4_state_mark_recovery_failed(state, status); - break; - case -EAGAIN: - ssleep(1); - /* Fall through */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_OLD_STATEID: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_RECLAIM_BAD: - case -NFS4ERR_RECLAIM_CONFLICT: - nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + default: + if (status >= 0) break; - case -NFS4ERR_EXPIRED: - case -NFS4ERR_NO_GRACE: - nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - goto out_err; + printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status); + /* Fall through */ + case -ENOENT: + case -ENOMEM: + case -EACCES: + case -EROFS: + case -EIO: + case -ESTALE: + /* Open state on this file cannot be recovered */ + nfs4_state_mark_recovery_failed(state, status); + break; + case -EAGAIN: + ssleep(1); + /* Fall through */ + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto out_err; } nfs4_put_open_state(state); spin_lock(&sp->so_lock); @@ -1795,38 +1813,38 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { - case 0: - break; - case -NFS4ERR_CB_PATH_DOWN: - nfs40_handle_cb_pathdown(clp); - break; - case -NFS4ERR_NO_GRACE: - nfs4_state_end_reclaim_reboot(clp); - break; - case -NFS4ERR_STALE_CLIENTID: - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_reboot(clp); - break; - case -NFS4ERR_EXPIRED: - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_nograce(clp); - break; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - /* Zero session reset errors */ - break; - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); - break; - default: - dprintk("%s: failed to handle error %d for server %s\n", - __func__, error, clp->cl_hostname); - return error; + case 0: + break; + case -NFS4ERR_CB_PATH_DOWN: + nfs40_handle_cb_pathdown(clp); + break; + case -NFS4ERR_NO_GRACE: + nfs4_state_end_reclaim_reboot(clp); + break; + case -NFS4ERR_STALE_CLIENTID: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_reboot(clp); + break; + case -NFS4ERR_EXPIRED: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* Zero session reset errors */ + break; + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + break; + default: + dprintk("%s: failed to handle error %d for server %s\n", + __func__, error, clp->cl_hostname); + return error; } dprintk("%s: handled error %d for server %s\n", __func__, error, clp->cl_hostname); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b7bde12d8cd5..2fc8f6fa25e4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3516,7 +3516,7 @@ static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) { __be32 *p; - int len; + u32 len; if (fh != NULL) memset(fh, 0, sizeof(*fh)); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index bb5476a6d264..5c4568a0804b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -63,14 +63,14 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) { - spin_lock(&hdr->lock); - if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags) - || pos < hdr->io_start + hdr->good_bytes) { + unsigned int new = pos - hdr->io_start; + + if (hdr->good_bytes > new) { + hdr->good_bytes = new; clear_bit(NFS_IOHDR_EOF, &hdr->flags); - hdr->good_bytes = pos - hdr->io_start; - hdr->error = error; + if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)) + hdr->error = error; } - spin_unlock(&hdr->lock); } static inline struct nfs_page * @@ -494,7 +494,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) if (hdr) { INIT_LIST_HEAD(&hdr->pages); - spin_lock_init(&hdr->lock); hdr->rw_ops = ops; } return hdr; @@ -1111,6 +1110,20 @@ static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc, return ret; } +static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc) +{ + u32 midx; + struct nfs_pgio_mirror *mirror; + + if (!desc->pg_error) + return; + + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + } +} + int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { @@ -1161,25 +1174,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, return 1; out_failed: - /* - * We might have failed before sending any reqs over wire. - * Clean up rest of the reqs in mirror pg_list. - */ - if (desc->pg_error) { - struct nfs_pgio_mirror *mirror; - void (*func)(struct list_head *); - - /* remember fatal errors */ - if (nfs_error_is_fatal(desc->pg_error)) - nfs_context_set_write_error(req->wb_context, - desc->pg_error); - - func = desc->pg_completion_ops->error_cleanup; - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; - func(&mirror->pg_list); - } - } + nfs_pageio_error_cleanup(desc); return 0; } @@ -1251,6 +1246,8 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) for (midx = 0; midx < desc->pg_mirror_count; midx++) nfs_pageio_complete_mirror(desc, midx); + if (desc->pg_error < 0) + nfs_pageio_error_cleanup(desc); if (desc->pg_ops->pg_cleanup) desc->pg_ops->pg_cleanup(desc); nfs_pageio_cleanup_mirroring(desc); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7d9a51e6b847..06cb90e9bc6e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -965,7 +965,7 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) struct page **pages; int i; - pages = kcalloc(size, sizeof(struct page *), gfp_flags); + pages = kmalloc_array(size, sizeof(struct page *), gfp_flags); if (!pages) { dprintk("%s: can't alloc array of %zu pages\n", __func__, size); return NULL; @@ -975,7 +975,7 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) pages[i] = alloc_page(gfp_flags); if (!pages[i]) { dprintk("%s: failed to allocate page\n", __func__); - nfs4_free_pages(pages, size); + nfs4_free_pages(pages, i); return NULL; } } @@ -991,6 +991,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino, gfp_t gfp_flags) { struct nfs_server *server = pnfs_find_server(ino, ctx); + size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response; size_t max_pages = max_response_pages(server); struct nfs4_layoutget *lgp; @@ -1000,6 +1001,12 @@ pnfs_alloc_init_layoutget_args(struct inode *ino, if (lgp == NULL) return NULL; + if (max_reply_sz) { + size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages < max_pages) + max_pages = npages; + } + lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); if (!lgp->args.layout.pages) { kfree(lgp); @@ -1332,6 +1339,7 @@ bool pnfs_roc(struct inode *ino, if (!nfs_have_layout(ino)) return false; retry: + rcu_read_lock(); spin_lock(&ino->i_lock); lo = nfsi->layout; if (!lo || !pnfs_layout_is_valid(lo) || @@ -1342,6 +1350,7 @@ retry: pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { spin_unlock(&ino->i_lock); + rcu_read_unlock(); wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); pnfs_put_layout_hdr(lo); @@ -1355,7 +1364,7 @@ retry: skip_read = true; } - list_for_each_entry(ctx, &nfsi->open_files, list) { + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -1403,6 +1412,7 @@ retry: out_noroc: spin_unlock(&ino->i_lock); + rcu_read_unlock(); pnfs_layoutcommit_inode(ino, true); if (roc) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index ece367ebde69..e2e9fcd5341d 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -125,6 +125,7 @@ struct pnfs_layoutdriver_type { struct module *owner; unsigned flags; unsigned max_deviceinfo_size; + unsigned max_layoutget_response; int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); int (*clear_layoutdriver) (struct nfs_server *); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 48d7277c60a9..f9f19784db82 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -276,16 +276,14 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_header *hdr) { if (hdr->res.eof) { - loff_t bound; + loff_t pos = hdr->args.offset + hdr->res.count; + unsigned int new = pos - hdr->io_start; - bound = hdr->args.offset + hdr->res.count; - spin_lock(&hdr->lock); - if (bound < hdr->io_start + hdr->good_bytes) { + if (hdr->good_bytes > new) { + hdr->good_bytes = new; set_bit(NFS_IOHDR_EOF, &hdr->flags); clear_bit(NFS_IOHDR_ERROR, &hdr->flags); - hdr->good_bytes = bound - hdr->io_start; } - spin_unlock(&hdr->lock); } else if (hdr->res.count < hdr->args.count) nfs_readpage_retry(task, hdr); } diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index b7559c6f2b97..4a98537efb0f 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -19,18 +19,22 @@ * is much larger than a sockaddr_in6. */ struct svc_cacherep { - struct list_head c_lru; + struct { + /* Keep often-read xid, csum in the same cache line: */ + __be32 k_xid; + __wsum k_csum; + u32 k_proc; + u32 k_prot; + u32 k_vers; + unsigned int k_len; + struct sockaddr_in6 k_addr; + } c_key; + struct rb_node c_node; + struct list_head c_lru; unsigned char c_state, /* unused, inprog, done */ c_type, /* status, buffer */ c_secure : 1; /* req came from port < 1024 */ - struct sockaddr_in6 c_addr; - __be32 c_xid; - u32 c_prot; - u32 c_proc; - u32 c_vers; - unsigned int c_len; - __wsum c_csum; unsigned long c_timestamp; union { struct kvec u_vec; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index a1143f7c2201..802993d8912f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -46,7 +46,7 @@ static void expkey_put(struct kref *ref) !test_bit(CACHE_NEGATIVE, &key->h.flags)) path_put(&key->ek_path); auth_domain_put(key->ek_client); - kfree(key); + kfree_rcu(key, ek_rcu); } static void expkey_request(struct cache_detail *cd, @@ -265,7 +265,7 @@ svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item) struct cache_head *ch; int hash = svc_expkey_hash(item); - ch = sunrpc_cache_lookup(cd, &item->h, hash); + ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else @@ -314,7 +314,7 @@ static void svc_export_put(struct kref *ref) auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); kfree(exp->ex_uuid); - kfree(exp); + kfree_rcu(exp, ex_rcu); } static void svc_export_request(struct cache_detail *cd, @@ -780,7 +780,7 @@ svc_export_lookup(struct svc_export *exp) struct cache_head *ch; int hash = svc_export_hash(exp); - ch = sunrpc_cache_lookup(exp->cd, &exp->h, hash); + ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash); if (ch) return container_of(ch, struct svc_export, h); else @@ -1216,9 +1216,9 @@ static int e_show(struct seq_file *m, void *p) } const struct seq_operations nfs_exports_op = { - .start = cache_seq_start, - .next = cache_seq_next, - .stop = cache_seq_stop, + .start = cache_seq_start_rcu, + .next = cache_seq_next_rcu, + .stop = cache_seq_stop_rcu, .show = e_show, }; diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index c8b74126ddaa..e7daa1f246f0 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -61,6 +61,7 @@ struct svc_export { u32 ex_layout_types; struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; + struct rcu_head ex_rcu; }; /* an "export key" (expkey) maps a filehandlefragement to an @@ -75,6 +76,7 @@ struct svc_expkey { u32 ek_fsid[6]; struct path ek_path; + struct rcu_head ek_rcu; }; #define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 426f55005697..32cb8c027483 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -123,6 +123,14 @@ struct nfsd_net { wait_queue_head_t ntf_wq; atomic_t ntf_refcnt; + + /* + * clientid and stateid data for construction of net unique COPY + * stateids. + */ + u32 s2s_cp_cl_id; + struct idr s2s_cp_stateids; + spinlock_t s2s_cp_lock; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 601bf33c26a0..25987bcdf96f 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -39,6 +39,7 @@ #include "state.h" #include "netns.h" #include "xdr4cb.h" +#include "xdr4.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -105,6 +106,7 @@ enum nfs_cb_opnum4 { OP_CB_WANTS_CANCELLED = 12, OP_CB_NOTIFY_LOCK = 13, OP_CB_NOTIFY_DEVICEID = 14, + OP_CB_OFFLOAD = 15, OP_CB_ILLEGAL = 10044 }; @@ -683,6 +685,101 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, } /* + * struct write_response4 { + * stateid4 wr_callback_id<1>; + * length4 wr_count; + * stable_how4 wr_committed; + * verifier4 wr_writeverf; + * }; + * union offload_info4 switch (nfsstat4 coa_status) { + * case NFS4_OK: + * write_response4 coa_resok4; + * default: + * length4 coa_bytes_copied; + * }; + * struct CB_OFFLOAD4args { + * nfs_fh4 coa_fh; + * stateid4 coa_stateid; + * offload_info4 coa_offload_info; + * }; + */ +static void encode_offload_info4(struct xdr_stream *xdr, + __be32 nfserr, + const struct nfsd4_copy *cp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p++ = nfserr; + if (!nfserr) { + p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + p = xdr_encode_empty_array(p); + p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written); + *p++ = cpu_to_be32(cp->cp_res.wr_stable_how); + p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data, + NFS4_VERIFIER_SIZE); + } else { + p = xdr_reserve_space(xdr, 8); + /* We always return success if bytes were written */ + p = xdr_encode_hyper(p, 0); + } +} + +static void encode_cb_offload4args(struct xdr_stream *xdr, + __be32 nfserr, + const struct knfsd_fh *fh, + const struct nfsd4_copy *cp, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p++ = cpu_to_be32(OP_CB_OFFLOAD); + encode_nfs_fh4(xdr, fh); + encode_stateid4(xdr, &cp->cp_res.cb_stateid); + encode_offload_info4(xdr, nfserr, cp); + + hdr->nops++; +} + +static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfsd4_callback *cb = data; + const struct nfsd4_copy *cp = + container_of(cb, struct nfsd4_copy, cp_cb); + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr); + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + if (cb) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + } + return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status); +} +/* * RPC procedure tables */ #define PROC(proc, call, argtype, restype) \ @@ -703,6 +800,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), #endif PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), + PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), }; static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index a5bb76593ce7..bf137fec33ff 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -65,6 +65,7 @@ struct ent { u32 id; char name[IDMAP_NAMESZ]; char authname[IDMAP_NAMESZ]; + struct rcu_head rcu_head; }; /* Common entry handling */ @@ -89,7 +90,7 @@ static void ent_put(struct kref *ref) { struct ent *map = container_of(ref, struct ent, h.ref); - kfree(map); + kfree_rcu(map, rcu_head); } static struct cache_head * @@ -264,8 +265,8 @@ out: static struct ent * idtoname_lookup(struct cache_detail *cd, struct ent *item) { - struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h, - idtoname_hash(item)); + struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, + idtoname_hash(item)); if (ch) return container_of(ch, struct ent, h); else @@ -422,8 +423,8 @@ out: static struct ent * nametoid_lookup(struct cache_detail *cd, struct ent *item) { - struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h, - nametoid_hash(item)); + struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, + nametoid_hash(item)); if (ch) return container_of(ch, struct ent, h); else diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b7bc6e1a85ac..edff074d38c7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -36,6 +36,7 @@ #include <linux/file.h> #include <linux/falloc.h> #include <linux/slab.h> +#include <linux/kthread.h> #include "idmap.h" #include "cache.h" @@ -1089,36 +1090,254 @@ out: return status; } +void nfs4_put_copy(struct nfsd4_copy *copy) +{ + if (!refcount_dec_and_test(©->refcount)) + return; + kfree(copy); +} + +static bool +check_and_set_stop_copy(struct nfsd4_copy *copy) +{ + bool value; + + spin_lock(©->cp_clp->async_lock); + value = copy->stopped; + if (!copy->stopped) + copy->stopped = true; + spin_unlock(©->cp_clp->async_lock); + return value; +} + +static void nfsd4_stop_copy(struct nfsd4_copy *copy) +{ + /* only 1 thread should stop the copy */ + if (!check_and_set_stop_copy(copy)) + kthread_stop(copy->copy_task); + nfs4_put_copy(copy); +} + +static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp) +{ + struct nfsd4_copy *copy = NULL; + + spin_lock(&clp->async_lock); + if (!list_empty(&clp->async_copies)) { + copy = list_first_entry(&clp->async_copies, struct nfsd4_copy, + copies); + refcount_inc(©->refcount); + } + spin_unlock(&clp->async_lock); + return copy; +} + +void nfsd4_shutdown_copy(struct nfs4_client *clp) +{ + struct nfsd4_copy *copy; + + while ((copy = nfsd4_get_copy(clp)) != NULL) + nfsd4_stop_copy(copy); +} + +static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) +{ + struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb); + + nfs4_put_copy(copy); +} + +static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + return 1; +} + +static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = { + .release = nfsd4_cb_offload_release, + .done = nfsd4_cb_offload_done +}; + +static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) +{ + copy->cp_res.wr_stable_how = NFS_UNSTABLE; + copy->cp_synchronous = sync; + gen_boot_verifier(©->cp_res.wr_verifier, copy->cp_clp->net); +} + +static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) +{ + ssize_t bytes_copied = 0; + size_t bytes_total = copy->cp_count; + u64 src_pos = copy->cp_src_pos; + u64 dst_pos = copy->cp_dst_pos; + + do { + if (kthread_should_stop()) + break; + bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos, + copy->file_dst, dst_pos, bytes_total); + if (bytes_copied <= 0) + break; + bytes_total -= bytes_copied; + copy->cp_res.wr_bytes_written += bytes_copied; + src_pos += bytes_copied; + dst_pos += bytes_copied; + } while (bytes_total > 0 && !copy->cp_synchronous); + return bytes_copied; +} + +static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) +{ + __be32 status; + ssize_t bytes; + + bytes = _nfsd_copy_file_range(copy); + /* for async copy, we ignore the error, client can always retry + * to get the error + */ + if (bytes < 0 && !copy->cp_res.wr_bytes_written) + status = nfserrno(bytes); + else { + nfsd4_init_copy_res(copy, sync); + status = nfs_ok; + } + + fput(copy->file_src); + fput(copy->file_dst); + return status; +} + +static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) +{ + dst->cp_src_pos = src->cp_src_pos; + dst->cp_dst_pos = src->cp_dst_pos; + dst->cp_count = src->cp_count; + dst->cp_synchronous = src->cp_synchronous; + memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res)); + memcpy(&dst->fh, &src->fh, sizeof(src->fh)); + dst->cp_clp = src->cp_clp; + dst->file_dst = get_file(src->file_dst); + dst->file_src = get_file(src->file_src); + memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid)); +} + +static void cleanup_async_copy(struct nfsd4_copy *copy) +{ + nfs4_free_cp_state(copy); + fput(copy->file_dst); + fput(copy->file_src); + spin_lock(©->cp_clp->async_lock); + list_del(©->copies); + spin_unlock(©->cp_clp->async_lock); + nfs4_put_copy(copy); +} + +static int nfsd4_do_async_copy(void *data) +{ + struct nfsd4_copy *copy = (struct nfsd4_copy *)data; + struct nfsd4_copy *cb_copy; + + copy->nfserr = nfsd4_do_copy(copy, 0); + cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); + if (!cb_copy) + goto out; + memcpy(&cb_copy->cp_res, ©->cp_res, sizeof(copy->cp_res)); + cb_copy->cp_clp = copy->cp_clp; + cb_copy->nfserr = copy->nfserr; + memcpy(&cb_copy->fh, ©->fh, sizeof(copy->fh)); + nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp, + &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); + nfsd4_run_cb(&cb_copy->cp_cb); +out: + cleanup_async_copy(copy); + return 0; +} + static __be32 nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_copy *copy = &u->copy; - struct file *src, *dst; __be32 status; - ssize_t bytes; + struct nfsd4_copy *async_copy = NULL; - status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, &src, - ©->cp_dst_stateid, &dst); + status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, + ©->file_src, ©->cp_dst_stateid, + ©->file_dst); if (status) goto out; - bytes = nfsd_copy_file_range(src, copy->cp_src_pos, - dst, copy->cp_dst_pos, copy->cp_count); + copy->cp_clp = cstate->clp; + memcpy(©->fh, &cstate->current_fh.fh_handle, + sizeof(struct knfsd_fh)); + if (!copy->cp_synchronous) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - if (bytes < 0) - status = nfserrno(bytes); - else { - copy->cp_res.wr_bytes_written = bytes; - copy->cp_res.wr_stable_how = NFS_UNSTABLE; - copy->cp_synchronous = 1; - gen_boot_verifier(©->cp_res.wr_verifier, SVC_NET(rqstp)); + status = nfserrno(-ENOMEM); + async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); + if (!async_copy) + goto out; + if (!nfs4_init_cp_state(nn, copy)) { + kfree(async_copy); + goto out; + } + refcount_set(&async_copy->refcount, 1); + memcpy(©->cp_res.cb_stateid, ©->cp_stateid, + sizeof(copy->cp_stateid)); + dup_copy_fields(copy, async_copy); + async_copy->copy_task = kthread_create(nfsd4_do_async_copy, + async_copy, "%s", "copy thread"); + if (IS_ERR(async_copy->copy_task)) + goto out_err; + spin_lock(&async_copy->cp_clp->async_lock); + list_add(&async_copy->copies, + &async_copy->cp_clp->async_copies); + spin_unlock(&async_copy->cp_clp->async_lock); + wake_up_process(async_copy->copy_task); status = nfs_ok; + } else + status = nfsd4_do_copy(copy, 1); +out: + return status; +out_err: + cleanup_async_copy(async_copy); + goto out; +} + +struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *stateid) +{ + struct nfsd4_copy *copy; + + spin_lock(&clp->async_lock); + list_for_each_entry(copy, &clp->async_copies, copies) { + if (memcmp(©->cp_stateid, stateid, NFS4_STATEID_SIZE)) + continue; + refcount_inc(©->refcount); + spin_unlock(&clp->async_lock); + return copy; } + spin_unlock(&clp->async_lock); + return NULL; +} + +static __be32 +nfsd4_offload_cancel(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_offload_status *os = &u->offload_status; + __be32 status = 0; + struct nfsd4_copy *copy; + struct nfs4_client *clp = cstate->clp; + + copy = find_async_copy(clp, &os->stateid); + if (copy) + nfsd4_stop_copy(copy); + else + status = nfserr_bad_stateid; - fput(src); - fput(dst); -out: return status; } @@ -1144,6 +1363,25 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fput(file); return status; } +static __be32 +nfsd4_offload_status(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_offload_status *os = &u->offload_status; + __be32 status = 0; + struct nfsd4_copy *copy; + struct nfs4_client *clp = cstate->clp; + + copy = find_async_copy(clp, &os->stateid); + if (copy) { + os->count = copy->cp_res.wr_bytes_written; + nfs4_put_copy(copy); + } else + status = nfserr_bad_stateid; + + return status; +} static __be32 nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -2047,6 +2285,14 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1 /* cr_synchronous */) * sizeof(__be32); } +static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 2 /* osr_count */ + + 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32); +} + #ifdef CONFIG_NFSD_PNFS static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { @@ -2460,6 +2706,17 @@ static const struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_SEEK", .op_rsize_bop = nfsd4_seek_rsize, }, + [OP_OFFLOAD_STATUS] = { + .op_func = nfsd4_offload_status, + .op_name = "OP_OFFLOAD_STATUS", + .op_rsize_bop = nfsd4_offload_status_rsize, + }, + [OP_OFFLOAD_CANCEL] = { + .op_func = nfsd4_offload_cancel, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_OFFLOAD_CANCEL", + .op_rsize_bop = nfsd4_only_status_rsize, + }, }; /** diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b0ca0efd2875..f093fbe47133 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -713,6 +713,36 @@ out_free: return NULL; } +/* + * Create a unique stateid_t to represent each COPY. + */ +int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy) +{ + int new_id; + + idr_preload(GFP_KERNEL); + spin_lock(&nn->s2s_cp_lock); + new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, copy, 0, 0, GFP_NOWAIT); + spin_unlock(&nn->s2s_cp_lock); + idr_preload_end(); + if (new_id < 0) + return 0; + copy->cp_stateid.si_opaque.so_id = new_id; + copy->cp_stateid.si_opaque.so_clid.cl_boot = nn->boot_time; + copy->cp_stateid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; + return 1; +} + +void nfs4_free_cp_state(struct nfsd4_copy *copy) +{ + struct nfsd_net *nn; + + nn = net_generic(copy->cp_clp->net, nfsd_net_id); + spin_lock(&nn->s2s_cp_lock); + idr_remove(&nn->s2s_cp_stateids, copy->cp_stateid.si_opaque.so_id); + spin_unlock(&nn->s2s_cp_lock); +} + static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) { struct nfs4_stid *stid; @@ -1827,6 +1857,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); #endif + INIT_LIST_HEAD(&clp->async_copies); + spin_lock_init(&clp->async_lock); spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; @@ -1942,6 +1974,7 @@ __destroy_client(struct nfs4_client *clp) } } nfsd4_return_all_client_layouts(clp); + nfsd4_shutdown_copy(clp); nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); @@ -2475,7 +2508,8 @@ static bool client_has_state(struct nfs4_client *clp) || !list_empty(&clp->cl_lo_states) #endif || !list_empty(&clp->cl_delegations) - || !list_empty(&clp->cl_sessions); + || !list_empty(&clp->cl_sessions) + || !list_empty(&clp->async_copies); } __be32 @@ -4364,7 +4398,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); if (!fl) - goto out_stid; + goto out_clnt_odstate; status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL); if (fl) @@ -4389,7 +4423,6 @@ out_unlock: vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); -out_stid: nfs4_put_stid(&dp->dl_stid); out_delegees: put_deleg_file(fp); @@ -7161,6 +7194,8 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); spin_lock_init(&nn->client_lock); + spin_lock_init(&nn->s2s_cp_lock); + idr_init(&nn->s2s_cp_stateids); spin_lock_init(&nn->blocked_locks_lock); INIT_LIST_HEAD(&nn->blocked_locks_lru); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 418fa9c78186..3de42a729093 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1768,6 +1768,13 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) } static __be32 +nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, + struct nfsd4_offload_status *os) +{ + return nfsd4_decode_stateid(argp, &os->stateid); +} + +static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { DECODE_HEAD; @@ -1873,8 +1880,8 @@ static const nfsd4_dec nfsd4_dec_ops[] = { [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status, + [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status, [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -4224,15 +4231,27 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, #endif /* CONFIG_NFSD_PNFS */ static __be32 -nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write) +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, + struct nfsd42_write_res *write, bool sync) { __be32 *p; + p = xdr_reserve_space(&resp->xdr, 4); + if (!p) + return nfserr_resource; - p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + if (sync) + *p++ = cpu_to_be32(0); + else { + __be32 nfserr; + *p++ = cpu_to_be32(1); + nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid); + if (nfserr) + return nfserr; + } + p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); if (!p) return nfserr_resource; - *p++ = cpu_to_be32(0); p = xdr_encode_hyper(p, write->wr_bytes_written); *p++ = cpu_to_be32(write->wr_stable_how); p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, @@ -4246,7 +4265,8 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, { __be32 *p; - nfserr = nfsd42_encode_write_res(resp, ©->cp_res); + nfserr = nfsd42_encode_write_res(resp, ©->cp_res, + copy->cp_synchronous); if (nfserr) return nfserr; @@ -4257,6 +4277,22 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 +nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_offload_status *os) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_hyper(p, os->count); + *p++ = cpu_to_be32(0); + + return nfserr; +} + +static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_seek *seek) { @@ -4359,7 +4395,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = { [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status, [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index dbdeb9d6af03..e2fe0e9ce0df 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -30,6 +30,7 @@ #define TARGET_BUCKET_SIZE 64 struct nfsd_drc_bucket { + struct rb_root rb_head; struct list_head lru_head; spinlock_t cache_lock; }; @@ -121,7 +122,7 @@ nfsd_cache_hash(__be32 xid) } static struct svc_cacherep * -nfsd_reply_cache_alloc(void) +nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum) { struct svc_cacherep *rp; @@ -129,21 +130,35 @@ nfsd_reply_cache_alloc(void) if (rp) { rp->c_state = RC_UNUSED; rp->c_type = RC_NOCACHE; + RB_CLEAR_NODE(&rp->c_node); INIT_LIST_HEAD(&rp->c_lru); + + memset(&rp->c_key, 0, sizeof(rp->c_key)); + rp->c_key.k_xid = rqstp->rq_xid; + rp->c_key.k_proc = rqstp->rq_proc; + rpc_copy_addr((struct sockaddr *)&rp->c_key.k_addr, svc_addr(rqstp)); + rpc_set_port((struct sockaddr *)&rp->c_key.k_addr, rpc_get_port(svc_addr(rqstp))); + rp->c_key.k_prot = rqstp->rq_prot; + rp->c_key.k_vers = rqstp->rq_vers; + rp->c_key.k_len = rqstp->rq_arg.len; + rp->c_key.k_csum = csum; } return rp; } static void -nfsd_reply_cache_free_locked(struct svc_cacherep *rp) +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { drc_mem_usage -= rp->c_replvec.iov_len; kfree(rp->c_replvec.iov_base); } - list_del(&rp->c_lru); - atomic_dec(&num_drc_entries); - drc_mem_usage -= sizeof(*rp); + if (rp->c_state != RC_UNUSED) { + rb_erase(&rp->c_node, &b->rb_head); + list_del(&rp->c_lru); + atomic_dec(&num_drc_entries); + drc_mem_usage -= sizeof(*rp); + } kmem_cache_free(drc_slab, rp); } @@ -151,7 +166,7 @@ static void nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { spin_lock(&b->cache_lock); - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(b, rp); spin_unlock(&b->cache_lock); } @@ -207,7 +222,7 @@ void nfsd_reply_cache_shutdown(void) struct list_head *head = &drc_hashtbl[i].lru_head; while (!list_empty(head)) { rp = list_first_entry(head, struct svc_cacherep, c_lru); - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(&drc_hashtbl[i], rp); } } @@ -246,7 +261,7 @@ prune_bucket(struct nfsd_drc_bucket *b) if (atomic_read(&num_drc_entries) <= max_drc_entries && time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) break; - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(b, rp); freed++; } return freed; @@ -318,51 +333,48 @@ nfsd_cache_csum(struct svc_rqst *rqstp) return csum; } -static bool -nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) +static int +nfsd_cache_key_cmp(const struct svc_cacherep *key, const struct svc_cacherep *rp) { - /* Check RPC XID first */ - if (rqstp->rq_xid != rp->c_xid) - return false; - /* compare checksum of NFS data */ - if (csum != rp->c_csum) { + if (key->c_key.k_xid == rp->c_key.k_xid && + key->c_key.k_csum != rp->c_key.k_csum) ++payload_misses; - return false; - } - /* Other discriminators */ - if (rqstp->rq_proc != rp->c_proc || - rqstp->rq_prot != rp->c_prot || - rqstp->rq_vers != rp->c_vers || - rqstp->rq_arg.len != rp->c_len || - !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || - rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) - return false; - - return true; + return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key)); } /* * Search the request hash for an entry that matches the given rqstp. * Must be called with cache_lock held. Returns the found entry or - * NULL on failure. + * inserts an empty key on failure. */ static struct svc_cacherep * -nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp, - __wsum csum) +nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key) { - struct svc_cacherep *rp, *ret = NULL; - struct list_head *rh = &b->lru_head; + struct svc_cacherep *rp, *ret = key; + struct rb_node **p = &b->rb_head.rb_node, + *parent = NULL; unsigned int entries = 0; + int cmp; - list_for_each_entry(rp, rh, c_lru) { + while (*p != NULL) { ++entries; - if (nfsd_cache_match(rqstp, csum, rp)) { + parent = *p; + rp = rb_entry(parent, struct svc_cacherep, c_node); + + cmp = nfsd_cache_key_cmp(key, rp); + if (cmp < 0) + p = &parent->rb_left; + else if (cmp > 0) + p = &parent->rb_right; + else { ret = rp; - break; + goto out; } } - + rb_link_node(&key->c_node, parent, p); + rb_insert_color(&key->c_node, &b->rb_head); +out: /* tally hash chain length stats */ if (entries > longest_chain) { longest_chain = entries; @@ -374,6 +386,7 @@ nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp, atomic_read(&num_drc_entries)); } + lru_put_end(b, ret); return ret; } @@ -389,9 +402,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) { struct svc_cacherep *rp, *found; __be32 xid = rqstp->rq_xid; - u32 proto = rqstp->rq_prot, - vers = rqstp->rq_vers, - proc = rqstp->rq_proc; __wsum csum; u32 hash = nfsd_cache_hash(xid); struct nfsd_drc_bucket *b = &drc_hashtbl[hash]; @@ -410,60 +420,38 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) * Since the common case is a cache miss followed by an insert, * preallocate an entry. */ - rp = nfsd_reply_cache_alloc(); - spin_lock(&b->cache_lock); - if (likely(rp)) { - atomic_inc(&num_drc_entries); - drc_mem_usage += sizeof(*rp); + rp = nfsd_reply_cache_alloc(rqstp, csum); + if (!rp) { + dprintk("nfsd: unable to allocate DRC entry!\n"); + return rtn; } - /* go ahead and prune the cache */ - prune_bucket(b); - - found = nfsd_cache_search(b, rqstp, csum); - if (found) { - if (likely(rp)) - nfsd_reply_cache_free_locked(rp); + spin_lock(&b->cache_lock); + found = nfsd_cache_insert(b, rp); + if (found != rp) { + nfsd_reply_cache_free_locked(NULL, rp); rp = found; goto found_entry; } - if (!rp) { - dprintk("nfsd: unable to allocate DRC entry!\n"); - goto out; - } - nfsdstats.rcmisses++; rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; - rp->c_xid = xid; - rp->c_proc = proc; - rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp)); - rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp))); - rp->c_prot = proto; - rp->c_vers = vers; - rp->c_len = rqstp->rq_arg.len; - rp->c_csum = csum; - lru_put_end(b, rp); + atomic_inc(&num_drc_entries); + drc_mem_usage += sizeof(*rp); - /* release any buffer */ - if (rp->c_type == RC_REPLBUFF) { - drc_mem_usage -= rp->c_replvec.iov_len; - kfree(rp->c_replvec.iov_base); - rp->c_replvec.iov_base = NULL; - } - rp->c_type = RC_NOCACHE; + /* go ahead and prune the cache */ + prune_bucket(b); out: spin_unlock(&b->cache_lock); return rtn; found_entry: - nfsdstats.rchits++; /* We found a matching entry which is either in progress or done. */ - lru_put_end(b, rp); - + nfsdstats.rchits++; rtn = RC_DROPIT; + /* Request being processed */ if (rp->c_state == RC_INPROG) goto out; @@ -489,7 +477,7 @@ found_entry: break; default: printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(b, rp); } goto out; @@ -524,7 +512,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) if (!rp) return; - hash = nfsd_cache_hash(rp->c_xid); + hash = nfsd_cache_hash(rp->c_key.k_xid); b = &drc_hashtbl[hash]; len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7fb9f7c667b1..6384c9b94898 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1242,6 +1242,7 @@ static __net_init int nfsd_init_net(struct net *net) nn->somebody_reclaimed = false; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); + nn->s2s_cp_cl_id = nn->clientid_counter++; atomic_set(&nn->ntf_refcnt, 0); init_waitqueue_head(&nn->ntf_wq); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 0b15dac7e609..6aacb325b6a0 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -355,6 +355,8 @@ struct nfs4_client { struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ /* wait here for slots */ struct net *net; + struct list_head async_copies; /* list of async copies */ + spinlock_t async_lock; /* lock for async copies */ }; /* struct nfs4_client_reset @@ -573,6 +575,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_NULL = 0, NFSPROC4_CLNT_CB_RECALL, NFSPROC4_CLNT_CB_LAYOUT, + NFSPROC4_CLNT_CB_OFFLOAD, NFSPROC4_CLNT_CB_SEQUENCE, NFSPROC4_CLNT_CB_NOTIFY_LOCK, }; @@ -599,6 +602,7 @@ struct nfsd4_blocked_lock { struct nfsd4_compound_state; struct nfsd_net; +struct nfsd4_copy; extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, @@ -608,6 +612,8 @@ __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid **s, struct nfsd_net *nn); struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, void (*sc_free)(struct nfs4_stid *)); +int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy); +void nfs4_free_cp_state(struct nfsd4_copy *copy); void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); @@ -626,6 +632,7 @@ extern void nfsd4_run_cb(struct nfsd4_callback *cb); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); +extern void nfsd4_shutdown_copy(struct nfs4_client *clp); extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn); @@ -633,6 +640,9 @@ extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); struct nfs4_file *find_file(struct knfsd_fh *fh); void put_nfs4_file(struct nfs4_file *fi); +extern void nfs4_put_copy(struct nfsd4_copy *copy); +extern struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *staetid); static inline void get_nfs4_file(struct nfs4_file *fi) { refcount_inc(&fi->fi_ref); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b53e76391e52..eb67098117b4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -541,8 +541,12 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { - return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, - count)); + loff_t cloned; + + cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); + if (count && cloned != count) + cloned = -EINVAL; + return nfserrno(cloned < 0 ? cloned : 0); } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, @@ -923,7 +927,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, int host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count); + iov_iter_kvec(&iter, READ, vec, vlen, *count); host_err = vfs_iter_read(file, &iter, &offset, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); } @@ -999,7 +1003,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (stable && !use_wgather) flags |= RWF_SYNC; - iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt); + iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) goto out_nfserr; @@ -1276,7 +1280,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dev_t rdev, struct svc_fh *resfhp) { struct dentry *dentry, *dchild = NULL; - struct inode *dirp; __be32 err; int host_err; @@ -1288,7 +1291,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; dentry = fhp->fh_dentry; - dirp = d_inode(dentry); host_err = fh_want_write(fhp); if (host_err) @@ -1409,6 +1411,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, *created = 1; break; } + /* fall through */ case NFS4_CREATE_EXCLUSIVE4_1: if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime && d_inode(dchild)->i_atime.tv_sec == v_atime @@ -1417,7 +1420,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, *created = 1; goto set_attr; } - /* fallthru */ + /* fall through */ case NFS3_CREATE_GUARDED: err = nfserr_exist; } diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 17c453a7999c..feeb6d4bdffd 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -511,6 +511,7 @@ struct nfsd42_write_res { u64 wr_bytes_written; u32 wr_stable_how; nfs4_verifier wr_verifier; + stateid_t cb_stateid; }; struct nfsd4_copy { @@ -526,6 +527,23 @@ struct nfsd4_copy { /* response */ struct nfsd42_write_res cp_res; + + /* for cb_offload */ + struct nfsd4_callback cp_cb; + __be32 nfserr; + struct knfsd_fh fh; + + struct nfs4_client *cp_clp; + + struct file *file_src; + struct file *file_dst; + + stateid_t cp_stateid; + + struct list_head copies; + struct task_struct *copy_task; + refcount_t refcount; + bool stopped; }; struct nfsd4_seek { @@ -539,6 +557,15 @@ struct nfsd4_seek { loff_t seek_pos; }; +struct nfsd4_offload_status { + /* request */ + stateid_t stateid; + + /* response */ + u64 count; + u32 status; +}; + struct nfsd4_op { int opnum; const struct nfsd4_operation * opdesc; @@ -597,6 +624,7 @@ struct nfsd4_op { struct nfsd4_fallocate deallocate; struct nfsd4_clone clone; struct nfsd4_copy copy; + struct nfsd4_offload_status offload_status; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 517239af0302..547cf07cf4e0 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -38,3 +38,13 @@ #define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) +#define enc_cb_offload_info_sz (1 + 1 + 2 + 1 + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define NFS4_enc_cb_offload_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + enc_nfs4_fh_sz + \ + enc_stateid_sz + \ + enc_cb_offload_info_sz) +#define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index ebb24a314f43..de99db518571 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -168,24 +168,18 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, ctxt->newbh = NULL; if (inode->i_blkbits == PAGE_SHIFT) { - lock_page(obh->b_page); - /* - * We cannot call radix_tree_preload for the kernels older - * than 2.6.23, because it is not exported for modules. - */ + struct page *opage = obh->b_page; + lock_page(opage); retry: - err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (err) - goto failed_unlock; /* BUG_ON(oldkey != obh->b_page->index); */ - if (unlikely(oldkey != obh->b_page->index)) - NILFS_PAGE_BUG(obh->b_page, + if (unlikely(oldkey != opage->index)) + NILFS_PAGE_BUG(opage, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); xa_lock_irq(&btnc->i_pages); - err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page); + err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS); xa_unlock_irq(&btnc->i_pages); /* * Note: page->index will not change to newkey until @@ -193,7 +187,6 @@ retry: * To protect the page in intermediate state, the page lock * is held. */ - radix_tree_preload_end(); if (!err) return 0; else if (err != -EEXIST) @@ -203,7 +196,7 @@ retry: if (!err) goto retry; /* fallback to copy mode */ - unlock_page(obh->b_page); + unlock_page(opage); } nbh = nilfs_btnode_create_block(btnc, newkey); @@ -243,9 +236,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, mark_buffer_dirty(obh); xa_lock_irq(&btnc->i_pages); - radix_tree_delete(&btnc->i_pages, oldkey); - radix_tree_tag_set(&btnc->i_pages, newkey, - PAGECACHE_TAG_DIRTY); + __xa_erase(&btnc->i_pages, oldkey); + __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); xa_unlock_irq(&btnc->i_pages); opage->index = obh->b_blocknr = newkey; @@ -275,7 +267,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, if (nbh == NULL) { /* blocksize == pagesize */ xa_lock_irq(&btnc->i_pages); - radix_tree_delete(&btnc->i_pages, newkey); + __xa_erase(&btnc->i_pages, newkey); xa_unlock_irq(&btnc->i_pages); unlock_page(ctxt->bh->b_page); } else diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 329a056b73b1..d7fc8d369d89 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -289,7 +289,7 @@ repeat: * @dmap: destination page cache * @smap: source page cache * - * No pages must no be added to the cache during this process. + * No pages must be added to the cache during this process. * This must be ensured by the caller. */ void nilfs_copy_back_pages(struct address_space *dmap, @@ -298,7 +298,6 @@ void nilfs_copy_back_pages(struct address_space *dmap, struct pagevec pvec; unsigned int i, n; pgoff_t index = 0; - int err; pagevec_init(&pvec); repeat: @@ -313,35 +312,34 @@ repeat: lock_page(page); dpage = find_lock_page(dmap, offset); if (dpage) { - /* override existing page on the destination cache */ + /* overwrite existing page in the destination cache */ WARN_ON(PageDirty(dpage)); nilfs_copy_page(dpage, page, 0); unlock_page(dpage); put_page(dpage); + /* Do we not need to remove page from smap here? */ } else { - struct page *page2; + struct page *p; /* move the page to the destination cache */ xa_lock_irq(&smap->i_pages); - page2 = radix_tree_delete(&smap->i_pages, offset); - WARN_ON(page2 != page); - + p = __xa_erase(&smap->i_pages, offset); + WARN_ON(page != p); smap->nrpages--; xa_unlock_irq(&smap->i_pages); xa_lock_irq(&dmap->i_pages); - err = radix_tree_insert(&dmap->i_pages, offset, page); - if (unlikely(err < 0)) { - WARN_ON(err == -EEXIST); + p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS); + if (unlikely(p)) { + /* Probably -ENOMEM */ page->mapping = NULL; - put_page(page); /* for cache */ + put_page(page); } else { page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) - radix_tree_tag_set(&dmap->i_pages, - offset, - PAGECACHE_TAG_DIRTY); + __xa_set_mark(&dmap->i_pages, offset, + PAGECACHE_TAG_DIRTY); } xa_unlock_irq(&dmap->i_pages); } @@ -467,8 +465,7 @@ int __nilfs_clear_page_dirty(struct page *page) if (mapping) { xa_lock_irq(&mapping->i_pages); if (test_bit(PG_dirty, &page->flags)) { - radix_tree_tag_clear(&mapping->i_pages, - page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irq(&mapping->i_pages); return clear_page_dirty_for_io(page); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 94b52157bf8d..5769cf3ff035 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -25,7 +25,7 @@ static bool should_merge(struct fsnotify_event *old_fsn, old = FANOTIFY_E(old_fsn); new = FANOTIFY_E(new_fsn); - if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && + if (old_fsn->inode == new_fsn->inode && old->pid == new->pid && old->path.mnt == new->path.mnt && old->path.dentry == new->path.dentry) return true; @@ -131,8 +131,8 @@ static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info, !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) return false; - if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask & - ~marks_ignored_mask) + if (event_mask & FANOTIFY_OUTGOING_EVENTS & + marks_mask & ~marks_ignored_mask) return true; return false; @@ -171,7 +171,10 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, goto out; init: __maybe_unused fsnotify_init_event(&event->fse, inode, mask); - event->tgid = get_pid(task_tgid(current)); + if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) + event->pid = get_pid(task_pid(current)); + else + event->pid = get_pid(task_tgid(current)); if (path) { event->path = *path; path_get(&event->path); @@ -205,6 +208,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 10); + if (!fanotify_should_send_event(iter_info, mask, data, data_type)) return 0; @@ -236,7 +241,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, ret = fsnotify_add_event(group, fsn_event, fanotify_merge); if (ret) { /* Permission events shouldn't be merged */ - BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); + BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); @@ -268,7 +273,7 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) event = FANOTIFY_E(fsn_event); path_put(&event->path); - put_pid(event->tgid); + put_pid(event->pid); if (fanotify_is_perm_event(fsn_event->mask)) { kmem_cache_free(fanotify_perm_event_cachep, FANOTIFY_PE(fsn_event)); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 8609ba06f474..ea05b8a401e7 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -19,7 +19,7 @@ struct fanotify_event_info { * during this object's lifetime */ struct path path; - struct pid *tgid; + struct pid *pid; }; /* @@ -44,7 +44,7 @@ FANOTIFY_PE(struct fsnotify_event *fse) static inline bool fanotify_is_perm_event(u32 mask) { return IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS) && - mask & FAN_ALL_PERM_EVENTS; + mask & FANOTIFY_PERM_EVENTS; } static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 69054886915b..e03be5071362 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -131,8 +131,8 @@ static int fill_event_metadata(struct fsnotify_group *group, metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->reserved = 0; - metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; - metadata->pid = pid_vnr(event->tgid); + metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS; + metadata->pid = pid_vnr(event->pid); if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) metadata->fd = FAN_NOFD; else { @@ -191,7 +191,7 @@ static int process_access_response(struct fsnotify_group *group, if (fd < 0) return -EINVAL; - if ((response & FAN_AUDIT) && !group->fanotify_data.audit) + if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; event = dequeue_event(group, fd); @@ -395,7 +395,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ while (!fsnotify_notify_queue_is_empty(group)) { fsn_event = fsnotify_remove_first_event(group); - if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) { + if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); spin_lock(&group->notification_lock); @@ -506,18 +506,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { - __u32 tmask = fsn_mark->mask & ~mask; - - if (flags & FAN_MARK_ONDIR) - tmask &= ~FAN_ONDIR; - oldmask = fsn_mark->mask; - fsn_mark->mask = tmask; + fsn_mark->mask &= ~mask; } else { - __u32 tmask = fsn_mark->ignored_mask & ~mask; - if (flags & FAN_MARK_ONDIR) - tmask &= ~FAN_ONDIR; - fsn_mark->ignored_mask = tmask; + fsn_mark->ignored_mask &= ~mask; } *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); spin_unlock(&fsn_mark->lock); @@ -563,6 +555,13 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, mask, flags); } +static int fanotify_remove_sb_mark(struct fsnotify_group *group, + struct super_block *sb, __u32 mask, + unsigned int flags) +{ + return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, flags); +} + static int fanotify_remove_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, unsigned int flags) @@ -579,19 +578,10 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { - __u32 tmask = fsn_mark->mask | mask; - - if (flags & FAN_MARK_ONDIR) - tmask |= FAN_ONDIR; - oldmask = fsn_mark->mask; - fsn_mark->mask = tmask; + fsn_mark->mask |= mask; } else { - __u32 tmask = fsn_mark->ignored_mask | mask; - if (flags & FAN_MARK_ONDIR) - tmask |= FAN_ONDIR; - - fsn_mark->ignored_mask = tmask; + fsn_mark->ignored_mask |= mask; if (flags & FAN_MARK_IGNORED_SURV_MODIFY) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; } @@ -658,6 +648,14 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags); } +static int fanotify_add_sb_mark(struct fsnotify_group *group, + struct super_block *sb, __u32 mask, + unsigned int flags) +{ + return fanotify_add_mark(group, &sb->s_fsnotify_marks, + FSNOTIFY_OBJ_TYPE_SB, mask, flags); +} + static int fanotify_add_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, unsigned int flags) @@ -686,16 +684,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) struct user_struct *user; struct fanotify_event_info *oevent; - pr_debug("%s: flags=%d event_f_flags=%d\n", - __func__, flags, event_f_flags); + pr_debug("%s: flags=%x event_f_flags=%x\n", + __func__, flags, event_f_flags); if (!capable(CAP_SYS_ADMIN)) return -EPERM; #ifdef CONFIG_AUDITSYSCALL - if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT)) + if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) #else - if (flags & ~FAN_ALL_INIT_FLAGS) + if (flags & ~FANOTIFY_INIT_FLAGS) #endif return -EINVAL; @@ -731,6 +729,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } group->fanotify_data.user = user; + group->fanotify_data.flags = flags; atomic_inc(&user->fanotify_listeners); group->memcg = get_mem_cgroup_from_mm(current->mm); @@ -746,7 +745,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - switch (flags & FAN_ALL_CLASS_BITS) { + switch (flags & FANOTIFY_CLASS_BITS) { case FAN_CLASS_NOTIF: group->priority = FS_PRIO_0; break; @@ -783,7 +782,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) fd = -EPERM; if (!capable(CAP_AUDIT_WRITE)) goto out_destroy_group; - group->fanotify_data.audit = true; } fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); @@ -805,7 +803,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct fsnotify_group *group; struct fd f; struct path path; - u32 valid_mask = FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD; + u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; int ret; pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", @@ -815,8 +814,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & ((__u64)0xffffffff << 32)) return -EINVAL; - if (flags & ~FAN_ALL_MARK_FLAGS) + if (flags & ~FANOTIFY_MARK_FLAGS) + return -EINVAL; + + switch (mark_type) { + case FAN_MARK_INODE: + case FAN_MARK_MOUNT: + case FAN_MARK_FILESYSTEM: + break; + default: return -EINVAL; + } + switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { case FAN_MARK_ADD: /* fallthrough */ case FAN_MARK_REMOVE: @@ -824,20 +833,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; break; case FAN_MARK_FLUSH: - if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH)) + if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH)) return -EINVAL; break; default: return -EINVAL; } - if (mask & FAN_ONDIR) { - flags |= FAN_MARK_ONDIR; - mask &= ~FAN_ONDIR; - } - if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) - valid_mask |= FAN_ALL_PERM_EVENTS; + valid_mask |= FANOTIFY_PERM_EVENTS; if (mask & ~valid_mask) return -EINVAL; @@ -857,14 +861,16 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * allowed to set permissions events. */ ret = -EINVAL; - if (mask & FAN_ALL_PERM_EVENTS && + if (mask & FANOTIFY_PERM_EVENTS && group->priority == FS_PRIO_0) goto fput_and_out; if (flags & FAN_MARK_FLUSH) { ret = 0; - if (flags & FAN_MARK_MOUNT) + if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); + else if (mark_type == FAN_MARK_FILESYSTEM) + fsnotify_clear_sb_marks_by_group(group); else fsnotify_clear_inode_marks_by_group(group); goto fput_and_out; @@ -875,7 +881,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; /* inode held in place by reference to path; group by fget on fd */ - if (!(flags & FAN_MARK_MOUNT)) + if (mark_type == FAN_MARK_INODE) inode = path.dentry->d_inode; else mnt = path.mnt; @@ -883,14 +889,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, /* create/update an inode mark */ switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { case FAN_MARK_ADD: - if (flags & FAN_MARK_MOUNT) + if (mark_type == FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); + else if (mark_type == FAN_MARK_FILESYSTEM) + ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags); else ret = fanotify_add_inode_mark(group, inode, mask, flags); break; case FAN_MARK_REMOVE: - if (flags & FAN_MARK_MOUNT) + if (mark_type == FAN_MARK_MOUNT) ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags); + else if (mark_type == FAN_MARK_FILESYSTEM) + ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags); else ret = fanotify_remove_inode_mark(group, inode, mask, flags); break; @@ -934,6 +944,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 86fcf5814279..348a184bcdda 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -131,37 +131,20 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); + } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) { + struct super_block *sb = fsnotify_conn_sb(mark->connector); + + seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", + sb->s_dev, mflags, mark->mask, mark->ignored_mask); } } void fanotify_show_fdinfo(struct seq_file *m, struct file *f) { struct fsnotify_group *group = f->private_data; - unsigned int flags = 0; - - switch (group->priority) { - case FS_PRIO_0: - flags |= FAN_CLASS_NOTIF; - break; - case FS_PRIO_1: - flags |= FAN_CLASS_CONTENT; - break; - case FS_PRIO_2: - flags |= FAN_CLASS_PRE_CONTENT; - break; - } - - if (group->max_events == UINT_MAX) - flags |= FAN_UNLIMITED_QUEUE; - - if (group->fanotify_data.max_marks == UINT_MAX) - flags |= FAN_UNLIMITED_MARKS; - - if (group->fanotify_data.audit) - flags |= FAN_ENABLE_AUDIT; seq_printf(m, "fanotify flags:%x event-flags:%x\n", - flags, group->fanotify_data.f_flags); + group->fanotify_data.flags, group->fanotify_data.f_flags); show_fdinfo(m, f, fanotify_fdinfo); } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ababdbfab537..2172ba516c61 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -48,7 +48,7 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) * Called during unmount with no locks held, so needs to be safe against * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ -void fsnotify_unmount_inodes(struct super_block *sb) +static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; @@ -96,6 +96,15 @@ void fsnotify_unmount_inodes(struct super_block *sb) if (iput_inode) iput(iput_inode); + /* Wait for outstanding inode references from connectors */ + wait_var_event(&sb->s_fsnotify_inode_refs, + !atomic_long_read(&sb->s_fsnotify_inode_refs)); +} + +void fsnotify_sb_delete(struct super_block *sb) +{ + fsnotify_unmount_inodes(sb); + fsnotify_clear_marks_by_sb(sb); } /* @@ -190,7 +199,7 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; - __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); + __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignored_mask = 0; struct fsnotify_mark *mark; @@ -319,15 +328,17 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *file_name, u32 cookie) { struct fsnotify_iter_info iter_info = {}; - struct mount *mnt; + struct super_block *sb = NULL; + struct mount *mnt = NULL; + __u32 mnt_or_sb_mask = 0; int ret = 0; - /* global tests shouldn't care about events on child only the specific event */ - __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); + __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); - if (data_is == FSNOTIFY_EVENT_PATH) + if (data_is == FSNOTIFY_EVENT_PATH) { mnt = real_mount(((const struct path *)data)->mnt); - else - mnt = NULL; + sb = mnt->mnt.mnt_sb; + mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask; + } /* * Optimization: srcu_read_lock() has a memory barrier which can @@ -337,16 +348,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * need SRCU to keep them "alive". */ if (!to_tell->i_fsnotify_marks && - (!mnt || !mnt->mnt_fsnotify_marks)) + (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks))) return 0; /* * if this is a modify event we may need to clear the ignored masks - * otherwise return if neither the inode nor the vfsmount care about + * otherwise return if neither the inode nor the vfsmount/sb care about * this type of event. */ if (!(mask & FS_MODIFY) && - !(test_mask & to_tell->i_fsnotify_mask) && - !(mnt && test_mask & mnt->mnt_fsnotify_mask)) + !(test_mask & (to_tell->i_fsnotify_mask | mnt_or_sb_mask))) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); @@ -356,11 +366,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, if (mnt) { iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); + iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] = + fsnotify_first_mark(&sb->s_fsnotify_marks); } /* - * We need to merge inode & vfsmount mark lists so that inode mark - * ignore masks are properly reflected for mount mark notifications. + * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark + * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { @@ -386,7 +398,7 @@ static __init int fsnotify_init(void) { int ret; - BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 7902653dd577..5a00121fb219 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -21,6 +21,12 @@ static inline struct mount *fsnotify_conn_mount( return container_of(conn->obj, struct mount, mnt_fsnotify_marks); } +static inline struct super_block *fsnotify_conn_sb( + struct fsnotify_mark_connector *conn) +{ + return container_of(conn->obj, struct super_block, s_fsnotify_marks); +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -43,6 +49,11 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } +/* run the list of all marks associated with sb and destroy them */ +static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) +{ + fsnotify_destroy_marks(&sb->s_fsnotify_marks); +} /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ac6978d3208c..105576daca4a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -815,7 +815,7 @@ static int __init inotify_user_setup(void) BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); - BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22); + BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC|SLAB_ACCOUNT); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 59cdb27826de..d2dd16cb5989 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -115,6 +115,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) return &fsnotify_conn_inode(conn)->i_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; + else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) + return &fsnotify_conn_sb(conn)->s_fsnotify_mask; return NULL; } @@ -179,19 +181,24 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } -static struct inode *fsnotify_detach_connector_from_object( - struct fsnotify_mark_connector *conn) +static void *fsnotify_detach_connector_from_object( + struct fsnotify_mark_connector *conn, + unsigned int *type) { struct inode *inode = NULL; + *type = conn->type; if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) return NULL; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; + atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs); } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; + } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { + fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } rcu_assign_pointer(*(conn->obj), NULL); @@ -211,10 +218,29 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) fsnotify_put_group(group); } +/* Drop object reference originally held by a connector */ +static void fsnotify_drop_object(unsigned int type, void *objp) +{ + struct inode *inode; + struct super_block *sb; + + if (!objp) + return; + /* Currently only inode references are passed to be dropped */ + if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) + return; + inode = objp; + sb = inode->i_sb; + iput(inode); + if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs)) + wake_up_var(&sb->s_fsnotify_inode_refs); +} + void fsnotify_put_mark(struct fsnotify_mark *mark) { struct fsnotify_mark_connector *conn; - struct inode *inode = NULL; + void *objp = NULL; + unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ @@ -234,7 +260,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) conn = mark->connector; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { - inode = fsnotify_detach_connector_from_object(conn); + objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { __fsnotify_recalc_mask(conn); @@ -242,7 +268,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) mark->connector = NULL; spin_unlock(&conn->lock); - iput(inode); + fsnotify_drop_object(type, objp); if (free_conn) { spin_lock(&destroy_lock); @@ -709,7 +735,8 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark, *old_mark = NULL; - struct inode *inode; + void *objp; + unsigned int type; conn = fsnotify_grab_connector(connp); if (!conn) @@ -735,11 +762,11 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp) * mark references get dropped. It would lead to strange results such * as delaying inode deletion or blocking unmount. */ - inode = fsnotify_detach_connector_from_object(conn); + objp = fsnotify_detach_connector_from_object(conn, &type); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); - iput(inode); + fsnotify_drop_object(type, objp); } /* diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 4690cd75d8d7..3986c7a1f6a8 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -312,7 +312,7 @@ static struct dentry *ntfs_get_parent(struct dentry *child_dent) /* Get the mft record of the inode belonging to the child dentry. */ mrec = map_mft_record(ni); if (IS_ERR(mrec)) - return (struct dentry *)mrec; + return ERR_CAST(mrec); /* Find the first file name attribute in the mft record. */ ctx = ntfs_attr_get_search_ctx(ni, mrec); if (unlikely(!ctx)) { diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a342f008e42f..d1cbb27808e2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5106,8 +5106,6 @@ int ocfs2_split_extent(handle_t *handle, * rightmost extent list. */ if (path->p_tree_depth) { - struct ocfs2_extent_block *eb; - ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); @@ -5115,8 +5113,6 @@ int ocfs2_split_extent(handle_t *handle, mlog_errno(ret); goto out; } - - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; } if (rec->e_cpos == split_rec->e_cpos && diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 302cd7caa4a7..da578ad4c08f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1392,8 +1392,7 @@ retry: unlock: spin_unlock(&oi->ip_lock); out: - if (new) - kfree(new); + kfree(new); return ret; } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 7d9eea7d4a87..e9f236af1927 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -916,7 +916,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { struct kvec vec = { .iov_len = len, .iov_base = data, }; struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, len); + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len); return sock_recvmsg(sock, &msg, MSG_DONTWAIT); } diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 9b984cae4c4e..1d6dc8422899 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -329,7 +329,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) { char *buf; - buf = (char *) get_zeroed_page(GFP_NOFS); + buf = (char *) get_zeroed_page(GFP_ATOMIC); if (buf) { dump_mle(mle, buf, PAGE_SIZE - 1); free_page((unsigned long)buf); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 838a06d4066a..074d5de17bb2 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -531,7 +531,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); /* don't shuffle secondary queues */ - if ((res->owner == dlm->node_num)) { + if (res->owner == dlm->node_num) { if (res->state & (DLM_LOCK_RES_MIGRATING | DLM_LOCK_RES_BLOCK_DIRTY)) return; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9fa35cb6f6e0..fe570824b991 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2527,24 +2527,79 @@ out: return offset; } -static int ocfs2_file_clone_range(struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) +static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { - return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); -} + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb); + struct buffer_head *in_bh = NULL, *out_bh = NULL; + bool same_inode = (inode_in == inode_out); + loff_t remapped = 0; + ssize_t ret; -static int ocfs2_file_dedupe_range(struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) -{ - return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, true); + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + if (!ocfs2_refcount_tree(osb)) + return -EOPNOTSUPP; + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + /* Lock both files against IO */ + ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh); + if (ret) + return ret; + + /* Check file eligibility and prepare for block sharing. */ + ret = -EINVAL; + if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) || + (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) + goto out_unlock; + + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + /* Lock out changes to the allocation maps and remap. */ + down_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, + SINGLE_DEPTH_NESTING); + + /* Zap any page cache for the destination file's range. */ + truncate_inode_pages_range(&inode_out->i_data, + round_down(pos_out, PAGE_SIZE), + round_up(pos_out + len, PAGE_SIZE) - 1); + + remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, + inode_out, out_bh, pos_out, len); + up_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + up_write(&OCFS2_I(inode_out)->ip_alloc_sem); + if (remapped < 0) { + ret = remapped; + mlog_errno(ret); + goto out_unlock; + } + + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode_in, 0); + ocfs2_extent_map_trunc(inode_out, 0); + + ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + +out_unlock: + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return remapped > 0 ? remapped : ret; } const struct inode_operations ocfs2_file_iops = { @@ -2586,8 +2641,7 @@ const struct file_operations ocfs2_fops = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, - .clone_file_range = ocfs2_file_clone_range, - .dedupe_file_range = ocfs2_file_dedupe_range, + .remap_file_range = ocfs2_remap_file_range, }; const struct file_operations ocfs2_dops = { @@ -2633,8 +2687,7 @@ const struct file_operations ocfs2_fops_no_plocks = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, - .clone_file_range = ocfs2_file_clone_range, - .dedupe_file_range = ocfs2_file_dedupe_range, + .remap_file_range = ocfs2_remap_file_range, }; const struct file_operations ocfs2_dops_no_plocks = { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7a5ee145c733..a35259eebc56 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4135,7 +4135,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, struct buffer_head *ref_root_bh = NULL; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); - struct ocfs2_refcount_block *rb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_refcount_tree *ref_tree; @@ -4162,7 +4161,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, mlog_errno(ret); goto out; } - rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, &ref_tree->rf_ci, ref_root_bh, @@ -4468,9 +4466,9 @@ out: } /* Update destination inode size, if necessary. */ -static int ocfs2_reflink_update_dest(struct inode *dest, - struct buffer_head *d_bh, - loff_t newlen) +int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen) { handle_t *handle; int ret; @@ -4507,14 +4505,14 @@ out_commit: } /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */ -static int ocfs2_reflink_remap_extent(struct inode *s_inode, - struct buffer_head *s_bh, - loff_t pos_in, - struct inode *t_inode, - struct buffer_head *t_bh, - loff_t pos_out, - loff_t len, - struct ocfs2_cached_dealloc_ctxt *dealloc) +static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len, + struct ocfs2_cached_dealloc_ctxt *dealloc) { struct ocfs2_extent_tree s_et; struct ocfs2_extent_tree t_et; @@ -4522,8 +4520,9 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode, struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_super *osb; + loff_t remapped_bytes = 0; loff_t pstart, plen; - u32 p_cluster, num_clusters, slast, spos, tpos; + u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0; unsigned int ext_flags; int ret = 0; @@ -4605,30 +4604,34 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode, next_loop: spos += num_clusters; tpos += num_clusters; + remapped_clus += num_clusters; } -out: - return ret; + goto out; out_unlock_refcount: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); - return ret; +out: + remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus); + remapped_bytes = min_t(loff_t, len, remapped_bytes); + + return remapped_bytes > 0 ? remapped_bytes : ret; } /* Set up refcount tree and remap s_inode to t_inode. */ -static int ocfs2_reflink_remap_blocks(struct inode *s_inode, - struct buffer_head *s_bh, - loff_t pos_in, - struct inode *t_inode, - struct buffer_head *t_bh, - loff_t pos_out, - loff_t len) +loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len) { struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb; struct ocfs2_dinode *dis; struct ocfs2_dinode *dit; - int ret; + loff_t ret; osb = OCFS2_SB(s_inode->i_sb); dis = (struct ocfs2_dinode *)s_bh->b_data; @@ -4700,7 +4703,7 @@ static int ocfs2_reflink_remap_blocks(struct inode *s_inode, /* Actually remap extents now. */ ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh, pos_out, len, &dealloc); - if (ret) { + if (ret < 0) { mlog_errno(ret); goto out; } @@ -4715,10 +4718,10 @@ out: } /* Lock an inode and grab a bh pointing to the inode. */ -static int ocfs2_reflink_inodes_lock(struct inode *s_inode, - struct buffer_head **bh1, - struct inode *t_inode, - struct buffer_head **bh2) +int ocfs2_reflink_inodes_lock(struct inode *s_inode, + struct buffer_head **bh1, + struct inode *t_inode, + struct buffer_head **bh2) { struct inode *inode1; struct inode *inode2; @@ -4803,10 +4806,10 @@ out_i1: } /* Unlock both inodes and release buffers. */ -static void ocfs2_reflink_inodes_unlock(struct inode *s_inode, - struct buffer_head *s_bh, - struct inode *t_inode, - struct buffer_head *t_bh) +void ocfs2_reflink_inodes_unlock(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) { ocfs2_inode_unlock(s_inode, 1); ocfs2_rw_unlock(s_inode, 1); @@ -4818,82 +4821,3 @@ static void ocfs2_reflink_inodes_unlock(struct inode *s_inode, } unlock_two_nondirectories(s_inode, t_inode); } - -/* Link a range of blocks from one file to another. */ -int ocfs2_reflink_remap_range(struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - bool is_dedupe) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb); - struct buffer_head *in_bh = NULL, *out_bh = NULL; - bool same_inode = (inode_in == inode_out); - ssize_t ret; - - if (!ocfs2_refcount_tree(osb)) - return -EOPNOTSUPP; - if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) - return -EROFS; - - /* Lock both files against IO */ - ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh); - if (ret) - return ret; - - /* Check file eligibility and prepare for block sharing. */ - ret = -EINVAL; - if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) || - (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) - goto out_unlock; - - ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, - &len, is_dedupe); - if (ret <= 0) - goto out_unlock; - - /* Lock out changes to the allocation maps and remap. */ - down_write(&OCFS2_I(inode_in)->ip_alloc_sem); - if (!same_inode) - down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, - SINGLE_DEPTH_NESTING); - - ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out, - out_bh, pos_out, len); - - /* Zap any page cache for the destination file's range. */ - if (!ret) - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); - - up_write(&OCFS2_I(inode_in)->ip_alloc_sem); - if (!same_inode) - up_write(&OCFS2_I(inode_out)->ip_alloc_sem); - if (ret) { - mlog_errno(ret); - goto out_unlock; - } - - /* - * Empty the extent map so that we may get the right extent - * record from the disk. - */ - ocfs2_extent_map_trunc(inode_in, 0); - ocfs2_extent_map_trunc(inode_out, 0); - - ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len); - if (ret) { - mlog_errno(ret); - goto out_unlock; - } - - ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); - return 0; - -out_unlock: - ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); - return ret; -} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 4af55bf4b35b..e9e862be4a1e 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -115,11 +115,23 @@ int ocfs2_reflink_ioctl(struct inode *inode, const char __user *oldname, const char __user *newname, bool preserve); -int ocfs2_reflink_remap_range(struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - bool is_dedupe); +loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len); +int ocfs2_reflink_inodes_lock(struct inode *s_inode, + struct buffer_head **bh1, + struct inode *t_inode, + struct buffer_head **bh2); +void ocfs2_reflink_inodes_unlock(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh); +int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen); #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 5e65d818937b..fe53381b26b1 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -25,7 +25,7 @@ static int read_one_page(struct page *page) struct iov_iter to; struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE}; - iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE); + iov_iter_bvec(&to, READ, &bv, 1, PAGE_SIZE); gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpage called with page %p\n", diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 1cc797a08a5b..9e62dcf06fc4 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -125,6 +125,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) struct file *new_file; loff_t old_pos = 0; loff_t new_pos = 0; + loff_t cloned; int error = 0; if (len == 0) @@ -141,11 +142,10 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) } /* Try to use clone_file_range to clone up within the same fs */ - error = do_clone_file_range(old_file, 0, new_file, 0, len); - if (!error) + cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); + if (cloned == len) goto out; /* Couldn't clone, so now we try to copy the data */ - error = 0; /* FIXME: copy up sparse files efficiently */ while (len) { @@ -395,7 +395,6 @@ struct ovl_copy_up_ctx { struct dentry *destdir; struct qstr destname; struct dentry *workdir; - bool tmpfile; bool origin; bool indexed; bool metacopy; @@ -440,63 +439,6 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) return err; } -static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp, - struct dentry **newdentry) -{ - int err; - struct dentry *upper; - struct inode *udir = d_inode(c->destdir); - - upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); - if (IS_ERR(upper)) - return PTR_ERR(upper); - - if (c->tmpfile) - err = ovl_do_link(temp, udir, upper); - else - err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0); - - if (!err) - *newdentry = dget(c->tmpfile ? upper : temp); - dput(upper); - - return err; -} - -static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c) -{ - int err; - struct dentry *temp; - const struct cred *old_creds = NULL; - struct cred *new_creds = NULL; - struct ovl_cattr cattr = { - /* Can't properly set mode on creation because of the umask */ - .mode = c->stat.mode & S_IFMT, - .rdev = c->stat.rdev, - .link = c->link - }; - - err = security_inode_copy_up(c->dentry, &new_creds); - temp = ERR_PTR(err); - if (err < 0) - goto out; - - if (new_creds) - old_creds = override_creds(new_creds); - - if (c->tmpfile) - temp = ovl_do_tmpfile(c->workdir, c->stat.mode); - else - temp = ovl_create_temp(c->workdir, &cattr); -out: - if (new_creds) { - revert_creds(old_creds); - put_cred(new_creds); - } - - return temp; -} - static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { int err; @@ -548,51 +490,148 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } -static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) +struct ovl_cu_creds { + const struct cred *old; + struct cred *new; +}; + +static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc) +{ + int err; + + cc->old = cc->new = NULL; + err = security_inode_copy_up(dentry, &cc->new); + if (err < 0) + return err; + + if (cc->new) + cc->old = override_creds(cc->new); + + return 0; +} + +static void ovl_revert_cu_creds(struct ovl_cu_creds *cc) +{ + if (cc->new) { + revert_creds(cc->old); + put_cred(cc->new); + } +} + +/* + * Copyup using workdir to prepare temp file. Used when copying up directories, + * special files or when upper fs doesn't support O_TMPFILE. + */ +static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) { - struct inode *udir = c->destdir->d_inode; struct inode *inode; - struct dentry *newdentry = NULL; - struct dentry *temp; + struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); + struct dentry *temp, *upper; + struct ovl_cu_creds cc; int err; + struct ovl_cattr cattr = { + /* Can't properly set mode on creation because of the umask */ + .mode = c->stat.mode & S_IFMT, + .rdev = c->stat.rdev, + .link = c->link + }; + + err = ovl_lock_rename_workdir(c->workdir, c->destdir); + if (err) + return err; + + err = ovl_prep_cu_creds(c->dentry, &cc); + if (err) + goto unlock; - temp = ovl_get_tmpfile(c); + temp = ovl_create_temp(c->workdir, &cattr); + ovl_revert_cu_creds(&cc); + + err = PTR_ERR(temp); if (IS_ERR(temp)) - return PTR_ERR(temp); + goto unlock; err = ovl_copy_up_inode(c, temp); if (err) - goto out; + goto cleanup; if (S_ISDIR(c->stat.mode) && c->indexed) { err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp); if (err) - goto out; + goto cleanup; } - if (c->tmpfile) { - inode_lock_nested(udir, I_MUTEX_PARENT); - err = ovl_install_temp(c, temp, &newdentry); - inode_unlock(udir); - } else { - err = ovl_install_temp(c, temp, &newdentry); - } + upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto cleanup; + + err = ovl_do_rename(wdir, temp, udir, upper, 0); + dput(upper); if (err) - goto out; + goto cleanup; if (!c->metacopy) ovl_set_upperdata(d_inode(c->dentry)); inode = d_inode(c->dentry); - ovl_inode_update(inode, newdentry); + ovl_inode_update(inode, temp); if (S_ISDIR(inode->i_mode)) ovl_set_flag(OVL_WHITEOUTS, inode); +unlock: + unlock_rename(c->workdir, c->destdir); -out: - if (err && !c->tmpfile) - ovl_cleanup(d_inode(c->workdir), temp); - dput(temp); return err; +cleanup: + ovl_cleanup(wdir, temp); + dput(temp); + goto unlock; +} + +/* Copyup using O_TMPFILE which does not require cross dir locking */ +static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) +{ + struct inode *udir = d_inode(c->destdir); + struct dentry *temp, *upper; + struct ovl_cu_creds cc; + int err; + + err = ovl_prep_cu_creds(c->dentry, &cc); + if (err) + return err; + + temp = ovl_do_tmpfile(c->workdir, c->stat.mode); + ovl_revert_cu_creds(&cc); + + if (IS_ERR(temp)) + return PTR_ERR(temp); + + err = ovl_copy_up_inode(c, temp); + if (err) + goto out_dput; + + inode_lock_nested(udir, I_MUTEX_PARENT); + + upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); + err = PTR_ERR(upper); + if (!IS_ERR(upper)) { + err = ovl_do_link(temp, udir, upper); + dput(upper); + } + inode_unlock(udir); + + if (err) + goto out_dput; + + if (!c->metacopy) + ovl_set_upperdata(d_inode(c->dentry)); + ovl_inode_update(d_inode(c->dentry), temp); + + return 0; + +out_dput: + dput(temp); + return err; } /* @@ -646,18 +685,10 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) } /* Should we copyup with O_TMPFILE or with workdir? */ - if (S_ISREG(c->stat.mode) && ofs->tmpfile) { - c->tmpfile = true; - err = ovl_copy_up_locked(c); - } else { - err = ovl_lock_rename_workdir(c->workdir, c->destdir); - if (!err) { - err = ovl_copy_up_locked(c); - unlock_rename(c->workdir, c->destdir); - } - } - - + if (S_ISREG(c->stat.mode) && ofs->tmpfile) + err = ovl_copy_up_tmpfile(c); + else + err = ovl_copy_up_workdir(c); if (err) goto out; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 276914ae3c60..c6289147c787 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -414,13 +414,12 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl) return 0; - size = posix_acl_to_xattr(NULL, acl, NULL, 0); + size = posix_acl_xattr_size(acl->a_count); buffer = kmalloc(size, GFP_KERNEL); if (!buffer) return -ENOMEM; - size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - err = size; + err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); if (err < 0) goto out_free; @@ -463,6 +462,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (IS_ERR(upper)) goto out_unlock; + err = -ESTALE; + if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper))) + goto out_dput; + newdentry = ovl_create_temp(workdir, cattr); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) @@ -652,7 +655,6 @@ static int ovl_link(struct dentry *old, struct inode *newdir, struct dentry *new) { int err; - bool locked = false; struct inode *inode; err = ovl_want_write(old); @@ -663,13 +665,17 @@ static int ovl_link(struct dentry *old, struct inode *newdir, if (err) goto out_drop_write; + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + if (ovl_is_metacopy_dentry(old)) { err = ovl_set_redirect(old, false); if (err) goto out_drop_write; } - err = ovl_nlink_start(old, &locked); + err = ovl_nlink_start(old); if (err) goto out_drop_write; @@ -682,7 +688,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir, if (err) iput(inode); - ovl_nlink_end(old, locked); + ovl_nlink_end(old); out_drop_write: ovl_drop_write(old); out: @@ -807,7 +813,6 @@ static bool ovl_pure_upper(struct dentry *dentry) static int ovl_do_remove(struct dentry *dentry, bool is_dir) { int err; - bool locked = false; const struct cred *old_cred; struct dentry *upperdentry; bool lower_positive = ovl_lower_positive(dentry); @@ -828,7 +833,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (err) goto out_drop_write; - err = ovl_nlink_start(dentry, &locked); + err = ovl_nlink_start(dentry); if (err) goto out_drop_write; @@ -844,7 +849,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) else drop_nlink(dentry->d_inode); } - ovl_nlink_end(dentry, locked); + ovl_nlink_end(dentry); /* * Copy ctime @@ -1008,7 +1013,6 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, unsigned int flags) { int err; - bool locked = false; struct dentry *old_upperdir; struct dentry *new_upperdir; struct dentry *olddentry; @@ -1017,6 +1021,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, bool old_opaque; bool new_opaque; bool cleanup_whiteout = false; + bool update_nlink = false; bool overwrite = !(flags & RENAME_EXCHANGE); bool is_dir = d_is_dir(old); bool new_is_dir = d_is_dir(new); @@ -1074,10 +1079,12 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, err = ovl_copy_up(new); if (err) goto out_drop_write; - } else { - err = ovl_nlink_start(new, &locked); + } else if (d_inode(new)) { + err = ovl_nlink_start(new); if (err) goto out_drop_write; + + update_nlink = true; } old_cred = ovl_override_creds(old->d_sb); @@ -1206,7 +1213,8 @@ out_unlock: unlock_rename(new_upperdir, old_upperdir); out_revert_creds: revert_creds(old_cred); - ovl_nlink_end(new, locked); + if (update_nlink) + ovl_nlink_end(new); out_drop_write: ovl_drop_write(old); out: diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 986313da0c88..84dd957efa24 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -434,14 +434,14 @@ enum ovl_copyop { OVL_DEDUPE, }; -static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, +static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, unsigned int flags, enum ovl_copyop op) + loff_t len, unsigned int flags, enum ovl_copyop op) { struct inode *inode_out = file_inode(file_out); struct fd real_in, real_out; const struct cred *old_cred; - ssize_t ret; + loff_t ret; ret = ovl_real_fdget(file_out, &real_out); if (ret) @@ -462,12 +462,13 @@ static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, case OVL_CLONE: ret = vfs_clone_file_range(real_in.file, pos_in, - real_out.file, pos_out, len); + real_out.file, pos_out, len, flags); break; case OVL_DEDUPE: ret = vfs_dedupe_file_range_one(real_in.file, pos_in, - real_out.file, pos_out, len); + real_out.file, pos_out, len, + flags); break; } revert_creds(old_cred); @@ -489,26 +490,31 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, OVL_COPY); } -static int ovl_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, - OVL_CLONE); -} + enum ovl_copyop op; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) + op = OVL_DEDUPE; + else + op = OVL_CLONE; -static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) -{ /* * Don't copy up because of a dedupe request, this wouldn't make sense * most of the time (data would be duplicated instead of deduplicated). */ - if (!ovl_inode_upper(file_inode(file_in)) || - !ovl_inode_upper(file_inode(file_out))) + if (op == OVL_DEDUPE && + (!ovl_inode_upper(file_inode(file_in)) || + !ovl_inode_upper(file_inode(file_out)))) return -EPERM; - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, - OVL_DEDUPE); + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, + remap_flags, op); } const struct file_operations ovl_file_operations = { @@ -525,6 +531,5 @@ const struct file_operations ovl_file_operations = { .compat_ioctl = ovl_compat_ioctl, .copy_file_range = ovl_copy_file_range, - .clone_file_range = ovl_clone_file_range, - .dedupe_file_range = ovl_dedupe_file_range, + .remap_file_range = ovl_remap_file_range, }; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 3b7ed5d2279c..6bcc9dedc342 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -286,13 +286,22 @@ int ovl_permission(struct inode *inode, int mask) if (err) return err; - old_cred = ovl_override_creds(inode->i_sb); - if (!upperinode && - !special_file(realinode->i_mode) && mask & MAY_WRITE) { + /* No need to do any access on underlying for special files */ + if (special_file(realinode->i_mode)) + return 0; + + /* No need to access underlying for execute */ + mask &= ~MAY_EXEC; + if ((mask & (MAY_READ | MAY_WRITE)) == 0) + return 0; + + /* Lower files get copied up, so turn write access into read */ + if (!upperinode && mask & MAY_WRITE) { mask &= ~(MAY_WRITE | MAY_APPEND); - /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } + + old_cred = ovl_override_creds(inode->i_sb); err = inode_permission(realinode, mask); revert_creds(old_cred); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 9c0ca6a7becf..efd372312ef1 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -422,8 +422,10 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, fh = ovl_encode_real_fh(real, is_upper); err = PTR_ERR(fh); - if (IS_ERR(fh)) + if (IS_ERR(fh)) { + fh = NULL; goto fail; + } err = ovl_verify_fh(dentry, name, fh); if (set && err == -ENODATA) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index a3c0d9584312..5e45cb3630a0 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -271,8 +271,8 @@ bool ovl_test_flag(unsigned long flag, struct inode *inode); bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); bool ovl_need_index(struct dentry *dentry); -int ovl_nlink_start(struct dentry *dentry, bool *locked); -void ovl_nlink_end(struct dentry *dentry, bool locked); +int ovl_nlink_start(struct dentry *dentry); +void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); int ovl_check_metacopy_xattr(struct dentry *dentry); bool ovl_is_metacopy_dentry(struct dentry *dentry); @@ -290,6 +290,16 @@ static inline unsigned int ovl_xino_bits(struct super_block *sb) return ofs->xino_bits; } +static inline int ovl_inode_lock(struct inode *inode) +{ + return mutex_lock_interruptible(&OVL_I(inode)->lock); +} + +static inline void ovl_inode_unlock(struct inode *inode) +{ + mutex_unlock(&OVL_I(inode)->lock); +} + /* namei.c */ int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 30adc9d408a0..0116735cc321 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -472,6 +472,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; int err; + bool metacopy_opt = false, redirect_opt = false; config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); if (!config->redirect_mode) @@ -516,6 +517,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->redirect_mode = match_strdup(&args[0]); if (!config->redirect_mode) return -ENOMEM; + redirect_opt = true; break; case OPT_INDEX_ON: @@ -548,6 +550,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) case OPT_METACOPY_ON: config->metacopy = true; + metacopy_opt = true; break; case OPT_METACOPY_OFF: @@ -572,13 +575,32 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) if (err) return err; - /* metacopy feature with upper requires redirect_dir=on */ - if (config->upperdir && config->metacopy && !config->redirect_dir) { - pr_warn("overlayfs: metadata only copy up requires \"redirect_dir=on\", falling back to metacopy=off.\n"); - config->metacopy = false; - } else if (config->metacopy && !config->redirect_follow) { - pr_warn("overlayfs: metadata only copy up requires \"redirect_dir=follow\" on non-upper mount, falling back to metacopy=off.\n"); - config->metacopy = false; + /* + * This is to make the logic below simpler. It doesn't make any other + * difference, since config->redirect_dir is only used for upper. + */ + if (!config->upperdir && config->redirect_follow) + config->redirect_dir = true; + + /* Resolve metacopy -> redirect_dir dependency */ + if (config->metacopy && !config->redirect_dir) { + if (metacopy_opt && redirect_opt) { + pr_err("overlayfs: conflicting options: metacopy=on,redirect_dir=%s\n", + config->redirect_mode); + return -EINVAL; + } + if (redirect_opt) { + /* + * There was an explicit redirect_dir=... that resulted + * in this conflict. + */ + pr_info("overlayfs: disabling metacopy due to redirect_dir=%s\n", + config->redirect_mode); + config->metacopy = false; + } else { + /* Automatically enable redirect otherwise. */ + config->redirect_follow = config->redirect_dir = true; + } } return 0; @@ -1175,9 +1197,29 @@ out: return err; } +static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) +{ + unsigned int i; + + if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt)) + return true; + + for (i = 0; i < ofs->numlowerfs; i++) { + /* + * We use uuid to associate an overlay lower file handle with a + * lower layer, so we can accept lower fs with null uuid as long + * as all lower layers with null uuid are on the same fs. + */ + if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) + return false; + } + return true; +} + /* Get a unique fsid for the layer */ -static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb) +static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) { + struct super_block *sb = path->mnt->mnt_sb; unsigned int i; dev_t dev; int err; @@ -1191,6 +1233,14 @@ static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb) return i + 1; } + if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { + ofs->config.index = false; + ofs->config.nfs_export = false; + pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", + uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", + path->dentry); + } + err = get_anon_bdev(&dev); if (err) { pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); @@ -1225,7 +1275,7 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, struct vfsmount *mnt; int fsid; - err = fsid = ovl_get_fsid(ofs, stack[i].mnt->mnt_sb); + err = fsid = ovl_get_fsid(ofs, &stack[i]); if (err < 0) goto out; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index ace4fe4c39a9..7c01327b1852 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -65,8 +65,7 @@ struct super_block *ovl_same_sb(struct super_block *sb) */ int ovl_can_decode_fh(struct super_block *sb) { - if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry || - uuid_is_null(&sb->s_uuid)) + if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry) return 0; return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; @@ -522,13 +521,13 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags) int ovl_copy_up_start(struct dentry *dentry, int flags) { - struct ovl_inode *oi = OVL_I(d_inode(dentry)); + struct inode *inode = d_inode(dentry); int err; - err = mutex_lock_interruptible(&oi->lock); + err = ovl_inode_lock(inode); if (!err && ovl_already_copied_up_locked(dentry, flags)) { err = 1; /* Already copied up */ - mutex_unlock(&oi->lock); + ovl_inode_unlock(inode); } return err; @@ -536,7 +535,7 @@ int ovl_copy_up_start(struct dentry *dentry, int flags) void ovl_copy_up_end(struct dentry *dentry) { - mutex_unlock(&OVL_I(d_inode(dentry))->lock); + ovl_inode_unlock(d_inode(dentry)); } bool ovl_check_origin_xattr(struct dentry *dentry) @@ -739,14 +738,14 @@ fail: * Operations that change overlay inode and upper inode nlink need to be * synchronized with copy up for persistent nlink accounting. */ -int ovl_nlink_start(struct dentry *dentry, bool *locked) +int ovl_nlink_start(struct dentry *dentry) { - struct ovl_inode *oi = OVL_I(d_inode(dentry)); + struct inode *inode = d_inode(dentry); const struct cred *old_cred; int err; - if (!d_inode(dentry)) - return 0; + if (WARN_ON(!inode)) + return -ENOENT; /* * With inodes index is enabled, we store the union overlay nlink @@ -768,11 +767,11 @@ int ovl_nlink_start(struct dentry *dentry, bool *locked) return err; } - err = mutex_lock_interruptible(&oi->lock); + err = ovl_inode_lock(inode); if (err) return err; - if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode)) goto out; old_cred = ovl_override_creds(dentry->d_sb); @@ -787,27 +786,24 @@ int ovl_nlink_start(struct dentry *dentry, bool *locked) out: if (err) - mutex_unlock(&oi->lock); - else - *locked = true; + ovl_inode_unlock(inode); return err; } -void ovl_nlink_end(struct dentry *dentry, bool locked) +void ovl_nlink_end(struct dentry *dentry) { - if (locked) { - if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) && - d_inode(dentry)->i_nlink == 0) { - const struct cred *old_cred; + struct inode *inode = d_inode(dentry); - old_cred = ovl_override_creds(dentry->d_sb); - ovl_cleanup_index(dentry); - revert_creds(old_cred); - } + if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) { + const struct cred *old_cred; - mutex_unlock(&OVL_I(d_inode(dentry))->lock); + old_cred = ovl_override_creds(dentry->d_sb); + ovl_cleanup_index(dentry); + revert_creds(old_cred); } + + ovl_inode_unlock(inode); } int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) diff --git a/fs/proc/base.c b/fs/proc/base.c index 7e9f07bf260d..ce3465479447 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2905,6 +2905,21 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_LIVEPATCH */ +#ifdef CONFIG_STACKLEAK_METRICS +static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long prev_depth = THREAD_SIZE - + (task->prev_lowest_stack & (THREAD_SIZE - 1)); + unsigned long depth = THREAD_SIZE - + (task->lowest_stack & (THREAD_SIZE - 1)); + + seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n", + prev_depth, depth); + return 0; +} +#endif /* CONFIG_STACKLEAK_METRICS */ + /* * Thread groups */ @@ -3006,6 +3021,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_STACKLEAK_METRICS + ONE("stack_depth", S_IRUGO, proc_stack_depth), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fc5306a31a1d..5792f9e39466 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -516,6 +516,9 @@ int proc_fill_super(struct super_block *s, void *data, int silent) */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink.seeks = 0; + pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); if (!root_inode) { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d297fe4472a9..bbcc185062bb 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -22,7 +22,7 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/printk.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/uaccess.h> diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index d06694757201..8468baee951d 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -10,9 +10,6 @@ #include <linux/seqlock.h> #include <linux/time.h> -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int loadavg_proc_show(struct seq_file *m, void *v) { unsigned long avnrun[3]; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index edda898714eb..568d90e17c17 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -38,6 +38,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) long cached; long available; unsigned long pages[NR_LRU_LISTS]; + unsigned long sreclaimable, sunreclaim; int lru; si_meminfo(&i); @@ -53,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); + sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE); + sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); @@ -94,14 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Mapped: ", global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); - show_val_kb(m, "Slab: ", - global_node_page_state(NR_SLAB_RECLAIMABLE) + - global_node_page_state(NR_SLAB_UNRECLAIMABLE)); - - show_val_kb(m, "SReclaimable: ", - global_node_page_state(NR_SLAB_RECLAIMABLE)); - show_val_kb(m, "SUnreclaim: ", - global_node_page_state(NR_SLAB_UNRECLAIMABLE)); + show_val_kb(m, "KReclaimable: ", sreclaimable + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); + show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); + show_val_kb(m, "SReclaimable: ", sreclaimable); + show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", diff --git a/fs/proc/page.c b/fs/proc/page.c index 792c78a49174..6c517b11acf8 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/init.h> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5ea1d64cb0b4..47c3764c469b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -521,7 +521,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) mss->swap += PAGE_SIZE; else put_page(page); @@ -713,6 +713,8 @@ static void smap_gather_stats(struct vm_area_struct *vma, smaps_walk.private = mss; #ifdef CONFIG_SHMEM + /* In case of smaps_rollup, reset the value from previous vma */ + mss->check_shmem_swap = false; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -728,7 +730,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE)) { - mss->swap = shmem_swapped; + mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; smaps_walk.pte_hole = smaps_pte_hole; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 91ae16fbd7d5..3fe90443c1bb 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -16,7 +16,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/printk.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/init.h> #include <linux/crash_dump.h> #include <linux/list.h> @@ -423,7 +423,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) if (rc < 0) { unlock_page(page); put_page(page); - return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + return vmf_error(rc); } SetPageUptodate(page); } diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 503086f7f7c1..0d19d191ae70 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -141,7 +141,6 @@ config PSTORE_RAM tristate "Log panic/oops to a RAM buffer" depends on PSTORE depends on HAS_IOMEM - depends on HAVE_MEMBLOCK select REED_SOLOMON select REED_SOLOMON_ENC8 select REED_SOLOMON_DEC8 diff --git a/fs/read_write.c b/fs/read_write.c index 8a2737f0d61d..bfcb4ced5664 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -331,7 +331,7 @@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned i } #endif -#ifdef __ARCH_WANT_SYS_LLSEEK +#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) @@ -1407,7 +1407,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, goto fput_in; if (!(out.file->f_mode & FMODE_WRITE)) goto fput_out; - retval = -EINVAL; in_inode = file_inode(in.file); out_inode = file_inode(out.file); out_pos = out.file->f_pos; @@ -1588,11 +1587,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * Try cloning first, this is supported by more file systems, and * more efficient if both clone and copy are supported (e.g. NFS). */ - if (file_in->f_op->clone_file_range) { - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); - if (ret == 0) { - ret = len; + if (file_in->f_op->remap_file_range) { + loff_t cloned; + + cloned = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, + min_t(loff_t, MAX_RW_COUNT, len), + REMAP_FILE_CAN_SHORTEN); + if (cloned > 0) { + ret = cloned; goto done; } } @@ -1686,11 +1689,12 @@ out2: return ret; } -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) +static int remap_verify_area(struct file *file, loff_t pos, loff_t len, + bool write) { struct inode *inode = file_inode(file); - if (unlikely(pos < 0)) + if (unlikely(pos < 0 || len < 0)) return -EINVAL; if (unlikely((loff_t) (pos + len) < 0)) @@ -1708,22 +1712,150 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } +/* + * Ensure that we don't remap a partial EOF block in the middle of something + * else. Assume that the offsets have already been checked for block + * alignment. + * + * For deduplication we always scale down to the previous block because we + * can't meaningfully compare post-EOF contents. + * + * For clone we only link a partial EOF block above the destination file's EOF. + * + * Shorten the request if possible. + */ +static int generic_remap_check_len(struct inode *inode_in, + struct inode *inode_out, + loff_t pos_out, + loff_t *len, + unsigned int remap_flags) +{ + u64 blkmask = i_blocksize(inode_in) - 1; + loff_t new_len = *len; + + if ((*len & blkmask) == 0) + return 0; + + if ((remap_flags & REMAP_FILE_DEDUP) || + pos_out + *len < i_size_read(inode_out)) + new_len &= ~blkmask; + + if (new_len == *len) + return 0; + + if (remap_flags & REMAP_FILE_CAN_SHORTEN) { + *len = new_len; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; +} + +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct page *page; + + page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} /* * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the * inodes have been locked against any other modifications. * - * Returns: 0 for "nothing to clone", 1 for "something to clone", or - * the usual negative error code. + * If there's an error, then the usual negative error code is returned. + * Otherwise returns 0 with *len set to the request length. */ -int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, - struct inode *inode_out, loff_t pos_out, - u64 *len, bool is_dedupe) +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) { - loff_t bs = inode_out->i_sb->s_blocksize; - loff_t blen; - loff_t isize; + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); bool same_inode = (inode_in == inode_out); int ret; @@ -1740,50 +1872,24 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; - /* Are we going all the way to the end? */ - isize = i_size_read(inode_in); - if (isize == 0) - return 0; - /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) { - if (is_dedupe || pos_in == isize) + loff_t isize = i_size_read(inode_in); + + if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) return 0; if (pos_in > isize) return -EINVAL; *len = isize - pos_in; + if (*len == 0) + return 0; } - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + *len < pos_in || pos_out + *len < pos_out || - pos_in + *len > isize) - return -EINVAL; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + *len > disize) - return -EINVAL; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + *len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = *len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - return -EINVAL; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode) { - if (pos_out + blen > pos_in && pos_out < pos_in + blen) - return -EINVAL; - } + /* Check that we don't violate system file offset limits. */ + ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, + remap_flags); + if (ret) + return ret; /* Wait for the completion of any pending IOs on both files */ inode_dio_wait(inode_in); @@ -1803,7 +1909,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, /* * Check that the extents are the same. */ - if (is_dedupe) { + if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; ret = vfs_dedupe_file_range_compare(inode_in, pos_in, @@ -1814,16 +1920,43 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, return -EBADE; } - return 1; + ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, + remap_flags); + if (ret) + return ret; + + /* If can't alter the file contents, we're done. */ + if (!(remap_flags & REMAP_FILE_DEDUP)) { + /* Update the timestamps, since we can alter file contents. */ + if (!(file_out->f_mode & FMODE_NOCMTIME)) { + ret = file_update_time(file_out); + if (ret) + return ret; + } + + /* + * Clear the security bits if the process is not being run by + * root. This keeps people from modifying setuid and setgid + * binaries. + */ + ret = file_remove_privs(file_out); + if (ret) + return ret; + } + + return 0; } -EXPORT_SYMBOL(vfs_clone_file_prep_inodes); +EXPORT_SYMBOL(generic_remap_file_range_prep); -int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); - int ret; + loff_t ret; + + WARN_ON_ONCE(remap_flags); if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; @@ -1843,155 +1976,76 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, (file_out->f_flags & O_APPEND)) return -EBADF; - if (!file_in->f_op->clone_file_range) + if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; - ret = clone_verify_area(file_in, pos_in, len, false); + ret = remap_verify_area(file_in, pos_in, len, false); if (ret) return ret; - ret = clone_verify_area(file_out, pos_out, len, true); + ret = remap_verify_area(file_out, pos_out, len, true); if (ret) return ret; - if (pos_in + len > i_size_read(inode_in)) - return -EINVAL; - - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); - if (!ret) { - fsnotify_access(file_in); - fsnotify_modify(file_out); - } + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, remap_flags); + if (ret < 0) + return ret; + fsnotify_access(file_in); + fsnotify_modify(file_out); return ret; } EXPORT_SYMBOL(do_clone_file_range); -int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { - int ret; + loff_t ret; file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, + remap_flags); file_end_write(file_out); return ret; } EXPORT_SYMBOL(vfs_clone_file_range); -/* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +/* Check whether we are allowed to dedupe the destination file */ +static bool allow_file_dedupe(struct file *file) { - struct address_space *mapping; - struct page *page; - pgoff_t n; - - n = offset >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - return page; + if (capable(CAP_SYS_ADMIN)) + return true; + if (file->f_mode & FMODE_WRITE) + return true; + if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) + return true; + if (!inode_permission(file_inode(file), MAY_WRITE)) + return true; + return false; } -/* - * Compare extents of two files to see if they are the same. - * Caller must have locked both inodes to prevent write races. - */ -int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same) +loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len, unsigned int remap_flags) { - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - if (cmp_len <= 0) - goto out_error; - - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - unlock_page(src_page); - put_page(src_page); - goto out_error; - } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); + loff_t ret; - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; - - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); - put_page(dest_page); - put_page(src_page); - - if (!same) - break; - - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; - return 0; - -out_error: - return error; -} -EXPORT_SYMBOL(vfs_dedupe_file_range_compare); - -int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, u64 len) -{ - s64 ret; + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | + REMAP_FILE_CAN_SHORTEN)); ret = mnt_want_write_file(dst_file); if (ret) return ret; - ret = clone_verify_area(dst_file, dst_pos, len, true); + ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret < 0) goto out_drop_write; - ret = -EINVAL; - if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE))) + ret = -EPERM; + if (!allow_file_dedupe(dst_file)) goto out_drop_write; ret = -EXDEV; @@ -2003,11 +2057,16 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, goto out_drop_write; ret = -EINVAL; - if (!dst_file->f_op->dedupe_file_range) + if (!dst_file->f_op->remap_file_range) goto out_drop_write; - ret = dst_file->f_op->dedupe_file_range(src_file, src_pos, - dst_file, dst_pos, len); + if (len == 0) { + ret = 0; + goto out_drop_write; + } + + ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, + dst_pos, len, remap_flags | REMAP_FILE_DEDUP); out_drop_write: mnt_drop_write_file(dst_file); @@ -2024,7 +2083,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) int i; int ret; u16 count = same->dest_count; - int deduped; + loff_t deduped; if (!(file->f_mode & FMODE_READ)) return -EINVAL; @@ -2043,7 +2102,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) if (!S_ISREG(src->i_mode)) goto out; - ret = clone_verify_area(file, off, len, false); + ret = remap_verify_area(file, off, len, false); if (ret < 0) goto out; ret = 0; @@ -2075,7 +2134,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } deduped = vfs_dedupe_file_range_one(file, off, dst_file, - info->dest_offset, len); + info->dest_offset, len, + REMAP_FILE_CAN_SHORTEN); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile index a39a562c1c10..bd29c58ccbd8 100644 --- a/fs/reiserfs/Makefile +++ b/fs/reiserfs/Makefile @@ -26,14 +26,5 @@ ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y) reiserfs-objs += xattr_acl.o endif -# gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline -# functions are used. This causes the compiler to advance the stack -# pointer out of the available stack space, corrupting kernel space, -# and causing a panic. Since this behavior only affects ppc32, this ifeq -# will work around it. If any other architecture displays this behavior, -# add it here. -ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1) - TAGS: etags *.c - diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 48cdfc81fe10..32d8986c26fb 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -185,6 +185,7 @@ struct reiserfs_dentry_buf { struct dir_context ctx; struct dentry *xadir; int count; + int err; struct dentry *dentries[8]; }; @@ -207,6 +208,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, dentry = lookup_one_len(name, dbuf->xadir, namelen); if (IS_ERR(dentry)) { + dbuf->err = PTR_ERR(dentry); return PTR_ERR(dentry); } else if (d_really_is_negative(dentry)) { /* A directory entry exists, but no file? */ @@ -215,6 +217,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, "not found for file %pd.\n", dentry, dbuf->xadir); dput(dentry); + dbuf->err = -EIO; return -EIO; } @@ -262,6 +265,10 @@ static int reiserfs_for_each_xattr(struct inode *inode, err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); if (err) break; + if (buf.err) { + err = buf.err; + break; + } if (!buf.count) break; for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) { diff --git a/fs/select.c b/fs/select.c index 4a6b6e4b21cb..22b3bf89f051 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1120,7 +1120,7 @@ int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user * ts.tv_sec = ts.tv_nsec = 0; if (timeval) { - struct compat_timeval rtv; + struct old_timeval32 rtv; rtv.tv_sec = ts.tv_sec; rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; @@ -1128,7 +1128,7 @@ int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user * if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } else { - if (!compat_put_timespec64(&ts, p)) + if (!put_old_timespec32(&ts, p)) return ret; } /* @@ -1257,10 +1257,10 @@ out_nofds: static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timeval __user *tvp) + struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; - struct compat_timeval tv; + struct old_timeval32 tv; int ret; if (tvp) { @@ -1282,7 +1282,7 @@ static int do_compat_select(int n, compat_ulong_t __user *inp, COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timeval __user *, tvp) + struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } @@ -1307,7 +1307,7 @@ COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, + struct old_timespec32 __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { sigset_t ksigmask, sigsaved; @@ -1315,7 +1315,7 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, int ret; if (tsp) { - if (compat_get_timespec64(&ts, tsp)) + if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; @@ -1355,7 +1355,7 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timespec __user *, tsp, void __user *, sig) + struct old_timespec32 __user *, tsp, void __user *, sig) { compat_size_t sigsetsize = 0; compat_uptr_t up = 0; @@ -1373,7 +1373,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, } COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, - unsigned int, nfds, struct compat_timespec __user *, tsp, + unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { sigset_t ksigmask, sigsaved; @@ -1381,7 +1381,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, int ret; if (tsp) { - if (compat_get_timespec64(&ts, tsp)) + if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; diff --git a/fs/splice.c b/fs/splice.c index b3daa971f597..3553f1956508 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -301,7 +301,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct kiocb kiocb; int idx, ret; - iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len); + iov_iter_pipe(&to, READ, pipe, len); idx = to.idx; init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; @@ -386,7 +386,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, */ offset = *ppos & ~PAGE_MASK; - iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset); + iov_iter_pipe(&to, READ, pipe, len + offset); res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base); if (res <= 0) @@ -745,8 +745,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, left -= this_len; } - iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, - sd.total_len - left); + iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; diff --git a/fs/stat.c b/fs/stat.c index f8e6fb2c3657..adbfcd86c81b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -280,6 +280,8 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat #endif /* __ARCH_WANT_OLD_STAT */ +#ifdef __ARCH_WANT_NEW_STAT + #if BITS_PER_LONG == 32 # define choose_32_64(a,b) a #else @@ -378,6 +380,7 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) return error; } +#endif static int do_readlinkat(int dfd, const char __user *pathname, char __user *buf, int bufsiz) diff --git a/fs/super.c b/fs/super.c index f3a8c008e164..ca53a08497ed 100644 --- a/fs/super.c +++ b/fs/super.c @@ -442,7 +442,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; - fsnotify_unmount_inodes(sb); + fsnotify_sb_delete(sb); cgroup_writeback_umount(); evict_inodes(sb); diff --git a/fs/timerfd.c b/fs/timerfd.c index d69ad801eb80..803ca070d42e 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -561,29 +561,29 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, - const struct compat_itimerspec __user *, utmr, - struct compat_itimerspec __user *, otmr) + const struct old_itimerspec32 __user *, utmr, + struct old_itimerspec32 __user *, otmr) { struct itimerspec64 new, old; int ret; - if (get_compat_itimerspec64(&new, utmr)) + if (get_old_itimerspec32(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; - if (otmr && put_compat_itimerspec64(&old, otmr)) + if (otmr && put_old_itimerspec32(&old, otmr)) return -EFAULT; return ret; } COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, - struct compat_itimerspec __user *, otmr) + struct old_itimerspec32 __user *, otmr) { struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; - return put_compat_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; + return put_old_itimerspec32(&kotmr, otmr) ? -EFAULT : 0; } #endif diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index fcda0fc97b90..ec85aeaed54a 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -175,8 +175,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, { struct udf_sb_info *sbi = UDF_SB(sb); int alloc_count = 0; - int bit, block, block_group, group_start; - int nr_groups, bitmap_nr; + int bit, block, block_group; + int bitmap_nr; struct buffer_head *bh; __u32 part_len; @@ -189,10 +189,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, block_count = part_len - first_block; do { - nr_groups = udf_compute_nr_groups(sb, partition); block = first_block + (sizeof(struct spaceBitmapDesc) << 3); block_group = block >> (sb->s_blocksize_bits + 3); - group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc); bitmap_nr = load_block_bitmap(sb, bitmap, block_group); if (bitmap_nr < 0) @@ -652,12 +650,6 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode, } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { udf_table_free_blocks(sb, map->s_uspace.s_table, bloc, offset, count); - } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { - udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap, - bloc, offset, count); - } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { - udf_table_free_blocks(sb, map->s_fspace.s_table, - bloc, offset, count); } if (inode) { @@ -684,16 +676,6 @@ inline int udf_prealloc_blocks(struct super_block *sb, map->s_uspace.s_table, partition, first_block, block_count); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - allocated = udf_bitmap_prealloc_blocks(sb, - map->s_fspace.s_bitmap, - partition, first_block, - block_count); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - allocated = udf_table_prealloc_blocks(sb, - map->s_fspace.s_table, - partition, first_block, - block_count); else return 0; @@ -717,14 +699,6 @@ inline udf_pblk_t udf_new_block(struct super_block *sb, block = udf_table_new_block(sb, map->s_uspace.s_table, partition, goal, err); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - block = udf_bitmap_new_block(sb, - map->s_fspace.s_bitmap, - partition, goal, err); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - block = udf_table_new_block(sb, - map->s_fspace.s_table, - partition, goal, err); else { *err = -EIO; return 0; diff --git a/fs/udf/super.c b/fs/udf/super.c index 6f515651a2c2..8f2f56d9a1bb 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -290,12 +290,8 @@ static void udf_free_partition(struct udf_part_map *map) if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) iput(map->s_uspace.s_table); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - iput(map->s_fspace.s_table); if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) udf_sb_free_bitmap(map->s_uspace.s_bitmap); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - udf_sb_free_bitmap(map->s_fspace.s_bitmap); if (map->s_partition_type == UDF_SPARABLE_MAP15) for (i = 0; i < 4; i++) brelse(map->s_type_specific.s_sparing.s_spar_map[i]); @@ -613,14 +609,11 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) struct udf_options uopt; struct udf_sb_info *sbi = UDF_SB(sb); int error = 0; - struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); + + if (!(*flags & SB_RDONLY) && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT)) + return -EACCES; sync_filesystem(sb); - if (lvidiu) { - int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); - if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY)) - return -EACCES; - } uopt.flags = sbi->s_flags; uopt.uid = sbi->s_uid; @@ -988,12 +981,62 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) return bitmap; } +static int check_partition_desc(struct super_block *sb, + struct partitionDesc *p, + struct udf_part_map *map) +{ + bool umap, utable, fmap, ftable; + struct partitionHeaderDesc *phd; + + switch (le32_to_cpu(p->accessType)) { + case PD_ACCESS_TYPE_READ_ONLY: + case PD_ACCESS_TYPE_WRITE_ONCE: + case PD_ACCESS_TYPE_REWRITABLE: + case PD_ACCESS_TYPE_NONE: + goto force_ro; + } + + /* No Partition Header Descriptor? */ + if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && + strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) + goto force_ro; + + phd = (struct partitionHeaderDesc *)p->partitionContentsUse; + utable = phd->unallocSpaceTable.extLength; + umap = phd->unallocSpaceBitmap.extLength; + ftable = phd->freedSpaceTable.extLength; + fmap = phd->freedSpaceBitmap.extLength; + + /* No allocation info? */ + if (!utable && !umap && !ftable && !fmap) + goto force_ro; + + /* We don't support blocks that require erasing before overwrite */ + if (ftable || fmap) + goto force_ro; + /* UDF 2.60: 2.3.3 - no mixing of tables & bitmaps, no VAT. */ + if (utable && umap) + goto force_ro; + + if (map->s_partition_type == UDF_VIRTUAL_MAP15 || + map->s_partition_type == UDF_VIRTUAL_MAP20) + goto force_ro; + + return 0; +force_ro: + if (!sb_rdonly(sb)) + return -EACCES; + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); + return 0; +} + static int udf_fill_partdesc_info(struct super_block *sb, struct partitionDesc *p, int p_index) { struct udf_part_map *map; struct udf_sb_info *sbi = UDF_SB(sb); struct partitionHeaderDesc *phd; + int err; map = &sbi->s_partmaps[p_index]; @@ -1013,8 +1056,16 @@ static int udf_fill_partdesc_info(struct super_block *sb, p_index, map->s_partition_type, map->s_partition_root, map->s_partition_len); - if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && - strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) + err = check_partition_desc(sb, p, map); + if (err) + return err; + + /* + * Skip loading allocation info it we cannot ever write to the fs. + * This is a correctness thing as we may have decided to force ro mount + * to avoid allocation info we don't support. + */ + if (UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT)) return 0; phd = (struct partitionHeaderDesc *)p->partitionContentsUse; @@ -1050,40 +1101,6 @@ static int udf_fill_partdesc_info(struct super_block *sb, p_index, bitmap->s_extPosition); } - if (phd->partitionIntegrityTable.extLength) - udf_debug("partitionIntegrityTable (part %d)\n", p_index); - - if (phd->freedSpaceTable.extLength) { - struct kernel_lb_addr loc = { - .logicalBlockNum = le32_to_cpu( - phd->freedSpaceTable.extPosition), - .partitionReferenceNum = p_index, - }; - struct inode *inode; - - inode = udf_iget_special(sb, &loc); - if (IS_ERR(inode)) { - udf_debug("cannot load freedSpaceTable (part %d)\n", - p_index); - return PTR_ERR(inode); - } - map->s_fspace.s_table = inode; - map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; - udf_debug("freedSpaceTable (part %d) @ %lu\n", - p_index, map->s_fspace.s_table->i_ino); - } - - if (phd->freedSpaceBitmap.extLength) { - struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); - if (!bitmap) - return -ENOMEM; - map->s_fspace.s_bitmap = bitmap; - bitmap->s_extPosition = le32_to_cpu( - phd->freedSpaceBitmap.extPosition); - map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; - udf_debug("freedSpaceBitmap (part %d) @ %u\n", - p_index, bitmap->s_extPosition); - } return 0; } @@ -1257,6 +1274,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) ret = -EACCES; goto out_bh; } + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); ret = udf_load_vat(sb, i, type1_idx); if (ret < 0) goto out_bh; @@ -2155,10 +2173,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) UDF_MAX_READ_VERSION); ret = -EINVAL; goto error_out; - } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION && - !sb_rdonly(sb)) { - ret = -EACCES; - goto error_out; + } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) { + if (!sb_rdonly(sb)) { + ret = -EACCES; + goto error_out; + } + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); } sbi->s_udfrev = minUDFWriteRev; @@ -2176,10 +2196,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & - UDF_PART_FLAG_READ_ONLY && - !sb_rdonly(sb)) { - ret = -EACCES; - goto error_out; + UDF_PART_FLAG_READ_ONLY) { + if (!sb_rdonly(sb)) { + ret = -EACCES; + goto error_out; + } + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); } if (udf_find_fileset(sb, &fileset, &rootdir)) { @@ -2433,10 +2455,6 @@ static unsigned int udf_count_free(struct super_block *sb) accum += udf_count_free_bitmap(sb, map->s_uspace.s_bitmap); } - if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { - accum += udf_count_free_bitmap(sb, - map->s_fspace.s_bitmap); - } if (accum) return accum; @@ -2444,11 +2462,6 @@ static unsigned int udf_count_free(struct super_block *sb) accum += udf_count_free_table(sb, map->s_uspace.s_table); } - if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { - accum += udf_count_free_table(sb, - map->s_fspace.s_table); - } - return accum; } diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 9424d7cab790..3d83be54c474 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -30,11 +30,11 @@ #define UDF_FLAG_LASTBLOCK_SET 16 #define UDF_FLAG_BLOCKSIZE_SET 17 #define UDF_FLAG_INCONSISTENT 18 +#define UDF_FLAG_RW_INCOMPAT 19 /* Set when we find RW incompatible + * feature */ #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 -#define UDF_PART_FLAG_FREED_BITMAP 0x0004 -#define UDF_PART_FLAG_FREED_TABLE 0x0008 #define UDF_PART_FLAG_READ_ONLY 0x0010 #define UDF_PART_FLAG_WRITE_ONCE 0x0020 #define UDF_PART_FLAG_REWRITABLE 0x0040 @@ -50,8 +50,6 @@ #define UDF_INVALID_MODE ((umode_t)-1) -#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ - #define MF_DUPLICATE_MD 0x01 #define MF_MIRROR_FE_LOADED 0x02 @@ -93,10 +91,6 @@ struct udf_part_map { struct udf_bitmap *s_bitmap; struct inode *s_table; } s_uspace; - union { - struct udf_bitmap *s_bitmap; - struct inode *s_table; - } s_fspace; __u32 s_partition_root; __u32 s_partition_len; __u16 s_partition_type; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index bfa0ec69f924..356d2b8568c1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1026,7 +1026,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, struct userfaultfd_ctx *fork_nctx = NULL; /* always take the fd_wqh lock before the fault_pending_wqh lock */ - spin_lock(&ctx->fd_wqh.lock); + spin_lock_irq(&ctx->fd_wqh.lock); __add_wait_queue(&ctx->fd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -1112,13 +1112,13 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, ret = -EAGAIN; break; } - spin_unlock(&ctx->fd_wqh.lock); + spin_unlock_irq(&ctx->fd_wqh.lock); schedule(); - spin_lock(&ctx->fd_wqh.lock); + spin_lock_irq(&ctx->fd_wqh.lock); } __remove_wait_queue(&ctx->fd_wqh, &wait); __set_current_state(TASK_RUNNING); - spin_unlock(&ctx->fd_wqh.lock); + spin_unlock_irq(&ctx->fd_wqh.lock); if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); diff --git a/fs/utimes.c b/fs/utimes.c index 69d4b6ba1bfb..bdcf2daf39c1 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -8,35 +8,6 @@ #include <linux/compat.h> #include <asm/unistd.h> -#ifdef __ARCH_WANT_SYS_UTIME - -/* - * sys_utime() can be implemented in user-level using sys_utimes(). - * Is this for backwards compatibility? If so, why not move it - * into the appropriate arch directory (for those architectures that - * need it). - */ - -/* If times==NULL, set access and modification to current time, - * must be owner or have write permission. - * Else, update from *times, must be owner or super user. - */ -SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) -{ - struct timespec64 tv[2]; - - if (times) { - if (get_user(tv[0].tv_sec, ×->actime) || - get_user(tv[1].tv_sec, ×->modtime)) - return -EFAULT; - tv[0].tv_nsec = 0; - tv[1].tv_nsec = 0; - } - return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0); -} - -#endif - static bool nsec_valid(long nsec) { if (nsec == UTIME_OMIT || nsec == UTIME_NOW) @@ -166,7 +137,7 @@ out: } SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, - struct timespec __user *, utimes, int, flags) + struct __kernel_timespec __user *, utimes, int, flags) { struct timespec64 tstimes[2]; @@ -184,6 +155,13 @@ SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); } +#ifdef __ARCH_WANT_SYS_UTIME +/* + * futimesat(), utimes() and utime() are older versions of utimensat() + * that are provided for compatibility with traditional C libraries. + * On modern architectures, we always use libc wrappers around + * utimensat() instead. + */ static long do_futimesat(int dfd, const char __user *filename, struct timeval __user *utimes) { @@ -225,13 +203,29 @@ SYSCALL_DEFINE2(utimes, char __user *, filename, return do_futimesat(AT_FDCWD, filename, utimes); } -#ifdef CONFIG_COMPAT +SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) +{ + struct timespec64 tv[2]; + + if (times) { + if (get_user(tv[0].tv_sec, ×->actime) || + get_user(tv[1].tv_sec, ×->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0); +} +#endif + +#ifdef CONFIG_COMPAT_32BIT_TIME /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. */ +#ifdef __ARCH_WANT_SYS_UTIME32 COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, - struct compat_utimbuf __user *, t) + struct old_utimbuf32 __user *, t) { struct timespec64 tv[2]; @@ -244,14 +238,15 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, } return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); } +#endif -COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) +COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags) { struct timespec64 tv[2]; if (t) { - if (compat_get_timespec64(&tv[0], &t[0]) || - compat_get_timespec64(&tv[1], &t[1])) + if (get_old_timespec32(&tv[0], &t[0]) || + get_old_timespec32(&tv[1], &t[1])) return -EFAULT; if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) @@ -260,8 +255,9 @@ COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filena return do_utimes(dfd, filename, t ? tv : NULL, flags); } +#ifdef __ARCH_WANT_SYS_UTIME32 static long do_compat_futimesat(unsigned int dfd, const char __user *filename, - struct compat_timeval __user *t) + struct old_timeval32 __user *t) { struct timespec64 tv[2]; @@ -282,13 +278,14 @@ static long do_compat_futimesat(unsigned int dfd, const char __user *filename, COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, - struct compat_timeval __user *, t) + struct old_timeval32 __user *, t) { return do_compat_futimesat(dfd, filename, t); } -COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) +COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(AT_FDCWD, filename, t); } #endif +#endif diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 61a5ad2600e8..53c9ab8fb777 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -919,28 +919,67 @@ out_unlock: return error; } -STATIC int -xfs_file_clone_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) -{ - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); -} -STATIC int -xfs_file_dedupe_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) +loff_t +xfs_file_remap_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + loff_t len, + unsigned int remap_flags) { - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, true); + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + struct xfs_mount *mp = src->i_mount; + loff_t remapped = 0; + xfs_extlen_t cowextsize; + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Prepare and then clone file data. */ + ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, + &len, remap_flags); + if (ret < 0 || len == 0) + return ret; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + + ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, + &remapped); + if (ret) + goto out_unlock; + + /* + * Carry the cowextsize hint from src to dest if we're sharing the + * entire source file to the entire destination file, the source file + * has a cowextsize hint, and the destination file does not. + */ + cowextsize = 0; + if (pos_in == 0 && len == i_size_read(inode_in) && + (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && + pos_out == 0 && len >= i_size_read(inode_out) && + !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_d.di_cowextsize; + + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, + remap_flags); + +out_unlock: + xfs_reflink_remap_unlock(file_in, file_out); + if (ret) + trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + return remapped > 0 ? remapped : ret; } STATIC int @@ -1175,8 +1214,7 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, - .clone_file_range = xfs_file_clone_range, - .dedupe_file_range = xfs_file_dedupe_range, + .remap_file_range = xfs_file_remap_range, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 8eaeec9d58ed..ecdb086bc23e 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -913,18 +913,18 @@ out_error: /* * Update destination inode size & cowextsize hint, if necessary. */ -STATIC int +int xfs_reflink_update_dest( struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, - bool is_dedupe) + unsigned int remap_flags) { struct xfs_mount *mp = dest->i_mount; struct xfs_trans *tp; int error; - if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); @@ -945,10 +945,6 @@ xfs_reflink_update_dest( dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; } - if (!is_dedupe) { - xfs_trans_ichgtime(tp, dest, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); error = xfs_trans_commit(tp); @@ -1112,19 +1108,28 @@ out: /* * Iteratively remap one file's extents (and holes) to another's. */ -STATIC int +int xfs_reflink_remap_blocks( struct xfs_inode *src, - xfs_fileoff_t srcoff, + loff_t pos_in, struct xfs_inode *dest, - xfs_fileoff_t destoff, - xfs_filblks_t len, - xfs_off_t new_isize) + loff_t pos_out, + loff_t remap_len, + loff_t *remapped) { struct xfs_bmbt_irec imap; + xfs_fileoff_t srcoff; + xfs_fileoff_t destoff; + xfs_filblks_t len; + xfs_filblks_t range_len; + xfs_filblks_t remapped_len = 0; + xfs_off_t new_isize = pos_out + remap_len; int nimaps; int error = 0; - xfs_filblks_t range_len; + + destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); + srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); + len = XFS_B_TO_FSB(src->i_mount, remap_len); /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ while (len) { @@ -1139,7 +1144,7 @@ xfs_reflink_remap_blocks( error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); xfs_iunlock(src, lock_mode); if (error) - goto err; + break; ASSERT(nimaps == 1); trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, @@ -1153,23 +1158,24 @@ xfs_reflink_remap_blocks( error = xfs_reflink_remap_extent(dest, &imap, destoff, new_isize); if (error) - goto err; + break; if (fatal_signal_pending(current)) { error = -EINTR; - goto err; + break; } /* Advance drange/srange */ srcoff += range_len; destoff += range_len; len -= range_len; + remapped_len += range_len; } - return 0; - -err: - trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + if (error) + trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + *remapped = min_t(loff_t, remap_len, + XFS_FSB_TO_B(src->i_mount, remapped_len)); return error; } @@ -1218,7 +1224,7 @@ retry: } /* Unlock both inodes after they've been prepped for a range clone. */ -STATIC void +void xfs_reflink_remap_unlock( struct file *file_in, struct file *file_out) @@ -1286,21 +1292,20 @@ xfs_reflink_zero_posteof( * stale data in the destination file. Hence we reject these clone attempts with * -EINVAL in this case. */ -STATIC int +int xfs_reflink_remap_prep( struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *len, - bool is_dedupe) + loff_t *len, + unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); bool same_inode = (inode_in == inode_out); - u64 blkmask = i_blocksize(inode_in) - 1; ssize_t ret; /* Lock both files against IO */ @@ -1323,29 +1328,11 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, - len, is_dedupe); - if (ret <= 0) + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); + if (ret < 0 || *len == 0) goto out_unlock; - /* - * If the dedupe data matches, chop off the partial EOF block - * from the source file so we don't try to dedupe the partial - * EOF block. - */ - if (is_dedupe) { - *len &= ~blkmask; - } else if (*len & blkmask) { - /* - * The user is attempting to share a partial EOF block, - * if it's inside the destination EOF then reject it. - */ - if (pos_out + *len < i_size_read(inode_out)) { - ret = -EINVAL; - goto out_unlock; - } - } - /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); if (ret) @@ -1365,31 +1352,9 @@ xfs_reflink_remap_prep( goto out_unlock; /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + *len) - 1); - - /* If we're altering the file contents... */ - if (!is_dedupe) { - /* - * ...update the timestamps (which will grab the ilock again - * from xfs_fs_dirty_inode, so we have to call it before we - * take the ilock). - */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - goto out_unlock; - } - - /* - * ...clear the security bits if the process is not being run - * by root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - goto out_unlock; - } + truncate_inode_pages_range(&inode_out->i_data, + round_down(pos_out, PAGE_SIZE), + round_up(pos_out + *len, PAGE_SIZE) - 1); return 1; out_unlock: @@ -1398,72 +1363,6 @@ out_unlock: } /* - * Link a range of blocks from one file to another. - */ -int -xfs_reflink_remap_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - bool is_dedupe) -{ - struct inode *inode_in = file_inode(file_in); - struct xfs_inode *src = XFS_I(inode_in); - struct inode *inode_out = file_inode(file_out); - struct xfs_inode *dest = XFS_I(inode_out); - struct xfs_mount *mp = src->i_mount; - xfs_fileoff_t sfsbno, dfsbno; - xfs_filblks_t fsblen; - xfs_extlen_t cowextsize; - ssize_t ret; - - if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return -EOPNOTSUPP; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - /* Prepare and then clone file data. */ - ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, - &len, is_dedupe); - if (ret <= 0) - return ret; - - trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); - - dfsbno = XFS_B_TO_FSBT(mp, pos_out); - sfsbno = XFS_B_TO_FSBT(mp, pos_in); - fsblen = XFS_B_TO_FSB(mp, len); - ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, - pos_out + len); - if (ret) - goto out_unlock; - - /* - * Carry the cowextsize hint from src to dest if we're sharing the - * entire source file to the entire destination file, the source file - * has a cowextsize hint, and the destination file does not. - */ - cowextsize = 0; - if (pos_in == 0 && len == i_size_read(inode_in) && - (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && - pos_out == 0 && len >= i_size_read(inode_out) && - !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) - cowextsize = src->i_d.di_cowextsize; - - ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, - is_dedupe); - -out_unlock: - xfs_reflink_remap_unlock(file_in, file_out); - if (ret) - trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); - return ret; -} - -/* * The user wants to preemptively CoW all shared blocks in this file, * which enables us to turn off the reflink flag. Iterate all * extents which are not prealloc/delalloc to see which ranges are diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 7f47202b5639..6d73daef1f13 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -27,13 +27,24 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); -extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); +extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t len, + unsigned int remap_flags); extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *has_shared); extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, struct xfs_trans **tpp); extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); +extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t *len, + unsigned int remap_flags); +extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, + struct xfs_inode *dest, loff_t pos_out, loff_t remap_len, + loff_t *remapped); +extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, + xfs_extlen_t cowextsize, unsigned int remap_flags); +extern void xfs_reflink_remap_unlock(struct file *file_in, + struct file *file_out); #endif /* __XFS_REFLINK_H */ |