From c57dcb566d3d866a302a1da2e06344bec31d5bcd Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Thu, 2 Apr 2015 08:39:00 +0100 Subject: efivarfs: Ensure VariableName is NUL-terminated Some buggy firmware implementations update VariableNameSize on success such that it does not include the final NUL character which results in garbage in the efivarfs name entries. Use kzalloc on the efivar_entry (as is done in efivars.c) to ensure that the name is always NUL-terminated. The buggy firmware is: BIOS Information Vendor: Intel Corp. Version: S1200RP.86B.02.02.0005.102320140911 Release Date: 10/23/2014 BIOS Revision: 4.6 System Information Manufacturer: Intel Corporation Product Name: S1200RP_SE Signed-off-by: Ross Lagerwall Acked-by: Matthew Garrett Cc: Jeremy Kerr Cc: Signed-off-by: Matt Fleming --- fs/efivarfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index ddbce42548c9..acf9a67f6770 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -121,7 +121,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, int len, i; int err = -ENOMEM; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); + entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return err; -- cgit v1.2.3 From 5de0b4d0cd153c471640b13aae6ae6d18d0a4603 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2015 16:56:45 -0400 Subject: ext4 crypto: simplify and speed up filename encryption Avoid using SHA-1 when calculating the user-visible filename when the encryption key is available, and avoid decrypting lots of filenames when searching for a directory entry in a directory block. Change-Id: If4655f144784978ba0305b597bfa1c8d7bb69e63 Signed-off-by: Theodore Ts'o --- fs/ext4/crypto_fname.c | 268 +++++++++++++++++++++++++------------------------ fs/ext4/dir.c | 2 +- fs/ext4/ext4.h | 9 +- fs/ext4/namei.c | 72 ++----------- fs/ext4/symlink.c | 2 +- 5 files changed, 149 insertions(+), 204 deletions(-) (limited to 'fs') diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index ca2f5948c1ac..7a877e609e5f 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -198,106 +198,57 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, return oname->len; } +static const char *lookup_table = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + /** * ext4_fname_encode_digest() - * * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. * The encoded string is roughly 4/3 times the size of the input string. */ -int ext4_fname_encode_digest(char *dst, char *src, u32 len) +static int digest_encode(const char *src, int len, char *dst) { - static const char *lookup_table = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_+"; - u32 current_chunk, num_chunks, i; - char tmp_buf[3]; - u32 c0, c1, c2, c3; - - current_chunk = 0; - num_chunks = len/3; - for (i = 0; i < num_chunks; i++) { - c0 = src[3*i] & 0x3f; - c1 = (((src[3*i]>>6)&0x3) | ((src[3*i+1] & 0xf)<<2)) & 0x3f; - c2 = (((src[3*i+1]>>4)&0xf) | ((src[3*i+2] & 0x3)<<4)) & 0x3f; - c3 = (src[3*i+2]>>2) & 0x3f; - dst[4*i] = lookup_table[c0]; - dst[4*i+1] = lookup_table[c1]; - dst[4*i+2] = lookup_table[c2]; - dst[4*i+3] = lookup_table[c3]; - } - if (i*3 < len) { - memset(tmp_buf, 0, 3); - memcpy(tmp_buf, &src[3*i], len-3*i); - c0 = tmp_buf[0] & 0x3f; - c1 = (((tmp_buf[0]>>6)&0x3) | ((tmp_buf[1] & 0xf)<<2)) & 0x3f; - c2 = (((tmp_buf[1]>>4)&0xf) | ((tmp_buf[2] & 0x3)<<4)) & 0x3f; - c3 = (tmp_buf[2]>>2) & 0x3f; - dst[4*i] = lookup_table[c0]; - dst[4*i+1] = lookup_table[c1]; - dst[4*i+2] = lookup_table[c2]; - dst[4*i+3] = lookup_table[c3]; + int i = 0, bits = 0, ac = 0; + char *cp = dst; + + while (i < len) { + ac += (((unsigned char) src[i]) << bits); + bits += 8; + do { + *cp++ = lookup_table[ac & 0x3f]; + ac >>= 6; + bits -= 6; + } while (bits >= 6); i++; } - return (i * 4); + if (bits) + *cp++ = lookup_table[ac & 0x3f]; + return cp - dst; } -/** - * ext4_fname_hash() - - * - * This function computes the hash of the input filename, and sets the output - * buffer to the *encoded* digest. It returns the length of the digest as its - * return value. Errors are returned as negative numbers. We trust the caller - * to allocate sufficient memory to oname string. - */ -static int ext4_fname_hash(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_str *iname, - struct ext4_str *oname) +static int digest_decode(const char *src, int len, char *dst) { - struct scatterlist sg; - struct hash_desc desc = { - .tfm = (struct crypto_hash *)ctx->htfm, - .flags = CRYPTO_TFM_REQ_MAY_SLEEP - }; - int res = 0; - - if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { - res = ext4_fname_encode_digest(oname->name, iname->name, - iname->len); - oname->len = res; - return res; - } - - sg_init_one(&sg, iname->name, iname->len); - res = crypto_hash_init(&desc); - if (res) { - printk(KERN_ERR - "%s: Error initializing crypto hash; res = [%d]\n", - __func__, res); - goto out; - } - res = crypto_hash_update(&desc, &sg, iname->len); - if (res) { - printk(KERN_ERR - "%s: Error updating crypto hash; res = [%d]\n", - __func__, res); - goto out; - } - res = crypto_hash_final(&desc, - &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE]); - if (res) { - printk(KERN_ERR - "%s: Error finalizing crypto hash; res = [%d]\n", - __func__, res); - goto out; + int i = 0, bits = 0, ac = 0; + const char *p; + char *cp = dst; + + while (i < len) { + p = strchr(lookup_table, src[i]); + if (p == NULL || src[i] == 0) + return -2; + ac += (p - lookup_table) << bits; + bits += 6; + if (bits >= 8) { + *cp++ = ac & 0xff; + ac >>= 8; + bits -= 8; + } + i++; } - /* Encode the digest as a printable string--this will increase the - * size of the digest */ - oname->name[0] = 'I'; - res = ext4_fname_encode_digest(oname->name+1, - &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE], - EXT4_FNAME_CRYPTO_DIGEST_SIZE) + 1; - oname->len = res; -out: - return res; + if (ac) + return -1; + return cp - dst; } /** @@ -571,9 +522,13 @@ void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) * ext4_fname_disk_to_usr() - converts a filename from disk space to user space */ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_str *iname, - struct ext4_str *oname) + struct dx_hash_info *hinfo, + const struct ext4_str *iname, + struct ext4_str *oname) { + char buf[24]; + int ret; + if (ctx == NULL) return -EIO; if (iname->len < 3) { @@ -587,18 +542,33 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, } if (ctx->has_valid_key) return ext4_fname_decrypt(ctx, iname, oname); - else - return ext4_fname_hash(ctx, iname, oname); + + if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { + ret = digest_encode(iname->name, iname->len, oname->name); + oname->len = ret; + return ret; + } + if (hinfo) { + memcpy(buf, &hinfo->hash, 4); + memcpy(buf+4, &hinfo->minor_hash, 4); + } else + memset(buf, 0, 8); + memcpy(buf + 8, iname->name + iname->len - 16, 16); + oname->name[0] = '_'; + ret = digest_encode(buf, 24, oname->name+1); + oname->len = ret + 1; + return ret + 1; } int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname) { struct ext4_str iname = {.name = (unsigned char *) de->name, .len = de->name_len }; - return _ext4_fname_disk_to_usr(ctx, &iname, oname); + return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname); } @@ -640,10 +610,11 @@ int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, const struct qstr *iname, struct dx_hash_info *hinfo) { - struct ext4_str tmp, tmp2; + struct ext4_str tmp; int ret = 0; + char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1]; - if (!ctx || !ctx->has_valid_key || + if (!ctx || ((iname->name[0] == '.') && ((iname->len == 1) || ((iname->name[1] == '.') && (iname->len == 2))))) { @@ -651,59 +622,90 @@ int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, return 0; } + if (!ctx->has_valid_key && iname->name[0] == '_') { + if (iname->len != 33) + return -ENOENT; + ret = digest_decode(iname->name+1, iname->len, buf); + if (ret != 24) + return -ENOENT; + memcpy(&hinfo->hash, buf, 4); + memcpy(&hinfo->minor_hash, buf + 4, 4); + return 0; + } + + if (!ctx->has_valid_key && iname->name[0] != '_') { + if (iname->len > 43) + return -ENOENT; + ret = digest_decode(iname->name, iname->len, buf); + ext4fs_dirhash(buf, ret, hinfo); + return 0; + } + /* First encrypt the plaintext name */ ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp); if (ret < 0) return ret; ret = ext4_fname_encrypt(ctx, iname, &tmp); - if (ret < 0) - goto out; - - tmp2.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1; - tmp2.name = kmalloc(tmp2.len + 1, GFP_KERNEL); - if (tmp2.name == NULL) { - ret = -ENOMEM; - goto out; + if (ret >= 0) { + ext4fs_dirhash(tmp.name, tmp.len, hinfo); + ret = 0; } - ret = ext4_fname_hash(ctx, &tmp, &tmp2); - if (ret > 0) - ext4fs_dirhash(tmp2.name, tmp2.len, hinfo); - ext4_fname_crypto_free_buffer(&tmp2); -out: ext4_fname_crypto_free_buffer(&tmp); return ret; } -/** - * ext4_fname_disk_to_htree() - converts a filename from disk space to htree-access string - */ -int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_dir_entry_2 *de, - struct dx_hash_info *hinfo) +int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, + int len, const char * const name, + struct ext4_dir_entry_2 *de) { - struct ext4_str iname = {.name = (unsigned char *) de->name, - .len = de->name_len}; - struct ext4_str tmp; - int ret; + int ret = -ENOENT; + int bigname = (*name == '_'); - if (!ctx || - ((iname.name[0] == '.') && - ((iname.len == 1) || - ((iname.name[1] == '.') && (iname.len == 2))))) { - ext4fs_dirhash(iname.name, iname.len, hinfo); - return 0; + if (ctx->has_valid_key) { + if (cstr->name == NULL) { + struct qstr istr; + + ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr); + if (ret < 0) + goto errout; + istr.name = name; + istr.len = len; + ret = ext4_fname_encrypt(ctx, &istr, cstr); + if (ret < 0) + goto errout; + } + } else { + if (cstr->name == NULL) { + cstr->name = kmalloc(32, GFP_KERNEL); + if (cstr->name == NULL) + return -ENOMEM; + if ((bigname && (len != 33)) || + (!bigname && (len > 43))) + goto errout; + ret = digest_decode(name+bigname, len-bigname, + cstr->name); + if (ret < 0) { + ret = -ENOENT; + goto errout; + } + cstr->len = ret; + } + if (bigname) { + if (de->name_len < 16) + return 0; + ret = memcmp(de->name + de->name_len - 16, + cstr->name + 8, 16); + return (ret == 0) ? 1 : 0; + } } - - tmp.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1; - tmp.name = kmalloc(tmp.len + 1, GFP_KERNEL); - if (tmp.name == NULL) - return -ENOMEM; - - ret = ext4_fname_hash(ctx, &iname, &tmp); - if (ret > 0) - ext4fs_dirhash(tmp.name, tmp.len, hinfo); - ext4_fname_crypto_free_buffer(&tmp); + if (de->name_len != cstr->len) + return 0; + ret = memcmp(de->name, cstr->name, cstr->len); + return (ret == 0) ? 1 : 0; +errout: + kfree(cstr->name); + cstr->name = NULL; return ret; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 61db51a5ce4c..5665d82d2332 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -249,7 +249,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) } else { /* Directory is encrypted */ err = ext4_fname_disk_to_usr(enc_ctx, - de, &fname_crypto_str); + NULL, de, &fname_crypto_str); if (err < 0) goto errout; if (!dir_emit(ctx, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0179654faf79..dfb113816672 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2093,9 +2093,11 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, u32 ilen, struct ext4_str *crypto_str); int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname); int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname); int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, @@ -2104,11 +2106,12 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, const struct qstr *iname, struct dx_hash_info *hinfo); -int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct ext4_dir_entry_2 *de, - struct dx_hash_info *hinfo); int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, u32 namelen); +int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, + int len, const char * const name, + struct ext4_dir_entry_2 *de); + #ifdef CONFIG_EXT4_FS_ENCRYPTION void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 4f87127f781f..5ea737114716 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -640,7 +640,7 @@ static struct stats dx_show_leaf(struct inode *dir, ext4_put_fname_crypto_ctx(&ctx); ctx = NULL; } - res = ext4_fname_disk_to_usr(ctx, de, + res = ext4_fname_disk_to_usr(ctx, NULL, de, &fname_crypto_str); if (res < 0) { printk(KERN_WARNING "Error " @@ -653,15 +653,8 @@ static struct stats dx_show_leaf(struct inode *dir, name = fname_crypto_str.name; len = fname_crypto_str.len; } - res = ext4_fname_disk_to_hash(ctx, de, - &h); - if (res < 0) { - printk(KERN_WARNING "Error " - "converting filename " - "from disk to htree" - "\n"); - h.hash = 0xDEADBEEF; - } + ext4fs_dirhash(de->name, de->name_len, + &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); @@ -1008,15 +1001,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, /* silently ignore the rest of the block */ break; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION - err = ext4_fname_disk_to_hash(ctx, de, hinfo); - if (err < 0) { - count = err; - goto errout; - } -#else ext4fs_dirhash(de->name, de->name_len, hinfo); -#endif if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) @@ -1032,7 +1017,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, &tmp_str); } else { /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(ctx, de, + err = ext4_fname_disk_to_usr(ctx, hinfo, de, &fname_crypto_str); if (err < 0) { count = err; @@ -1193,26 +1178,10 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, int count = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_fname_crypto_ctx *ctx = NULL; - int err; - - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); -#endif while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - err = ext4_fname_disk_to_hash(ctx, de, &h); - if (err < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return err; - } -#else ext4fs_dirhash(de->name, de->name_len, &h); -#endif map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; @@ -1223,9 +1192,6 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, /* XXX: do we need to check rec_len == 0 case? -Chris */ de = ext4_next_entry(de, blocksize); } -#ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&ctx); -#endif return count; } @@ -1287,16 +1253,8 @@ static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx, return 0; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ctx) { - /* Directory is encrypted */ - res = ext4_fname_disk_to_usr(ctx, de, fname_crypto_str); - if (res < 0) - return res; - if (len != res) - return 0; - res = memcmp(name, fname_crypto_str->name, len); - return (res == 0) ? 1 : 0; - } + if (ctx) + return ext4_fname_match(ctx, fname_crypto_str, len, name, de); #endif if (len != de->name_len) return 0; @@ -1324,16 +1282,6 @@ int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, if (IS_ERR(ctx)) return -1; - if (ctx != NULL) { - /* Allocate buffer to hold maximum name length */ - res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, - &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return -1; - } - } - de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { @@ -1872,14 +1820,6 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, return res; } reclen = EXT4_DIR_REC_LEN(res); - - /* Allocate buffer to hold maximum name length */ - res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, - &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return -1; - } } de = (struct ext4_dir_entry_2 *)buf; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 136ca0e911fd..ce2ed286ba08 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -74,7 +74,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) goto errout; } pstr.name = paddr; - res = _ext4_fname_disk_to_usr(ctx, &cstr, &pstr); + res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); if (res < 0) goto errout; /* Null-terminate the name */ -- cgit v1.2.3 From a44cd7a05496d60fd2ba8cca080e3db8f481549b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2015 16:56:50 -0400 Subject: ext4 crypto: add padding to filenames before encrypting This obscures the length of the filenames, to decrease the amount of information leakage. By default, we pad the filenames to the next 4 byte boundaries. This costs nothing, since the directory entries are aligned to 4 byte boundaries anyway. Filenames can also be padded to 8, 16, or 32 bytes, which will consume more directory space. Change-Id: Ibb7a0fb76d2c48e2061240a709358ff40b14f322 Signed-off-by: Theodore Ts'o --- fs/ext4/crypto_fname.c | 12 ++++++++++-- fs/ext4/crypto_key.c | 1 + fs/ext4/crypto_policy.c | 14 +++++++++----- fs/ext4/ext4.h | 1 + fs/ext4/ext4_crypto.h | 11 ++++++++++- 5 files changed, 31 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 7a877e609e5f..fded02f72299 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -66,6 +66,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; struct scatterlist sg[1]; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); char *workbuf; if (iname->len <= 0 || iname->len > ctx->lim) @@ -73,6 +74,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? EXT4_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); ciphertext_len = (ciphertext_len > ctx->lim) ? ctx->lim : ciphertext_len; @@ -101,7 +103,7 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, /* Create encryption request */ sg_init_table(sg, 1); sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); @@ -356,6 +358,7 @@ struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( if (IS_ERR(ctx)) return ctx; + ctx->flags = ei->i_crypt_policy_flags; if (ctx->has_valid_key) { if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { printk_once(KERN_WARNING @@ -468,6 +471,7 @@ int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, u32 namelen) { u32 ciphertext_len; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); if (ctx == NULL) return -EIO; @@ -475,6 +479,7 @@ int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, return -EACCES; ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ? EXT4_CRYPTO_BLOCK_SIZE : namelen; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); ciphertext_len = (ciphertext_len > ctx->lim) ? ctx->lim : ciphertext_len; return (int) ciphertext_len; @@ -490,10 +495,13 @@ int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, u32 ilen, struct ext4_str *crypto_str) { unsigned int olen; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); if (!ctx) return -EIO; - olen = ext4_fname_crypto_round_up(ilen, EXT4_CRYPTO_BLOCK_SIZE); + if (padding < EXT4_CRYPTO_BLOCK_SIZE) + padding = EXT4_CRYPTO_BLOCK_SIZE; + olen = ext4_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index c8392af8abbb..52170d0b7c40 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -110,6 +110,7 @@ int ext4_generate_encryption_key(struct inode *inode) } res = 0; + ei->i_crypt_policy_flags = ctx.flags; if (S_ISREG(inode->i_mode)) crypt_key->mode = ctx.contents_encryption_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 30eaf9e9864a..a6d6291aea16 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -37,6 +37,8 @@ static int ext4_is_encryption_context_consistent_with_policy( return 0; return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == + policy->flags) && (ctx.contents_encryption_mode == policy->contents_encryption_mode) && (ctx.filenames_encryption_mode == @@ -56,25 +58,25 @@ static int ext4_create_encryption_context_from_policy( printk(KERN_WARNING "%s: Invalid contents encryption mode %d\n", __func__, policy->contents_encryption_mode); - res = -EINVAL; - goto out; + return -EINVAL; } if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { printk(KERN_WARNING "%s: Invalid filenames encryption mode %d\n", __func__, policy->filenames_encryption_mode); - res = -EINVAL; - goto out; + return -EINVAL; } + if (policy->flags & ~EXT4_POLICY_FLAGS_VALID) + return -EINVAL; ctx.contents_encryption_mode = policy->contents_encryption_mode; ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); -out: if (!res) ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); return res; @@ -115,6 +117,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) policy->version = 0; policy->contents_encryption_mode = ctx.contents_encryption_mode; policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE); return 0; @@ -176,6 +179,7 @@ int ext4_inherit_context(struct inode *parent, struct inode *child) EXT4_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; memset(ctx.master_key_descriptor, 0x42, EXT4_KEY_DESCRIPTOR_SIZE); res = 0; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index dfb113816672..bca1bdc67725 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -911,6 +911,7 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; + char i_crypt_policy_flags; /* Indicate the inline data space. */ u16 i_inline_off; diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index c2ba35a914b6..d75159c101ce 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -20,12 +20,20 @@ struct ext4_encryption_policy { char version; char contents_encryption_mode; char filenames_encryption_mode; + char flags; char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; } __attribute__((__packed__)); #define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1 #define EXT4_KEY_DERIVATION_NONCE_SIZE 16 +#define EXT4_POLICY_FLAGS_PAD_4 0x00 +#define EXT4_POLICY_FLAGS_PAD_8 0x01 +#define EXT4_POLICY_FLAGS_PAD_16 0x02 +#define EXT4_POLICY_FLAGS_PAD_32 0x03 +#define EXT4_POLICY_FLAGS_PAD_MASK 0x03 +#define EXT4_POLICY_FLAGS_VALID 0x03 + /** * Encryption context for inode * @@ -41,7 +49,7 @@ struct ext4_encryption_context { char format; char contents_encryption_mode; char filenames_encryption_mode; - char reserved; + char flags; char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE]; } __attribute__((__packed__)); @@ -120,6 +128,7 @@ struct ext4_fname_crypto_ctx { struct crypto_hash *htfm; struct page *workpage; struct ext4_encryption_key key; + unsigned flags : 8; unsigned has_valid_key : 1; unsigned ctfm_key_is_ready : 1; }; -- cgit v1.2.3 From fb63e5489f7ef5bb4d1a655984ca7ef98ffc5849 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 2 May 2015 10:29:19 -0400 Subject: ext4 crypto: do not select from EXT4_FS_ENCRYPTION This patch adds a tristate EXT4_ENCRYPTION to do the selections for EXT4_FS_ENCRYPTION because selecting from a bool causes all the selected options to be built-in, even if EXT4 itself is a module. Signed-off-by: Herbert Xu Signed-off-by: Theodore Ts'o --- fs/ext4/Kconfig | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 18228c201f7f..024f2284d3f6 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -64,8 +64,8 @@ config EXT4_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. -config EXT4_FS_ENCRYPTION - bool "Ext4 Encryption" +config EXT4_ENCRYPTION + tristate "Ext4 Encryption" depends on EXT4_FS select CRYPTO_AES select CRYPTO_CBC @@ -81,6 +81,11 @@ config EXT4_FS_ENCRYPTION efficient since it avoids caching the encrypted and decrypted pages in the page cache. +config EXT4_FS_ENCRYPTION + bool + default y + depends on EXT4_ENCRYPTION + config EXT4_DEBUG bool "EXT4 debugging support" depends on EXT4_FS -- cgit v1.2.3 From 9402bdcacdfedf7219a17e4d93300058a8e2aa4c Mon Sep 17 00:00:00 2001 From: Chanho Park Date: Sat, 2 May 2015 10:29:22 -0400 Subject: ext4 crypto: remove duplicated encryption mode definitions This patch removes duplicated encryption modes which were already in ext4.h. They were duplicated from commit 3edc18d and commit f542fb. Cc: Theodore Ts'o Cc: Michael Halcrow Cc: Andreas Dilger Signed-off-by: Chanho Park Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bca1bdc67725..1de8c7f06897 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1067,12 +1067,6 @@ extern void ext4_set_bits(void *bm, int cur, int len); /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 -/* Encryption algorithms */ -#define EXT4_ENCRYPTION_MODE_INVALID 0 -#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 -#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 -#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 - /* * Structure of the super block */ -- cgit v1.2.3 From d2dc317d564a46dfc683978a2e5a4f91434e9711 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 2 May 2015 21:36:55 -0400 Subject: ext4: fix data corruption caused by unwritten and delayed extents Currently it is possible to lose whole file system block worth of data when we hit the specific interaction with unwritten and delayed extents in status extent tree. The problem is that when we insert delayed extent into extent status tree the only way to get rid of it is when we write out delayed buffer. However there is a limitation in the extent status tree implementation so that when inserting unwritten extent should there be even a single delayed block the whole unwritten extent would be marked as delayed. At this point, there is no way to get rid of the delayed extents, because there are no delayed buffers to write out. So when a we write into said unwritten extent we will convert it to written, but it still remains delayed. When we try to write into that block later ext4_da_map_blocks() will set the buffer new and delayed and map it to invalid block which causes the rest of the block to be zeroed loosing already written data. For now we can fix this by simply not allowing to set delayed status on written extent in the extent status tree. Also add WARN_ON() to make sure that we notice if this happens in the future. This problem can be easily reproduced by running the following xfs_io. xfs_io -f -c "pwrite -S 0xaa 4096 2048" \ -c "falloc 0 131072" \ -c "pwrite -S 0xbb 65536 2048" \ -c "fsync" /mnt/test/fff echo 3 > /proc/sys/vm/drop_caches xfs_io -c "pwrite -S 0xdd 67584 2048" /mnt/test/fff This can be theoretically also reproduced by at random by running fsx, but it's not very reliable, though on machines with bigger page size (like ppc) this can be seen more often (especially xfstest generic/127) Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/extents_status.c | 8 ++++++++ fs/ext4/inode.c | 2 ++ 2 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d33d5a6852b9..26724aeece73 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -703,6 +703,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, BUG_ON(end < lblk); + if ((status & EXTENT_STATUS_DELAYED) && + (status & EXTENT_STATUS_WRITTEN)) { + ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " + " delayed and written which can potentially " + " cause data loss.\n", lblk, len); + WARN_ON(1); + } + newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f6b35d8a4a5b..4415cea85ced 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -532,6 +532,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && ext4_find_delalloc_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; @@ -636,6 +637,7 @@ found: status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && ext4_find_delalloc_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; -- cgit v1.2.3 From 280227a75b56ab5d35854f3a77ef74a7ad56a203 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Sat, 2 May 2015 23:21:15 -0400 Subject: ext4: move check under lock scope to close a race. fallocate() checks that the file is extent-based and returns EOPNOTSUPP in case is not. Other tasks can convert from and to indirect and extent so it's safe to check only after grabbing the inode mutex. Signed-off-by: Davide Italiano Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 973816bfe4a9..d74e08029643 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4927,13 +4927,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; - /* - * currently supporting (pre)allocate mode for extent-based - * files _only_ - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return -EOPNOTSUPP; - if (mode & FALLOC_FL_COLLAPSE_RANGE) return ext4_collapse_range(inode, offset, len); @@ -4955,6 +4948,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) mutex_lock(&inode->i_mutex); + /* + * We only support preallocation for extent-based files only + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; -- cgit v1.2.3 From 2c869b262a10ca99cb866d04087d75311587a30c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 2 May 2015 23:58:32 -0400 Subject: ext4: fix growing of tiny filesystems The estimate of necessary transaction credits in ext4_flex_group_add() is too pessimistic. It reserves credit for sb, resize inode, and resize inode dindirect block for each group added in a flex group although they are always the same block and thus it is enough to account them only once. Also the number of modified GDT block is overestimated since we fit EXT4_DESC_PER_BLOCK(sb) descriptors in one block. Make the estimation more precise. That reduces number of requested credits enough that we can grow 20 MB filesystem (which has 1 MB journal, 79 reserved GDT blocks, and flex group size 16 by default). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Reviewed-by: Eric Sandeen --- fs/ext4/resize.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 8a8ec6293b19..cf0c472047e3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1432,12 +1432,15 @@ static int ext4_flex_group_add(struct super_block *sb, goto exit; /* * We will always be modifying at least the superblock and GDT - * block. If we are adding a group past the last current GDT block, + * blocks. If we are adding a group past the last current GDT block, * we will also modify the inode and the dindirect block. If we * are adding a group with superblock/GDT backups we will also * modify each of the reserved GDT dindirect blocks. */ - credit = flex_gd->count * 4 + reserved_gdb; + credit = 3; /* sb, resize inode, resize inode dindirect */ + /* GDT blocks */ + credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb)); + credit += reserved_gdb; /* Reserved GDT dindirect blocks */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); if (IS_ERR(handle)) { err = PTR_ERR(handle); -- cgit v1.2.3 From 40cdc7a530c7a075557651a071354bb42b99df08 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Apr 2015 14:50:13 +0200 Subject: nfsd/blocklayout: pretend we can send deviceid notifications Commit df52699e4fcef ("NFSv4.1: Don't cache deviceids that have no notifications") causes the Linux NFS client to stop caching deviceid's unless a server pretends to support deviceid notifications. While this behavior is stupid and the language around this area in rfc5661 is a mess carified by an errata that I submittted, Trond insists on this behavior. Not caching deviceids degrades block layout performance massively as a GETDEVICEINFO is fairly expensive. So add this hack to make the Linux client happy again. Cc: stable@vger.kernel.org Signed-off-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- fs/nfsd/blocklayout.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 03d647bf195d..cdefaa331a07 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -181,6 +181,17 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, } const struct nfsd4_layout_ops bl_layout_ops = { + /* + * Pretend that we send notification to the client. This is a blatant + * lie to force recent Linux clients to cache our device IDs. + * We rarely ever change the device ID, so the harm of leaking deviceids + * for a while isn't too bad. Unfortunately RFC5661 is a complete mess + * in this regard, but I filed errata 4119 for this a while ago, and + * hopefully the Linux client will eventually start caching deviceids + * without this again. + */ + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo, .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, .proc_layoutget = nfsd4_block_proc_layoutget, -- cgit v1.2.3 From ebe9cb3bb13e7b9b281969cd279ce70834f7500f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Apr 2015 15:41:15 +0200 Subject: nfsd: fix the check for confirmed openowner in nfs4_preprocess_stateid_op If we find a non-confirmed openowner we jump to exit the function, but do not set an error value. Fix this by factoring out a helper to do the check and properly set the error from nfsd4_validate_stateid. Cc: stable@vger.kernel.org Signed-off-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 38f2d7abe3a7..fb8c66725035 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4385,10 +4385,17 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfserr_old_stateid; } +static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) +{ + if (ols->st_stateowner->so_is_open_owner && + !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; + return nfs_ok; +} + static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; - struct nfs4_ol_stateid *ols; __be32 status = nfserr_bad_stateid; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) @@ -4418,13 +4425,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) break; case NFS4_OPEN_STID: case NFS4_LOCK_STID: - ols = openlockstateid(s); - if (ols->st_stateowner->so_is_open_owner - && !(openowner(ols->st_stateowner)->oo_flags - & NFS4_OO_CONFIRMED)) - status = nfserr_bad_stateid; - else - status = nfs_ok; + status = nfsd4_check_openowner_confirmed(openlockstateid(s)); break; default: printk("unknown stateid type %x\n", s->sc_type); @@ -4516,8 +4517,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, status = nfs4_check_fh(current_fh, stp); if (status) goto out; - if (stp->st_stateowner->so_is_open_owner - && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + status = nfsd4_check_openowner_confirmed(stp); + if (status) goto out; status = nfs4_check_openmode(stp, flags); if (status) -- cgit v1.2.3 From 8287f009bd95a5e548059dba62a67727bb9549cd Mon Sep 17 00:00:00 2001 From: Sachin Bhamare Date: Mon, 27 Apr 2015 14:50:14 +0200 Subject: nfsd: fix pNFS return on close semantics For the sake of forgetful clients, the server should return the layouts to the file system on 'last close' of a file (assuming that there are no delegations outstanding to that particular client) or on delegreturn (assuming that there are no opens on a file from that particular client). In theory the information is all there in current data structures, but it's not efficiently available; nfs4_file->fi_ref includes references on the file across all clients, but we need a per-(client, file) count. Walking through lots of stateid's to calculate this on each close or delegreturn would be painful. This patch introduces infrastructure to maintain per-client opens and delegation counters on a per-file basis. [hch: ported to the mainline pNFS support, merged various fixes from Jeff] Signed-off-by: Sachin Bhamare Signed-off-by: Jeff Layton Signed-off-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++--- fs/nfsd/state.h | 14 ++++++ fs/nfsd/xdr4.h | 1 + 3 files changed, 133 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fb8c66725035..66067a291267 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -94,6 +94,7 @@ static struct kmem_cache *lockowner_slab; static struct kmem_cache *file_slab; static struct kmem_cache *stateid_slab; static struct kmem_cache *deleg_slab; +static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); @@ -281,6 +282,7 @@ put_nfs4_file(struct nfs4_file *fi) if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del_rcu(&fi->fi_hash); spin_unlock(&state_lock); + WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); } @@ -471,6 +473,86 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) __nfs4_file_put_access(fp, O_RDONLY); } +/* + * Allocate a new open/delegation state counter. This is needed for + * pNFS for proper return on close semantics. + * + * Note that we only allocate it for pNFS-enabled exports, otherwise + * all pointers to struct nfs4_clnt_odstate are always NULL. + */ +static struct nfs4_clnt_odstate * +alloc_clnt_odstate(struct nfs4_client *clp) +{ + struct nfs4_clnt_odstate *co; + + co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL); + if (co) { + co->co_client = clp; + atomic_set(&co->co_odcount, 1); + } + return co; +} + +static void +hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp = co->co_file; + + lockdep_assert_held(&fp->fi_lock); + list_add(&co->co_perfile, &fp->fi_clnt_odstate); +} + +static inline void +get_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + if (co) + atomic_inc(&co->co_odcount); +} + +static void +put_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp; + + if (!co) + return; + + fp = co->co_file; + if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { + list_del(&co->co_perfile); + spin_unlock(&fp->fi_lock); + + nfsd4_return_all_file_layouts(co->co_client, fp); + kmem_cache_free(odstate_slab, co); + } +} + +static struct nfs4_clnt_odstate * +find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new) +{ + struct nfs4_clnt_odstate *co; + struct nfs4_client *cl; + + if (!new) + return NULL; + + cl = new->co_client; + + spin_lock(&fp->fi_lock); + list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { + if (co->co_client == cl) { + get_clnt_odstate(co); + goto out; + } + } + co = new; + co->co_file = fp; + hash_clnt_odstate_locked(new); +out: + spin_unlock(&fp->fi_lock); + return co; +} + struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) { @@ -606,7 +688,8 @@ static void block_delegations(struct knfsd_fh *fh) } static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) +alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, + struct nfs4_clnt_odstate *odstate) { struct nfs4_delegation *dp; long n; @@ -631,6 +714,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); + dp->dl_clnt_odstate = odstate; + get_clnt_odstate(odstate); dp->dl_type = NFS4_OPEN_DELEGATE_READ; dp->dl_retries = 1; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, @@ -714,6 +799,7 @@ static void destroy_delegation(struct nfs4_delegation *dp) spin_lock(&state_lock); unhash_delegation_locked(dp); spin_unlock(&state_lock); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } @@ -724,6 +810,7 @@ static void revoke_delegation(struct nfs4_delegation *dp) WARN_ON(!list_empty(&dp->dl_recall_lru)); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); if (clp->cl_minorversion == 0) @@ -933,6 +1020,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); + put_clnt_odstate(stp->st_clnt_odstate); release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); @@ -1634,6 +1722,7 @@ __destroy_client(struct nfs4_client *clp) while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } @@ -3057,6 +3146,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); + INIT_LIST_HEAD(&fp->fi_clnt_odstate); fh_copy_shallow(&fp->fi_fhandle, fh); fp->fi_deleg_file = NULL; fp->fi_had_conflict = false; @@ -3073,6 +3163,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, void nfsd4_free_slabs(void) { + kmem_cache_destroy(odstate_slab); kmem_cache_destroy(openowner_slab); kmem_cache_destroy(lockowner_slab); kmem_cache_destroy(file_slab); @@ -3103,8 +3194,14 @@ nfsd4_init_slabs(void) sizeof(struct nfs4_delegation), 0, 0, NULL); if (deleg_slab == NULL) goto out_free_stateid_slab; + odstate_slab = kmem_cache_create("nfsd4_odstate", + sizeof(struct nfs4_clnt_odstate), 0, 0, NULL); + if (odstate_slab == NULL) + goto out_free_deleg_slab; return 0; +out_free_deleg_slab: + kmem_cache_destroy(deleg_slab); out_free_stateid_slab: kmem_cache_destroy(stateid_slab); out_free_file_slab: @@ -3581,6 +3678,14 @@ alloc_stateid: open->op_stp = nfs4_alloc_open_stateid(clp); if (!open->op_stp) return nfserr_jukebox; + + if (nfsd4_has_session(cstate) && + (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) { + open->op_odstate = alloc_clnt_odstate(clp); + if (!open->op_odstate) + return nfserr_jukebox; + } + return nfs_ok; } @@ -3869,7 +3974,7 @@ out_fput: static struct nfs4_delegation * nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, - struct nfs4_file *fp) + struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) { int status; struct nfs4_delegation *dp; @@ -3877,7 +3982,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - dp = alloc_init_deleg(clp, fh); + dp = alloc_init_deleg(clp, fh, odstate); if (!dp) return ERR_PTR(-ENOMEM); @@ -3903,6 +4008,7 @@ out_unlock: spin_unlock(&state_lock); out: if (status) { + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); return ERR_PTR(status); } @@ -3980,7 +4086,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, default: goto out_no_deleg; } - dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file); + dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate); if (IS_ERR(dp)) goto out_no_deleg; @@ -4069,6 +4175,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf release_open_stateid(stp); goto out; } + + stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp, + open->op_odstate); + if (stp->st_clnt_odstate == open->op_odstate) + open->op_odstate = NULL; } update_stateid(&stp->st_stid.sc_stateid); memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); @@ -4129,6 +4240,8 @@ void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, kmem_cache_free(file_slab, open->op_file); if (open->op_stp) nfs4_put_stid(&open->op_stp->st_stid); + if (open->op_odstate) + kmem_cache_free(odstate_slab, open->op_odstate); } __be32 @@ -4853,9 +4966,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - nfsd4_return_all_file_layouts(stp->st_stateowner->so_client, - stp->st_stid.sc_file); - nfsd4_close_open_stateid(stp); /* put reference from nfs4_preprocess_seqid_op */ @@ -6489,6 +6599,7 @@ nfs4_state_shutdown_net(struct net *net) list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4f3bfeb11766..bde45d90b746 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -126,6 +126,7 @@ struct nfs4_delegation { struct list_head dl_perfile; struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ + struct nfs4_clnt_odstate *dl_clnt_odstate; u32 dl_type; time_t dl_time; /* For recall: */ @@ -464,6 +465,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) return container_of(so, struct nfs4_lockowner, lo_owner); } +/* + * Per-client state indicating no. of opens and outstanding delegations + * on a file from a particular client.'od' stands for 'open & delegation' + */ +struct nfs4_clnt_odstate { + struct nfs4_client *co_client; + struct nfs4_file *co_file; + struct list_head co_perfile; + atomic_t co_odcount; +}; + /* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * @@ -485,6 +497,7 @@ struct nfs4_file { struct list_head fi_delegations; struct rcu_head fi_rcu; }; + struct list_head fi_clnt_odstate; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* @@ -526,6 +539,7 @@ struct nfs4_ol_stateid { struct list_head st_perstateowner; struct list_head st_locks; struct nfs4_stateowner * st_stateowner; + struct nfs4_clnt_odstate * st_clnt_odstate; unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid * st_openstp; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index f982ae84f0cd..2f8c092be2b3 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -247,6 +247,7 @@ struct nfsd4_open { struct nfs4_openowner *op_openowner; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ + struct nfs4_clnt_odstate *op_odstate; /* used during processing */ struct nfs4_acl *op_acl; struct xdr_netobj op_label; }; -- cgit v1.2.3 From ef2a1b3e1067195f1d6b89d8329454775c87f033 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Apr 2015 11:49:23 +0200 Subject: nfsd: split transport vs operation errors for callbacks We must only increment the sequence id if the client has seen and responded to a request. If we failed to deliver it to the client we must resend with the same sequence id. So just like the client track errors at the transport level differently from those returned in the XDR. Signed-off-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 60 ++++++++++++++++++++------------------------------ fs/nfsd/state.h | 1 + 2 files changed, 25 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 58277859a467..cd58b7cd3f1a 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -224,7 +224,7 @@ static int nfs_cb_stat_to_errno(int status) } static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, - enum nfsstat4 *status) + int *status) { __be32 *p; u32 op; @@ -235,7 +235,7 @@ static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, op = be32_to_cpup(p++); if (unlikely(op != expected)) goto out_unexpected; - *status = be32_to_cpup(p); + *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -446,22 +446,16 @@ out_overflow: static int decode_cb_sequence4res(struct xdr_stream *xdr, struct nfsd4_callback *cb) { - enum nfsstat4 nfserr; int status; if (cb->cb_minorversion == 0) return 0; - status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - goto out_default; - status = decode_cb_sequence4resok(xdr, cb); -out: - return status; -out_default: - return nfs_cb_stat_to_errno(nfserr); + status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status); + if (unlikely(status || cb->cb_status)) + return status; + + return decode_cb_sequence4resok(xdr, cb); } /* @@ -524,26 +518,19 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, struct nfsd4_callback *cb) { struct nfs4_cb_compound_hdr hdr; - enum nfsstat4 nfserr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) - goto out; + return status; if (cb != NULL) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status)) - goto out; + if (unlikely(status || cb->cb_status)) + return status; } - status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - status = nfs_cb_stat_to_errno(nfserr); -out: - return status; + return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } #ifdef CONFIG_NFSD_PNFS @@ -621,24 +608,18 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, struct nfsd4_callback *cb) { struct nfs4_cb_compound_hdr hdr; - enum nfsstat4 nfserr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) - goto out; + return status; + if (cb) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status)) - goto out; + if (unlikely(status || cb->cb_status)) + return status; } - status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - status = nfs_cb_stat_to_errno(nfserr); -out: - return status; + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); } #endif /* CONFIG_NFSD_PNFS */ @@ -918,7 +899,8 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) if (clp->cl_minorversion) { /* No need for lock, access serialized in nfsd4_cb_prepare */ - ++clp->cl_cb_session->se_cb_seq_nr; + if (!task->tk_status) + ++clp->cl_cb_session->se_cb_seq_nr; clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, @@ -935,6 +917,11 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) if (cb->cb_done) return; + if (cb->cb_status) { + WARN_ON_ONCE(task->tk_status); + task->tk_status = cb->cb_status; + } + switch (cb->cb_ops->done(cb, task)) { case 0: task->tk_status = 0; @@ -1099,6 +1086,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); INIT_LIST_HEAD(&cb->cb_per_client); + cb->cb_status = 0; cb->cb_done = true; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index bde45d90b746..e791985a7318 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -68,6 +68,7 @@ struct nfsd4_callback { struct rpc_message cb_msg; struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; + int cb_status; bool cb_done; }; -- cgit v1.2.3 From cba5f62b1830c1919b47544789bc993e6e617dc6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Apr 2015 11:49:24 +0200 Subject: nfsd: fix callback restarts Checking the rpc_client pointer is not a reliable way to detect backchannel changes: cl_cb_client is changed only after shutting down the rpc client, so the condition cl_cb_client = tk_client will always be true. Check the RPC_TASK_KILLED flag instead, and rewrite the code to avoid the buggy cl_callbacks list and fix the lifetime rules due to double calls of the ->prepare callback operations method for this retry case. Signed-off-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 52 ++++++++++++++++++++++---------------------------- fs/nfsd/nfs4state.c | 1 - fs/nfsd/state.h | 4 +--- 3 files changed, 24 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index cd58b7cd3f1a..4c993aaf1e6e 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -879,13 +879,6 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) if (!nfsd41_cb_get_slot(clp, task)) return; } - spin_lock(&clp->cl_lock); - if (list_empty(&cb->cb_per_client)) { - /* This is the first call, not a restart */ - cb->cb_done = false; - list_add(&cb->cb_per_client, &clp->cl_callbacks); - } - spin_unlock(&clp->cl_lock); rpc_call_start(task); } @@ -907,16 +900,21 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) clp->cl_cb_session->se_cb_seq_nr); } - if (clp->cl_cb_client != task->tk_client) { - /* We're shutting down or changing cl_cb_client; leave - * it to nfsd4_process_cb_update to restart the call if - * necessary. */ + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (task->tk_flags & RPC_TASK_KILLED) { + task->tk_status = 0; + cb->cb_need_restart = true; return; } - if (cb->cb_done) - return; - if (cb->cb_status) { WARN_ON_ONCE(task->tk_status); task->tk_status = cb->cb_status; @@ -936,21 +934,17 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) default: BUG(); } - cb->cb_done = true; } static void nfsd4_cb_release(void *calldata) { struct nfsd4_callback *cb = calldata; - struct nfs4_client *clp = cb->cb_clp; - - if (cb->cb_done) { - spin_lock(&clp->cl_lock); - list_del(&cb->cb_per_client); - spin_unlock(&clp->cl_lock); + if (cb->cb_need_restart) + nfsd4_run_cb(cb); + else cb->cb_ops->release(cb); - } + } static const struct rpc_call_ops nfsd4_cb_ops = { @@ -1045,9 +1039,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) nfsd4_mark_cb_down(clp, err); return; } - /* Yay, the callback channel's back! Restart any callbacks: */ - list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) - queue_work(callback_wq, &cb->cb_work); } static void @@ -1058,8 +1049,12 @@ nfsd4_run_cb_work(struct work_struct *work) struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; - if (cb->cb_ops && cb->cb_ops->prepare) - cb->cb_ops->prepare(cb); + if (cb->cb_need_restart) { + cb->cb_need_restart = false; + } else { + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + } if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); @@ -1085,9 +1080,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_resp = cb; cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); - INIT_LIST_HEAD(&cb->cb_per_client); cb->cb_status = 0; - cb->cb_done = true; + cb->cb_need_restart = false; } void nfsd4_run_cb(struct nfsd4_callback *cb) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 66067a291267..039f9c8a95e8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1626,7 +1626,6 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); - INIT_LIST_HEAD(&clp->cl_callbacks); INIT_LIST_HEAD(&clp->cl_revoked); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e791985a7318..dbc4f85a5008 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -63,13 +63,12 @@ typedef struct { struct nfsd4_callback { struct nfs4_client *cb_clp; - struct list_head cb_per_client; u32 cb_minorversion; struct rpc_message cb_msg; struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; int cb_status; - bool cb_done; + bool cb_need_restart; }; struct nfsd4_callback_ops { @@ -334,7 +333,6 @@ struct nfs4_client { int cl_cb_state; struct nfsd4_callback cl_cb_null; struct nfsd4_session *cl_cb_session; - struct list_head cl_callbacks; /* list of in-progress callbacks */ /* for all client information that callback code might need: */ spinlock_t cl_lock; -- cgit v1.2.3 From 4bd9e9b77fc6787c45b8bb439f6511aa3478606c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Apr 2015 11:49:25 +0200 Subject: nfsd: skip CB_NULL probes for 4.1 or later With sessions in v4.1 or later we don't need to manually probe the backchannel connection, so we can declare it up instantly after setting up the RPC client. Note that we really should split nfsd4_run_cb_work in the long run, this is just the least intrusive fix for now. Signed-off-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 4c993aaf1e6e..5694cfb7a47b 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1066,6 +1066,15 @@ nfsd4_run_cb_work(struct work_struct *work) cb->cb_ops->release(cb); return; } + + /* + * Don't send probe messages for 4.1 or later. + */ + if (!cb->cb_ops && clp->cl_minorversion) { + clp->cl_cb_state = NFSD4_CB_UP; + return; + } + cb->cb_msg.rpc_cred = clp->cl_cb_cred; rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); -- cgit v1.2.3 From 5463e7c18e51152104aba9614e6abfc039a8b710 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 21 Apr 2015 10:40:54 -0700 Subject: Revert "f2fs: enhance multi-threads performance" This reports performance regression by Yuanhan Liu. The basic idea was to reduce one-point mutex, but it turns out this causes another contention like context swithes. https://lkml.org/lkml/2015/4/21/11 Until finishing the analysis on this issue, I'd like to revert this for a while. This reverts commit 78373b7319abdf15050af5b1632c4c8b8b398f33. --- fs/f2fs/data.c | 7 +++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + 3 files changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b91b0e10678e..1e1aae669fa8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1513,6 +1513,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool locked = false; int ret; long diff; @@ -1533,7 +1534,13 @@ static int f2fs_write_data_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, DATA, wbc); + if (!S_ISDIR(inode->i_mode)) { + mutex_lock(&sbi->writepages); + locked = true; + } ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); + if (locked) + mutex_unlock(&sbi->writepages); f2fs_submit_merged_bio(sbi, DATA, WRITE); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d8921cf2ba9a..8de34ab6d5b1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -625,6 +625,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 160b88346b24..b2dd1b01f076 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1035,6 +1035,7 @@ try_onemore: sbi->raw_super = raw_super; sbi->raw_super_buf = raw_super_buf; mutex_init(&sbi->gc_mutex); + mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); clear_sbi_flag(sbi, SBI_POR_DOING); -- cgit v1.2.3 From 7263b1bd0490fca68ee7eedb0b6973cb86d4701c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Apr 2015 11:03:48 -0700 Subject: f2fs: fix wrong error hanlder in f2fs_follow_link The page_follow_link_light returns NULL and its error pointer was remained in nd->path. Reported-by: Dan Carpenter Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7e3794edae42..658e8079aaf9 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -298,16 +298,14 @@ fail: static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct page *page; + struct page *page = page_follow_link_light(dentry, nd); - page = page_follow_link_light(dentry, nd); - if (IS_ERR(page)) + if (IS_ERR_OR_NULL(page)) return page; /* this is broken symlink case */ if (*nd_get_link(nd) == 0) { - kunmap(page); - page_cache_release(page); + page_put_link(dentry, nd, page); return ERR_PTR(-ENOENT); } return page; -- cgit v1.2.3 From f5b697700c86d7d01489202bfd37d86665754afd Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Tue, 5 May 2015 16:23:54 -0700 Subject: configfs: init configfs module earlier at boot time We need this earlier in the boot process to allow various subsystems to use configfs (e.g Industrial IIO). Also, debugfs is at core_initcall level and configfs should be on the same level from infrastructure point of view. Signed-off-by: Daniel Baluta Suggested-by: Lars-Peter Clausen Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/configfs/mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index da94e41bdbf6..537356742091 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -173,5 +173,5 @@ MODULE_LICENSE("GPL"); MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); -module_init(configfs_init); +core_initcall(configfs_init); module_exit(configfs_exit); -- cgit v1.2.3 From d8fd150fe3935e1692bf57c66691e17409ebb9c1 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 5 May 2015 16:24:00 -0700 Subject: nilfs2: fix sanity check of btree level in nilfs_btree_root_broken() The range check for b-tree level parameter in nilfs_btree_root_broken() is wrong; it accepts the case of "level == NILFS_BTREE_LEVEL_MAX" even though the level is limited to values in the range of 0 to (NILFS_BTREE_LEVEL_MAX - 1). Since the level parameter is read from storage device and used to index nilfs_btree_path array whose element count is NILFS_BTREE_LEVEL_MAX, it can cause memory overrun during btree operations if the boundary value is set to the level parameter on device. This fixes the broken sanity check and adds a comment to clarify that the upper bound NILFS_BTREE_LEVEL_MAX is exclusive. Signed-off-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/btree.c | 2 +- include/linux/nilfs2_fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 059f37137f9a..919fd5bb14a8 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -388,7 +388,7 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, nchildren = nilfs_btree_node_get_nchildren(node); if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || - level > NILFS_BTREE_LEVEL_MAX || + level >= NILFS_BTREE_LEVEL_MAX || nchildren < 0 || nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index ff3fea3194c6..9abb763e4b86 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -460,7 +460,7 @@ struct nilfs_btree_node { /* level */ #define NILFS_BTREE_LEVEL_DATA 0 #define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1) -#define NILFS_BTREE_LEVEL_MAX 14 +#define NILFS_BTREE_LEVEL_MAX 14 /* Max level (exclusive) */ /** * struct nilfs_palloc_group_desc - block group descriptor -- cgit v1.2.3 From b1432a2a35565f538586774a03bf277c27fc267d Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Tue, 5 May 2015 16:24:02 -0700 Subject: ocfs2: dlm: fix race between purge and get lock resource There is a race window in dlm_get_lock_resource(), which may return a lock resource which has been purged. This will cause the process to hang forever in dlmlock() as the ast msg can't be handled due to its lock resource not existing. dlm_get_lock_resource { ... spin_lock(&dlm->spinlock); tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); if (tmpres) { spin_unlock(&dlm->spinlock); >>>>>>>> race window, dlm_run_purge_list() may run and purge the lock resource spin_lock(&tmpres->spinlock); ... spin_unlock(&tmpres->spinlock); } } Signed-off-by: Junxiao Bi Cc: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a6944b25fd5b..fdf4b41d0609 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -757,6 +757,19 @@ lookup: if (tmpres) { spin_unlock(&dlm->spinlock); spin_lock(&tmpres->spinlock); + + /* + * Right after dlm spinlock was released, dlm_thread could have + * purged the lockres. Check if lockres got unhashed. If so + * start over. + */ + if (hlist_unhashed(&tmpres->hash_node)) { + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + /* Wait on the thread that is mastering the resource */ if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { __dlm_wait_on_lockres(tmpres); -- cgit v1.2.3 From 0ff28d9f4674d781e492bcff6f32f0fe48cf0fed Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 6 May 2015 17:26:47 +0200 Subject: splice: sendfile() at once fails for big files Using sendfile with below small program to get MD5 sums of some files, it appear that big files (over 64kbytes with 4k pages system) get a wrong MD5 sum while small files get the correct sum. This program uses sendfile() to send a file to an AF_ALG socket for hashing. /* md5sum2.c */ #include #include #include #include #include #include #include #include #include int main(int argc, char **argv) { int sk = socket(AF_ALG, SOCK_SEQPACKET, 0); struct stat st; struct sockaddr_alg sa = { .salg_family = AF_ALG, .salg_type = "hash", .salg_name = "md5", }; int n; bind(sk, (struct sockaddr*)&sa, sizeof(sa)); for (n = 1; n < argc; n++) { int size; int offset = 0; char buf[4096]; int fd; int sko; int i; fd = open(argv[n], O_RDONLY); sko = accept(sk, NULL, 0); fstat(fd, &st); size = st.st_size; sendfile(sko, fd, &offset, size); size = read(sko, buf, sizeof(buf)); for (i = 0; i < size; i++) printf("%2.2x", buf[i]); printf(" %s\n", argv[n]); close(fd); close(sko); } exit(0); } Test below is done using official linux patch files. First result is with a software based md5sum. Second result is with the program above. root@vgoip:~# ls -l patch-3.6.* -rw-r--r-- 1 root root 64011 Aug 24 12:01 patch-3.6.2.gz -rw-r--r-- 1 root root 94131 Aug 24 12:01 patch-3.6.3.gz root@vgoip:~# md5sum patch-3.6.* b3ffb9848196846f31b2ff133d2d6443 patch-3.6.2.gz c5e8f687878457db77cb7158c38a7e43 patch-3.6.3.gz root@vgoip:~# ./md5sum2 patch-3.6.* b3ffb9848196846f31b2ff133d2d6443 patch-3.6.2.gz 5fd77b24e68bb24dcc72d6e57c64790e patch-3.6.3.gz After investivation, it appears that sendfile() sends the files by blocks of 64kbytes (16 times PAGE_SIZE). The problem is that at the end of each block, the SPLICE_F_MORE flag is missing, therefore the hashing operation is reset as if it was the end of the file. This patch adds SPLICE_F_MORE to the flags when more data is pending. With the patch applied, we get the correct sums: root@vgoip:~# md5sum patch-3.6.* b3ffb9848196846f31b2ff133d2d6443 patch-3.6.2.gz c5e8f687878457db77cb7158c38a7e43 patch-3.6.3.gz root@vgoip:~# ./md5sum2 patch-3.6.* b3ffb9848196846f31b2ff133d2d6443 patch-3.6.2.gz c5e8f687878457db77cb7158c38a7e43 patch-3.6.3.gz Signed-off-by: Christophe Leroy Signed-off-by: Jens Axboe --- fs/splice.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 476024bb6546..bfe62ae40f40 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1161,7 +1161,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, long ret, bytes; umode_t i_mode; size_t len; - int i, flags; + int i, flags, more; /* * We require the input being a regular file, as we don't want to @@ -1204,6 +1204,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * Don't block on output, we have to drain the direct pipe. */ sd->flags &= ~SPLICE_F_NONBLOCK; + more = sd->flags & SPLICE_F_MORE; while (len) { size_t read_len; @@ -1216,6 +1217,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, read_len = ret; sd->total_len = read_len; + /* + * If more data is pending, set SPLICE_F_MORE + * If this is the last data and SPLICE_F_MORE was not set + * initially, clears it. + */ + if (read_len < len) + sd->flags |= SPLICE_F_MORE; + else if (!more) + sd->flags &= ~SPLICE_F_MORE; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we -- cgit v1.2.3 From 1d3c61c2eb3fe4f96d3192212f1bdcee49ea55aa Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 6 May 2015 11:17:01 +0100 Subject: Btrfs: fix wrong mapping flags for free space inode We were passing a flags value that differed from the intention in commit 2b108268006e ("Btrfs: don't use highmem for free space cache pages"). This caused problems in a ARM machine, leaving btrfs unusable there. Reported-by: Merlijn Wajer Tested-by: Merlijn Wajer Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 41c510b7cc11..5e020d76fd07 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -86,7 +86,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, mapping_set_gfp_mask(inode->i_mapping, mapping_gfp_mask(inode->i_mapping) & - ~(GFP_NOFS & ~__GFP_HIGHMEM)); + ~(__GFP_FS | __GFP_HIGHMEM)); return inode; } -- cgit v1.2.3 From 766c4cbfacd8634d7580bac6a1b8456e63de3e84 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 May 2015 19:24:57 -0400 Subject: namei: d_is_negative() should be checked before ->d_seq validation Fetching ->d_inode, verifying ->d_seq and finding d_is_negative() to be true does *not* mean that inode we'd fetched had been NULL - that holds only while ->d_seq is still unchanged. Shift d_is_negative() checks into lookup_fast() prior to ->d_seq verification. Reported-by: Steven Rostedt Tested-by: Steven Rostedt Signed-off-by: Al Viro --- fs/namei.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 4a8d998b7274..f67cf6cef986 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1415,6 +1415,7 @@ static int lookup_fast(struct nameidata *nd, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; + bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (!dentry) goto unlazy; @@ -1424,8 +1425,11 @@ static int lookup_fast(struct nameidata *nd, * the dentry name information from lookup. */ *inode = dentry->d_inode; + negative = d_is_negative(dentry); if (read_seqcount_retry(&dentry->d_seq, seq)) return -ECHILD; + if (negative) + return -ENOENT; /* * This sequence count validates that the parent had no @@ -1472,6 +1476,10 @@ unlazy: goto need_lookup; } + if (unlikely(d_is_negative(dentry))) { + dput(dentry); + return -ENOENT; + } path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1583,10 +1591,10 @@ static inline int walk_component(struct nameidata *nd, struct path *path, goto out_err; inode = path->dentry->d_inode; + err = -ENOENT; + if (d_is_negative(path->dentry)) + goto out_path_put; } - err = -ENOENT; - if (d_is_negative(path->dentry)) - goto out_path_put; if (should_follow_link(path->dentry, follow)) { if (nd->flags & LOOKUP_RCU) { @@ -3036,14 +3044,13 @@ retry_lookup: BUG_ON(nd->flags & LOOKUP_RCU); inode = path->dentry->d_inode; -finish_lookup: - /* we _can_ be in RCU mode here */ error = -ENOENT; if (d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } - +finish_lookup: + /* we _can_ be in RCU mode here */ if (should_follow_link(path->dentry, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(nd->path.mnt != path->mnt || -- cgit v1.2.3 From f15133df088ecadd141ea1907f2c96df67c729f0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 May 2015 22:53:15 -0400 Subject: path_openat(): fix double fput() path_openat() jumps to the wrong place after do_tmpfile() - it has already done path_cleanup() (as part of path_lookupat() called by do_tmpfile()), so doing that again can lead to double fput(). Cc: stable@vger.kernel.org # v3.11+ Signed-off-by: Al Viro --- fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index f67cf6cef986..fe30d3be43a8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3233,7 +3233,7 @@ static struct file *path_openat(int dfd, struct filename *pathname, if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); - goto out; + goto out2; } error = path_init(dfd, pathname, flags, nd); @@ -3263,6 +3263,7 @@ static struct file *path_openat(int dfd, struct filename *pathname, } out: path_cleanup(nd); +out2: if (!(opened & FILE_OPENED)) { BUG_ON(!error); put_filp(file); -- cgit v1.2.3 From 7e96c1b0e0f495c5a7450dc4aa7c9a24ba4305bd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 May 2015 16:36:50 -0500 Subject: mnt: Fix fs_fully_visible to verify the root directory is visible This fixes a dumb bug in fs_fully_visible that allows proc or sys to be mounted if there is a bind mount of part of /proc/ or /sys/ visible. Cc: stable@vger.kernel.org Reported-by: Eric Windisch Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 1f4f9dac6e5a..1b9e11167bae 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3179,6 +3179,12 @@ bool fs_fully_visible(struct file_system_type *type) if (mnt->mnt.mnt_sb->s_type != type) continue; + /* This mount is not fully visible if it's root directory + * is not the root directory of the filesystem. + */ + if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) + continue; + /* This mount is not fully visible if there are any child mounts * that cover anything except for empty directories. */ -- cgit v1.2.3